# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- encoding: utf-8 -*-
# @Author: SWHL
# @Contact: liekkaskono@163.com
import copy
import re
def deal_isolate_span(thead_part):
"""
Deal with isolate span cases in this function.
It causes by wrong prediction in structure recognition model.
eg. predict
| to | rowspan="2">.
:param thead_part:
:return:
"""
# 1. find out isolate span tokens.
isolate_pattern = (
' | rowspan="(\d)+" colspan="(\d)+">|'
' | colspan="(\d)+" rowspan="(\d)+">|'
' | rowspan="(\d)+">|'
' | colspan="(\d)+">'
)
isolate_iter = re.finditer(isolate_pattern, thead_part)
isolate_list = [i.group() for i in isolate_iter]
# 2. find out span number, by step 1 results.
span_pattern = (
' rowspan="(\d)+" colspan="(\d)+"|'
' colspan="(\d)+" rowspan="(\d)+"|'
' rowspan="(\d)+"|'
' colspan="(\d)+"'
)
corrected_list = []
for isolate_item in isolate_list:
span_part = re.search(span_pattern, isolate_item)
spanStr_in_isolateItem = span_part.group()
# 3. merge the span number into the span token format string.
if spanStr_in_isolateItem is not None:
corrected_item = f" | "
corrected_list.append(corrected_item)
else:
corrected_list.append(None)
# 4. replace original isolated token.
for corrected_item, isolate_item in zip(corrected_list, isolate_list):
if corrected_item is not None:
thead_part = thead_part.replace(isolate_item, corrected_item)
else:
pass
return thead_part
def deal_duplicate_bb(thead_part):
"""
Deal duplicate or after replace.
Keep one in a | token.
:param thead_part:
:return:
"""
# 1. find out | in .
td_pattern = (
'(.+?) | |'
'(.+?) | |'
'(.+?) | |'
'(.+?) | |'
"(.*?) | "
)
td_iter = re.finditer(td_pattern, thead_part)
td_list = [t.group() for t in td_iter]
# 2. is multiply in | or not?
new_td_list = []
for td_item in td_list:
if td_item.count("") > 1 or td_item.count("") > 1:
# multiply in | case.
# 1. remove all
td_item = td_item.replace("", "").replace("", "")
# 2. replace -> , -> .
td_item = td_item.replace("", " | ").replace(" | ", "")
new_td_list.append(td_item)
else:
new_td_list.append(td_item)
# 3. replace original thead part.
for td_item, new_td_item in zip(td_list, new_td_list):
thead_part = thead_part.replace(td_item, new_td_item)
return thead_part
def deal_bb(result_token):
"""
In our opinion, always occurs in text's context.
This function will find out all tokens in and insert by manual.
:param result_token:
:return:
"""
# find out parts.
thead_pattern = "(.*?)"
if re.search(thead_pattern, result_token) is None:
return result_token
thead_part = re.search(thead_pattern, result_token).group()
origin_thead_part = copy.deepcopy(thead_part)
# check "rowspan" or "colspan" occur in parts or not .
span_pattern = '| | | | | | '
span_iter = re.finditer(span_pattern, thead_part)
span_list = [s.group() for s in span_iter]
has_span_in_head = True if len(span_list) > 0 else False
if not has_span_in_head:
# not include "rowspan" or "colspan" branch 1.
# 1. replace | to | , and | to
# 2. it is possible to predict text include or by Text-line recognition,
# so we replace to , and to
thead_part = (
thead_part.replace("", " | ")
.replace(" | ", "")
.replace("", "")
.replace("", "")
)
else:
# include "rowspan" or "colspan" branch 2.
# Firstly, we deal rowspan or colspan cases.
# 1. replace > to >
# 2. replace to
# 3. it is possible to predict text include or by Text-line recognition,
# so we replace to , and to
# Secondly, deal ordinary cases like branch 1
# replace ">" to ""
replaced_span_list = []
for sp in span_list:
replaced_span_list.append(sp.replace(">", ">"))
for sp, rsp in zip(span_list, replaced_span_list):
thead_part = thead_part.replace(sp, rsp)
# replace "" to ""
thead_part = thead_part.replace("", "")
# remove duplicated by re.sub
mb_pattern = "()+"
single_b_string = ""
thead_part = re.sub(mb_pattern, single_b_string, thead_part)
mgb_pattern = "()+"
single_gb_string = ""
thead_part = re.sub(mgb_pattern, single_gb_string, thead_part)
# ordinary cases like branch 1
thead_part = thead_part.replace("", " | ").replace("", "")
# convert back to , empty cell has no .
# but space cell( ) is suitable for | |
thead_part = thead_part.replace(" | ", " | ")
# deal with duplicated
thead_part = deal_duplicate_bb(thead_part)
# deal with isolate span tokens, which causes by wrong predict by structure prediction.
# eg.PMC5994107_011_00.png
thead_part = deal_isolate_span(thead_part)
# replace original result with new thead part.
result_token = result_token.replace(origin_thead_part, thead_part)
return result_token
def deal_eb_token(master_token):
"""
post process with , , ...
emptyBboxTokenDict = {
"[]": '',
"[' ']": '',
"['', ' ', '']": '',
"['\\u2028', '\\u2028']": '',
"['', ' ', '']": '',
"['', '']": '',
"['', ' ', '']": '',
"['', '', '', '']": '',
"['', '', ' ', '', '']": '',
"['', '']": '',
"['', ' ', '\\u2028', ' ', '\\u2028', ' ', '']": '',
}
:param master_token:
:return:
"""
master_token = master_token.replace("", " | ")
master_token = master_token.replace("", " | ")
master_token = master_token.replace("", " | ")
master_token = master_token.replace("", "\u2028\u2028 | ")
master_token = master_token.replace("", " | ")
master_token = master_token.replace("", " | ")
master_token = master_token.replace("", " | ")
master_token = master_token.replace("", " | ")
master_token = master_token.replace("", " | ")
master_token = master_token.replace("", " | ")
master_token = master_token.replace(
"", " \u2028 \u2028 | "
)
return master_token
def distance(box_1, box_2):
x1, y1, x2, y2 = box_1
x3, y3, x4, y4 = box_2
dis = abs(x3 - x1) + abs(y3 - y1) + abs(x4 - x2) + abs(y4 - y2)
dis_2 = abs(x3 - x1) + abs(y3 - y1)
dis_3 = abs(x4 - x2) + abs(y4 - y2)
return dis + min(dis_2, dis_3)
def compute_iou(rec1, rec2):
"""
computing IoU
:param rec1: (y0, x0, y1, x1), which reflects
(top, left, bottom, right)
:param rec2: (y0, x0, y1, x1)
:return: scala value of IoU
"""
# computing area of each rectangles
S_rec1 = (rec1[2] - rec1[0]) * (rec1[3] - rec1[1])
S_rec2 = (rec2[2] - rec2[0]) * (rec2[3] - rec2[1])
# computing the sum_area
sum_area = S_rec1 + S_rec2
# find the each edge of intersect rectangle
left_line = max(rec1[1], rec2[1])
right_line = min(rec1[3], rec2[3])
top_line = max(rec1[0], rec2[0])
bottom_line = min(rec1[2], rec2[2])
# judge if there is an intersect
if left_line >= right_line or top_line >= bottom_line:
return 0.0
intersect = (right_line - left_line) * (bottom_line - top_line)
return (intersect / (sum_area - intersect)) * 1.0