You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
250 lines
9.4 KiB
Python
250 lines
9.4 KiB
Python
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# -*- encoding: utf-8 -*-
|
|
# @Author: SWHL
|
|
# @Contact: liekkaskono@163.com
|
|
import copy
|
|
import re
|
|
|
|
|
|
def deal_isolate_span(thead_part):
|
|
"""
|
|
Deal with isolate span cases in this function.
|
|
It causes by wrong prediction in structure recognition model.
|
|
eg. predict <td rowspan="2"></td> to <td></td> rowspan="2"></b></td>.
|
|
:param thead_part:
|
|
:return:
|
|
"""
|
|
# 1. find out isolate span tokens.
|
|
isolate_pattern = (
|
|
'<td></td> rowspan="(\d)+" colspan="(\d)+"></b></td>|'
|
|
'<td></td> colspan="(\d)+" rowspan="(\d)+"></b></td>|'
|
|
'<td></td> rowspan="(\d)+"></b></td>|'
|
|
'<td></td> colspan="(\d)+"></b></td>'
|
|
)
|
|
isolate_iter = re.finditer(isolate_pattern, thead_part)
|
|
isolate_list = [i.group() for i in isolate_iter]
|
|
|
|
# 2. find out span number, by step 1 results.
|
|
span_pattern = (
|
|
' rowspan="(\d)+" colspan="(\d)+"|'
|
|
' colspan="(\d)+" rowspan="(\d)+"|'
|
|
' rowspan="(\d)+"|'
|
|
' colspan="(\d)+"'
|
|
)
|
|
corrected_list = []
|
|
for isolate_item in isolate_list:
|
|
span_part = re.search(span_pattern, isolate_item)
|
|
spanStr_in_isolateItem = span_part.group()
|
|
# 3. merge the span number into the span token format string.
|
|
if spanStr_in_isolateItem is not None:
|
|
corrected_item = f"<td{spanStr_in_isolateItem}></td>"
|
|
corrected_list.append(corrected_item)
|
|
else:
|
|
corrected_list.append(None)
|
|
|
|
# 4. replace original isolated token.
|
|
for corrected_item, isolate_item in zip(corrected_list, isolate_list):
|
|
if corrected_item is not None:
|
|
thead_part = thead_part.replace(isolate_item, corrected_item)
|
|
else:
|
|
pass
|
|
return thead_part
|
|
|
|
|
|
def deal_duplicate_bb(thead_part):
|
|
"""
|
|
Deal duplicate <b> or </b> after replace.
|
|
Keep one <b></b> in a <td></td> token.
|
|
:param thead_part:
|
|
:return:
|
|
"""
|
|
# 1. find out <td></td> in <thead></thead>.
|
|
td_pattern = (
|
|
'<td rowspan="(\d)+" colspan="(\d)+">(.+?)</td>|'
|
|
'<td colspan="(\d)+" rowspan="(\d)+">(.+?)</td>|'
|
|
'<td rowspan="(\d)+">(.+?)</td>|'
|
|
'<td colspan="(\d)+">(.+?)</td>|'
|
|
"<td>(.*?)</td>"
|
|
)
|
|
td_iter = re.finditer(td_pattern, thead_part)
|
|
td_list = [t.group() for t in td_iter]
|
|
|
|
# 2. is multiply <b></b> in <td></td> or not?
|
|
new_td_list = []
|
|
for td_item in td_list:
|
|
if td_item.count("<b>") > 1 or td_item.count("</b>") > 1:
|
|
# multiply <b></b> in <td></td> case.
|
|
# 1. remove all <b></b>
|
|
td_item = td_item.replace("<b>", "").replace("</b>", "")
|
|
# 2. replace <tb> -> <tb><b>, </tb> -> </b></tb>.
|
|
td_item = td_item.replace("<td>", "<td><b>").replace("</td>", "</b></td>")
|
|
new_td_list.append(td_item)
|
|
else:
|
|
new_td_list.append(td_item)
|
|
|
|
# 3. replace original thead part.
|
|
for td_item, new_td_item in zip(td_list, new_td_list):
|
|
thead_part = thead_part.replace(td_item, new_td_item)
|
|
return thead_part
|
|
|
|
|
|
def deal_bb(result_token):
|
|
"""
|
|
In our opinion, <b></b> always occurs in <thead></thead> text's context.
|
|
This function will find out all tokens in <thead></thead> and insert <b></b> by manual.
|
|
:param result_token:
|
|
:return:
|
|
"""
|
|
# find out <thead></thead> parts.
|
|
thead_pattern = "<thead>(.*?)</thead>"
|
|
if re.search(thead_pattern, result_token) is None:
|
|
return result_token
|
|
thead_part = re.search(thead_pattern, result_token).group()
|
|
origin_thead_part = copy.deepcopy(thead_part)
|
|
|
|
# check "rowspan" or "colspan" occur in <thead></thead> parts or not .
|
|
span_pattern = '<td rowspan="(\d)+" colspan="(\d)+">|<td colspan="(\d)+" rowspan="(\d)+">|<td rowspan="(\d)+">|<td colspan="(\d)+">'
|
|
span_iter = re.finditer(span_pattern, thead_part)
|
|
span_list = [s.group() for s in span_iter]
|
|
has_span_in_head = True if len(span_list) > 0 else False
|
|
|
|
if not has_span_in_head:
|
|
# <thead></thead> not include "rowspan" or "colspan" branch 1.
|
|
# 1. replace <td> to <td><b>, and </td> to </b></td>
|
|
# 2. it is possible to predict text include <b> or </b> by Text-line recognition,
|
|
# so we replace <b><b> to <b>, and </b></b> to </b>
|
|
thead_part = (
|
|
thead_part.replace("<td>", "<td><b>")
|
|
.replace("</td>", "</b></td>")
|
|
.replace("<b><b>", "<b>")
|
|
.replace("</b></b>", "</b>")
|
|
)
|
|
else:
|
|
# <thead></thead> include "rowspan" or "colspan" branch 2.
|
|
# Firstly, we deal rowspan or colspan cases.
|
|
# 1. replace > to ><b>
|
|
# 2. replace </td> to </b></td>
|
|
# 3. it is possible to predict text include <b> or </b> by Text-line recognition,
|
|
# so we replace <b><b> to <b>, and </b><b> to </b>
|
|
|
|
# Secondly, deal ordinary cases like branch 1
|
|
|
|
# replace ">" to "<b>"
|
|
replaced_span_list = []
|
|
for sp in span_list:
|
|
replaced_span_list.append(sp.replace(">", "><b>"))
|
|
for sp, rsp in zip(span_list, replaced_span_list):
|
|
thead_part = thead_part.replace(sp, rsp)
|
|
|
|
# replace "</td>" to "</b></td>"
|
|
thead_part = thead_part.replace("</td>", "</b></td>")
|
|
|
|
# remove duplicated <b> by re.sub
|
|
mb_pattern = "(<b>)+"
|
|
single_b_string = "<b>"
|
|
thead_part = re.sub(mb_pattern, single_b_string, thead_part)
|
|
|
|
mgb_pattern = "(</b>)+"
|
|
single_gb_string = "</b>"
|
|
thead_part = re.sub(mgb_pattern, single_gb_string, thead_part)
|
|
|
|
# ordinary cases like branch 1
|
|
thead_part = thead_part.replace("<td>", "<td><b>").replace("<b><b>", "<b>")
|
|
|
|
# convert <tb><b></b></tb> back to <tb></tb>, empty cell has no <b></b>.
|
|
# but space cell(<tb> </tb>) is suitable for <td><b> </b></td>
|
|
thead_part = thead_part.replace("<td><b></b></td>", "<td></td>")
|
|
# deal with duplicated <b></b>
|
|
thead_part = deal_duplicate_bb(thead_part)
|
|
# deal with isolate span tokens, which causes by wrong predict by structure prediction.
|
|
# eg.PMC5994107_011_00.png
|
|
thead_part = deal_isolate_span(thead_part)
|
|
# replace original result with new thead part.
|
|
result_token = result_token.replace(origin_thead_part, thead_part)
|
|
return result_token
|
|
|
|
|
|
def deal_eb_token(master_token):
|
|
"""
|
|
post process with <eb></eb>, <eb1></eb1>, ...
|
|
emptyBboxTokenDict = {
|
|
"[]": '<eb></eb>',
|
|
"[' ']": '<eb1></eb1>',
|
|
"['<b>', ' ', '</b>']": '<eb2></eb2>',
|
|
"['\\u2028', '\\u2028']": '<eb3></eb3>',
|
|
"['<sup>', ' ', '</sup>']": '<eb4></eb4>',
|
|
"['<b>', '</b>']": '<eb5></eb5>',
|
|
"['<i>', ' ', '</i>']": '<eb6></eb6>',
|
|
"['<b>', '<i>', '</i>', '</b>']": '<eb7></eb7>',
|
|
"['<b>', '<i>', ' ', '</i>', '</b>']": '<eb8></eb8>',
|
|
"['<i>', '</i>']": '<eb9></eb9>',
|
|
"['<b>', ' ', '\\u2028', ' ', '\\u2028', ' ', '</b>']": '<eb10></eb10>',
|
|
}
|
|
:param master_token:
|
|
:return:
|
|
"""
|
|
master_token = master_token.replace("<eb></eb>", "<td></td>")
|
|
master_token = master_token.replace("<eb1></eb1>", "<td> </td>")
|
|
master_token = master_token.replace("<eb2></eb2>", "<td><b> </b></td>")
|
|
master_token = master_token.replace("<eb3></eb3>", "<td>\u2028\u2028</td>")
|
|
master_token = master_token.replace("<eb4></eb4>", "<td><sup> </sup></td>")
|
|
master_token = master_token.replace("<eb5></eb5>", "<td><b></b></td>")
|
|
master_token = master_token.replace("<eb6></eb6>", "<td><i> </i></td>")
|
|
master_token = master_token.replace("<eb7></eb7>", "<td><b><i></i></b></td>")
|
|
master_token = master_token.replace("<eb8></eb8>", "<td><b><i> </i></b></td>")
|
|
master_token = master_token.replace("<eb9></eb9>", "<td><i></i></td>")
|
|
master_token = master_token.replace(
|
|
"<eb10></eb10>", "<td><b> \u2028 \u2028 </b></td>"
|
|
)
|
|
return master_token
|
|
|
|
|
|
def distance(box_1, box_2):
|
|
x1, y1, x2, y2 = box_1
|
|
x3, y3, x4, y4 = box_2
|
|
dis = abs(x3 - x1) + abs(y3 - y1) + abs(x4 - x2) + abs(y4 - y2)
|
|
dis_2 = abs(x3 - x1) + abs(y3 - y1)
|
|
dis_3 = abs(x4 - x2) + abs(y4 - y2)
|
|
return dis + min(dis_2, dis_3)
|
|
|
|
|
|
def compute_iou(rec1, rec2):
|
|
"""
|
|
computing IoU
|
|
:param rec1: (y0, x0, y1, x1), which reflects
|
|
(top, left, bottom, right)
|
|
:param rec2: (y0, x0, y1, x1)
|
|
:return: scala value of IoU
|
|
"""
|
|
# computing area of each rectangles
|
|
S_rec1 = (rec1[2] - rec1[0]) * (rec1[3] - rec1[1])
|
|
S_rec2 = (rec2[2] - rec2[0]) * (rec2[3] - rec2[1])
|
|
|
|
# computing the sum_area
|
|
sum_area = S_rec1 + S_rec2
|
|
|
|
# find the each edge of intersect rectangle
|
|
left_line = max(rec1[1], rec2[1])
|
|
right_line = min(rec1[3], rec2[3])
|
|
top_line = max(rec1[0], rec2[0])
|
|
bottom_line = min(rec1[2], rec2[2])
|
|
|
|
# judge if there is an intersect
|
|
if left_line >= right_line or top_line >= bottom_line:
|
|
return 0.0
|
|
|
|
intersect = (right_line - left_line) * (bottom_line - top_line)
|
|
return (intersect / (sum_area - intersect)) * 1.0
|