# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import copy import re def deal_isolate_span(thead_part): """ Deal with isolate span cases in this function. It causes by wrong prediction in structure recognition model. eg. predict to rowspan="2">. :param thead_part: :return: """ # 1. find out isolate span tokens. isolate_pattern = ( ' rowspan="(\d)+" colspan="(\d)+">|' ' colspan="(\d)+" rowspan="(\d)+">|' ' rowspan="(\d)+">|' ' colspan="(\d)+">' ) isolate_iter = re.finditer(isolate_pattern, thead_part) isolate_list = [i.group() for i in isolate_iter] # 2. find out span number, by step 1 results. span_pattern = ( ' rowspan="(\d)+" colspan="(\d)+"|' ' colspan="(\d)+" rowspan="(\d)+"|' ' rowspan="(\d)+"|' ' colspan="(\d)+"' ) corrected_list = [] for isolate_item in isolate_list: span_part = re.search(span_pattern, isolate_item) spanStr_in_isolateItem = span_part.group() # 3. merge the span number into the span token format string. if spanStr_in_isolateItem is not None: corrected_item = f"" corrected_list.append(corrected_item) else: corrected_list.append(None) # 4. replace original isolated token. for corrected_item, isolate_item in zip(corrected_list, isolate_list): if corrected_item is not None: thead_part = thead_part.replace(isolate_item, corrected_item) else: pass return thead_part def deal_duplicate_bb(thead_part): """ Deal duplicate or after replace. Keep one in a token. :param thead_part: :return: """ # 1. find out in . td_pattern = ( '(.+?)|' '(.+?)|' '(.+?)|' '(.+?)|' "(.*?)" ) td_iter = re.finditer(td_pattern, thead_part) td_list = [t.group() for t in td_iter] # 2. is multiply in or not? new_td_list = [] for td_item in td_list: if td_item.count("") > 1 or td_item.count("") > 1: # multiply in case. # 1. remove all td_item = td_item.replace("", "").replace("", "") # 2. replace -> , -> . td_item = td_item.replace("", "").replace("", "") new_td_list.append(td_item) else: new_td_list.append(td_item) # 3. replace original thead part. for td_item, new_td_item in zip(td_list, new_td_list): thead_part = thead_part.replace(td_item, new_td_item) return thead_part def deal_bb(result_token): """ In our opinion, always occurs in text's context. This function will find out all tokens in and insert by manual. :param result_token: :return: """ # find out parts. thead_pattern = "(.*?)" if re.search(thead_pattern, result_token) is None: return result_token thead_part = re.search(thead_pattern, result_token).group() origin_thead_part = copy.deepcopy(thead_part) # check "rowspan" or "colspan" occur in parts or not . span_pattern = '|||' span_iter = re.finditer(span_pattern, thead_part) span_list = [s.group() for s in span_iter] has_span_in_head = True if len(span_list) > 0 else False if not has_span_in_head: # not include "rowspan" or "colspan" branch 1. # 1. replace to , and to # 2. it is possible to predict text include or by Text-line recognition, # so we replace to , and to thead_part = ( thead_part.replace("", "") .replace("", "") .replace("", "") .replace("", "") ) else: # include "rowspan" or "colspan" branch 2. # Firstly, we deal rowspan or colspan cases. # 1. replace > to > # 2. replace to # 3. it is possible to predict text include or by Text-line recognition, # so we replace to , and to # Secondly, deal ordinary cases like branch 1 # replace ">" to "" replaced_span_list = [] for sp in span_list: replaced_span_list.append(sp.replace(">", ">")) for sp, rsp in zip(span_list, replaced_span_list): thead_part = thead_part.replace(sp, rsp) # replace "" to "" thead_part = thead_part.replace("", "") # remove duplicated by re.sub mb_pattern = "()+" single_b_string = "" thead_part = re.sub(mb_pattern, single_b_string, thead_part) mgb_pattern = "()+" single_gb_string = "" thead_part = re.sub(mgb_pattern, single_gb_string, thead_part) # ordinary cases like branch 1 thead_part = thead_part.replace("", "").replace("", "") # convert back to , empty cell has no . # but space cell( ) is suitable for thead_part = thead_part.replace("", "") # deal with duplicated thead_part = deal_duplicate_bb(thead_part) # deal with isolate span tokens, which causes by wrong predict by structure prediction. # eg.PMC5994107_011_00.png thead_part = deal_isolate_span(thead_part) # replace original result with new thead part. result_token = result_token.replace(origin_thead_part, thead_part) return result_token def deal_eb_token(master_token): """ post process with , , ... emptyBboxTokenDict = { "[]": '', "[' ']": '', "['', ' ', '']": '', "['\\u2028', '\\u2028']": '', "['', ' ', '']": '', "['', '']": '', "['', ' ', '']": '', "['', '', '', '']": '', "['', '', ' ', '', '']": '', "['', '']": '', "['', ' ', '\\u2028', ' ', '\\u2028', ' ', '']": '', } :param master_token: :return: """ master_token = master_token.replace("", "") master_token = master_token.replace("", " ") master_token = master_token.replace("", " ") master_token = master_token.replace("", "\u2028\u2028") master_token = master_token.replace("", " ") master_token = master_token.replace("", "") master_token = master_token.replace("", " ") master_token = master_token.replace("", "") master_token = master_token.replace("", " ") master_token = master_token.replace("", "") master_token = master_token.replace( "", " \u2028 \u2028 " ) return master_token def distance(box_1, box_2): x1, y1, x2, y2 = box_1 x3, y3, x4, y4 = box_2 dis = abs(x3 - x1) + abs(y3 - y1) + abs(x4 - x2) + abs(y4 - y2) dis_2 = abs(x3 - x1) + abs(y3 - y1) dis_3 = abs(x4 - x2) + abs(y4 - y2) return dis + min(dis_2, dis_3) def compute_iou(rec1, rec2): """ computing IoU :param rec1: (y0, x0, y1, x1), which reflects (top, left, bottom, right) :param rec2: (y0, x0, y1, x1) :return: scala value of IoU """ # computing area of each rectangles S_rec1 = (rec1[2] - rec1[0]) * (rec1[3] - rec1[1]) S_rec2 = (rec2[2] - rec2[0]) * (rec2[3] - rec2[1]) # computing the sum_area sum_area = S_rec1 + S_rec2 # find the each edge of intersect rectangle left_line = max(rec1[1], rec2[1]) right_line = min(rec1[3], rec2[3]) top_line = max(rec1[0], rec2[0]) bottom_line = min(rec1[2], rec2[2]) # judge if there is an intersect if left_line >= right_line or top_line >= bottom_line: return 0.0 intersect = (right_line - left_line) * (bottom_line - top_line) return (intersect / (sum_area - intersect)) * 1.0