You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
103 lines
3.0 KiB
Python
103 lines
3.0 KiB
Python
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""
|
|
conver table label to html
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
from tqdm import tqdm
|
|
|
|
|
|
def save_pred_txt(key, val, tmp_file_path):
|
|
with open(tmp_file_path, "a+", encoding="utf-8") as f:
|
|
f.write("{}\t{}\n".format(key, val))
|
|
|
|
|
|
def skip_char(text, sp_char_list):
|
|
"""
|
|
skip empty cell
|
|
@param text: text in cell
|
|
@param sp_char_list: style char and special code
|
|
@return:
|
|
"""
|
|
for sp_char in sp_char_list:
|
|
text = text.replace(sp_char, "")
|
|
return text
|
|
|
|
|
|
def gen_html(img):
|
|
"""
|
|
Formats HTML code from tokenized annotation of Crop_img
|
|
"""
|
|
html_code = img["html"]["structure"]["tokens"].copy()
|
|
to_insert = [i for i, tag in enumerate(html_code) if tag in ("<td>", ">")]
|
|
for i, cell in zip(to_insert[::-1], img["html"]["cells"][::-1]):
|
|
if cell["tokens"]:
|
|
text = "".join(cell["tokens"])
|
|
# skip empty text
|
|
sp_char_list = ["<b>", "</b>", "\u2028", " ", "<i>", "</i>"]
|
|
text_remove_style = skip_char(text, sp_char_list)
|
|
if len(text_remove_style) == 0:
|
|
continue
|
|
html_code.insert(i + 1, text)
|
|
html_code = "".join(html_code)
|
|
html_code = "<html><body><table>{}</table></body></html>".format(html_code)
|
|
return html_code
|
|
|
|
|
|
def load_gt_data(gt_path):
|
|
"""
|
|
load gt
|
|
@param gt_path:
|
|
@return:
|
|
"""
|
|
data_list = {}
|
|
with open(gt_path, "rb") as f:
|
|
lines = f.readlines()
|
|
for line in tqdm(lines):
|
|
data_line = line.decode("utf-8").strip("\n")
|
|
info = json.loads(data_line)
|
|
data_list[info["filename"]] = info
|
|
return data_list
|
|
|
|
|
|
def convert(origin_gt_path, save_path):
|
|
"""
|
|
gen html from label file
|
|
@param origin_gt_path:
|
|
@param save_path:
|
|
@return:
|
|
"""
|
|
data_dict = load_gt_data(origin_gt_path)
|
|
for img_name, gt in tqdm(data_dict.items()):
|
|
html = gen_html(gt)
|
|
save_pred_txt(img_name, html, save_path)
|
|
print("conver finish")
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description="args for paddleserving")
|
|
parser.add_argument("--ori_gt_path", type=str, required=True, help="label gt path")
|
|
parser.add_argument(
|
|
"--save_path", type=str, required=True, help="path to save file"
|
|
)
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = parse_args()
|
|
convert(args.ori_gt_path, args.save_path)
|