You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
154 lines
5.3 KiB
Python
154 lines
5.3 KiB
Python
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import os
|
|
from copy import deepcopy
|
|
|
|
from docx import Document
|
|
from docx import shared
|
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
from docx.enum.section import WD_SECTION
|
|
from docx.oxml.ns import qn
|
|
from docx.enum.table import WD_TABLE_ALIGNMENT
|
|
|
|
from ppstructure.recovery.table_process import HtmlToDocx
|
|
|
|
from ppocr.utils.logging import get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
|
|
def convert_info_docx(img, res, save_folder, img_name):
|
|
doc = Document()
|
|
doc.styles["Normal"].font.name = "Times New Roman"
|
|
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
|
|
doc.styles["Normal"].font.size = shared.Pt(6.5)
|
|
|
|
flag = 1
|
|
for i, region in enumerate(res):
|
|
if len(region["res"]) == 0:
|
|
continue
|
|
img_idx = region["img_idx"]
|
|
if flag == 2 and region["layout"] == "single":
|
|
section = doc.add_section(WD_SECTION.CONTINUOUS)
|
|
section._sectPr.xpath("./w:cols")[0].set(qn("w:num"), "1")
|
|
flag = 1
|
|
elif flag == 1 and region["layout"] == "double":
|
|
section = doc.add_section(WD_SECTION.CONTINUOUS)
|
|
section._sectPr.xpath("./w:cols")[0].set(qn("w:num"), "2")
|
|
flag = 2
|
|
|
|
if region["type"].lower() == "figure":
|
|
excel_save_folder = os.path.join(save_folder, img_name)
|
|
img_path = os.path.join(
|
|
excel_save_folder, "{}_{}.jpg".format(region["bbox"], img_idx)
|
|
)
|
|
paragraph_pic = doc.add_paragraph()
|
|
paragraph_pic.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
run = paragraph_pic.add_run("")
|
|
if flag == 1:
|
|
run.add_picture(img_path, width=shared.Inches(5))
|
|
elif flag == 2:
|
|
run.add_picture(img_path, width=shared.Inches(2))
|
|
elif region["type"].lower() == "title":
|
|
doc.add_heading(region["res"][0]["text"])
|
|
elif region["type"].lower() == "table":
|
|
parser = HtmlToDocx()
|
|
parser.table_style = "TableGrid"
|
|
parser.handle_table(region["res"]["html"], doc)
|
|
else:
|
|
paragraph = doc.add_paragraph()
|
|
paragraph_format = paragraph.paragraph_format
|
|
for i, line in enumerate(region["res"]):
|
|
if i == 0:
|
|
paragraph_format.first_line_indent = shared.Inches(0.25)
|
|
text_run = paragraph.add_run(line["text"] + " ")
|
|
text_run.font.size = shared.Pt(10)
|
|
|
|
# save to docx
|
|
docx_path = os.path.join(save_folder, "{}_ocr.docx".format(img_name))
|
|
doc.save(docx_path)
|
|
logger.info("docx save to {}".format(docx_path))
|
|
|
|
|
|
def sorted_layout_boxes(res, w):
|
|
"""
|
|
Sort text boxes in order from top to bottom, left to right
|
|
args:
|
|
res(list):ppstructure results
|
|
return:
|
|
sorted results(list)
|
|
"""
|
|
num_boxes = len(res)
|
|
if num_boxes == 1:
|
|
res[0]["layout"] = "single"
|
|
return res
|
|
|
|
sorted_boxes = sorted(res, key=lambda x: (x["bbox"][1], x["bbox"][0]))
|
|
_boxes = list(sorted_boxes)
|
|
|
|
new_res = []
|
|
res_left = []
|
|
res_right = []
|
|
i = 0
|
|
|
|
while True:
|
|
if i >= num_boxes:
|
|
break
|
|
if i == num_boxes - 1:
|
|
if (
|
|
_boxes[i]["bbox"][1] > _boxes[i - 1]["bbox"][3]
|
|
and _boxes[i]["bbox"][0] < w / 2
|
|
and _boxes[i]["bbox"][2] > w / 2
|
|
):
|
|
new_res += res_left
|
|
new_res += res_right
|
|
_boxes[i]["layout"] = "single"
|
|
new_res.append(_boxes[i])
|
|
else:
|
|
if _boxes[i]["bbox"][2] > w / 2:
|
|
_boxes[i]["layout"] = "double"
|
|
res_right.append(_boxes[i])
|
|
new_res += res_left
|
|
new_res += res_right
|
|
elif _boxes[i]["bbox"][0] < w / 2:
|
|
_boxes[i]["layout"] = "double"
|
|
res_left.append(_boxes[i])
|
|
new_res += res_left
|
|
new_res += res_right
|
|
res_left = []
|
|
res_right = []
|
|
break
|
|
elif _boxes[i]["bbox"][0] < w / 4 and _boxes[i]["bbox"][2] < 3 * w / 4:
|
|
_boxes[i]["layout"] = "double"
|
|
res_left.append(_boxes[i])
|
|
i += 1
|
|
elif _boxes[i]["bbox"][0] > w / 4 and _boxes[i]["bbox"][2] > w / 2:
|
|
_boxes[i]["layout"] = "double"
|
|
res_right.append(_boxes[i])
|
|
i += 1
|
|
else:
|
|
new_res += res_left
|
|
new_res += res_right
|
|
_boxes[i]["layout"] = "single"
|
|
new_res.append(_boxes[i])
|
|
res_left = []
|
|
res_right = []
|
|
i += 1
|
|
if res_left:
|
|
new_res += res_left
|
|
if res_right:
|
|
new_res += res_right
|
|
return new_res
|