You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
129 lines
4.1 KiB
Python
129 lines
4.1 KiB
Python
"""
|
|
clean data
|
|
"""
|
|
import argparse
|
|
import os
|
|
import re
|
|
import htmltabletomd # type: ignore
|
|
import pypandoc
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="get tool type")
|
|
parser.add_argument(
|
|
"--tool_name",
|
|
type=str,
|
|
required=True,
|
|
help="input tool name",
|
|
)
|
|
parser.add_argument(
|
|
"--download_dir",
|
|
type=str,
|
|
required=True,
|
|
help="input download dir",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
def clean_markdown_images(content):
|
|
"""
|
|
clean markdown images
|
|
"""
|
|
pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE)
|
|
cleaned_content = pattern.sub('', content)
|
|
return cleaned_content
|
|
|
|
def clean_ocrmath_photo(content):
|
|
"""
|
|
clean ocrmath photo
|
|
"""
|
|
pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE)
|
|
cleaned_content = pattern.sub('', content)
|
|
return cleaned_content
|
|
|
|
def convert_html_table_to_md(html_table):
|
|
"""
|
|
convert html table to markdown table
|
|
"""
|
|
lines = html_table.strip().split('\n')
|
|
md_table = ''
|
|
if lines and '<tr>' in lines[0]:
|
|
in_thead = True
|
|
for line in lines:
|
|
if '<th>' in line:
|
|
cells = re.findall(r'<th>(.*?)</th>', line)
|
|
md_table += '| ' + ' | '.join(cells) + ' |\n'
|
|
in_thead = False
|
|
elif '<td>' in line and not in_thead:
|
|
cells = re.findall(r'<td>(.*?)</td>', line)
|
|
md_table += '| ' + ' | '.join(cells) + ' |\n'
|
|
md_table = md_table.rstrip() + '\n'
|
|
return md_table
|
|
|
|
def convert_latext_to_md(content):
|
|
"""
|
|
convert latex table to markdown table
|
|
"""
|
|
tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL)
|
|
placeholders = []
|
|
for table in tables:
|
|
placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"
|
|
replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}"
|
|
content = content.replace(replace_str, placeholder)
|
|
try:
|
|
pypandoc.convert_text(replace_str, format="latex", to="md", outputfile="output.md", encoding="utf-8")
|
|
except:
|
|
markdown_string = replace_str
|
|
else:
|
|
markdown_string = open('output.md', 'r', encoding='utf-8').read()
|
|
placeholders.append((placeholder, markdown_string))
|
|
new_content = content
|
|
for placeholder, md_table in placeholders:
|
|
new_content = new_content.replace(placeholder, md_table)
|
|
# 写入文件
|
|
return new_content
|
|
|
|
|
|
def convert_htmltale_to_md(content):
|
|
"""
|
|
convert html table to markdown table
|
|
"""
|
|
tables = re.findall(r'<table>(.*?)</table>', content, re.DOTALL)
|
|
placeholders = []
|
|
for table in tables:
|
|
placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"
|
|
content = content.replace(f"<table>{table}</table>", placeholder)
|
|
try:
|
|
convert_table = htmltabletomd.convert_table(table)
|
|
except:
|
|
convert_table = table
|
|
placeholders.append((placeholder,convert_table))
|
|
new_content = content
|
|
for placeholder, md_table in placeholders:
|
|
new_content = new_content.replace(placeholder, md_table)
|
|
# 写入文件
|
|
return new_content
|
|
|
|
def clean_data(prod_type, download_dir):
|
|
"""
|
|
clean data
|
|
"""
|
|
tgt_dir = os.path.join(download_dir, prod_type, "cleaned")
|
|
if not os.path.exists(tgt_dir):
|
|
os.makedirs(tgt_dir)
|
|
source_dir = os.path.join(download_dir, prod_type)
|
|
filenames = os.listdir(source_dir)
|
|
for filename in filenames:
|
|
if filename.endswith('.md'):
|
|
input_file = os.path.join(source_dir, filename)
|
|
output_file = os.path.join(tgt_dir, "cleaned_" + filename)
|
|
with open(input_file, 'r', encoding='utf-8') as fr:
|
|
content = fr.read()
|
|
new_content = clean_markdown_images(content)
|
|
with open(output_file, 'w', encoding='utf-8') as fw:
|
|
fw.write(new_content)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
tool_type = args.tool_name
|
|
download_dir = args.download_dir
|
|
clean_data(tool_type, download_dir)
|