You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

48 lines
1.5 KiB
Python

10 months ago
import os
import re
import json
def extract_qa_from_txt(txt_path):
with open(txt_path, 'r', encoding='utf-8') as file:
text = file.read()
# 正则表达式匹配问答对
pattern = re.compile(r'(问[:](.*?))\s*答[:](.*?)(?=(问[:]|$))', re.S)
qa_pairs = []
for match in re.findall(pattern, text):
question = match[1].strip()
answer = match[2].strip()
qa_pairs.append({'': question, '': answer})
return qa_pairs
def save_as_json(data, json_path):
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
def process_folder(input_folder_path, output_folder_path):
if not os.path.exists(output_folder_path):
os.makedirs(output_folder_path)
for filename in os.listdir(input_folder_path):
if filename.endswith('.txt'):
txt_path = os.path.join(input_folder_path, filename)
json_filename = filename.replace('.txt', '.json')
json_path = os.path.join(output_folder_path, json_filename)
qa_pairs = extract_qa_from_txt(txt_path)
save_as_json(qa_pairs, json_path)
print(f"问答对已成功提取并保存到 {json_path}")
# 设置你的输入和输出文件夹路径
input_folder_path = r'E:\Project\PaddleOCR\bilu_txt_dispose\OCR\OCR_txt'
output_folder_path = r'E:\Project\PaddleOCR\bilu_txt_dispose\OCR\txt_json'
process_folder(input_folder_path, output_folder_path)