|
|
import os
|
|
|
import re
|
|
|
import json
|
|
|
|
|
|
|
|
|
def extract_qa_from_txt(txt_path):
|
|
|
with open(txt_path, 'r', encoding='utf-8') as file:
|
|
|
text = file.read()
|
|
|
|
|
|
# 正则表达式匹配问答对
|
|
|
pattern = re.compile(r'(问[::](.*?))\s*答[::](.*?)(?=(问[::]|$))', re.S)
|
|
|
qa_pairs = []
|
|
|
|
|
|
for match in re.findall(pattern, text):
|
|
|
question = match[1].strip()
|
|
|
answer = match[2].strip()
|
|
|
qa_pairs.append({'问': question, '答': answer})
|
|
|
|
|
|
return qa_pairs
|
|
|
|
|
|
|
|
|
def save_as_json(data, json_path):
|
|
|
with open(json_path, 'w', encoding='utf-8') as f:
|
|
|
json.dump(data, f, ensure_ascii=False, indent=4)
|
|
|
|
|
|
|
|
|
def process_folder(input_folder_path, output_folder_path):
|
|
|
if not os.path.exists(output_folder_path):
|
|
|
os.makedirs(output_folder_path)
|
|
|
|
|
|
for filename in os.listdir(input_folder_path):
|
|
|
if filename.endswith('.txt'):
|
|
|
txt_path = os.path.join(input_folder_path, filename)
|
|
|
json_filename = filename.replace('.txt', '.json')
|
|
|
json_path = os.path.join(output_folder_path, json_filename)
|
|
|
|
|
|
qa_pairs = extract_qa_from_txt(txt_path)
|
|
|
save_as_json(qa_pairs, json_path)
|
|
|
|
|
|
print(f"问答对已成功提取并保存到 {json_path}")
|
|
|
|
|
|
|
|
|
# 设置你的输入和输出文件夹路径
|
|
|
input_folder_path = r'E:\Project\PaddleOCR\bilu_txt_dispose\OCR\OCR_txt'
|
|
|
output_folder_path = r'E:\Project\PaddleOCR\bilu_txt_dispose\OCR\txt_json'
|
|
|
|
|
|
process_folder(input_folder_path, output_folder_path)
|