import os import re import json def extract_qa_from_txt(txt_path): with open(txt_path, 'r', encoding='utf-8') as file: text = file.read() # 正则表达式匹配问答对 pattern = re.compile(r'(问[::](.*?))\s*答[::](.*?)(?=(问[::]|$))', re.S) qa_pairs = [] for match in re.findall(pattern, text): question = match[1].strip() answer = match[2].strip() qa_pairs.append({'问': question, '答': answer}) return qa_pairs def save_as_json(data, json_path): with open(json_path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) def process_folder(input_folder_path, output_folder_path): if not os.path.exists(output_folder_path): os.makedirs(output_folder_path) for filename in os.listdir(input_folder_path): if filename.endswith('.txt'): txt_path = os.path.join(input_folder_path, filename) json_filename = filename.replace('.txt', '.json') json_path = os.path.join(output_folder_path, json_filename) qa_pairs = extract_qa_from_txt(txt_path) save_as_json(qa_pairs, json_path) print(f"问答对已成功提取并保存到 {json_path}") # 设置你的输入和输出文件夹路径 input_folder_path = r'E:\Project\PaddleOCR\bilu_txt_dispose\OCR\OCR_txt' output_folder_path = r'E:\Project\PaddleOCR\bilu_txt_dispose\OCR\txt_json' process_folder(input_folder_path, output_folder_path)