You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

72 lines
2.5 KiB
Python

import os
import subprocess
import re
# 图片文件夹路径
base_image_dir = 'E:/Project/PaddleOCR/bilu_txt_dispose/Crop_img'
# OCR 结果保存路径
output_dir = 'E:/Project/PaddleOCR/bilu_txt_dispose/OCR/OCR_txt'
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 用于去除 ANSI 转义序列的正则表达式
ansi_escape = re.compile(r'\x1b[^m]*m')
def remove_ansi_escape_sequences(text):
# 去除 ANSI 转义序列
text = ansi_escape.sub('', text)
# 去除换行符
text = text.replace('\n', '')
return text
# 遍历每个子文件夹
for foldername in os.listdir(base_image_dir):
folder_path = os.path.join(base_image_dir, foldername)
if not os.path.isdir(folder_path):
continue
# 获取每个子文件夹中的所有图片文件路径
image_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if
os.path.isfile(os.path.join(folder_path, f))]
# 保存结果的文件路径
result_file_path = os.path.join(output_dir, f'{foldername}.txt')
with open(result_file_path, 'w', encoding='utf-8') as result_file:
# 循环处理每个图片文件
for image_file in image_files:
# 构建命令行命令
command = [
'python', 'tools/infer/predict_system_1.py',
'--use_gpu=False',
f'--cls_model_dir=./models/cls',
f'--rec_model_dir=./models/rec',
f'--det_model_dir=./models/det',
f'--image_dir={image_file}'
]
# 执行命令
print(f"Processing {os.path.basename(image_file)} in {foldername}...")
try:
result = subprocess.run(command, capture_output=True, text=True, encoding='utf-8')
# 解析并写入识别结果
result_output = remove_ansi_escape_sequences(result.stdout.strip())
if result_output:
result_file.write(f"{result_output}\n")
if result.stderr:
print(f"Error processing {os.path.basename(image_file)} in {foldername}: {result.stderr}")
except UnicodeDecodeError as e:
print(
f"UnicodeDecodeError occurred while processing {os.path.basename(image_file)} in {foldername}: {e}")
continue # Skip this file and continue with the next one
print(f"Results for folder {foldername} saved to {result_file_path}")
print("All folders processed.")