You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
29 lines
1.0 KiB
Python
29 lines
1.0 KiB
Python
import os
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from tqdm import tqdm
|
|
import traceback
|
|
|
|
from surya.input.processing import slice_bboxes_from_image
|
|
from surya.recognition import RecognitionPredictor
|
|
|
|
def textract_ocr(extractor, img):
|
|
try:
|
|
document = extractor.detect_document_text(file_source=img)
|
|
return [line.text for line in document.lines]
|
|
except:
|
|
traceback.print_exc()
|
|
return [None]
|
|
|
|
def textract_ocr_parallel(imgs, cpus=None):
|
|
from textractor import Textractor # Optional dependency
|
|
|
|
extractor = Textractor(profile_name='default')
|
|
parallel_cores = min(len(imgs), RecognitionPredictor().get_batch_size())
|
|
if not cpus:
|
|
cpus = os.cpu_count()
|
|
parallel_cores = min(parallel_cores, cpus)
|
|
|
|
with ThreadPoolExecutor(max_workers=parallel_cores) as executor:
|
|
textract_text = tqdm(executor.map(textract_ocr, [extractor]*len(imgs), imgs), total=len(imgs), desc="Running textract OCR")
|
|
textract_text = list(textract_text)
|
|
return textract_text |