diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index cc48a461..22333cf0 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -32,8 +32,10 @@ class TesseractOcrOptions(OcrOptions): kind: Literal["tesseract"] = "tesseract" lang: List[str] = ["fr", "de", "es", "en"] + class TesserOcrOptions(OcrOptions): kind: Literal["tesserocr"] = "tesserocr" + lang: List[str] = ["fra", "deu", "spa", "eng"] class PipelineOptions(BaseModel): diff --git a/docling/models/tesseract_model.py b/docling/models/tesseract_model.py index e4e8a73a..3b6fa04c 100644 --- a/docling/models/tesseract_model.py +++ b/docling/models/tesseract_model.py @@ -1,7 +1,6 @@ -import logging import io +import logging import os - from subprocess import PIPE, Popen from typing import Iterable, Tuple @@ -13,6 +12,7 @@ from docling.models.base_ocr_model import BaseOcrModel _log = logging.getLogger(__name__) + class TesseractOcrModel(BaseOcrModel): def __init__(self, enabled: bool, options: TesseractOcrOptions): @@ -23,7 +23,7 @@ class TesseractOcrModel(BaseOcrModel): self._name = None self._version = None - + if self.enabled: try: self._get_name_and_version() @@ -76,12 +76,12 @@ class TesseractOcrModel(BaseOcrModel): proc = Popen(cmd, stdout=PIPE) output, _ = proc.communicate() - #_log.info(output) - + # _log.info(output) + # Decode the byte string to a regular string - decoded_data = output.decode('utf-8') + decoded_data = output.decode("utf-8") # _log.info(decoded_data) - + # Read the TSV file generated by Tesseract df = pd.read_csv(io.StringIO(decoded_data), sep="\t") @@ -112,7 +112,7 @@ class TesseractOcrModel(BaseOcrModel): fname = "temporary-file.png" high_res_image.save(fname) - df=None + df = None if os.path.exists(fname): df = self._run_tesseract(fname) os.remove(fname) @@ -121,7 +121,7 @@ class TesseractOcrModel(BaseOcrModel): continue # _log.info(df) - + # Print relevant columns (bounding box and text) for ix, row in df.iterrows(): text = row["text"] @@ -138,7 +138,7 @@ class TesseractOcrModel(BaseOcrModel): cell = OcrCell( id=ix, text=text, - confidence=conf/100., + confidence=conf / 100.0, bbox=BoundingBox.from_tuple( coord=( (l / self.scale) + ocr_rect.l, diff --git a/docling/models/tesserocr_model.py b/docling/models/tesserocr_model.py index 01f1e71f..97078184 100644 --- a/docling/models/tesserocr_model.py +++ b/docling/models/tesserocr_model.py @@ -2,6 +2,8 @@ import logging from typing import Iterable import numpy +import tesserocr +from tesserocr import OEM, PSM, RIL, PyTessBaseAPI from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page from docling.datamodel.pipeline_options import TesseractOcrOptions @@ -16,11 +18,21 @@ class TesserOcrModel(BaseOcrModel): self.options: TesseractOcrOptions self.scale = 3 # multiplier for 72 dpi == 216 dpi. + self.reader = None if self.enabled: - import tesserocr + # Initialize the tesseractAPI + lang = "+".join(self.options.lang) + _log.debug("Initializing TesserOCR: %s", tesserocr.tesseract_version()) + self.reader = PyTessBaseAPI( + lang=lang, psm=PSM.AUTO, init=True, oem=OEM.DEFAULT + ) - self.reader = easyocr.Reader(lang_list=self.options.lang) + def __del__(self): + if self.reader is not None: + # Finalize the tesseractAPI + _log.debug("Finalize TesserOCR") + self.reader.End() def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: @@ -36,29 +48,38 @@ class TesserOcrModel(BaseOcrModel): high_res_image = page._backend.get_page_image( scale=self.scale, cropbox=ocr_rect ) - im = numpy.array(high_res_image) - result = self.reader.readtext(im) - del high_res_image - del im + # Retrieve text snippets with their bounding boxes + self.reader.SetImage(high_res_image) + boxes = self.reader.GetComponentImages(RIL.TEXTLINE, True) - cells = [ - OcrCell( - id=ix, - text=line[1], - confidence=line[2], - bbox=BoundingBox.from_tuple( - coord=( - (line[0][0][0] / self.scale) + ocr_rect.l, - (line[0][0][1] / self.scale) + ocr_rect.t, - (line[0][2][0] / self.scale) + ocr_rect.l, - (line[0][2][1] / self.scale) + ocr_rect.t, + cells = [] + for ix, (im, box, _, _) in enumerate(boxes): + # Set the area of interest. Tesseract uses Bottom-Left for the origin + self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"]) + + # Extract text within the bounding box + text = self.reader.GetUTF8Text().strip() + confidence = self.reader.MeanTextConf() + left = box["x"] / self.scale + bottom = box["y"] / self.scale + right = (box["x"] + box["w"]) / self.scale + top = (box["y"] + box["h"]) / self.scale + + cells.append( + OcrCell( + id=ix, + text=text, + confidence=confidence, + bbox=BoundingBox.from_tuple( + # l, b, r, t = coord[0], coord[1], coord[2], coord[3] + coord=(left, bottom, right, top), + origin=CoordOrigin.BOTTOMLEFT, ), - origin=CoordOrigin.TOPLEFT, - ), + ) ) - for ix, line in enumerate(result) - ] + + # del high_res_image all_ocr_cells.extend(cells) ## Remove OCR cells which overlap with programmatic cells. diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py index df64fb8a..3bf93edc 100644 --- a/docling/pipeline/standard_model_pipeline.py +++ b/docling/pipeline/standard_model_pipeline.py @@ -11,6 +11,7 @@ from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel from docling.models.table_structure_model import TableStructureModel from docling.models.tesseract_model import TesseractOcrModel +from docling.models.tesserocr_model import TesserOcrModel from docling.pipeline.base_model_pipeline import BaseModelPipeline @@ -33,12 +34,10 @@ class StandardModelPipeline(BaseModelPipeline): options=pipeline_options.ocr_options, ) elif isinstance(pipeline_options.ocr_options, TesserOcrOptions): - raise NotImplemented() - # TODO - # ocr_model = TesseractOcrModel( - # enabled=pipeline_options.do_ocr, - # options=pipeline_options.ocr_options, - # ) + ocr_model = TesserOcrModel( + enabled=pipeline_options.do_ocr, + options=pipeline_options.ocr_options, + ) else: raise RuntimeError( f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}." diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index 37b72963..db1214bc 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -1,80 +1,62 @@ from pathlib import Path +from pydantic import Field + from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import PipelineOptions +from docling.datamodel.pipeline_options import PipelineOptions, TesseractOcrOptions from docling.document_converter import DocumentConverter from .verify_utils import verify_conversion_result +# from tests.verify_utils import verify_conversion_result + + GENERATE = False # Debug -def save_output(pdf_path: Path, doc_result: ConversionResult): +def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str): r""" """ import json import os parent = pdf_path.parent - dict_fn = os.path.join(parent, f"{pdf_path.stem}.json") + dict_fn = os.path.join(parent, f"{pdf_path.stem}.{engine}.json") with open(dict_fn, "w") as fd: json.dump(doc_result.render_as_dict(), fd) - pages_fn = os.path.join(parent, f"{pdf_path.stem}.pages.json") + pages_fn = os.path.join(parent, f"{pdf_path.stem}.{engine}.pages.json") pages = [p.model_dump() for p in doc_result.pages] with open(pages_fn, "w") as fd: json.dump(pages, fd) - doctags_fn = os.path.join(parent, f"{pdf_path.stem}.doctags.txt") + doctags_fn = os.path.join(parent, f"{pdf_path.stem}.{engine}.doctags.txt") with open(doctags_fn, "w") as fd: fd.write(doc_result.render_as_doctags()) - md_fn = os.path.join(parent, f"{pdf_path.stem}.md") + md_fn = os.path.join(parent, f"{pdf_path.stem}.{engine}.md") with open(md_fn, "w") as fd: fd.write(doc_result.render_as_markdown()) def get_pdf_paths(): - # TODO: Debug # Define the directory you want to search - # directory = Path("./tests/data") - directory = Path("./tests/data/scanned") + directory = Path("./tests/data_scanned") # List all PDF files in the directory and its subdirectories pdf_files = sorted(directory.rglob("*.pdf")) return pdf_files -def get_easyocr_converter(): - - ocr_options = EasyOcrOptions( - - ) - +def get_converter(engine: str): pipeline_options = PipelineOptions() - # Debug - pipeline_options.do_ocr = True - pipeline_options.do_table_structure = True - pipeline_options.table_structure_options.do_cell_matching = True - - - - converter = DocumentConverter( - pipeline_options=pipeline_options, - pdf_backend=DoclingParseDocumentBackend, - ) - - return converter - -def get_tesseract_converter(): - - pipeline_options = PipelineOptions() - # Debug pipeline_options.do_ocr = True pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True + if engine == "tesserocr": + pipeline_options.ocr_options = TesseractOcrOptions() converter = DocumentConverter( pipeline_options=pipeline_options, @@ -84,18 +66,29 @@ def get_tesseract_converter(): return converter - def test_e2e_conversions(): pdf_paths = get_pdf_paths() - converter = get_converter() - for pdf_path in pdf_paths: - print(f"converting {pdf_path}") + for engine in ["easyocr", "tesserocr"]: + print(f"Converting with ocr_engine: {engine}") + converter = get_converter(engine) + for pdf_path in pdf_paths: + print(f"converting {pdf_path}") - doc_result: ConversionResult = converter.convert_single(pdf_path) + doc_result: ConversionResult = converter.convert_single(pdf_path) - # Debug - verify_conversion_result( - input_path=pdf_path, doc_result=doc_result, generate=GENERATE - ) + # # Save conversions + # save_output(pdf_path, doc_result, engine) + + # Debug + verify_conversion_result( + input_path=pdf_path, + doc_result=doc_result, + generate=GENERATE, + ocr_engine=engine, + ) + + +# if __name__ == "__main__": +# test_e2e_conversions() diff --git a/tests/verify_utils.py b/tests/verify_utils.py index f75ed614..36bd6f1e 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -127,7 +127,10 @@ def verify_dt(doc_pred_dt, doc_true_dt): def verify_conversion_result( - input_path: Path, doc_result: ConversionResult, generate=False + input_path: Path, + doc_result: ConversionResult, + generate=False, + ocr_engine=None, ): PageList = TypeAdapter(List[Page]) @@ -140,10 +143,16 @@ def verify_conversion_result( doc_pred_md = doc_result.render_as_markdown() doc_pred_dt = doc_result.render_as_doctags() - pages_path = input_path.with_suffix(".pages.json") - json_path = input_path.with_suffix(".json") - md_path = input_path.with_suffix(".md") - dt_path = input_path.with_suffix(".doctags.txt") + # pages_path = input_path.with_suffix(".pages.json") + # json_path = input_path.with_suffix(".json") + # md_path = input_path.with_suffix(".md") + # dt_path = input_path.with_suffix(".doctags.txt") + + engine_suffix = "" if ocr_engine is None else f".{ocr_engine}" + pages_path = input_path.with_suffix(f"{engine_suffix}.pages.json") + json_path = input_path.with_suffix(f"{engine_suffix}.json") + md_path = input_path.with_suffix(f"{engine_suffix}.md") + dt_path = input_path.with_suffix(f"{engine_suffix}.doctags.txt") if generate: # only used when re-generating truth with open(pages_path, "w") as fw: