add examples for swtching OCR engine and CLI support

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-10-02 16:57:48 +02:00 · 2024-10-02 16:57:48 +02:00 · 0b76211eed
commit 0b76211eed
parent 8d1c1d6dd5
6 changed files with 99 additions and 37 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -14,7 +14,12 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.datamodel.pipeline_options import PipelineOptions
+from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    PipelineOptions,
    TesseractOcrOptions,
    TesserOcrOptions,
 )
 from docling.document_converter import DocumentConverter
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@ -53,6 +58,13 @@ class Backend(str, Enum):
    DOCLING = "docling"
 # Define an enum for the ocr engines
 class OcrEngine(str, Enum):
    EASYOCR = "easyocr"
    TESSERACT = "tesseract"
    TESSEROCR = "tesserocr"
 def export_documents(
    conv_results: Iterable[ConversionResult],
    output_dir: Path,
@ -152,6 +164,9 @@ def convert(
    backend: Annotated[
        Backend, typer.Option(..., help="The PDF backend to use.")
    ] = Backend.DOCLING,
    ocr_engine: Annotated[
        OcrEngine, typer.Option(..., help="The OCR engine to use.")
    ] = OcrEngine.EASYOCR,
    output: Annotated[
        Path, typer.Option(..., help="Output directory where results are saved.")
    ] = Path("."),
@ -191,8 +206,19 @@ def convert(
        case _:
            raise RuntimeError(f"Unexpected backend type {backend}")
    match ocr_engine:
        case OcrEngine.EASYOCR:
            ocr_options = EasyOcrOptions()
        case OcrEngine.TESSERACT:
            ocr_options = TesseractOcrOptions()
        case OcrEngine.TESSEROCR:
            ocr_options = TesserOcrOptions()
        case _:
            raise RuntimeError(f"Unexpected backend type {backend}")
    pipeline_options = PipelineOptions(
        do_ocr=ocr,
        ocr_options=ocr_options,
        do_table_structure=True,
    )
    pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -32,11 +32,15 @@ class TesseractOcrOptions(OcrOptions):
    kind: Literal["tesseract"] = "tesseract"
 class TesserOcrOptions(OcrOptions):
    kind: Literal["tesseract"] = "tesserocr"
 class PipelineOptions(BaseModel):
    do_table_structure: bool = True  # True: perform table structure extraction
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
    table_structure_options: TableStructureOptions = TableStructureOptions()
-    ocr_options: Union[EasyOcrOptions, TesseractOcrOptions] = Field(
+    ocr_options: Union[EasyOcrOptions, TesseractOcrOptions, TesserOcrOptions] = Field(
        EasyOcrOptions(), discriminator="kind"
    )
--- a/docling/models/tesseract_model.py
+++ b/docling/models/tesseract_model.py
@ -1,7 +1,7 @@
 import logging
-from typing import Iterable
+from subprocess import PIPE, Popen
 from typing import Iterable, Tuple
 import numpy
 import pandas as pd
 from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
@ -10,7 +10,8 @@ from docling.models.base_ocr_model import BaseOcrModel
 _log = logging.getLogger(__name__)
-class TesseractModel(BaseOcrModel):
+
 class TesseractOcrModel(BaseOcrModel):
    def __init__(self, enabled: bool, options: TesseractOcrOptions):
        super().__init__(enabled=enabled, options=options)
@ -23,15 +24,15 @@ class TesseractModel(BaseOcrModel):
                self._get_name_and_version()
            except Exception as exc:
-                _log.error(f"Tesseract is not supported, aborting ...")
+                _log.error(f"Tesseract is not available, aborting ...")
                self.enabled = False
    def _get_name_and_version(self) -> Tuple[str, str]:
-        if self._name!=None and self._version!=None:
+        if self._name != None and self._version != None:
            return self._name, self._version
-        cmd = ['tesseract', '--version']
+        cmd = ["tesseract", "--version"]
        proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
        stdout, stderr = proc.communicate()
@ -40,15 +41,19 @@ class TesseractModel(BaseOcrModel):
        # HACK: Windows versions of Tesseract output the version to stdout, Linux versions
        # to stderr, so check both.
-        version_line = (stdout.decode('utf8').strip() or stderr.decode('utf8').strip()).split('\n')[0].strip()
+        version_line = (
            (stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
            .split("\n")[0]
            .strip()
        )
        # If everything else fails...
        if not version_line:
-            version_line = 'tesseract XXX'
+            version_line = "tesseract XXX"
-        name, version = version_line.split(' ')
+        name, version = version_line.split(" ")
-        self._name    = name
+        self._name = name
        self._version = version
        return name, version
@ -58,26 +63,25 @@ class TesseractModel(BaseOcrModel):
        cmd = ["tesseract"]
        if languages:
-            cmd += ['-l', '+'.join(languages)]
+            cmd += ["-l", "+".join(languages)]
-        cmd += [ifilename, 'stdout', "tsv"]
+        cmd += [ifilename, "stdout", "tsv"]
-        logger.info("command: {}".format(" ".join(cmd)))
+        _log.info("command: {}".format(" ".join(cmd)))
        proc = Popen(cmd, stdout=PIPE)
        output, _ = proc.communicate()
        # Read the TSV file generated by Tesseract
-        df = pd.read_csv('output_file_name.tsv', sep='\t')
+        df = pd.read_csv("output_file_name.tsv", sep="\t")
        # Display the dataframe (optional)
        print(df.head())
        # Filter rows that contain actual text (ignore header or empty rows)
-        df_filtered = df[df['text'].notnull() & (df['text'].str.strip() != '')]
+        df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
        return df_filtered
    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        if not self.enabled:
@ -111,13 +115,13 @@ class TesseractModel(BaseOcrModel):
                    text = row["text"]
                    conf = row["confidence"]
-                    l = float(row['left'])
+                    l = float(row["left"])
-                    t = float(row['top'])
+                    t = float(row["top"])
-                    w = float(row['width'])
+                    w = float(row["width"])
-                    h = float(row['height'])
+                    h = float(row["height"])
-                    b = t-h
+                    b = t - h
-                    r = l+w
+                    r = l + w
                    cell = OcrCell(
                        id=ix,
--- a/docling/pipeline/standard_model_pipeline.py
+++ b/docling/pipeline/standard_model_pipeline.py
@ -4,11 +4,13 @@ from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    PipelineOptions,
    TesseractOcrOptions,
    TesserOcrOptions,
 )
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_model import TesseractOcrModel
 from docling.pipeline.base_model_pipeline import BaseModelPipeline
@ -26,6 +28,11 @@ class StandardModelPipeline(BaseModelPipeline):
                options=pipeline_options.ocr_options,
            )
        elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
            ocr_model = TesseractOcrModel(
                enabled=pipeline_options.do_ocr,
                options=pipeline_options.ocr_options,
            )
        elif isinstance(pipeline_options.ocr_options, TesserOcrOptions):
            raise NotImplemented()
            # TODO
            # ocr_model = TesseractOcrModel(
--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@ -8,6 +8,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, PipelineOptions
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
 from docling.datamodel.pipeline_options import TesseractOcrOptions, TesserOcrOptions
 from docling.document_converter import DocumentConverter
 _log = logging.getLogger(__name__)
@ -115,6 +116,27 @@ def main():
    #     pdf_backend=DoclingParseDocumentBackend,
    # )
    # Docling Parse with Tesseract OCR
    # ----------------------
    pipeline_options = PipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    pipeline_options.ocr_options = TesseractOcrOptions()
    # Docling Parse with TesserOCR
    # ----------------------
    # pipeline_options = PipelineOptions()
    # pipeline_options.do_ocr=True
    # pipeline_options.do_table_structure=True
    # pipeline_options.table_structure_options.do_cell_matching = True
    # pipeline_options.ocr_options = TesserOcrOptions()
    doc_converter = DocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=DoclingParseDocumentBackend,
    )
    ###########################################################################
    # Define input files
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@ -12,8 +12,7 @@ GENERATE = False
 # Debug
 def save_output(pdf_path: Path, doc_result: ConversionResult):
-    r"""
+    r""" """
    """
    import json
    import os