From 0b76211eedafa354c3cbb67ee10aeedca0d72319 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <dol@zurich.ibm.com>
Date: Wed, 2 Oct 2024 16:57:48 +0200
Subject: [PATCH] add examples for swtching OCR engine and CLI support

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/cli/main.py                         | 28 ++++++++-
 docling/datamodel/pipeline_options.py       |  6 +-
 docling/models/tesseract_model.py           | 70 +++++++++++----------
 docling/pipeline/standard_model_pipeline.py |  7 +++
 examples/custom_convert.py                  | 22 +++++++
 tests/test_e2e_ocr_conversion.py            |  3 +-
 6 files changed, 99 insertions(+), 37 deletions(-)

diff --git a/docling/cli/main.py b/docling/cli/main.py
index b942d519..6459b414 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -14,7 +14,12 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.datamodel.pipeline_options import PipelineOptions
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    PipelineOptions,
+    TesseractOcrOptions,
+    TesserOcrOptions,
+)
 from docling.document_converter import DocumentConverter
 
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -53,6 +58,13 @@ class Backend(str, Enum):
     DOCLING = "docling"
 
 
+# Define an enum for the ocr engines
+class OcrEngine(str, Enum):
+    EASYOCR = "easyocr"
+    TESSERACT = "tesseract"
+    TESSEROCR = "tesserocr"
+
+
 def export_documents(
     conv_results: Iterable[ConversionResult],
     output_dir: Path,
@@ -152,6 +164,9 @@ def convert(
     backend: Annotated[
         Backend, typer.Option(..., help="The PDF backend to use.")
     ] = Backend.DOCLING,
+    ocr_engine: Annotated[
+        OcrEngine, typer.Option(..., help="The OCR engine to use.")
+    ] = OcrEngine.EASYOCR,
     output: Annotated[
         Path, typer.Option(..., help="Output directory where results are saved.")
     ] = Path("."),
@@ -191,8 +206,19 @@ def convert(
         case _:
             raise RuntimeError(f"Unexpected backend type {backend}")
 
+    match ocr_engine:
+        case OcrEngine.EASYOCR:
+            ocr_options = EasyOcrOptions()
+        case OcrEngine.TESSERACT:
+            ocr_options = TesseractOcrOptions()
+        case OcrEngine.TESSEROCR:
+            ocr_options = TesserOcrOptions()
+        case _:
+            raise RuntimeError(f"Unexpected backend type {backend}")
+
     pipeline_options = PipelineOptions(
         do_ocr=ocr,
+        ocr_options=ocr_options,
         do_table_structure=True,
     )
     pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 6742c412..bc30634d 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -32,11 +32,15 @@ class TesseractOcrOptions(OcrOptions):
     kind: Literal["tesseract"] = "tesseract"
 
 
+class TesserOcrOptions(OcrOptions):
+    kind: Literal["tesseract"] = "tesserocr"
+
+
 class PipelineOptions(BaseModel):
     do_table_structure: bool = True  # True: perform table structure extraction
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
 
     table_structure_options: TableStructureOptions = TableStructureOptions()
-    ocr_options: Union[EasyOcrOptions, TesseractOcrOptions] = Field(
+    ocr_options: Union[EasyOcrOptions, TesseractOcrOptions, TesserOcrOptions] = Field(
         EasyOcrOptions(), discriminator="kind"
     )
diff --git a/docling/models/tesseract_model.py b/docling/models/tesseract_model.py
index 59ec9504..2f1fd4ee 100644
--- a/docling/models/tesseract_model.py
+++ b/docling/models/tesseract_model.py
@@ -1,7 +1,7 @@
 import logging
-from typing import Iterable
+from subprocess import PIPE, Popen
+from typing import Iterable, Tuple
 
-import numpy
 import pandas as pd
 
 from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
@@ -10,7 +10,8 @@ from docling.models.base_ocr_model import BaseOcrModel
 
 _log = logging.getLogger(__name__)
 
-class TesseractModel(BaseOcrModel):
+
+class TesseractOcrModel(BaseOcrModel):
 
     def __init__(self, enabled: bool, options: TesseractOcrOptions):
         super().__init__(enabled=enabled, options=options)
@@ -21,34 +22,38 @@ class TesseractModel(BaseOcrModel):
         if self.enabled:
             try:
                 self._get_name_and_version()
-                
+
             except Exception as exc:
-                _log.error(f"Tesseract is not supported, aborting ...")
+                _log.error(f"Tesseract is not available, aborting ...")
                 self.enabled = False
-                
+
     def _get_name_and_version(self) -> Tuple[str, str]:
 
-        if self._name!=None and self._version!=None:
+        if self._name != None and self._version != None:
             return self._name, self._version
 
-        cmd = ['tesseract', '--version']
+        cmd = ["tesseract", "--version"]
 
         proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
         stdout, stderr = proc.communicate()
 
         proc.wait()
 
-        # HACK: Windows versions of Tesseract output the version to stdout, Linux versions 
+        # HACK: Windows versions of Tesseract output the version to stdout, Linux versions
         # to stderr, so check both.
-        version_line = (stdout.decode('utf8').strip() or stderr.decode('utf8').strip()).split('\n')[0].strip()
+        version_line = (
+            (stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
+            .split("\n")[0]
+            .strip()
+        )
 
         # If everything else fails...
         if not version_line:
-            version_line = 'tesseract XXX'
+            version_line = "tesseract XXX"
 
-        name, version = version_line.split(' ')
+        name, version = version_line.split(" ")
 
-        self._name    = name
+        self._name = name
         self._version = version
 
         return name, version
@@ -58,26 +63,25 @@ class TesseractModel(BaseOcrModel):
         cmd = ["tesseract"]
 
         if languages:
-            cmd += ['-l', '+'.join(languages)]
+            cmd += ["-l", "+".join(languages)]
 
-        cmd += [ifilename, 'stdout', "tsv"]
-        logger.info("command: {}".format(" ".join(cmd)))
+        cmd += [ifilename, "stdout", "tsv"]
+        _log.info("command: {}".format(" ".join(cmd)))
 
         proc = Popen(cmd, stdout=PIPE)
         output, _ = proc.communicate()
 
         # Read the TSV file generated by Tesseract
-        df = pd.read_csv('output_file_name.tsv', sep='\t')
+        df = pd.read_csv("output_file_name.tsv", sep="\t")
 
         # Display the dataframe (optional)
         print(df.head())
-        
+
         # Filter rows that contain actual text (ignore header or empty rows)
-        df_filtered = df[df['text'].notnull() & (df['text'].str.strip() != '')]
+        df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
 
         return df_filtered
-        
-    
+
     def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
 
         if not self.enabled:
@@ -93,7 +97,7 @@ class TesseractModel(BaseOcrModel):
                     scale=self.scale, cropbox=ocr_rect
                 )
                 print(high_res_image)
-                
+
                 # FIXME: do we really need to save the image to a file
                 fname = "temporary-file.png"
                 high_res_image.save(fname)
@@ -103,22 +107,22 @@ class TesseractModel(BaseOcrModel):
                     os.remove(fname)
                 else:
                     _log.error(f"no image file: {fname}")
-                
+
                 # Print relevant columns (bounding box and text)
                 for index, row in df_filtered.iterrows():
                     print(row)
-                    
+
                     text = row["text"]
                     conf = row["confidence"]
-                    
-                    l = float(row['left'])
-                    t = float(row['top'])
-                    w = float(row['width'])
-                    h = float(row['height'])
 
-                    b = t-h
-                    r = l+w
-                    
+                    l = float(row["left"])
+                    t = float(row["top"])
+                    w = float(row["width"])
+                    h = float(row["height"])
+
+                    b = t - h
+                    r = l + w
+
                     cell = OcrCell(
                         id=ix,
                         text=text,
@@ -134,7 +138,7 @@ class TesseractModel(BaseOcrModel):
                         ),
                     )
                     all_ocr_cells.append(cell)
-                
+
             ## Remove OCR cells which overlap with programmatic cells.
             filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
 
diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py
index 4f3d0214..df64fb8a 100644
--- a/docling/pipeline/standard_model_pipeline.py
+++ b/docling/pipeline/standard_model_pipeline.py
@@ -4,11 +4,13 @@ from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
     PipelineOptions,
     TesseractOcrOptions,
+    TesserOcrOptions,
 )
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.table_structure_model import TableStructureModel
+from docling.models.tesseract_model import TesseractOcrModel
 from docling.pipeline.base_model_pipeline import BaseModelPipeline
 
 
@@ -26,6 +28,11 @@ class StandardModelPipeline(BaseModelPipeline):
                 options=pipeline_options.ocr_options,
             )
         elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
+            ocr_model = TesseractOcrModel(
+                enabled=pipeline_options.do_ocr,
+                options=pipeline_options.ocr_options,
+            )
+        elif isinstance(pipeline_options.ocr_options, TesserOcrOptions):
             raise NotImplemented()
             # TODO
             # ocr_model = TesseractOcrModel(
diff --git a/examples/custom_convert.py b/examples/custom_convert.py
index 9c84d8dd..70b07dd2 100644
--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@@ -8,6 +8,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, PipelineOptions
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.datamodel.pipeline_options import TesseractOcrOptions, TesserOcrOptions
 from docling.document_converter import DocumentConverter
 
 _log = logging.getLogger(__name__)
@@ -115,6 +116,27 @@ def main():
     #     pdf_backend=DoclingParseDocumentBackend,
     # )
 
+    # Docling Parse with Tesseract OCR
+    # ----------------------
+    pipeline_options = PipelineOptions()
+    pipeline_options.do_ocr = True
+    pipeline_options.do_table_structure = True
+    pipeline_options.table_structure_options.do_cell_matching = True
+    pipeline_options.ocr_options = TesseractOcrOptions()
+
+    # Docling Parse with TesserOCR
+    # ----------------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr=True
+    # pipeline_options.do_table_structure=True
+    # pipeline_options.table_structure_options.do_cell_matching = True
+    # pipeline_options.ocr_options = TesserOcrOptions()
+
+    doc_converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+        pdf_backend=DoclingParseDocumentBackend,
+    )
+
     ###########################################################################
 
     # Define input files
diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py
index 870e270b..aea55651 100644
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -12,8 +12,7 @@ GENERATE = False
 
 # Debug
 def save_output(pdf_path: Path, doc_result: ConversionResult):
-    r"""
-    """
+    r""" """
     import json
     import os