Merge from main, update OCR model and test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-17 00:58:25 +00:00 · 2024-10-09 16:04:19 +02:00
parent 0dfbd0b6fc 6924999f1f
commit b5a27386c1
20 changed files with 814 additions and 129 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -14,7 +14,12 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, InputFormat
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    PdfPipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
 from docling.document_converter import DocumentConverter, PdfFormatOption

 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -53,6 +58,13 @@ class Backend(str, Enum):
    DOCLING = "docling"


+# Define an enum for the ocr engines
+class OcrEngine(str, Enum):
+    EASYOCR = "easyocr"
+    TESSERACT_CLI = "tesseract_cli"
+    TESSERACT = "tesseract"
+
+
 def export_documents(
    conv_results: Iterable[ConversionResult],
    output_dir: Path,
@@ -152,6 +164,9 @@ def convert(
    backend: Annotated[
        Backend, typer.Option(..., help="The PDF backend to use.")
    ] = Backend.DOCLING,
+    ocr_engine: Annotated[
+        OcrEngine, typer.Option(..., help="The OCR engine to use.")
+    ] = OcrEngine.EASYOCR,
    output: Annotated[
        Path, typer.Option(..., help="Output directory where results are saved.")
    ] = Path("."),
@@ -191,8 +206,19 @@ def convert(
        case _:
            raise RuntimeError(f"Unexpected backend type {backend}")

+    match ocr_engine:
+        case OcrEngine.EASYOCR:
+            ocr_options = EasyOcrOptions()
+        case OcrEngine.TESSERACT_CLI:
+            ocr_options = TesseractCliOcrOptions()
+        case OcrEngine.TESSERACT:
+            ocr_options = TesseractOcrOptions()
+        case _:
+            raise RuntimeError(f"Unexpected backend type {backend}")
+
    pipeline_options = PdfPipelineOptions(
        do_ocr=ocr,
+        ocr_options=ocr_options,
        do_table_structure=True,
    )
    pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -1,9 +1,9 @@
 import warnings
 from enum import Enum, auto
 from pathlib import Path
-from typing import Annotated, Optional, Union
+from typing import Annotated, List, Literal, Optional, Union

-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, ConfigDict, Field, model_validator


 class TableFormerMode(str, Enum):
@@ -21,6 +21,44 @@ class TableStructureOptions(BaseModel):
    mode: TableFormerMode = TableFormerMode.FAST


+class OcrOptions(BaseModel):
+    kind: str
+
+
+class EasyOcrOptions(OcrOptions):
+    kind: Literal["easyocr"] = "easyocr"
+    lang: List[str] = ["fr", "de", "es", "en"]
+    use_gpu: bool = True  # same default as easyocr.Reader
+    model_storage_directory: Optional[str] = None
+    download_enabled: bool = True  # same default as easyocr.Reader
+
+    model_config = ConfigDict(
+        extra="forbid",
+        protected_namespaces=(),
+    )
+
+
+class TesseractCliOcrOptions(OcrOptions):
+    kind: Literal["tesseract"] = "tesseract"
+    lang: List[str] = ["fra", "deu", "spa", "eng"]
+    tesseract_cmd: str = "tesseract"
+    path: Optional[str] = None
+
+    model_config = ConfigDict(
+        extra="forbid",
+    )
+
+
+class TesseractOcrOptions(OcrOptions):
+    kind: Literal["tesserocr"] = "tesserocr"
+    lang: List[str] = ["fra", "deu", "spa", "eng"]
+    path: Optional[str] = None
+
+    model_config = ConfigDict(
+        extra="forbid",
+    )
+
+
 class PipelineOptions(BaseModel): ...


@@ -30,6 +68,9 @@ class PdfPipelineOptions(PipelineOptions):
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text

    table_structure_options: TableStructureOptions = TableStructureOptions()
+    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
+        Field(EasyOcrOptions(), discriminator="kind")
+    )

    keep_page_images: Annotated[
        bool,
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@@ -10,15 +10,15 @@ from rtree import index
 from scipy.ndimage import find_objects, label

 from docling.datamodel.base_models import OcrCell, Page
-from docling.models.abstract_model import AbstractPageModel
+from docling.datamodel.pipeline_options import OcrOptions

 _log = logging.getLogger(__name__)


-class BaseOcrModel(AbstractPageModel):
-    def __init__(self, config):
-        self.config = config
-        self.enabled = config["enabled"]
+class BaseOcrModel:
+    def __init__(self, enabled: bool, options: OcrOptions):
+        self.enabled = enabled
+        self.options = options

    # Computes the optimum amount and coordinates of rectangles to OCR on a given page
    def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@@ -5,21 +5,33 @@ import numpy
 from docling_core.types.experimental import BoundingBox, CoordOrigin

 from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.pipeline_options import EasyOcrOptions
 from docling.models.base_ocr_model import BaseOcrModel

 _log = logging.getLogger(__name__)


 class EasyOcrModel(BaseOcrModel):
-    def __init__(self, config):
-        super().__init__(config)
+    def __init__(self, enabled: bool, options: EasyOcrOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: EasyOcrOptions

        self.scale = 3  # multiplier for 72 dpi == 216 dpi.

        if self.enabled:
-            import easyocr
+            try:
+                import easyocr
+            except ImportError:
+                raise ImportError(
+                    "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
+                    "Alternatively, Docling has support for other OCR engines. See the documentation."
+                )

-            self.reader = easyocr.Reader(config["lang"])
+            self.reader = easyocr.Reader(
+                lang_list=self.options.lang,
+                model_storage_directory=self.options.model_storage_directory,
+                download_enabled=self.options.download_enabled,
+            )

    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:

@@ -32,6 +44,9 @@ class EasyOcrModel(BaseOcrModel):

            all_ocr_cells = []
            for ocr_rect in ocr_rects:
+                # Skip zero area boxes
+                if ocr_rect.area() == 0:
+                    continue
                high_res_image = page._backend.get_page_image(
                    scale=self.scale, cropbox=ocr_rect
                )
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -0,0 +1,168 @@
+import io
+import logging
+import tempfile
+from subprocess import PIPE, Popen
+from typing import Iterable, Tuple
+
+import pandas as pd
+from docling_core.types.experimental import BoundingBox, CoordOrigin
+
+from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.pipeline_options import TesseractCliOcrOptions
+from docling.models.base_ocr_model import BaseOcrModel
+
+_log = logging.getLogger(__name__)
+
+
+class TesseractOcrCliModel(BaseOcrModel):
+
+    def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: TesseractCliOcrOptions
+
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+
+        self._name = None
+        self._version = None
+
+        if self.enabled:
+            try:
+                self._get_name_and_version()
+
+            except Exception as exc:
+                raise RuntimeError(
+                    f"Tesseract is not available, aborting: {exc} "
+                    "Install tesseract on your system and the tesseract binary is discoverable. "
+                    "The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
+                    "Alternatively, Docling has support for other OCR engines. See the documentation."
+                )
+
+    def _get_name_and_version(self) -> Tuple[str, str]:
+
+        if self._name != None and self._version != None:
+            return self._name, self._version
+
+        cmd = [self.options.tesseract_cmd, "--version"]
+
+        proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
+        stdout, stderr = proc.communicate()
+
+        proc.wait()
+
+        # HACK: Windows versions of Tesseract output the version to stdout, Linux versions
+        # to stderr, so check both.
+        version_line = (
+            (stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
+            .split("\n")[0]
+            .strip()
+        )
+
+        # If everything else fails...
+        if not version_line:
+            version_line = "tesseract XXX"
+
+        name, version = version_line.split(" ")
+
+        self._name = name
+        self._version = version
+
+        return name, version
+
+    def _run_tesseract(self, ifilename: str):
+
+        cmd = [self.options.tesseract_cmd]
+
+        if self.options.lang is not None and len(self.options.lang) > 0:
+            cmd.append("-l")
+            cmd.append("+".join(self.options.lang))
+        if self.options.path is not None:
+            cmd.append("--tessdata-dir")
+            cmd.append(self.options.path)
+
+        cmd += [ifilename, "stdout", "tsv"]
+        _log.info("command: {}".format(" ".join(cmd)))
+
+        proc = Popen(cmd, stdout=PIPE)
+        output, _ = proc.communicate()
+
+        # _log.info(output)
+
+        # Decode the byte string to a regular string
+        decoded_data = output.decode("utf-8")
+        # _log.info(decoded_data)
+
+        # Read the TSV file generated by Tesseract
+        df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
+
+        # Display the dataframe (optional)
+        # _log.info("df: ", df.head())
+
+        # Filter rows that contain actual text (ignore header or empty rows)
+        df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
+
+        return df_filtered
+
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+
+        if not self.enabled:
+            yield from page_batch
+            return
+
+        for page in page_batch:
+            ocr_rects = self.get_ocr_rects(page)
+
+            all_ocr_cells = []
+            for ocr_rect in ocr_rects:
+                # Skip zero area boxes
+                if ocr_rect.area() == 0:
+                    continue
+                high_res_image = page._backend.get_page_image(
+                    scale=self.scale, cropbox=ocr_rect
+                )
+
+                with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
+                    fname = image_file.name
+                    high_res_image.save(fname)
+
+                    df = self._run_tesseract(fname)
+
+                # _log.info(df)
+
+                # Print relevant columns (bounding box and text)
+                for ix, row in df.iterrows():
+                    text = row["text"]
+                    conf = row["conf"]
+
+                    l = float(row["left"])
+                    b = float(row["top"])
+                    w = float(row["width"])
+                    h = float(row["height"])
+
+                    t = b + h
+                    r = l + w
+
+                    cell = OcrCell(
+                        id=ix,
+                        text=text,
+                        confidence=conf / 100.0,
+                        bbox=BoundingBox.from_tuple(
+                            coord=(
+                                (l / self.scale) + ocr_rect.l,
+                                (b / self.scale) + ocr_rect.t,
+                                (r / self.scale) + ocr_rect.l,
+                                (t / self.scale) + ocr_rect.t,
+                            ),
+                            origin=CoordOrigin.TOPLEFT,
+                        ),
+                    )
+                    all_ocr_cells.append(cell)
+
+            ## Remove OCR cells which overlap with programmatic cells.
+            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+
+            page.cells.extend(filtered_ocr_cells)
+
+            # DEBUG code:
+            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+
+            yield page
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@@ -0,0 +1,123 @@
+import logging
+from typing import Iterable
+
+import numpy
+from docling_core.types.experimental import BoundingBox, CoordOrigin
+
+from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.pipeline_options import TesseractCliOcrOptions
+from docling.models.base_ocr_model import BaseOcrModel
+
+_log = logging.getLogger(__name__)
+
+
+class TesseractOcrModel(BaseOcrModel):
+    def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: TesseractCliOcrOptions
+
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+        self.reader = None
+
+        if self.enabled:
+            setup_errmsg = (
+                "tesserocr is not correctly installed. "
+                "Please install it via `pip install tesserocr` to use this OCR engine. "
+                "Note that tesserocr might have to be manually compiled for working with"
+                "your Tesseract installation. The Docling documentation provides examples for it. "
+                "Alternatively, Docling has support for other OCR engines. See the documentation."
+            )
+            try:
+                import tesserocr
+            except ImportError:
+                raise ImportError(setup_errmsg)
+
+            try:
+                tesseract_version = tesserocr.tesseract_version()
+                _log.debug("Initializing TesserOCR: %s", tesseract_version)
+            except:
+                raise ImportError(setup_errmsg)
+
+            # Initialize the tesseractAPI
+            lang = "+".join(self.options.lang)
+            if self.options.path is not None:
+                self.reader = tesserocr.PyTessBaseAPI(
+                    path=self.options.path,
+                    lang=lang,
+                    psm=tesserocr.PSM.AUTO,
+                    init=True,
+                    oem=tesserocr.OEM.DEFAULT,
+                )
+            else:
+                self.reader = tesserocr.PyTessBaseAPI(
+                    lang=lang,
+                    psm=tesserocr.PSM.AUTO,
+                    init=True,
+                    oem=tesserocr.OEM.DEFAULT,
+                )
+            self.reader_RIL = tesserocr.RIL
+
+    def __del__(self):
+        if self.reader is not None:
+            # Finalize the tesseractAPI
+            self.reader.End()
+
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+
+        if not self.enabled:
+            yield from page_batch
+            return
+
+        for page in page_batch:
+            ocr_rects = self.get_ocr_rects(page)
+
+            all_ocr_cells = []
+            for ocr_rect in ocr_rects:
+                # Skip zero area boxes
+                if ocr_rect.area() == 0:
+                    continue
+                high_res_image = page._backend.get_page_image(
+                    scale=self.scale, cropbox=ocr_rect
+                )
+
+                # Retrieve text snippets with their bounding boxes
+                self.reader.SetImage(high_res_image)
+                boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
+
+                cells = []
+                for ix, (im, box, _, _) in enumerate(boxes):
+                    # Set the area of interest. Tesseract uses Bottom-Left for the origin
+                    self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
+
+                    # Extract text within the bounding box
+                    text = self.reader.GetUTF8Text().strip()
+                    confidence = self.reader.MeanTextConf()
+                    left = box["x"] / self.scale
+                    bottom = box["y"] / self.scale
+                    right = (box["x"] + box["w"]) / self.scale
+                    top = (box["y"] + box["h"]) / self.scale
+
+                    cells.append(
+                        OcrCell(
+                            id=ix,
+                            text=text,
+                            confidence=confidence,
+                            bbox=BoundingBox.from_tuple(
+                                coord=(left, top, right, bottom),
+                                origin=CoordOrigin.TOPLEFT,
+                            ),
+                        )
+                    )
+
+                # del high_res_image
+                all_ocr_cells.extend(cells)
+
+            ## Remove OCR cells which overlap with programmatic cells.
+            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+
+            page.cells.extend(filtered_ocr_cells)
+
+            # DEBUG code:
+            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+
+            yield page
--- a/docling/pipeline/standard_pdf_model_pipeline.py
+++ b/docling/pipeline/standard_pdf_model_pipeline.py
@@ -6,13 +6,21 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import AssembledUnit, Page
 from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    PdfPipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.ds_glm_model import GlmModel
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.page_assemble_model import PageAssembleModel
 from docling.models.page_preprocessing_model import PagePreprocessingModel
 from docling.models.table_structure_model import TableStructureModel
+from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
+from docling.models.tesseract_ocr_model import TesseractOcrModel
 from docling.pipeline.base_model_pipeline import PaginatedModelPipeline

 _log = logging.getLogger(__name__)
@@ -31,16 +39,32 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
        self.artifacts_path = Path(artifacts_path)
        self.glm_model = GlmModel(config={})

+        ocr_model: BaseOcrModel
+        if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
+            ocr_model = EasyOcrModel(
+                enabled=pipeline_options.do_ocr,
+                options=pipeline_options.ocr_options,
+            )
+        elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
+            ocr_model = TesseractOcrCliModel(
+                enabled=pipeline_options.do_ocr,
+                options=pipeline_options.ocr_options,
+            )
+        elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
+            ocr_model = TesseractOcrModel(
+                enabled=pipeline_options.do_ocr,
+                options=pipeline_options.ocr_options,
+            )
+        else:
+            raise RuntimeError(
+                f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
+            )
+
        self.model_pipe = [
            PagePreprocessingModel(
                config={"images_scale": pipeline_options.images_scale}
            ),
-            EasyOcrModel(
-                config={
-                    "lang": ["fr", "de", "es", "en"],
-                    "enabled": pipeline_options.do_ocr,
-                }
-            ),
+            ocr_model,
            LayoutModel(
                config={
                    "artifacts_path": artifacts_path