chore(OCR): Rename class names to use Tesseract for the tesserocr and TesseractCLI for the tesseract process

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-10-08 14:44:23 +02:00 · 2024-10-08 14:44:23 +02:00 · 70a8a2cc82
commit 70a8a2cc82
parent 074acd703c
8 changed files with 261 additions and 261 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -17,8 +17,8 @@ from docling.datamodel.document import ConversionResult, DocumentConversionInput
 from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    PipelineOptions,
-    TesseractOcrOptions,
+    TesseractCLIOptions,
-    TesserOcrOptions,
+    TesseractOptions,
 )
 from docling.document_converter import DocumentConverter
@ -61,8 +61,8 @@ class Backend(str, Enum):
 # Define an enum for the ocr engines
 class OcrEngine(str, Enum):
    EASYOCR = "easyocr"
    TESSERACT_CLI = "tesseract_cli"
    TESSERACT = "tesseract"
    TESSEROCR = "tesserocr"
 def export_documents(
@ -209,10 +209,10 @@ def convert(
    match ocr_engine:
        case OcrEngine.EASYOCR:
            ocr_options = EasyOcrOptions()
        case OcrEngine.TESSERACT_CLI:
            ocr_options = TesseractCLIOptions()
        case OcrEngine.TESSERACT:
-            ocr_options = TesseractOcrOptions()
+            ocr_options = TesseractOptions()
        case OcrEngine.TESSEROCR:
            ocr_options = TesserOcrOptions()
        case _:
            raise RuntimeError(f"Unexpected backend type {backend}")
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -36,7 +36,7 @@ class EasyOcrOptions(OcrOptions):
    )
-class TesseractOcrOptions(OcrOptions):
+class TesseractCLIOptions(OcrOptions):
    kind: Literal["tesseract"] = "tesseract"
    lang: List[str] = ["fra", "deu", "spa", "eng"]
    tesseract_cmd: str = "tesseract"
@ -47,7 +47,7 @@ class TesseractOcrOptions(OcrOptions):
    )
-class TesserOcrOptions(OcrOptions):
+class TesseractOptions(OcrOptions):
    kind: Literal["tesserocr"] = "tesserocr"
    lang: List[str] = ["fra", "deu", "spa", "eng"]
    path: Optional[str] = None
@ -62,6 +62,6 @@ class PipelineOptions(BaseModel):
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
    table_structure_options: TableStructureOptions = TableStructureOptions()
-    ocr_options: Union[EasyOcrOptions, TesseractOcrOptions, TesserOcrOptions] = Field(
+    ocr_options: Union[EasyOcrOptions, TesseractCLIOptions, TesseractOptions] = Field(
        EasyOcrOptions(), discriminator="kind"
    )
--- a/docling/models/tesseract_cli_model.py
+++ b/docling/models/tesseract_cli_model.py
@ -0,0 +1,167 @@
 import io
 import logging
 import tempfile
 from subprocess import PIPE, Popen
 from typing import Iterable, Tuple
 import pandas as pd
 from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
 from docling.datamodel.pipeline_options import TesseractCLIOptions
 from docling.models.base_ocr_model import BaseOcrModel
 _log = logging.getLogger(__name__)
 class TesseractCLIModel(BaseOcrModel):
    def __init__(self, enabled: bool, options: TesseractCLIOptions):
        super().__init__(enabled=enabled, options=options)
        self.options: TesseractCLIOptions
        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
        self._name = None
        self._version = None
        if self.enabled:
            try:
                self._get_name_and_version()
            except Exception as exc:
                raise RuntimeError(
                    f"Tesseract is not available, aborting: {exc} "
                    "Install tesseract on your system and the tesseract binary is discoverable. "
                    "The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
                    "Alternatively, Docling has support for other OCR engines. See the documentation."
                )
    def _get_name_and_version(self) -> Tuple[str, str]:
        if self._name != None and self._version != None:
            return self._name, self._version
        cmd = [self.options.tesseract_cmd, "--version"]
        proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
        stdout, stderr = proc.communicate()
        proc.wait()
        # HACK: Windows versions of Tesseract output the version to stdout, Linux versions
        # to stderr, so check both.
        version_line = (
            (stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
            .split("\n")[0]
            .strip()
        )
        # If everything else fails...
        if not version_line:
            version_line = "tesseract XXX"
        name, version = version_line.split(" ")
        self._name = name
        self._version = version
        return name, version
    def _run_tesseract(self, ifilename: str):
        cmd = [self.options.tesseract_cmd]
        if self.options.lang is not None and len(self.options.lang) > 0:
            cmd.append("-l")
            cmd.append("+".join(self.options.lang))
        if self.options.path is not None:
            cmd.append("--tessdata-dir")
            cmd.append(self.options.path)
        cmd += [ifilename, "stdout", "tsv"]
        _log.info("command: {}".format(" ".join(cmd)))
        proc = Popen(cmd, stdout=PIPE)
        output, _ = proc.communicate()
        # _log.info(output)
        # Decode the byte string to a regular string
        decoded_data = output.decode("utf-8")
        # _log.info(decoded_data)
        # Read the TSV file generated by Tesseract
        df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
        # Display the dataframe (optional)
        # _log.info("df: ", df.head())
        # Filter rows that contain actual text (ignore header or empty rows)
        df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
        return df_filtered
    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        if not self.enabled:
            yield from page_batch
            return
        for page in page_batch:
            ocr_rects = self.get_ocr_rects(page)
            all_ocr_cells = []
            for ocr_rect in ocr_rects:
                # Skip zero area boxes
                if ocr_rect.area() == 0:
                    continue
                high_res_image = page._backend.get_page_image(
                    scale=self.scale, cropbox=ocr_rect
                )
                with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
                    fname = image_file.name
                    high_res_image.save(fname)
                    df = self._run_tesseract(fname)
                # _log.info(df)
                # Print relevant columns (bounding box and text)
                for ix, row in df.iterrows():
                    text = row["text"]
                    conf = row["conf"]
                    l = float(row["left"])
                    b = float(row["top"])
                    w = float(row["width"])
                    h = float(row["height"])
                    t = b + h
                    r = l + w
                    cell = OcrCell(
                        id=ix,
                        text=text,
                        confidence=conf / 100.0,
                        bbox=BoundingBox.from_tuple(
                            coord=(
                                (l / self.scale) + ocr_rect.l,
                                (b / self.scale) + ocr_rect.t,
                                (r / self.scale) + ocr_rect.l,
                                (t / self.scale) + ocr_rect.t,
                            ),
                            origin=CoordOrigin.TOPLEFT,
                        ),
                    )
                    all_ocr_cells.append(cell)
            ## Remove OCR cells which overlap with programmatic cells.
            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
            page.cells.extend(filtered_ocr_cells)
            # DEBUG code:
            # self.draw_ocr_rects_and_cells(page, ocr_rects)
            yield page
--- a/docling/models/tesseract_model.py
+++ b/docling/models/tesseract_model.py
@ -1,105 +1,65 @@
 import io
 import logging
-import tempfile
+from typing import Iterable
 from subprocess import PIPE, Popen
 from typing import Iterable, Tuple
-import pandas as pd
+import numpy
 from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
-from docling.datamodel.pipeline_options import TesseractOcrOptions
+from docling.datamodel.pipeline_options import TesseractCLIOptions
 from docling.models.base_ocr_model import BaseOcrModel
 _log = logging.getLogger(__name__)
-class TesseractOcrModel(BaseOcrModel):
+class TesseractModel(BaseOcrModel):
-
+    def __init__(self, enabled: bool, options: TesseractCLIOptions):
    def __init__(self, enabled: bool, options: TesseractOcrOptions):
        super().__init__(enabled=enabled, options=options)
-        self.options: TesseractOcrOptions
+        self.options: TesseractCLIOptions
        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
-
+        self.reader = None
        self._name = None
        self._version = None
        if self.enabled:
            setup_errmsg = (
                "tesserocr is not correctly installed. "
                "Please install it via `pip install tesserocr` to use this OCR engine. "
                "Note that tesserocr might have to be manually compiled for working with"
                "your Tesseract installation. The Docling documentation provides examples for it. "
                "Alternatively, Docling has support for other OCR engines. See the documentation."
            )
            try:
-                self._get_name_and_version()
+                import tesserocr
            except ImportError:
                raise ImportError(setup_errmsg)
-            except Exception as exc:
+            try:
-                raise RuntimeError(
+                tesseract_version = tesserocr.tesseract_version()
-                    f"Tesseract is not available, aborting: {exc} "
+                _log.debug("Initializing TesserOCR: %s", tesseract_version)
-                    "Install tesseract on your system and the tesseract binary is discoverable. "
+            except:
-                    "The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
+                raise ImportError(setup_errmsg)
-                    "Alternatively, Docling has support for other OCR engines. See the documentation."
+
            # Initialize the tesseractAPI
            lang = "+".join(self.options.lang)
            if self.options.path is not None:
                self.reader = tesserocr.PyTessBaseAPI(
                    path=self.options.path,
                    lang=lang,
                    psm=tesserocr.PSM.AUTO,
                    init=True,
                    oem=tesserocr.OEM.DEFAULT,
                )
            else:
                self.reader = tesserocr.PyTessBaseAPI(
                    lang=lang,
                    psm=tesserocr.PSM.AUTO,
                    init=True,
                    oem=tesserocr.OEM.DEFAULT,
                )
            self.reader_RIL = tesserocr.RIL
-    def _get_name_and_version(self) -> Tuple[str, str]:
+    def __del__(self):
-
+        if self.reader is not None:
-        if self._name != None and self._version != None:
+            # Finalize the tesseractAPI
-            return self._name, self._version
+            self.reader.End()
        cmd = [self.options.tesseract_cmd, "--version"]
        proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
        stdout, stderr = proc.communicate()
        proc.wait()
        # HACK: Windows versions of Tesseract output the version to stdout, Linux versions
        # to stderr, so check both.
        version_line = (
            (stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
            .split("\n")[0]
            .strip()
        )
        # If everything else fails...
        if not version_line:
            version_line = "tesseract XXX"
        name, version = version_line.split(" ")
        self._name = name
        self._version = version
        return name, version
    def _run_tesseract(self, ifilename: str):
        cmd = [self.options.tesseract_cmd]
        if self.options.lang is not None and len(self.options.lang) > 0:
            cmd.append("-l")
            cmd.append("+".join(self.options.lang))
        if self.options.path is not None:
            cmd.append("--tessdata-dir")
            cmd.append(self.options.path)
        cmd += [ifilename, "stdout", "tsv"]
        _log.info("command: {}".format(" ".join(cmd)))
        proc = Popen(cmd, stdout=PIPE)
        output, _ = proc.communicate()
        # _log.info(output)
        # Decode the byte string to a regular string
        decoded_data = output.decode("utf-8")
        # _log.info(decoded_data)
        # Read the TSV file generated by Tesseract
        df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
        # Display the dataframe (optional)
        # _log.info("df: ", df.head())
        # Filter rows that contain actual text (ignore header or empty rows)
        df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
        return df_filtered
    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
@ -119,42 +79,37 @@ class TesseractOcrModel(BaseOcrModel):
                    scale=self.scale, cropbox=ocr_rect
                )
-                with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
+                # Retrieve text snippets with their bounding boxes
-                    fname = image_file.name
+                self.reader.SetImage(high_res_image)
-                    high_res_image.save(fname)
+                boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
-                    df = self._run_tesseract(fname)
+                cells = []
                for ix, (im, box, _, _) in enumerate(boxes):
                    # Set the area of interest. Tesseract uses Bottom-Left for the origin
                    self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
-                # _log.info(df)
+                    # Extract text within the bounding box
                    text = self.reader.GetUTF8Text().strip()
                    confidence = self.reader.MeanTextConf()
                    left = box["x"] / self.scale
                    bottom = box["y"] / self.scale
                    right = (box["x"] + box["w"]) / self.scale
                    top = (box["y"] + box["h"]) / self.scale
-                # Print relevant columns (bounding box and text)
+                    cells.append(
-                for ix, row in df.iterrows():
+                        OcrCell(
-                    text = row["text"]
+                            id=ix,
-                    conf = row["conf"]
+                            text=text,
-
+                            confidence=confidence,
-                    l = float(row["left"])
+                            bbox=BoundingBox.from_tuple(
-                    b = float(row["top"])
+                                coord=(left, top, right, bottom),
-                    w = float(row["width"])
+                                origin=CoordOrigin.TOPLEFT,
                    h = float(row["height"])
                    t = b + h
                    r = l + w
                    cell = OcrCell(
                        id=ix,
                        text=text,
                        confidence=conf / 100.0,
                        bbox=BoundingBox.from_tuple(
                            coord=(
                                (l / self.scale) + ocr_rect.l,
                                (b / self.scale) + ocr_rect.t,
                                (r / self.scale) + ocr_rect.l,
                                (t / self.scale) + ocr_rect.t,
                            ),
-                            origin=CoordOrigin.TOPLEFT,
+                        )
                        ),
                    )
-                    all_ocr_cells.append(cell)
+
                # del high_res_image
                all_ocr_cells.extend(cells)
            ## Remove OCR cells which overlap with programmatic cells.
            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
--- a/docling/models/tesserocr_model.py
+++ b/docling/models/tesserocr_model.py
@ -1,122 +0,0 @@
 import logging
 from typing import Iterable
 import numpy
 from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
 from docling.datamodel.pipeline_options import TesseractOcrOptions
 from docling.models.base_ocr_model import BaseOcrModel
 _log = logging.getLogger(__name__)
 class TesserOcrModel(BaseOcrModel):
    def __init__(self, enabled: bool, options: TesseractOcrOptions):
        super().__init__(enabled=enabled, options=options)
        self.options: TesseractOcrOptions
        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
        self.reader = None
        if self.enabled:
            setup_errmsg = (
                "tesserocr is not correctly installed. "
                "Please install it via `pip install tesserocr` to use this OCR engine. "
                "Note that tesserocr might have to be manually compiled for working with"
                "your Tesseract installation. The Docling documentation provides examples for it. "
                "Alternatively, Docling has support for other OCR engines. See the documentation."
            )
            try:
                import tesserocr
            except ImportError:
                raise ImportError(setup_errmsg)
            try:
                tesseract_version = tesserocr.tesseract_version()
                _log.debug("Initializing TesserOCR: %s", tesseract_version)
            except:
                raise ImportError(setup_errmsg)
            # Initialize the tesseractAPI
            lang = "+".join(self.options.lang)
            if self.options.path is not None:
                self.reader = tesserocr.PyTessBaseAPI(
                    path=self.options.path,
                    lang=lang,
                    psm=tesserocr.PSM.AUTO,
                    init=True,
                    oem=tesserocr.OEM.DEFAULT,
                )
            else:
                self.reader = tesserocr.PyTessBaseAPI(
                    lang=lang,
                    psm=tesserocr.PSM.AUTO,
                    init=True,
                    oem=tesserocr.OEM.DEFAULT,
                )
            self.reader_RIL = tesserocr.RIL
    def __del__(self):
        if self.reader is not None:
            # Finalize the tesseractAPI
            self.reader.End()
    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        if not self.enabled:
            yield from page_batch
            return
        for page in page_batch:
            ocr_rects = self.get_ocr_rects(page)
            all_ocr_cells = []
            for ocr_rect in ocr_rects:
                # Skip zero area boxes
                if ocr_rect.area() == 0:
                    continue
                high_res_image = page._backend.get_page_image(
                    scale=self.scale, cropbox=ocr_rect
                )
                # Retrieve text snippets with their bounding boxes
                self.reader.SetImage(high_res_image)
                boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
                cells = []
                for ix, (im, box, _, _) in enumerate(boxes):
                    # Set the area of interest. Tesseract uses Bottom-Left for the origin
                    self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
                    # Extract text within the bounding box
                    text = self.reader.GetUTF8Text().strip()
                    confidence = self.reader.MeanTextConf()
                    left = box["x"] / self.scale
                    bottom = box["y"] / self.scale
                    right = (box["x"] + box["w"]) / self.scale
                    top = (box["y"] + box["h"]) / self.scale
                    cells.append(
                        OcrCell(
                            id=ix,
                            text=text,
                            confidence=confidence,
                            bbox=BoundingBox.from_tuple(
                                coord=(left, top, right, bottom),
                                origin=CoordOrigin.TOPLEFT,
                            ),
                        )
                    )
                # del high_res_image
                all_ocr_cells.extend(cells)
            ## Remove OCR cells which overlap with programmatic cells.
            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
            page.cells.extend(filtered_ocr_cells)
            # DEBUG code:
            # self.draw_ocr_rects_and_cells(page, ocr_rects)
            yield page
--- a/docling/pipeline/standard_model_pipeline.py
+++ b/docling/pipeline/standard_model_pipeline.py
@ -3,15 +3,15 @@ from pathlib import Path
 from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    PipelineOptions,
-    TesseractOcrOptions,
+    TesseractCLIOptions,
-    TesserOcrOptions,
+    TesseractOptions,
 )
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.table_structure_model import TableStructureModel
-from docling.models.tesseract_model import TesseractOcrModel
+from docling.models.tesseract_cli_model import TesseractCLIModel
-from docling.models.tesserocr_model import TesserOcrModel
+from docling.models.tesseract_model import TesseractModel
 from docling.pipeline.base_model_pipeline import BaseModelPipeline
@ -28,13 +28,13 @@ class StandardModelPipeline(BaseModelPipeline):
                enabled=pipeline_options.do_ocr,
                options=pipeline_options.ocr_options,
            )
-        elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
+        elif isinstance(pipeline_options.ocr_options, TesseractCLIOptions):
-            ocr_model = TesseractOcrModel(
+            ocr_model = TesseractCLIModel(
                enabled=pipeline_options.do_ocr,
                options=pipeline_options.ocr_options,
            )
-        elif isinstance(pipeline_options.ocr_options, TesserOcrOptions):
+        elif isinstance(pipeline_options.ocr_options, TesseractOptions):
-            ocr_model = TesserOcrModel(
+            ocr_model = TesseractModel(
                enabled=pipeline_options.do_ocr,
                options=pipeline_options.ocr_options,
            )
--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@ -8,7 +8,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, PipelineOptions
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.datamodel.pipeline_options import TesseractOcrOptions, TesserOcrOptions
+from docling.datamodel.pipeline_options import TesseractCLIOptions, TesseractOptions
 from docling.document_converter import DocumentConverter
 _log = logging.getLogger(__name__)
@ -126,7 +126,7 @@ def main():
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
-    pipeline_options.ocr_options = TesserOcrOptions()
+    pipeline_options.ocr_options = TesseractOptions()
    # Docling Parse with Tesseract CLI
    # ----------------------
@ -134,7 +134,7 @@ def main():
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
-    pipeline_options.ocr_options = TesseractOcrOptions()
+    pipeline_options.ocr_options = TesseractCLIOptions()
    doc_converter = DocumentConverter(
        pipeline_options=pipeline_options,
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@ -7,8 +7,8 @@ from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrOptions,
    PipelineOptions,
-    TesseractOcrOptions,
+    TesseractCLIOptions,
-    TesserOcrOptions,
+    TesseractOptions,
 )
 from docling.document_converter import DocumentConverter
@ -74,8 +74,8 @@ def test_e2e_conversions():
    engines: List[OcrOptions] = [
        EasyOcrOptions(),
-        TesserOcrOptions(),
+        TesseractOptions(),
-        TesseractOcrOptions(),
+        TesseractCLIOptions(),
    ]
    for ocr_options in engines: