feat: Add Tesseract PSM options support (#2411)

* feat: Add Tesseract PSM options support Signed-off-by: Bruno Pio <913963+blap@users.noreply.github.com> * apply formatting Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add tesseract_cli in checks Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Bruno Pio <913963+blap@users.noreply.github.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-10-10 09:44:30 -03:00
parent ee5501320e
commit f11f8c0a81
4 changed files with 38 additions and 5 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -57,6 +57,8 @@ from docling.datamodel.pipeline_options import (
    PipelineOptions,
    ProcessingPipeline,
    TableFormerMode,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
    VlmPipelineOptions,
 )
 from docling.datamodel.settings import settings
@@ -380,6 +382,13 @@ def convert(  # noqa: C901
            help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
        ),
    ] = None,
+    psm: Annotated[
+        Optional[int],
+        typer.Option(
+            ...,
+            help="Page Segmentation Mode for the OCR engine (0-13).",
+        ),
+    ] = None,
    pdf_backend: Annotated[
        PdfBackend, typer.Option(..., help="The PDF backend to use.")
    ] = PdfBackend.DLPARSE_V2,
@@ -596,6 +605,10 @@ def convert(  # noqa: C901
        ocr_lang_list = _split_list(ocr_lang)
        if ocr_lang_list is not None:
            ocr_options.lang = ocr_lang_list
+        if psm is not None and isinstance(
+            ocr_options, (TesseractOcrOptions, TesseractCliOcrOptions)
+        ):
+            ocr_options.psm = psm

        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
        # pipeline_options: PaginatedPipelineOptions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -154,6 +154,9 @@ class TesseractCliOcrOptions(OcrOptions):
    lang: List[str] = ["fra", "deu", "spa", "eng"]
    tesseract_cmd: str = "tesseract"
    path: Optional[str] = None
+    psm: Optional[int] = (
+        None  # Page Segmentation Mode (0-13), defaults to tesseract's default
+    )

    model_config = ConfigDict(
        extra="forbid",
@@ -166,6 +169,9 @@ class TesseractOcrOptions(OcrOptions):
    kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
    lang: List[str] = ["fra", "deu", "spa", "eng"]
    path: Optional[str] = None
+    psm: Optional[int] = (
+        None  # Page Segmentation Mode (0-13), defaults to tesseract's default
+    )

    model_config = ConfigDict(
        extra="forbid",
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -117,6 +117,10 @@ class TesseractOcrCliModel(BaseOcrModel):
            cmd.append("--tessdata-dir")
            cmd.append(self.options.path)

+        # Add PSM option if specified in the configuration
+        if self.options.psm is not None:
+            cmd.extend(["--psm", str(self.options.psm)])
+
        cmd += [ifilename, "stdout", "tsv"]
        _log.info("command: {}".format(" ".join(cmd)))

--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@@ -86,7 +86,6 @@ class TesseractOcrModel(BaseOcrModel):
                self.script_prefix = ""

            tesserocr_kwargs = {
-                "psm": tesserocr.PSM.AUTO,
                "init": True,
                "oem": tesserocr.OEM.DEFAULT,
            }
@@ -96,14 +95,23 @@ class TesseractOcrModel(BaseOcrModel):
            if self.options.path is not None:
                tesserocr_kwargs["path"] = self.options.path

+            # Set main OCR reader with configurable PSM
+            main_psm = (
+                tesserocr.PSM(self.options.psm)
+                if self.options.psm is not None
+                else tesserocr.PSM.AUTO
+            )
            if lang == "auto":
-                self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
+                self.reader = tesserocr.PyTessBaseAPI(psm=main_psm, **tesserocr_kwargs)
            else:
                self.reader = tesserocr.PyTessBaseAPI(
-                    **{"lang": lang} | tesserocr_kwargs,
+                    lang=lang,
+                    psm=main_psm,
+                    **tesserocr_kwargs,
                )
+            # OSD reader must use PSM.OSD_ONLY for orientation detection
            self.osd_reader = tesserocr.PyTessBaseAPI(
-                **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
+                lang="osd", psm=tesserocr.PSM.OSD_ONLY, **tesserocr_kwargs
            )
            self.reader_RIL = tesserocr.RIL

@@ -187,7 +195,9 @@ class TesseractOcrModel(BaseOcrModel):
                                        tesserocr.PyTessBaseAPI(
                                            path=self.reader.GetDatapath(),
                                            lang=lang,
-                                            psm=tesserocr.PSM.AUTO,
+                                            psm=tesserocr.PSM(self.options.psm)
+                                            if self.options.psm is not None
+                                            else tesserocr.PSM.AUTO,
                                            init=True,
                                            oem=tesserocr.OEM.DEFAULT,
                                        )