feat: Add Tesseract PSM options support (#2411)

* feat: Add Tesseract PSM options support

Signed-off-by: Bruno Pio <913963+blap@users.noreply.github.com>

* apply formatting

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add tesseract_cli in checks

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Bruno Pio <913963+blap@users.noreply.github.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Bruno Pio
2025-10-10 09:44:30 -03:00
committed by GitHub
parent ee5501320e
commit f11f8c0a81
4 changed files with 38 additions and 5 deletions

View File

@@ -57,6 +57,8 @@ from docling.datamodel.pipeline_options import (
PipelineOptions,
ProcessingPipeline,
TableFormerMode,
TesseractCliOcrOptions,
TesseractOcrOptions,
VlmPipelineOptions,
)
from docling.datamodel.settings import settings
@@ -380,6 +382,13 @@ def convert( # noqa: C901
help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
),
] = None,
psm: Annotated[
Optional[int],
typer.Option(
...,
help="Page Segmentation Mode for the OCR engine (0-13).",
),
] = None,
pdf_backend: Annotated[
PdfBackend, typer.Option(..., help="The PDF backend to use.")
] = PdfBackend.DLPARSE_V2,
@@ -596,6 +605,10 @@ def convert( # noqa: C901
ocr_lang_list = _split_list(ocr_lang)
if ocr_lang_list is not None:
ocr_options.lang = ocr_lang_list
if psm is not None and isinstance(
ocr_options, (TesseractOcrOptions, TesseractCliOcrOptions)
):
ocr_options.psm = psm
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
# pipeline_options: PaginatedPipelineOptions

View File

@@ -154,6 +154,9 @@ class TesseractCliOcrOptions(OcrOptions):
lang: List[str] = ["fra", "deu", "spa", "eng"]
tesseract_cmd: str = "tesseract"
path: Optional[str] = None
psm: Optional[int] = (
None # Page Segmentation Mode (0-13), defaults to tesseract's default
)
model_config = ConfigDict(
extra="forbid",
@@ -166,6 +169,9 @@ class TesseractOcrOptions(OcrOptions):
kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
lang: List[str] = ["fra", "deu", "spa", "eng"]
path: Optional[str] = None
psm: Optional[int] = (
None # Page Segmentation Mode (0-13), defaults to tesseract's default
)
model_config = ConfigDict(
extra="forbid",

View File

@@ -117,6 +117,10 @@ class TesseractOcrCliModel(BaseOcrModel):
cmd.append("--tessdata-dir")
cmd.append(self.options.path)
# Add PSM option if specified in the configuration
if self.options.psm is not None:
cmd.extend(["--psm", str(self.options.psm)])
cmd += [ifilename, "stdout", "tsv"]
_log.info("command: {}".format(" ".join(cmd)))

View File

@@ -86,7 +86,6 @@ class TesseractOcrModel(BaseOcrModel):
self.script_prefix = ""
tesserocr_kwargs = {
"psm": tesserocr.PSM.AUTO,
"init": True,
"oem": tesserocr.OEM.DEFAULT,
}
@@ -96,14 +95,23 @@ class TesseractOcrModel(BaseOcrModel):
if self.options.path is not None:
tesserocr_kwargs["path"] = self.options.path
# Set main OCR reader with configurable PSM
main_psm = (
tesserocr.PSM(self.options.psm)
if self.options.psm is not None
else tesserocr.PSM.AUTO
)
if lang == "auto":
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
self.reader = tesserocr.PyTessBaseAPI(psm=main_psm, **tesserocr_kwargs)
else:
self.reader = tesserocr.PyTessBaseAPI(
**{"lang": lang} | tesserocr_kwargs,
lang=lang,
psm=main_psm,
**tesserocr_kwargs,
)
# OSD reader must use PSM.OSD_ONLY for orientation detection
self.osd_reader = tesserocr.PyTessBaseAPI(
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
lang="osd", psm=tesserocr.PSM.OSD_ONLY, **tesserocr_kwargs
)
self.reader_RIL = tesserocr.RIL
@@ -187,7 +195,9 @@ class TesseractOcrModel(BaseOcrModel):
tesserocr.PyTessBaseAPI(
path=self.reader.GetDatapath(),
lang=lang,
psm=tesserocr.PSM.AUTO,
psm=tesserocr.PSM(self.options.psm)
if self.options.psm is not None
else tesserocr.PSM.AUTO,
init=True,
oem=tesserocr.OEM.DEFAULT,
)