diff --git a/docling/cli/main.py b/docling/cli/main.py index 5782ef36..bcd0e24b 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -57,6 +57,8 @@ from docling.datamodel.pipeline_options import ( PipelineOptions, ProcessingPipeline, TableFormerMode, + TesseractCliOcrOptions, + TesseractOcrOptions, VlmPipelineOptions, ) from docling.datamodel.settings import settings @@ -380,6 +382,13 @@ def convert( # noqa: C901 help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.", ), ] = None, + psm: Annotated[ + Optional[int], + typer.Option( + ..., + help="Page Segmentation Mode for the OCR engine (0-13).", + ), + ] = None, pdf_backend: Annotated[ PdfBackend, typer.Option(..., help="The PDF backend to use.") ] = PdfBackend.DLPARSE_V2, @@ -596,6 +605,10 @@ def convert( # noqa: C901 ocr_lang_list = _split_list(ocr_lang) if ocr_lang_list is not None: ocr_options.lang = ocr_lang_list + if psm is not None and isinstance( + ocr_options, (TesseractOcrOptions, TesseractCliOcrOptions) + ): + ocr_options.psm = psm accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device) # pipeline_options: PaginatedPipelineOptions diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index ca8324e5..b76da87f 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -154,6 +154,9 @@ class TesseractCliOcrOptions(OcrOptions): lang: List[str] = ["fra", "deu", "spa", "eng"] tesseract_cmd: str = "tesseract" path: Optional[str] = None + psm: Optional[int] = ( + None # Page Segmentation Mode (0-13), defaults to tesseract's default + ) model_config = ConfigDict( extra="forbid", @@ -166,6 +169,9 @@ class TesseractOcrOptions(OcrOptions): kind: ClassVar[Literal["tesserocr"]] = "tesserocr" lang: List[str] = ["fra", "deu", "spa", "eng"] path: Optional[str] = None + psm: Optional[int] = ( + None # Page Segmentation Mode (0-13), defaults to tesseract's default + ) model_config = ConfigDict( extra="forbid", diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index c483fa87..d4eee549 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -117,6 +117,10 @@ class TesseractOcrCliModel(BaseOcrModel): cmd.append("--tessdata-dir") cmd.append(self.options.path) + # Add PSM option if specified in the configuration + if self.options.psm is not None: + cmd.extend(["--psm", str(self.options.psm)]) + cmd += [ifilename, "stdout", "tsv"] _log.info("command: {}".format(" ".join(cmd))) diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index ed6306ba..be828c59 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -86,7 +86,6 @@ class TesseractOcrModel(BaseOcrModel): self.script_prefix = "" tesserocr_kwargs = { - "psm": tesserocr.PSM.AUTO, "init": True, "oem": tesserocr.OEM.DEFAULT, } @@ -96,14 +95,23 @@ class TesseractOcrModel(BaseOcrModel): if self.options.path is not None: tesserocr_kwargs["path"] = self.options.path + # Set main OCR reader with configurable PSM + main_psm = ( + tesserocr.PSM(self.options.psm) + if self.options.psm is not None + else tesserocr.PSM.AUTO + ) if lang == "auto": - self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs) + self.reader = tesserocr.PyTessBaseAPI(psm=main_psm, **tesserocr_kwargs) else: self.reader = tesserocr.PyTessBaseAPI( - **{"lang": lang} | tesserocr_kwargs, + lang=lang, + psm=main_psm, + **tesserocr_kwargs, ) + # OSD reader must use PSM.OSD_ONLY for orientation detection self.osd_reader = tesserocr.PyTessBaseAPI( - **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs + lang="osd", psm=tesserocr.PSM.OSD_ONLY, **tesserocr_kwargs ) self.reader_RIL = tesserocr.RIL @@ -187,7 +195,9 @@ class TesseractOcrModel(BaseOcrModel): tesserocr.PyTessBaseAPI( path=self.reader.GetDatapath(), lang=lang, - psm=tesserocr.PSM.AUTO, + psm=tesserocr.PSM(self.options.psm) + if self.options.psm is not None + else tesserocr.PSM.AUTO, init=True, oem=tesserocr.OEM.DEFAULT, )