mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: Add Tesseract PSM options support (#2411)
* feat: Add Tesseract PSM options support Signed-off-by: Bruno Pio <913963+blap@users.noreply.github.com> * apply formatting Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add tesseract_cli in checks Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Bruno Pio <913963+blap@users.noreply.github.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -57,6 +57,8 @@ from docling.datamodel.pipeline_options import (
|
||||
PipelineOptions,
|
||||
ProcessingPipeline,
|
||||
TableFormerMode,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
VlmPipelineOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -380,6 +382,13 @@ def convert( # noqa: C901
|
||||
help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
|
||||
),
|
||||
] = None,
|
||||
psm: Annotated[
|
||||
Optional[int],
|
||||
typer.Option(
|
||||
...,
|
||||
help="Page Segmentation Mode for the OCR engine (0-13).",
|
||||
),
|
||||
] = None,
|
||||
pdf_backend: Annotated[
|
||||
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
||||
] = PdfBackend.DLPARSE_V2,
|
||||
@@ -596,6 +605,10 @@ def convert( # noqa: C901
|
||||
ocr_lang_list = _split_list(ocr_lang)
|
||||
if ocr_lang_list is not None:
|
||||
ocr_options.lang = ocr_lang_list
|
||||
if psm is not None and isinstance(
|
||||
ocr_options, (TesseractOcrOptions, TesseractCliOcrOptions)
|
||||
):
|
||||
ocr_options.psm = psm
|
||||
|
||||
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
||||
# pipeline_options: PaginatedPipelineOptions
|
||||
|
||||
@@ -154,6 +154,9 @@ class TesseractCliOcrOptions(OcrOptions):
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
tesseract_cmd: str = "tesseract"
|
||||
path: Optional[str] = None
|
||||
psm: Optional[int] = (
|
||||
None # Page Segmentation Mode (0-13), defaults to tesseract's default
|
||||
)
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
@@ -166,6 +169,9 @@ class TesseractOcrOptions(OcrOptions):
|
||||
kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
path: Optional[str] = None
|
||||
psm: Optional[int] = (
|
||||
None # Page Segmentation Mode (0-13), defaults to tesseract's default
|
||||
)
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
|
||||
@@ -117,6 +117,10 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
cmd.append("--tessdata-dir")
|
||||
cmd.append(self.options.path)
|
||||
|
||||
# Add PSM option if specified in the configuration
|
||||
if self.options.psm is not None:
|
||||
cmd.extend(["--psm", str(self.options.psm)])
|
||||
|
||||
cmd += [ifilename, "stdout", "tsv"]
|
||||
_log.info("command: {}".format(" ".join(cmd)))
|
||||
|
||||
|
||||
@@ -86,7 +86,6 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
self.script_prefix = ""
|
||||
|
||||
tesserocr_kwargs = {
|
||||
"psm": tesserocr.PSM.AUTO,
|
||||
"init": True,
|
||||
"oem": tesserocr.OEM.DEFAULT,
|
||||
}
|
||||
@@ -96,14 +95,23 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
if self.options.path is not None:
|
||||
tesserocr_kwargs["path"] = self.options.path
|
||||
|
||||
# Set main OCR reader with configurable PSM
|
||||
main_psm = (
|
||||
tesserocr.PSM(self.options.psm)
|
||||
if self.options.psm is not None
|
||||
else tesserocr.PSM.AUTO
|
||||
)
|
||||
if lang == "auto":
|
||||
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
|
||||
self.reader = tesserocr.PyTessBaseAPI(psm=main_psm, **tesserocr_kwargs)
|
||||
else:
|
||||
self.reader = tesserocr.PyTessBaseAPI(
|
||||
**{"lang": lang} | tesserocr_kwargs,
|
||||
lang=lang,
|
||||
psm=main_psm,
|
||||
**tesserocr_kwargs,
|
||||
)
|
||||
# OSD reader must use PSM.OSD_ONLY for orientation detection
|
||||
self.osd_reader = tesserocr.PyTessBaseAPI(
|
||||
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
|
||||
lang="osd", psm=tesserocr.PSM.OSD_ONLY, **tesserocr_kwargs
|
||||
)
|
||||
self.reader_RIL = tesserocr.RIL
|
||||
|
||||
@@ -187,7 +195,9 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
tesserocr.PyTessBaseAPI(
|
||||
path=self.reader.GetDatapath(),
|
||||
lang=lang,
|
||||
psm=tesserocr.PSM.AUTO,
|
||||
psm=tesserocr.PSM(self.options.psm)
|
||||
if self.options.psm is not None
|
||||
else tesserocr.PSM.AUTO,
|
||||
init=True,
|
||||
oem=tesserocr.OEM.DEFAULT,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user