From bb8cd0f7fcd2d962bfb71c937829284bf7b250fd Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Tue, 8 Oct 2024 16:46:25 +0200 Subject: [PATCH] fix: Rename the tesseract OCR related classes and filenames Signed-off-by: Nikos Livathinos --- docling/cli/main.py | 8 ++++---- docling/datamodel/pipeline_options.py | 8 ++++---- ...t_cli_model.py => tesseract_ocr_cli_model.py} | 8 ++++---- ...tesseract_model.py => tesseract_ocr_model.py} | 8 ++++---- docling/pipeline/standard_model_pipeline.py | 16 ++++++++-------- examples/custom_convert.py | 9 ++++++--- tests/test_e2e_ocr_conversion.py | 8 ++++---- 7 files changed, 34 insertions(+), 31 deletions(-) rename docling/models/{tesseract_cli_model.py => tesseract_ocr_cli_model.py} (95%) rename docling/models/{tesseract_model.py => tesseract_ocr_model.py} (95%) diff --git a/docling/cli/main.py b/docling/cli/main.py index 41f9440e..e27026d9 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -17,8 +17,8 @@ from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.pipeline_options import ( EasyOcrOptions, PipelineOptions, - TesseractCLIOptions, - TesseractOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, ) from docling.document_converter import DocumentConverter @@ -210,9 +210,9 @@ def convert( case OcrEngine.EASYOCR: ocr_options = EasyOcrOptions() case OcrEngine.TESSERACT_CLI: - ocr_options = TesseractCLIOptions() + ocr_options = TesseractCliOcrOptions() case OcrEngine.TESSERACT: - ocr_options = TesseractOptions() + ocr_options = TesseractOcrOptions() case _: raise RuntimeError(f"Unexpected backend type {backend}") diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 41e56297..2ebff48d 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -36,7 +36,7 @@ class EasyOcrOptions(OcrOptions): ) -class TesseractCLIOptions(OcrOptions): +class TesseractCliOcrOptions(OcrOptions): kind: Literal["tesseract"] = "tesseract" lang: List[str] = ["fra", "deu", "spa", "eng"] tesseract_cmd: str = "tesseract" @@ -47,7 +47,7 @@ class TesseractCLIOptions(OcrOptions): ) -class TesseractOptions(OcrOptions): +class TesseractOcrOptions(OcrOptions): kind: Literal["tesserocr"] = "tesserocr" lang: List[str] = ["fra", "deu", "spa", "eng"] path: Optional[str] = None @@ -62,6 +62,6 @@ class PipelineOptions(BaseModel): do_ocr: bool = True # True: perform OCR, replace programmatic PDF text table_structure_options: TableStructureOptions = TableStructureOptions() - ocr_options: Union[EasyOcrOptions, TesseractCLIOptions, TesseractOptions] = Field( - EasyOcrOptions(), discriminator="kind" + ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = ( + Field(EasyOcrOptions(), discriminator="kind") ) diff --git a/docling/models/tesseract_cli_model.py b/docling/models/tesseract_ocr_cli_model.py similarity index 95% rename from docling/models/tesseract_cli_model.py rename to docling/models/tesseract_ocr_cli_model.py index 0a23be97..c3c19991 100644 --- a/docling/models/tesseract_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -7,17 +7,17 @@ from typing import Iterable, Tuple import pandas as pd from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page -from docling.datamodel.pipeline_options import TesseractCLIOptions +from docling.datamodel.pipeline_options import TesseractCliOcrOptions from docling.models.base_ocr_model import BaseOcrModel _log = logging.getLogger(__name__) -class TesseractCLIModel(BaseOcrModel): +class TesseractOcrCliModel(BaseOcrModel): - def __init__(self, enabled: bool, options: TesseractCLIOptions): + def __init__(self, enabled: bool, options: TesseractCliOcrOptions): super().__init__(enabled=enabled, options=options) - self.options: TesseractCLIOptions + self.options: TesseractCliOcrOptions self.scale = 3 # multiplier for 72 dpi == 216 dpi. diff --git a/docling/models/tesseract_model.py b/docling/models/tesseract_ocr_model.py similarity index 95% rename from docling/models/tesseract_model.py rename to docling/models/tesseract_ocr_model.py index cafb39d6..1b4f6f7f 100644 --- a/docling/models/tesseract_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -4,16 +4,16 @@ from typing import Iterable import numpy from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page -from docling.datamodel.pipeline_options import TesseractCLIOptions +from docling.datamodel.pipeline_options import TesseractCliOcrOptions from docling.models.base_ocr_model import BaseOcrModel _log = logging.getLogger(__name__) -class TesseractModel(BaseOcrModel): - def __init__(self, enabled: bool, options: TesseractCLIOptions): +class TesseractOcrModel(BaseOcrModel): + def __init__(self, enabled: bool, options: TesseractCliOcrOptions): super().__init__(enabled=enabled, options=options) - self.options: TesseractCLIOptions + self.options: TesseractCliOcrOptions self.scale = 3 # multiplier for 72 dpi == 216 dpi. self.reader = None diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py index c1ef179d..3cbd87d9 100644 --- a/docling/pipeline/standard_model_pipeline.py +++ b/docling/pipeline/standard_model_pipeline.py @@ -3,15 +3,15 @@ from pathlib import Path from docling.datamodel.pipeline_options import ( EasyOcrOptions, PipelineOptions, - TesseractCLIOptions, - TesseractOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, ) from docling.models.base_ocr_model import BaseOcrModel from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel from docling.models.table_structure_model import TableStructureModel -from docling.models.tesseract_cli_model import TesseractCLIModel -from docling.models.tesseract_model import TesseractModel +from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel +from docling.models.tesseract_ocr_model import TesseractOcrModel from docling.pipeline.base_model_pipeline import BaseModelPipeline @@ -28,13 +28,13 @@ class StandardModelPipeline(BaseModelPipeline): enabled=pipeline_options.do_ocr, options=pipeline_options.ocr_options, ) - elif isinstance(pipeline_options.ocr_options, TesseractCLIOptions): - ocr_model = TesseractCLIModel( + elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions): + ocr_model = TesseractOcrCliModel( enabled=pipeline_options.do_ocr, options=pipeline_options.ocr_options, ) - elif isinstance(pipeline_options.ocr_options, TesseractOptions): - ocr_model = TesseractModel( + elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions): + ocr_model = TesseractOcrModel( enabled=pipeline_options.do_ocr, options=pipeline_options.ocr_options, ) diff --git a/examples/custom_convert.py b/examples/custom_convert.py index 17c78f02..59724120 100644 --- a/examples/custom_convert.py +++ b/examples/custom_convert.py @@ -8,7 +8,10 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.document import ConversionResult, DocumentConversionInput -from docling.datamodel.pipeline_options import TesseractCLIOptions, TesseractOptions +from docling.datamodel.pipeline_options import ( + TesseractCliOcrOptions, + TesseractOcrOptions, +) from docling.document_converter import DocumentConverter _log = logging.getLogger(__name__) @@ -126,7 +129,7 @@ def main(): pipeline_options.do_ocr = True pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True - pipeline_options.ocr_options = TesseractOptions() + pipeline_options.ocr_options = TesseractOcrOptions() # Docling Parse with Tesseract CLI # ---------------------- @@ -134,7 +137,7 @@ def main(): pipeline_options.do_ocr = True pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True - pipeline_options.ocr_options = TesseractCLIOptions() + pipeline_options.ocr_options = TesseractCliOcrOptions() doc_converter = DocumentConverter( pipeline_options=pipeline_options, diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index c875963f..96bc0871 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -7,8 +7,8 @@ from docling.datamodel.pipeline_options import ( EasyOcrOptions, OcrOptions, PipelineOptions, - TesseractCLIOptions, - TesseractOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, ) from docling.document_converter import DocumentConverter @@ -74,8 +74,8 @@ def test_e2e_conversions(): engines: List[OcrOptions] = [ EasyOcrOptions(), - TesseractOptions(), - TesseractCLIOptions(), + TesseractOcrOptions(), + TesseractCliOcrOptions(), ] for ocr_options in engines: