diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index bc30634d..6e3867f4 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -30,7 +30,7 @@ class EasyOcrOptions(OcrOptions): class TesseractOcrOptions(OcrOptions): kind: Literal["tesseract"] = "tesseract" - + lang: List[str] = ["fr", "de", "es", "en"] class TesserOcrOptions(OcrOptions): kind: Literal["tesseract"] = "tesserocr" diff --git a/docling/models/tesseract_model.py b/docling/models/tesseract_model.py index 2f1fd4ee..687e3b4f 100644 --- a/docling/models/tesseract_model.py +++ b/docling/models/tesseract_model.py @@ -10,7 +10,6 @@ from docling.models.base_ocr_model import BaseOcrModel _log = logging.getLogger(__name__) - class TesseractOcrModel(BaseOcrModel): def __init__(self, enabled: bool, options: TesseractOcrOptions): diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py index df64fb8a..c39b83c3 100644 --- a/docling/pipeline/standard_model_pipeline.py +++ b/docling/pipeline/standard_model_pipeline.py @@ -8,6 +8,7 @@ from docling.datamodel.pipeline_options import ( ) from docling.models.base_ocr_model import BaseOcrModel from docling.models.easyocr_model import EasyOcrModel +from docling.models.tesseract_model import TesseractOCRModel from docling.models.layout_model import LayoutModel from docling.models.table_structure_model import TableStructureModel from docling.models.tesseract_model import TesseractOcrModel diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index aea55651..37b72963 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -47,7 +47,28 @@ def get_pdf_paths(): return pdf_files -def get_converter(): +def get_easyocr_converter(): + + ocr_options = EasyOcrOptions( + + ) + + pipeline_options = PipelineOptions() + # Debug + pipeline_options.do_ocr = True + pipeline_options.do_table_structure = True + pipeline_options.table_structure_options.do_cell_matching = True + + + + converter = DocumentConverter( + pipeline_options=pipeline_options, + pdf_backend=DoclingParseDocumentBackend, + ) + + return converter + +def get_tesseract_converter(): pipeline_options = PipelineOptions() # Debug @@ -63,6 +84,7 @@ def get_converter(): return converter + def test_e2e_conversions(): pdf_paths = get_pdf_paths()