From f57e4b2afb57b3480c7fcd965e0744046d34cbcf Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Thu, 3 Oct 2024 18:59:29 +0200 Subject: [PATCH] add tesseract in CI, improve error messages and allow to specify the tesseract cmd Signed-off-by: Michele Dolfi --- .github/workflows/checks.yml | 2 ++ docling/datamodel/pipeline_options.py | 3 ++- docling/models/easyocr_model.py | 3 ++- docling/models/tesseract_model.py | 26 +++++++++++------------ docling/models/tesserocr_model.py | 22 +++++++++++++------- poetry.lock | 14 ++++++++++--- pyproject.toml | 5 ++++- tests/test_e2e_ocr_conversion.py | 30 ++++++++++++++++++--------- 8 files changed, 68 insertions(+), 37 deletions(-) diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 8e92e76e..a38edf58 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -9,6 +9,8 @@ jobs: python-version: ['3.10', '3.11', '3.12'] steps: - uses: actions/checkout@v3 + - name: Install tesseract + run: sudo apt-get -y tesseract-ocr libleptonica-dev libtesseract-dev pkg-config - uses: ./.github/actions/setup-poetry with: python-version: ${{ matrix.python-version }} diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 22333cf0..c9b4a9df 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -30,7 +30,8 @@ class EasyOcrOptions(OcrOptions): class TesseractOcrOptions(OcrOptions): kind: Literal["tesseract"] = "tesseract" - lang: List[str] = ["fr", "de", "es", "en"] + lang: List[str] = ["fra", "deu", "spa", "eng"] + tesseract_cmd: str = "tesseract" class TesserOcrOptions(OcrOptions): diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index 13455b6b..fef0958d 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -22,7 +22,8 @@ class EasyOcrModel(BaseOcrModel): import easyocr except ImportError: raise ImportError( - "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine." + "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. " + "Alternatively, Docling has support for other OCR engines. See the documentation." ) self.reader = easyocr.Reader(lang_list=self.options.lang) diff --git a/docling/models/tesseract_model.py b/docling/models/tesseract_model.py index 3b6fa04c..980e60bc 100644 --- a/docling/models/tesseract_model.py +++ b/docling/models/tesseract_model.py @@ -1,6 +1,6 @@ import io import logging -import os +import tempfile from subprocess import PIPE, Popen from typing import Iterable, Tuple @@ -29,15 +29,19 @@ class TesseractOcrModel(BaseOcrModel): self._get_name_and_version() except Exception as exc: - _log.error(f"Tesseract is not available, aborting: ", exc.what()) - self.enabled = False + raise RuntimeError( + f"Tesseract is not available, aborting: {exc} " + "Install tesseract on your system and the tesseract binary is discoverable. " + "The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. " + "Alternatively, Docling has support for other OCR engines. See the documentation." + ) def _get_name_and_version(self) -> Tuple[str, str]: if self._name != None and self._version != None: return self._name, self._version - cmd = ["tesseract", "--version"] + cmd = [self.options.tesseract_cmd, "--version"] proc = Popen(cmd, stdout=PIPE, stderr=PIPE) stdout, stderr = proc.communicate() @@ -65,7 +69,7 @@ class TesseractOcrModel(BaseOcrModel): def _run_tesseract(self, ifilename, languages=None): - cmd = ["tesseract"] + cmd = [self.options.tesseract_cmd] if languages: cmd += ["-l", "+".join(languages)] @@ -108,17 +112,11 @@ class TesseractOcrModel(BaseOcrModel): scale=self.scale, cropbox=ocr_rect ) - # FIXME: do we really need to save the image to a file - fname = "temporary-file.png" - high_res_image.save(fname) + with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file: + fname = image_file.name + high_res_image.save(fname) - df = None - if os.path.exists(fname): df = self._run_tesseract(fname) - os.remove(fname) - else: - _log.error(f"no image file: {fname}") - continue # _log.info(df) diff --git a/docling/models/tesserocr_model.py b/docling/models/tesserocr_model.py index 9581589b..07707115 100644 --- a/docling/models/tesserocr_model.py +++ b/docling/models/tesserocr_model.py @@ -19,22 +19,30 @@ class TesserOcrModel(BaseOcrModel): self.reader = None if self.enabled: + setup_errmsg = ( + "tesserocr is not correctly installed. " + "Please install it via `pip install tesserocr` to use this OCR engine. " + "Note that tesserocr might have to be manually compiled for working with" + "your Tesseract installation. The Docling documentation provides examples for it. " + "Alternatively, Docling has support for other OCR engines. See the documentation." + ) try: import tesserocr except ImportError: - msg = ( - "TesserOCR is not installed." - "Please install it via `pip install easyocr` to use this OCR engine." - ) - raise ImportError(msg) + raise ImportError(setup_errmsg) + + try: + tesseract_version = tesserocr.tesseract_version() + _log.debug("Initializing TesserOCR: %s", tesseract_version) + except: + raise ImportError(setup_errmsg) # Initialize the tesseractAPI lang = "+".join(self.options.lang) - _log.debug("Initializing TesserOCR: %s", tesserocr.tesseract_version()) self.reader = tesserocr.PyTessBaseAPI( lang=lang, psm=tesserocr.PSM.AUTO, init=True, oem=tesserocr.OEM.DEFAULT ) - self.reader_RIL = tesserocr.RIL.TEXTLINE + self.reader_RIL = tesserocr.RIL def __del__(self): if self.reader is not None: diff --git a/poetry.lock b/poetry.lock index e9b9f568..7c390068 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -6025,7 +6025,7 @@ test = ["pytest", "tornado (>=4.5)", "typeguard"] name = "tesserocr" version = "2.7.1" description = "A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython" -optional = false +optional = true python-versions = "*" files = [ {file = "tesserocr-2.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b8c4828f970af7bcfca83a1fb228aa68a2587299387bc875d0dfad8b6baf8ed"}, @@ -6641,6 +6641,11 @@ files = [ {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"}, {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"}, {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"}, + {file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"}, + {file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"}, + {file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"}, + {file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"}, + {file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"}, ] [package.dependencies] @@ -7248,7 +7253,10 @@ enabler = ["pytest-enabler (>=2.2)"] test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] type = ["pytest-mypy"] +[extras] +tesserocr = ["tesserocr"] + [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "6518c5a526a0c2f9edbd2157bf9fca35ee9927e2ef5d26e9f31ec83f76191f51" +content-hash = "441fa5b2921ff9f9880e789c9e87cd89b617f3ff814f64cf750aa058d439511a" diff --git a/pyproject.toml b/pyproject.toml index c4b24d2f..1cb78945 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ pydantic-settings = "^2.3.0" huggingface_hub = ">=0.23,<1" requests = "^2.32.3" easyocr = "^1.7" -tesserocr = "^2.7.1" +tesserocr = { version = "^2.7.1", optional = true } docling-parse = "^1.2.0" certifi = ">=2024.7.4" rtree = "^1.3.0" @@ -82,6 +82,9 @@ langchain-huggingface = "^0.0.3" langchain-milvus = "^0.1.4" langchain-text-splitters = "^0.2.4" +[tool.poetry.extras] +tesserocr = ["tesserocr"] + [tool.poetry.scripts] docling = "docling.cli.main:app" diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index b455ce4d..8156c04c 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -1,10 +1,15 @@ from pathlib import Path - -from pydantic import Field +from typing import List from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import PipelineOptions, TesseractOcrOptions +from docling.datamodel.pipeline_options import ( + EasyOcrOptions, + OcrOptions, + PipelineOptions, + TesseractOcrOptions, + TesserOcrOptions, +) from docling.document_converter import DocumentConverter from .verify_utils import verify_conversion_result @@ -50,13 +55,12 @@ def get_pdf_paths(): return pdf_files -def get_converter(engine: str): +def get_converter(ocr_options: OcrOptions): pipeline_options = PipelineOptions() pipeline_options.do_ocr = True pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True - if engine == "tesserocr": - pipeline_options.ocr_options = TesseractOcrOptions() + pipeline_options.ocr_options = ocr_options converter = DocumentConverter( pipeline_options=pipeline_options, @@ -70,9 +74,15 @@ def test_e2e_conversions(): pdf_paths = get_pdf_paths() - for engine in ["easyocr", "tesserocr", "tesseract"]: - print(f"Converting with ocr_engine: {engine}") - converter = get_converter(engine) + engines: List[OcrOptions] = [ + EasyOcrOptions(), + TesserOcrOptions(), + TesseractOcrOptions(), + ] + + for ocr_options in engines: + print(f"Converting with ocr_engine: {ocr_options.kind}") + converter = get_converter(ocr_options=ocr_options) for pdf_path in pdf_paths: print(f"converting {pdf_path}") @@ -86,7 +96,7 @@ def test_e2e_conversions(): input_path=pdf_path, doc_result=doc_result, generate=GENERATE, - ocr_engine=engine, + ocr_engine=ocr_options.kind, )