From f57e4b2afb57b3480c7fcd965e0744046d34cbcf Mon Sep 17 00:00:00 2001
From: Michele Dolfi <dol@zurich.ibm.com>
Date: Thu, 3 Oct 2024 18:59:29 +0200
Subject: [PATCH] add tesseract in CI, improve error messages and allow to
 specify the tesseract cmd

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 .github/workflows/checks.yml          |  2 ++
 docling/datamodel/pipeline_options.py |  3 ++-
 docling/models/easyocr_model.py       |  3 ++-
 docling/models/tesseract_model.py     | 26 +++++++++++------------
 docling/models/tesserocr_model.py     | 22 +++++++++++++-------
 poetry.lock                           | 14 ++++++++++---
 pyproject.toml                        |  5 ++++-
 tests/test_e2e_ocr_conversion.py      | 30 ++++++++++++++++++---------
 8 files changed, 68 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
index 8e92e76e..a38edf58 100644
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -9,6 +9,8 @@ jobs:
         python-version: ['3.10', '3.11', '3.12']
     steps:
       - uses: actions/checkout@v3
+      - name: Install tesseract
+        run: sudo apt-get -y tesseract-ocr libleptonica-dev libtesseract-dev pkg-config
       - uses: ./.github/actions/setup-poetry
         with:
           python-version: ${{ matrix.python-version }}
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 22333cf0..c9b4a9df 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -30,7 +30,8 @@ class EasyOcrOptions(OcrOptions):
 
 class TesseractOcrOptions(OcrOptions):
     kind: Literal["tesseract"] = "tesseract"
-    lang: List[str] = ["fr", "de", "es", "en"]
+    lang: List[str] = ["fra", "deu", "spa", "eng"]
+    tesseract_cmd: str = "tesseract"
 
 
 class TesserOcrOptions(OcrOptions):
diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py
index 13455b6b..fef0958d 100644
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@@ -22,7 +22,8 @@ class EasyOcrModel(BaseOcrModel):
                 import easyocr
             except ImportError:
                 raise ImportError(
-                    "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine."
+                    "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
+                    "Alternatively, Docling has support for other OCR engines. See the documentation."
                 )
 
             self.reader = easyocr.Reader(lang_list=self.options.lang)
diff --git a/docling/models/tesseract_model.py b/docling/models/tesseract_model.py
index 3b6fa04c..980e60bc 100644
--- a/docling/models/tesseract_model.py
+++ b/docling/models/tesseract_model.py
@@ -1,6 +1,6 @@
 import io
 import logging
-import os
+import tempfile
 from subprocess import PIPE, Popen
 from typing import Iterable, Tuple
 
@@ -29,15 +29,19 @@ class TesseractOcrModel(BaseOcrModel):
                 self._get_name_and_version()
 
             except Exception as exc:
-                _log.error(f"Tesseract is not available, aborting: ", exc.what())
-                self.enabled = False
+                raise RuntimeError(
+                    f"Tesseract is not available, aborting: {exc} "
+                    "Install tesseract on your system and the tesseract binary is discoverable. "
+                    "The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
+                    "Alternatively, Docling has support for other OCR engines. See the documentation."
+                )
 
     def _get_name_and_version(self) -> Tuple[str, str]:
 
         if self._name != None and self._version != None:
             return self._name, self._version
 
-        cmd = ["tesseract", "--version"]
+        cmd = [self.options.tesseract_cmd, "--version"]
 
         proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
         stdout, stderr = proc.communicate()
@@ -65,7 +69,7 @@ class TesseractOcrModel(BaseOcrModel):
 
     def _run_tesseract(self, ifilename, languages=None):
 
-        cmd = ["tesseract"]
+        cmd = [self.options.tesseract_cmd]
 
         if languages:
             cmd += ["-l", "+".join(languages)]
@@ -108,17 +112,11 @@ class TesseractOcrModel(BaseOcrModel):
                     scale=self.scale, cropbox=ocr_rect
                 )
 
-                # FIXME: do we really need to save the image to a file
-                fname = "temporary-file.png"
-                high_res_image.save(fname)
+                with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
+                    fname = image_file.name
+                    high_res_image.save(fname)
 
-                df = None
-                if os.path.exists(fname):
                     df = self._run_tesseract(fname)
-                    os.remove(fname)
-                else:
-                    _log.error(f"no image file: {fname}")
-                    continue
 
                 # _log.info(df)
 
diff --git a/docling/models/tesserocr_model.py b/docling/models/tesserocr_model.py
index 9581589b..07707115 100644
--- a/docling/models/tesserocr_model.py
+++ b/docling/models/tesserocr_model.py
@@ -19,22 +19,30 @@ class TesserOcrModel(BaseOcrModel):
         self.reader = None
 
         if self.enabled:
+            setup_errmsg = (
+                "tesserocr is not correctly installed. "
+                "Please install it via `pip install tesserocr` to use this OCR engine. "
+                "Note that tesserocr might have to be manually compiled for working with"
+                "your Tesseract installation. The Docling documentation provides examples for it. "
+                "Alternatively, Docling has support for other OCR engines. See the documentation."
+            )
             try:
                 import tesserocr
             except ImportError:
-                msg = (
-                    "TesserOCR is not installed."
-                    "Please install it via `pip install easyocr` to use this OCR engine."
-                )
-                raise ImportError(msg)
+                raise ImportError(setup_errmsg)
+
+            try:
+                tesseract_version = tesserocr.tesseract_version()
+                _log.debug("Initializing TesserOCR: %s", tesseract_version)
+            except:
+                raise ImportError(setup_errmsg)
 
             # Initialize the tesseractAPI
             lang = "+".join(self.options.lang)
-            _log.debug("Initializing TesserOCR: %s", tesserocr.tesseract_version())
             self.reader = tesserocr.PyTessBaseAPI(
                 lang=lang, psm=tesserocr.PSM.AUTO, init=True, oem=tesserocr.OEM.DEFAULT
             )
-            self.reader_RIL = tesserocr.RIL.TEXTLINE
+            self.reader_RIL = tesserocr.RIL
 
     def __del__(self):
         if self.reader is not None:
diff --git a/poetry.lock b/poetry.lock
index e9b9f568..7c390068 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -6025,7 +6025,7 @@ test = ["pytest", "tornado (>=4.5)", "typeguard"]
 name = "tesserocr"
 version = "2.7.1"
 description = "A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython"
-optional = false
+optional = true
 python-versions = "*"
 files = [
     {file = "tesserocr-2.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b8c4828f970af7bcfca83a1fb228aa68a2587299387bc875d0dfad8b6baf8ed"},
@@ -6641,6 +6641,11 @@ files = [
     {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
     {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
     {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
+    {file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
+    {file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
+    {file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
+    {file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
+    {file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
 ]
 
 [package.dependencies]
@@ -7248,7 +7253,10 @@ enabler = ["pytest-enabler (>=2.2)"]
 test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
 type = ["pytest-mypy"]
 
+[extras]
+tesserocr = ["tesserocr"]
+
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "6518c5a526a0c2f9edbd2157bf9fca35ee9927e2ef5d26e9f31ec83f76191f51"
+content-hash = "441fa5b2921ff9f9880e789c9e87cd89b617f3ff814f64cf750aa058d439511a"
diff --git a/pyproject.toml b/pyproject.toml
index c4b24d2f..1cb78945 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,7 +46,7 @@ pydantic-settings = "^2.3.0"
 huggingface_hub = ">=0.23,<1"
 requests = "^2.32.3"
 easyocr = "^1.7"
-tesserocr = "^2.7.1"
+tesserocr = { version = "^2.7.1", optional = true }
 docling-parse = "^1.2.0"
 certifi = ">=2024.7.4"
 rtree = "^1.3.0"
@@ -82,6 +82,9 @@ langchain-huggingface = "^0.0.3"
 langchain-milvus = "^0.1.4"
 langchain-text-splitters = "^0.2.4"
 
+[tool.poetry.extras]
+tesserocr = ["tesserocr"]
+
 [tool.poetry.scripts]
 docling = "docling.cli.main:app"
 
diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py
index b455ce4d..8156c04c 100644
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -1,10 +1,15 @@
 from pathlib import Path
-
-from pydantic import Field
+from typing import List
 
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import PipelineOptions, TesseractOcrOptions
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    OcrOptions,
+    PipelineOptions,
+    TesseractOcrOptions,
+    TesserOcrOptions,
+)
 from docling.document_converter import DocumentConverter
 
 from .verify_utils import verify_conversion_result
@@ -50,13 +55,12 @@ def get_pdf_paths():
     return pdf_files
 
 
-def get_converter(engine: str):
+def get_converter(ocr_options: OcrOptions):
     pipeline_options = PipelineOptions()
     pipeline_options.do_ocr = True
     pipeline_options.do_table_structure = True
     pipeline_options.table_structure_options.do_cell_matching = True
-    if engine == "tesserocr":
-        pipeline_options.ocr_options = TesseractOcrOptions()
+    pipeline_options.ocr_options = ocr_options
 
     converter = DocumentConverter(
         pipeline_options=pipeline_options,
@@ -70,9 +74,15 @@ def test_e2e_conversions():
 
     pdf_paths = get_pdf_paths()
 
-    for engine in ["easyocr", "tesserocr", "tesseract"]:
-        print(f"Converting with ocr_engine: {engine}")
-        converter = get_converter(engine)
+    engines: List[OcrOptions] = [
+        EasyOcrOptions(),
+        TesserOcrOptions(),
+        TesseractOcrOptions(),
+    ]
+
+    for ocr_options in engines:
+        print(f"Converting with ocr_engine: {ocr_options.kind}")
+        converter = get_converter(ocr_options=ocr_options)
         for pdf_path in pdf_paths:
             print(f"converting {pdf_path}")
 
@@ -86,7 +96,7 @@ def test_e2e_conversions():
                 input_path=pdf_path,
                 doc_result=doc_result,
                 generate=GENERATE,
-                ocr_engine=engine,
+                ocr_engine=ocr_options.kind,
             )