add tesseract in CI, improve error messages and allow to specify the tesseract cmd

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-10-03 18:59:29 +02:00
parent e571ab50ee
commit f57e4b2afb
8 changed files with 68 additions and 37 deletions

View File

@ -9,6 +9,8 @@ jobs:
python-version: ['3.10', '3.11', '3.12'] python-version: ['3.10', '3.11', '3.12']
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- name: Install tesseract
run: sudo apt-get -y tesseract-ocr libleptonica-dev libtesseract-dev pkg-config
- uses: ./.github/actions/setup-poetry - uses: ./.github/actions/setup-poetry
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}

View File

@ -30,7 +30,8 @@ class EasyOcrOptions(OcrOptions):
class TesseractOcrOptions(OcrOptions): class TesseractOcrOptions(OcrOptions):
kind: Literal["tesseract"] = "tesseract" kind: Literal["tesseract"] = "tesseract"
lang: List[str] = ["fr", "de", "es", "en"] lang: List[str] = ["fra", "deu", "spa", "eng"]
tesseract_cmd: str = "tesseract"
class TesserOcrOptions(OcrOptions): class TesserOcrOptions(OcrOptions):

View File

@ -23,6 +23,7 @@ class EasyOcrModel(BaseOcrModel):
except ImportError: except ImportError:
raise ImportError( raise ImportError(
"EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. " "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
"Alternatively, Docling has support for other OCR engines. See the documentation."
) )
self.reader = easyocr.Reader(lang_list=self.options.lang) self.reader = easyocr.Reader(lang_list=self.options.lang)

View File

@ -1,6 +1,6 @@
import io import io
import logging import logging
import os import tempfile
from subprocess import PIPE, Popen from subprocess import PIPE, Popen
from typing import Iterable, Tuple from typing import Iterable, Tuple
@ -29,15 +29,19 @@ class TesseractOcrModel(BaseOcrModel):
self._get_name_and_version() self._get_name_and_version()
except Exception as exc: except Exception as exc:
_log.error(f"Tesseract is not available, aborting: ", exc.what()) raise RuntimeError(
self.enabled = False f"Tesseract is not available, aborting: {exc} "
"Install tesseract on your system and the tesseract binary is discoverable. "
"The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
"Alternatively, Docling has support for other OCR engines. See the documentation."
)
def _get_name_and_version(self) -> Tuple[str, str]: def _get_name_and_version(self) -> Tuple[str, str]:
if self._name != None and self._version != None: if self._name != None and self._version != None:
return self._name, self._version return self._name, self._version
cmd = ["tesseract", "--version"] cmd = [self.options.tesseract_cmd, "--version"]
proc = Popen(cmd, stdout=PIPE, stderr=PIPE) proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = proc.communicate() stdout, stderr = proc.communicate()
@ -65,7 +69,7 @@ class TesseractOcrModel(BaseOcrModel):
def _run_tesseract(self, ifilename, languages=None): def _run_tesseract(self, ifilename, languages=None):
cmd = ["tesseract"] cmd = [self.options.tesseract_cmd]
if languages: if languages:
cmd += ["-l", "+".join(languages)] cmd += ["-l", "+".join(languages)]
@ -108,17 +112,11 @@ class TesseractOcrModel(BaseOcrModel):
scale=self.scale, cropbox=ocr_rect scale=self.scale, cropbox=ocr_rect
) )
# FIXME: do we really need to save the image to a file with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
fname = "temporary-file.png" fname = image_file.name
high_res_image.save(fname) high_res_image.save(fname)
df = None
if os.path.exists(fname):
df = self._run_tesseract(fname) df = self._run_tesseract(fname)
os.remove(fname)
else:
_log.error(f"no image file: {fname}")
continue
# _log.info(df) # _log.info(df)

View File

@ -19,22 +19,30 @@ class TesserOcrModel(BaseOcrModel):
self.reader = None self.reader = None
if self.enabled: if self.enabled:
setup_errmsg = (
"tesserocr is not correctly installed. "
"Please install it via `pip install tesserocr` to use this OCR engine. "
"Note that tesserocr might have to be manually compiled for working with"
"your Tesseract installation. The Docling documentation provides examples for it. "
"Alternatively, Docling has support for other OCR engines. See the documentation."
)
try: try:
import tesserocr import tesserocr
except ImportError: except ImportError:
msg = ( raise ImportError(setup_errmsg)
"TesserOCR is not installed."
"Please install it via `pip install easyocr` to use this OCR engine." try:
) tesseract_version = tesserocr.tesseract_version()
raise ImportError(msg) _log.debug("Initializing TesserOCR: %s", tesseract_version)
except:
raise ImportError(setup_errmsg)
# Initialize the tesseractAPI # Initialize the tesseractAPI
lang = "+".join(self.options.lang) lang = "+".join(self.options.lang)
_log.debug("Initializing TesserOCR: %s", tesserocr.tesseract_version())
self.reader = tesserocr.PyTessBaseAPI( self.reader = tesserocr.PyTessBaseAPI(
lang=lang, psm=tesserocr.PSM.AUTO, init=True, oem=tesserocr.OEM.DEFAULT lang=lang, psm=tesserocr.PSM.AUTO, init=True, oem=tesserocr.OEM.DEFAULT
) )
self.reader_RIL = tesserocr.RIL.TEXTLINE self.reader_RIL = tesserocr.RIL
def __del__(self): def __del__(self):
if self.reader is not None: if self.reader is not None:

14
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
[[package]] [[package]]
name = "aiohappyeyeballs" name = "aiohappyeyeballs"
@ -6025,7 +6025,7 @@ test = ["pytest", "tornado (>=4.5)", "typeguard"]
name = "tesserocr" name = "tesserocr"
version = "2.7.1" version = "2.7.1"
description = "A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython" description = "A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython"
optional = false optional = true
python-versions = "*" python-versions = "*"
files = [ files = [
{file = "tesserocr-2.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b8c4828f970af7bcfca83a1fb228aa68a2587299387bc875d0dfad8b6baf8ed"}, {file = "tesserocr-2.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b8c4828f970af7bcfca83a1fb228aa68a2587299387bc875d0dfad8b6baf8ed"},
@ -6641,6 +6641,11 @@ files = [
{file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"}, {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
{file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"}, {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
{file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"}, {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
{file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
{file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
{file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
{file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
{file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
] ]
[package.dependencies] [package.dependencies]
@ -7248,7 +7253,10 @@ enabler = ["pytest-enabler (>=2.2)"]
test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
type = ["pytest-mypy"] type = ["pytest-mypy"]
[extras]
tesserocr = ["tesserocr"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "6518c5a526a0c2f9edbd2157bf9fca35ee9927e2ef5d26e9f31ec83f76191f51" content-hash = "441fa5b2921ff9f9880e789c9e87cd89b617f3ff814f64cf750aa058d439511a"

View File

@ -46,7 +46,7 @@ pydantic-settings = "^2.3.0"
huggingface_hub = ">=0.23,<1" huggingface_hub = ">=0.23,<1"
requests = "^2.32.3" requests = "^2.32.3"
easyocr = "^1.7" easyocr = "^1.7"
tesserocr = "^2.7.1" tesserocr = { version = "^2.7.1", optional = true }
docling-parse = "^1.2.0" docling-parse = "^1.2.0"
certifi = ">=2024.7.4" certifi = ">=2024.7.4"
rtree = "^1.3.0" rtree = "^1.3.0"
@ -82,6 +82,9 @@ langchain-huggingface = "^0.0.3"
langchain-milvus = "^0.1.4" langchain-milvus = "^0.1.4"
langchain-text-splitters = "^0.2.4" langchain-text-splitters = "^0.2.4"
[tool.poetry.extras]
tesserocr = ["tesserocr"]
[tool.poetry.scripts] [tool.poetry.scripts]
docling = "docling.cli.main:app" docling = "docling.cli.main:app"

View File

@ -1,10 +1,15 @@
from pathlib import Path from pathlib import Path
from typing import List
from pydantic import Field
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PipelineOptions, TesseractOcrOptions from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrOptions,
PipelineOptions,
TesseractOcrOptions,
TesserOcrOptions,
)
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
from .verify_utils import verify_conversion_result from .verify_utils import verify_conversion_result
@ -50,13 +55,12 @@ def get_pdf_paths():
return pdf_files return pdf_files
def get_converter(engine: str): def get_converter(ocr_options: OcrOptions):
pipeline_options = PipelineOptions() pipeline_options = PipelineOptions()
pipeline_options.do_ocr = True pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.do_cell_matching = True
if engine == "tesserocr": pipeline_options.ocr_options = ocr_options
pipeline_options.ocr_options = TesseractOcrOptions()
converter = DocumentConverter( converter = DocumentConverter(
pipeline_options=pipeline_options, pipeline_options=pipeline_options,
@ -70,9 +74,15 @@ def test_e2e_conversions():
pdf_paths = get_pdf_paths() pdf_paths = get_pdf_paths()
for engine in ["easyocr", "tesserocr", "tesseract"]: engines: List[OcrOptions] = [
print(f"Converting with ocr_engine: {engine}") EasyOcrOptions(),
converter = get_converter(engine) TesserOcrOptions(),
TesseractOcrOptions(),
]
for ocr_options in engines:
print(f"Converting with ocr_engine: {ocr_options.kind}")
converter = get_converter(ocr_options=ocr_options)
for pdf_path in pdf_paths: for pdf_path in pdf_paths:
print(f"converting {pdf_path}") print(f"converting {pdf_path}")
@ -86,7 +96,7 @@ def test_e2e_conversions():
input_path=pdf_path, input_path=pdf_path,
doc_result=doc_result, doc_result=doc_result,
generate=GENERATE, generate=GENERATE,
ocr_engine=engine, ocr_engine=ocr_options.kind,
) )