mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
add tesseract in CI, improve error messages and allow to specify the tesseract cmd
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
e571ab50ee
commit
f57e4b2afb
2
.github/workflows/checks.yml
vendored
2
.github/workflows/checks.yml
vendored
@ -9,6 +9,8 @@ jobs:
|
|||||||
python-version: ['3.10', '3.11', '3.12']
|
python-version: ['3.10', '3.11', '3.12']
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
- name: Install tesseract
|
||||||
|
run: sudo apt-get -y tesseract-ocr libleptonica-dev libtesseract-dev pkg-config
|
||||||
- uses: ./.github/actions/setup-poetry
|
- uses: ./.github/actions/setup-poetry
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
@ -30,7 +30,8 @@ class EasyOcrOptions(OcrOptions):
|
|||||||
|
|
||||||
class TesseractOcrOptions(OcrOptions):
|
class TesseractOcrOptions(OcrOptions):
|
||||||
kind: Literal["tesseract"] = "tesseract"
|
kind: Literal["tesseract"] = "tesseract"
|
||||||
lang: List[str] = ["fr", "de", "es", "en"]
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||||
|
tesseract_cmd: str = "tesseract"
|
||||||
|
|
||||||
|
|
||||||
class TesserOcrOptions(OcrOptions):
|
class TesserOcrOptions(OcrOptions):
|
||||||
|
@ -22,7 +22,8 @@ class EasyOcrModel(BaseOcrModel):
|
|||||||
import easyocr
|
import easyocr
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine."
|
"EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
|
||||||
|
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||||
)
|
)
|
||||||
|
|
||||||
self.reader = easyocr.Reader(lang_list=self.options.lang)
|
self.reader = easyocr.Reader(lang_list=self.options.lang)
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import os
|
import tempfile
|
||||||
from subprocess import PIPE, Popen
|
from subprocess import PIPE, Popen
|
||||||
from typing import Iterable, Tuple
|
from typing import Iterable, Tuple
|
||||||
|
|
||||||
@ -29,15 +29,19 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
self._get_name_and_version()
|
self._get_name_and_version()
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
_log.error(f"Tesseract is not available, aborting: ", exc.what())
|
raise RuntimeError(
|
||||||
self.enabled = False
|
f"Tesseract is not available, aborting: {exc} "
|
||||||
|
"Install tesseract on your system and the tesseract binary is discoverable. "
|
||||||
|
"The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
|
||||||
|
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||||
|
)
|
||||||
|
|
||||||
def _get_name_and_version(self) -> Tuple[str, str]:
|
def _get_name_and_version(self) -> Tuple[str, str]:
|
||||||
|
|
||||||
if self._name != None and self._version != None:
|
if self._name != None and self._version != None:
|
||||||
return self._name, self._version
|
return self._name, self._version
|
||||||
|
|
||||||
cmd = ["tesseract", "--version"]
|
cmd = [self.options.tesseract_cmd, "--version"]
|
||||||
|
|
||||||
proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
|
proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
|
||||||
stdout, stderr = proc.communicate()
|
stdout, stderr = proc.communicate()
|
||||||
@ -65,7 +69,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
|
|
||||||
def _run_tesseract(self, ifilename, languages=None):
|
def _run_tesseract(self, ifilename, languages=None):
|
||||||
|
|
||||||
cmd = ["tesseract"]
|
cmd = [self.options.tesseract_cmd]
|
||||||
|
|
||||||
if languages:
|
if languages:
|
||||||
cmd += ["-l", "+".join(languages)]
|
cmd += ["-l", "+".join(languages)]
|
||||||
@ -108,17 +112,11 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
scale=self.scale, cropbox=ocr_rect
|
scale=self.scale, cropbox=ocr_rect
|
||||||
)
|
)
|
||||||
|
|
||||||
# FIXME: do we really need to save the image to a file
|
with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
|
||||||
fname = "temporary-file.png"
|
fname = image_file.name
|
||||||
high_res_image.save(fname)
|
high_res_image.save(fname)
|
||||||
|
|
||||||
df = None
|
|
||||||
if os.path.exists(fname):
|
|
||||||
df = self._run_tesseract(fname)
|
df = self._run_tesseract(fname)
|
||||||
os.remove(fname)
|
|
||||||
else:
|
|
||||||
_log.error(f"no image file: {fname}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# _log.info(df)
|
# _log.info(df)
|
||||||
|
|
||||||
|
@ -19,22 +19,30 @@ class TesserOcrModel(BaseOcrModel):
|
|||||||
self.reader = None
|
self.reader = None
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
|
setup_errmsg = (
|
||||||
|
"tesserocr is not correctly installed. "
|
||||||
|
"Please install it via `pip install tesserocr` to use this OCR engine. "
|
||||||
|
"Note that tesserocr might have to be manually compiled for working with"
|
||||||
|
"your Tesseract installation. The Docling documentation provides examples for it. "
|
||||||
|
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
import tesserocr
|
import tesserocr
|
||||||
except ImportError:
|
except ImportError:
|
||||||
msg = (
|
raise ImportError(setup_errmsg)
|
||||||
"TesserOCR is not installed."
|
|
||||||
"Please install it via `pip install easyocr` to use this OCR engine."
|
try:
|
||||||
)
|
tesseract_version = tesserocr.tesseract_version()
|
||||||
raise ImportError(msg)
|
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
||||||
|
except:
|
||||||
|
raise ImportError(setup_errmsg)
|
||||||
|
|
||||||
# Initialize the tesseractAPI
|
# Initialize the tesseractAPI
|
||||||
lang = "+".join(self.options.lang)
|
lang = "+".join(self.options.lang)
|
||||||
_log.debug("Initializing TesserOCR: %s", tesserocr.tesseract_version())
|
|
||||||
self.reader = tesserocr.PyTessBaseAPI(
|
self.reader = tesserocr.PyTessBaseAPI(
|
||||||
lang=lang, psm=tesserocr.PSM.AUTO, init=True, oem=tesserocr.OEM.DEFAULT
|
lang=lang, psm=tesserocr.PSM.AUTO, init=True, oem=tesserocr.OEM.DEFAULT
|
||||||
)
|
)
|
||||||
self.reader_RIL = tesserocr.RIL.TEXTLINE
|
self.reader_RIL = tesserocr.RIL
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
if self.reader is not None:
|
if self.reader is not None:
|
||||||
|
14
poetry.lock
generated
14
poetry.lock
generated
@ -1,4 +1,4 @@
|
|||||||
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aiohappyeyeballs"
|
name = "aiohappyeyeballs"
|
||||||
@ -6025,7 +6025,7 @@ test = ["pytest", "tornado (>=4.5)", "typeguard"]
|
|||||||
name = "tesserocr"
|
name = "tesserocr"
|
||||||
version = "2.7.1"
|
version = "2.7.1"
|
||||||
description = "A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython"
|
description = "A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython"
|
||||||
optional = false
|
optional = true
|
||||||
python-versions = "*"
|
python-versions = "*"
|
||||||
files = [
|
files = [
|
||||||
{file = "tesserocr-2.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b8c4828f970af7bcfca83a1fb228aa68a2587299387bc875d0dfad8b6baf8ed"},
|
{file = "tesserocr-2.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b8c4828f970af7bcfca83a1fb228aa68a2587299387bc875d0dfad8b6baf8ed"},
|
||||||
@ -6641,6 +6641,11 @@ files = [
|
|||||||
{file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
|
{file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
|
||||||
{file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
|
{file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
|
||||||
{file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
|
{file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
|
||||||
|
{file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
|
||||||
|
{file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
|
||||||
|
{file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
|
||||||
|
{file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
|
||||||
|
{file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -7248,7 +7253,10 @@ enabler = ["pytest-enabler (>=2.2)"]
|
|||||||
test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
|
test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
|
||||||
type = ["pytest-mypy"]
|
type = ["pytest-mypy"]
|
||||||
|
|
||||||
|
[extras]
|
||||||
|
tesserocr = ["tesserocr"]
|
||||||
|
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "6518c5a526a0c2f9edbd2157bf9fca35ee9927e2ef5d26e9f31ec83f76191f51"
|
content-hash = "441fa5b2921ff9f9880e789c9e87cd89b617f3ff814f64cf750aa058d439511a"
|
||||||
|
@ -46,7 +46,7 @@ pydantic-settings = "^2.3.0"
|
|||||||
huggingface_hub = ">=0.23,<1"
|
huggingface_hub = ">=0.23,<1"
|
||||||
requests = "^2.32.3"
|
requests = "^2.32.3"
|
||||||
easyocr = "^1.7"
|
easyocr = "^1.7"
|
||||||
tesserocr = "^2.7.1"
|
tesserocr = { version = "^2.7.1", optional = true }
|
||||||
docling-parse = "^1.2.0"
|
docling-parse = "^1.2.0"
|
||||||
certifi = ">=2024.7.4"
|
certifi = ">=2024.7.4"
|
||||||
rtree = "^1.3.0"
|
rtree = "^1.3.0"
|
||||||
@ -82,6 +82,9 @@ langchain-huggingface = "^0.0.3"
|
|||||||
langchain-milvus = "^0.1.4"
|
langchain-milvus = "^0.1.4"
|
||||||
langchain-text-splitters = "^0.2.4"
|
langchain-text-splitters = "^0.2.4"
|
||||||
|
|
||||||
|
[tool.poetry.extras]
|
||||||
|
tesserocr = ["tesserocr"]
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
[tool.poetry.scripts]
|
||||||
docling = "docling.cli.main:app"
|
docling = "docling.cli.main:app"
|
||||||
|
|
||||||
|
@ -1,10 +1,15 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
from pydantic import Field
|
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions, TesseractOcrOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
|
EasyOcrOptions,
|
||||||
|
OcrOptions,
|
||||||
|
PipelineOptions,
|
||||||
|
TesseractOcrOptions,
|
||||||
|
TesserOcrOptions,
|
||||||
|
)
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
from .verify_utils import verify_conversion_result
|
from .verify_utils import verify_conversion_result
|
||||||
@ -50,13 +55,12 @@ def get_pdf_paths():
|
|||||||
return pdf_files
|
return pdf_files
|
||||||
|
|
||||||
|
|
||||||
def get_converter(engine: str):
|
def get_converter(ocr_options: OcrOptions):
|
||||||
pipeline_options = PipelineOptions()
|
pipeline_options = PipelineOptions()
|
||||||
pipeline_options.do_ocr = True
|
pipeline_options.do_ocr = True
|
||||||
pipeline_options.do_table_structure = True
|
pipeline_options.do_table_structure = True
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
if engine == "tesserocr":
|
pipeline_options.ocr_options = ocr_options
|
||||||
pipeline_options.ocr_options = TesseractOcrOptions()
|
|
||||||
|
|
||||||
converter = DocumentConverter(
|
converter = DocumentConverter(
|
||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
@ -70,9 +74,15 @@ def test_e2e_conversions():
|
|||||||
|
|
||||||
pdf_paths = get_pdf_paths()
|
pdf_paths = get_pdf_paths()
|
||||||
|
|
||||||
for engine in ["easyocr", "tesserocr", "tesseract"]:
|
engines: List[OcrOptions] = [
|
||||||
print(f"Converting with ocr_engine: {engine}")
|
EasyOcrOptions(),
|
||||||
converter = get_converter(engine)
|
TesserOcrOptions(),
|
||||||
|
TesseractOcrOptions(),
|
||||||
|
]
|
||||||
|
|
||||||
|
for ocr_options in engines:
|
||||||
|
print(f"Converting with ocr_engine: {ocr_options.kind}")
|
||||||
|
converter = get_converter(ocr_options=ocr_options)
|
||||||
for pdf_path in pdf_paths:
|
for pdf_path in pdf_paths:
|
||||||
print(f"converting {pdf_path}")
|
print(f"converting {pdf_path}")
|
||||||
|
|
||||||
@ -86,7 +96,7 @@ def test_e2e_conversions():
|
|||||||
input_path=pdf_path,
|
input_path=pdf_path,
|
||||||
doc_result=doc_result,
|
doc_result=doc_result,
|
||||||
generate=GENERATE,
|
generate=GENERATE,
|
||||||
ocr_engine=engine,
|
ocr_engine=ocr_options.kind,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user