add examples for swtching OCR engine and CLI support

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-10-02 16:57:48 +02:00
parent 8d1c1d6dd5
commit 0b76211eed
6 changed files with 99 additions and 37 deletions

View File

@ -14,7 +14,12 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
PipelineOptions,
TesseractOcrOptions,
TesserOcrOptions,
)
from docling.document_converter import DocumentConverter
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@ -53,6 +58,13 @@ class Backend(str, Enum):
DOCLING = "docling"
# Define an enum for the ocr engines
class OcrEngine(str, Enum):
EASYOCR = "easyocr"
TESSERACT = "tesseract"
TESSEROCR = "tesserocr"
def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
@ -152,6 +164,9 @@ def convert(
backend: Annotated[
Backend, typer.Option(..., help="The PDF backend to use.")
] = Backend.DOCLING,
ocr_engine: Annotated[
OcrEngine, typer.Option(..., help="The OCR engine to use.")
] = OcrEngine.EASYOCR,
output: Annotated[
Path, typer.Option(..., help="Output directory where results are saved.")
] = Path("."),
@ -191,8 +206,19 @@ def convert(
case _:
raise RuntimeError(f"Unexpected backend type {backend}")
match ocr_engine:
case OcrEngine.EASYOCR:
ocr_options = EasyOcrOptions()
case OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions()
case OcrEngine.TESSEROCR:
ocr_options = TesserOcrOptions()
case _:
raise RuntimeError(f"Unexpected backend type {backend}")
pipeline_options = PipelineOptions(
do_ocr=ocr,
ocr_options=ocr_options,
do_table_structure=True,
)
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching

View File

@ -32,11 +32,15 @@ class TesseractOcrOptions(OcrOptions):
kind: Literal["tesseract"] = "tesseract"
class TesserOcrOptions(OcrOptions):
kind: Literal["tesseract"] = "tesserocr"
class PipelineOptions(BaseModel):
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[EasyOcrOptions, TesseractOcrOptions] = Field(
ocr_options: Union[EasyOcrOptions, TesseractOcrOptions, TesserOcrOptions] = Field(
EasyOcrOptions(), discriminator="kind"
)

View File

@ -1,7 +1,7 @@
import logging
from typing import Iterable
from subprocess import PIPE, Popen
from typing import Iterable, Tuple
import numpy
import pandas as pd
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
@ -10,7 +10,8 @@ from docling.models.base_ocr_model import BaseOcrModel
_log = logging.getLogger(__name__)
class TesseractModel(BaseOcrModel):
class TesseractOcrModel(BaseOcrModel):
def __init__(self, enabled: bool, options: TesseractOcrOptions):
super().__init__(enabled=enabled, options=options)
@ -23,15 +24,15 @@ class TesseractModel(BaseOcrModel):
self._get_name_and_version()
except Exception as exc:
_log.error(f"Tesseract is not supported, aborting ...")
_log.error(f"Tesseract is not available, aborting ...")
self.enabled = False
def _get_name_and_version(self) -> Tuple[str, str]:
if self._name!=None and self._version!=None:
if self._name != None and self._version != None:
return self._name, self._version
cmd = ['tesseract', '--version']
cmd = ["tesseract", "--version"]
proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = proc.communicate()
@ -40,13 +41,17 @@ class TesseractModel(BaseOcrModel):
# HACK: Windows versions of Tesseract output the version to stdout, Linux versions
# to stderr, so check both.
version_line = (stdout.decode('utf8').strip() or stderr.decode('utf8').strip()).split('\n')[0].strip()
version_line = (
(stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
.split("\n")[0]
.strip()
)
# If everything else fails...
if not version_line:
version_line = 'tesseract XXX'
version_line = "tesseract XXX"
name, version = version_line.split(' ')
name, version = version_line.split(" ")
self._name = name
self._version = version
@ -58,26 +63,25 @@ class TesseractModel(BaseOcrModel):
cmd = ["tesseract"]
if languages:
cmd += ['-l', '+'.join(languages)]
cmd += ["-l", "+".join(languages)]
cmd += [ifilename, 'stdout', "tsv"]
logger.info("command: {}".format(" ".join(cmd)))
cmd += [ifilename, "stdout", "tsv"]
_log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE)
output, _ = proc.communicate()
# Read the TSV file generated by Tesseract
df = pd.read_csv('output_file_name.tsv', sep='\t')
df = pd.read_csv("output_file_name.tsv", sep="\t")
# Display the dataframe (optional)
print(df.head())
# Filter rows that contain actual text (ignore header or empty rows)
df_filtered = df[df['text'].notnull() & (df['text'].str.strip() != '')]
df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
return df_filtered
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
if not self.enabled:
@ -111,13 +115,13 @@ class TesseractModel(BaseOcrModel):
text = row["text"]
conf = row["confidence"]
l = float(row['left'])
t = float(row['top'])
w = float(row['width'])
h = float(row['height'])
l = float(row["left"])
t = float(row["top"])
w = float(row["width"])
h = float(row["height"])
b = t-h
r = l+w
b = t - h
r = l + w
cell = OcrCell(
id=ix,

View File

@ -4,11 +4,13 @@ from docling.datamodel.pipeline_options import (
EasyOcrOptions,
PipelineOptions,
TesseractOcrOptions,
TesserOcrOptions,
)
from docling.models.base_ocr_model import BaseOcrModel
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_model import TesseractOcrModel
from docling.pipeline.base_model_pipeline import BaseModelPipeline
@ -26,6 +28,11 @@ class StandardModelPipeline(BaseModelPipeline):
options=pipeline_options.ocr_options,
)
elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
ocr_model = TesseractOcrModel(
enabled=pipeline_options.do_ocr,
options=pipeline_options.ocr_options,
)
elif isinstance(pipeline_options.ocr_options, TesserOcrOptions):
raise NotImplemented()
# TODO
# ocr_model = TesseractOcrModel(

View File

@ -8,6 +8,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import TesseractOcrOptions, TesserOcrOptions
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
@ -115,6 +116,27 @@ def main():
# pdf_backend=DoclingParseDocumentBackend,
# )
# Docling Parse with Tesseract OCR
# ----------------------
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options = TesseractOcrOptions()
# Docling Parse with TesserOCR
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesserOcrOptions()
doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
###########################################################################
# Define input files

View File

@ -12,8 +12,7 @@ GENERATE = False
# Debug
def save_output(pdf_path: Path, doc_result: ConversionResult):
r"""
"""
r""" """
import json
import os