add examples for swtching OCR engine and CLI support

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-10-02 16:57:48 +02:00
parent 8d1c1d6dd5
commit 0b76211eed
6 changed files with 99 additions and 37 deletions

View File

@ -14,7 +14,12 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.pipeline_options import (
EasyOcrOptions,
PipelineOptions,
TesseractOcrOptions,
TesserOcrOptions,
)
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@ -53,6 +58,13 @@ class Backend(str, Enum):
DOCLING = "docling" DOCLING = "docling"
# Define an enum for the ocr engines
class OcrEngine(str, Enum):
EASYOCR = "easyocr"
TESSERACT = "tesseract"
TESSEROCR = "tesserocr"
def export_documents( def export_documents(
conv_results: Iterable[ConversionResult], conv_results: Iterable[ConversionResult],
output_dir: Path, output_dir: Path,
@ -152,6 +164,9 @@ def convert(
backend: Annotated[ backend: Annotated[
Backend, typer.Option(..., help="The PDF backend to use.") Backend, typer.Option(..., help="The PDF backend to use.")
] = Backend.DOCLING, ] = Backend.DOCLING,
ocr_engine: Annotated[
OcrEngine, typer.Option(..., help="The OCR engine to use.")
] = OcrEngine.EASYOCR,
output: Annotated[ output: Annotated[
Path, typer.Option(..., help="Output directory where results are saved.") Path, typer.Option(..., help="Output directory where results are saved.")
] = Path("."), ] = Path("."),
@ -191,8 +206,19 @@ def convert(
case _: case _:
raise RuntimeError(f"Unexpected backend type {backend}") raise RuntimeError(f"Unexpected backend type {backend}")
match ocr_engine:
case OcrEngine.EASYOCR:
ocr_options = EasyOcrOptions()
case OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions()
case OcrEngine.TESSEROCR:
ocr_options = TesserOcrOptions()
case _:
raise RuntimeError(f"Unexpected backend type {backend}")
pipeline_options = PipelineOptions( pipeline_options = PipelineOptions(
do_ocr=ocr, do_ocr=ocr,
ocr_options=ocr_options,
do_table_structure=True, do_table_structure=True,
) )
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching pipeline_options.table_structure_options.do_cell_matching = do_cell_matching

View File

@ -32,11 +32,15 @@ class TesseractOcrOptions(OcrOptions):
kind: Literal["tesseract"] = "tesseract" kind: Literal["tesseract"] = "tesseract"
class TesserOcrOptions(OcrOptions):
kind: Literal["tesseract"] = "tesserocr"
class PipelineOptions(BaseModel): class PipelineOptions(BaseModel):
do_table_structure: bool = True # True: perform table structure extraction do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions() table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[EasyOcrOptions, TesseractOcrOptions] = Field( ocr_options: Union[EasyOcrOptions, TesseractOcrOptions, TesserOcrOptions] = Field(
EasyOcrOptions(), discriminator="kind" EasyOcrOptions(), discriminator="kind"
) )

View File

@ -1,7 +1,7 @@
import logging import logging
from typing import Iterable from subprocess import PIPE, Popen
from typing import Iterable, Tuple
import numpy
import pandas as pd import pandas as pd
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
@ -10,7 +10,8 @@ from docling.models.base_ocr_model import BaseOcrModel
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class TesseractModel(BaseOcrModel):
class TesseractOcrModel(BaseOcrModel):
def __init__(self, enabled: bool, options: TesseractOcrOptions): def __init__(self, enabled: bool, options: TesseractOcrOptions):
super().__init__(enabled=enabled, options=options) super().__init__(enabled=enabled, options=options)
@ -23,15 +24,15 @@ class TesseractModel(BaseOcrModel):
self._get_name_and_version() self._get_name_and_version()
except Exception as exc: except Exception as exc:
_log.error(f"Tesseract is not supported, aborting ...") _log.error(f"Tesseract is not available, aborting ...")
self.enabled = False self.enabled = False
def _get_name_and_version(self) -> Tuple[str, str]: def _get_name_and_version(self) -> Tuple[str, str]:
if self._name!=None and self._version!=None: if self._name != None and self._version != None:
return self._name, self._version return self._name, self._version
cmd = ['tesseract', '--version'] cmd = ["tesseract", "--version"]
proc = Popen(cmd, stdout=PIPE, stderr=PIPE) proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = proc.communicate() stdout, stderr = proc.communicate()
@ -40,15 +41,19 @@ class TesseractModel(BaseOcrModel):
# HACK: Windows versions of Tesseract output the version to stdout, Linux versions # HACK: Windows versions of Tesseract output the version to stdout, Linux versions
# to stderr, so check both. # to stderr, so check both.
version_line = (stdout.decode('utf8').strip() or stderr.decode('utf8').strip()).split('\n')[0].strip() version_line = (
(stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
.split("\n")[0]
.strip()
)
# If everything else fails... # If everything else fails...
if not version_line: if not version_line:
version_line = 'tesseract XXX' version_line = "tesseract XXX"
name, version = version_line.split(' ') name, version = version_line.split(" ")
self._name = name self._name = name
self._version = version self._version = version
return name, version return name, version
@ -58,26 +63,25 @@ class TesseractModel(BaseOcrModel):
cmd = ["tesseract"] cmd = ["tesseract"]
if languages: if languages:
cmd += ['-l', '+'.join(languages)] cmd += ["-l", "+".join(languages)]
cmd += [ifilename, 'stdout', "tsv"] cmd += [ifilename, "stdout", "tsv"]
logger.info("command: {}".format(" ".join(cmd))) _log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE) proc = Popen(cmd, stdout=PIPE)
output, _ = proc.communicate() output, _ = proc.communicate()
# Read the TSV file generated by Tesseract # Read the TSV file generated by Tesseract
df = pd.read_csv('output_file_name.tsv', sep='\t') df = pd.read_csv("output_file_name.tsv", sep="\t")
# Display the dataframe (optional) # Display the dataframe (optional)
print(df.head()) print(df.head())
# Filter rows that contain actual text (ignore header or empty rows) # Filter rows that contain actual text (ignore header or empty rows)
df_filtered = df[df['text'].notnull() & (df['text'].str.strip() != '')] df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
return df_filtered return df_filtered
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
if not self.enabled: if not self.enabled:
@ -111,13 +115,13 @@ class TesseractModel(BaseOcrModel):
text = row["text"] text = row["text"]
conf = row["confidence"] conf = row["confidence"]
l = float(row['left']) l = float(row["left"])
t = float(row['top']) t = float(row["top"])
w = float(row['width']) w = float(row["width"])
h = float(row['height']) h = float(row["height"])
b = t-h b = t - h
r = l+w r = l + w
cell = OcrCell( cell = OcrCell(
id=ix, id=ix,

View File

@ -4,11 +4,13 @@ from docling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
PipelineOptions, PipelineOptions,
TesseractOcrOptions, TesseractOcrOptions,
TesserOcrOptions,
) )
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.models.easyocr_model import EasyOcrModel from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_model import TesseractOcrModel
from docling.pipeline.base_model_pipeline import BaseModelPipeline from docling.pipeline.base_model_pipeline import BaseModelPipeline
@ -26,6 +28,11 @@ class StandardModelPipeline(BaseModelPipeline):
options=pipeline_options.ocr_options, options=pipeline_options.ocr_options,
) )
elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions): elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
ocr_model = TesseractOcrModel(
enabled=pipeline_options.do_ocr,
options=pipeline_options.ocr_options,
)
elif isinstance(pipeline_options.ocr_options, TesserOcrOptions):
raise NotImplemented() raise NotImplemented()
# TODO # TODO
# ocr_model = TesseractOcrModel( # ocr_model = TesseractOcrModel(

View File

@ -8,6 +8,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import TesseractOcrOptions, TesserOcrOptions
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -115,6 +116,27 @@ def main():
# pdf_backend=DoclingParseDocumentBackend, # pdf_backend=DoclingParseDocumentBackend,
# ) # )
# Docling Parse with Tesseract OCR
# ----------------------
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options = TesseractOcrOptions()
# Docling Parse with TesserOCR
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesserOcrOptions()
doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
########################################################################### ###########################################################################
# Define input files # Define input files

View File

@ -12,8 +12,7 @@ GENERATE = False
# Debug # Debug
def save_output(pdf_path: Path, doc_result: ConversionResult): def save_output(pdf_path: Path, doc_result: ConversionResult):
r""" r""" """
"""
import json import json
import os import os