From 0b76211eedafa354c3cbb67ee10aeedca0d72319 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Wed, 2 Oct 2024 16:57:48 +0200 Subject: [PATCH] add examples for swtching OCR engine and CLI support Signed-off-by: Michele Dolfi --- docling/cli/main.py | 28 ++++++++- docling/datamodel/pipeline_options.py | 6 +- docling/models/tesseract_model.py | 70 +++++++++++---------- docling/pipeline/standard_model_pipeline.py | 7 +++ examples/custom_convert.py | 22 +++++++ tests/test_e2e_ocr_conversion.py | 3 +- 6 files changed, 99 insertions(+), 37 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index b942d519..6459b414 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -14,7 +14,12 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ConversionStatus from docling.datamodel.document import ConversionResult, DocumentConversionInput -from docling.datamodel.pipeline_options import PipelineOptions +from docling.datamodel.pipeline_options import ( + EasyOcrOptions, + PipelineOptions, + TesseractOcrOptions, + TesserOcrOptions, +) from docling.document_converter import DocumentConverter warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") @@ -53,6 +58,13 @@ class Backend(str, Enum): DOCLING = "docling" +# Define an enum for the ocr engines +class OcrEngine(str, Enum): + EASYOCR = "easyocr" + TESSERACT = "tesseract" + TESSEROCR = "tesserocr" + + def export_documents( conv_results: Iterable[ConversionResult], output_dir: Path, @@ -152,6 +164,9 @@ def convert( backend: Annotated[ Backend, typer.Option(..., help="The PDF backend to use.") ] = Backend.DOCLING, + ocr_engine: Annotated[ + OcrEngine, typer.Option(..., help="The OCR engine to use.") + ] = OcrEngine.EASYOCR, output: Annotated[ Path, typer.Option(..., help="Output directory where results are saved.") ] = Path("."), @@ -191,8 +206,19 @@ def convert( case _: raise RuntimeError(f"Unexpected backend type {backend}") + match ocr_engine: + case OcrEngine.EASYOCR: + ocr_options = EasyOcrOptions() + case OcrEngine.TESSERACT: + ocr_options = TesseractOcrOptions() + case OcrEngine.TESSEROCR: + ocr_options = TesserOcrOptions() + case _: + raise RuntimeError(f"Unexpected backend type {backend}") + pipeline_options = PipelineOptions( do_ocr=ocr, + ocr_options=ocr_options, do_table_structure=True, ) pipeline_options.table_structure_options.do_cell_matching = do_cell_matching diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 6742c412..bc30634d 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -32,11 +32,15 @@ class TesseractOcrOptions(OcrOptions): kind: Literal["tesseract"] = "tesseract" +class TesserOcrOptions(OcrOptions): + kind: Literal["tesseract"] = "tesserocr" + + class PipelineOptions(BaseModel): do_table_structure: bool = True # True: perform table structure extraction do_ocr: bool = True # True: perform OCR, replace programmatic PDF text table_structure_options: TableStructureOptions = TableStructureOptions() - ocr_options: Union[EasyOcrOptions, TesseractOcrOptions] = Field( + ocr_options: Union[EasyOcrOptions, TesseractOcrOptions, TesserOcrOptions] = Field( EasyOcrOptions(), discriminator="kind" ) diff --git a/docling/models/tesseract_model.py b/docling/models/tesseract_model.py index 59ec9504..2f1fd4ee 100644 --- a/docling/models/tesseract_model.py +++ b/docling/models/tesseract_model.py @@ -1,7 +1,7 @@ import logging -from typing import Iterable +from subprocess import PIPE, Popen +from typing import Iterable, Tuple -import numpy import pandas as pd from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page @@ -10,7 +10,8 @@ from docling.models.base_ocr_model import BaseOcrModel _log = logging.getLogger(__name__) -class TesseractModel(BaseOcrModel): + +class TesseractOcrModel(BaseOcrModel): def __init__(self, enabled: bool, options: TesseractOcrOptions): super().__init__(enabled=enabled, options=options) @@ -21,34 +22,38 @@ class TesseractModel(BaseOcrModel): if self.enabled: try: self._get_name_and_version() - + except Exception as exc: - _log.error(f"Tesseract is not supported, aborting ...") + _log.error(f"Tesseract is not available, aborting ...") self.enabled = False - + def _get_name_and_version(self) -> Tuple[str, str]: - if self._name!=None and self._version!=None: + if self._name != None and self._version != None: return self._name, self._version - cmd = ['tesseract', '--version'] + cmd = ["tesseract", "--version"] proc = Popen(cmd, stdout=PIPE, stderr=PIPE) stdout, stderr = proc.communicate() proc.wait() - # HACK: Windows versions of Tesseract output the version to stdout, Linux versions + # HACK: Windows versions of Tesseract output the version to stdout, Linux versions # to stderr, so check both. - version_line = (stdout.decode('utf8').strip() or stderr.decode('utf8').strip()).split('\n')[0].strip() + version_line = ( + (stdout.decode("utf8").strip() or stderr.decode("utf8").strip()) + .split("\n")[0] + .strip() + ) # If everything else fails... if not version_line: - version_line = 'tesseract XXX' + version_line = "tesseract XXX" - name, version = version_line.split(' ') + name, version = version_line.split(" ") - self._name = name + self._name = name self._version = version return name, version @@ -58,26 +63,25 @@ class TesseractModel(BaseOcrModel): cmd = ["tesseract"] if languages: - cmd += ['-l', '+'.join(languages)] + cmd += ["-l", "+".join(languages)] - cmd += [ifilename, 'stdout', "tsv"] - logger.info("command: {}".format(" ".join(cmd))) + cmd += [ifilename, "stdout", "tsv"] + _log.info("command: {}".format(" ".join(cmd))) proc = Popen(cmd, stdout=PIPE) output, _ = proc.communicate() # Read the TSV file generated by Tesseract - df = pd.read_csv('output_file_name.tsv', sep='\t') + df = pd.read_csv("output_file_name.tsv", sep="\t") # Display the dataframe (optional) print(df.head()) - + # Filter rows that contain actual text (ignore header or empty rows) - df_filtered = df[df['text'].notnull() & (df['text'].str.strip() != '')] + df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")] return df_filtered - - + def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: if not self.enabled: @@ -93,7 +97,7 @@ class TesseractModel(BaseOcrModel): scale=self.scale, cropbox=ocr_rect ) print(high_res_image) - + # FIXME: do we really need to save the image to a file fname = "temporary-file.png" high_res_image.save(fname) @@ -103,22 +107,22 @@ class TesseractModel(BaseOcrModel): os.remove(fname) else: _log.error(f"no image file: {fname}") - + # Print relevant columns (bounding box and text) for index, row in df_filtered.iterrows(): print(row) - + text = row["text"] conf = row["confidence"] - - l = float(row['left']) - t = float(row['top']) - w = float(row['width']) - h = float(row['height']) - b = t-h - r = l+w - + l = float(row["left"]) + t = float(row["top"]) + w = float(row["width"]) + h = float(row["height"]) + + b = t - h + r = l + w + cell = OcrCell( id=ix, text=text, @@ -134,7 +138,7 @@ class TesseractModel(BaseOcrModel): ), ) all_ocr_cells.append(cell) - + ## Remove OCR cells which overlap with programmatic cells. filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py index 4f3d0214..df64fb8a 100644 --- a/docling/pipeline/standard_model_pipeline.py +++ b/docling/pipeline/standard_model_pipeline.py @@ -4,11 +4,13 @@ from docling.datamodel.pipeline_options import ( EasyOcrOptions, PipelineOptions, TesseractOcrOptions, + TesserOcrOptions, ) from docling.models.base_ocr_model import BaseOcrModel from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel from docling.models.table_structure_model import TableStructureModel +from docling.models.tesseract_model import TesseractOcrModel from docling.pipeline.base_model_pipeline import BaseModelPipeline @@ -26,6 +28,11 @@ class StandardModelPipeline(BaseModelPipeline): options=pipeline_options.ocr_options, ) elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions): + ocr_model = TesseractOcrModel( + enabled=pipeline_options.do_ocr, + options=pipeline_options.ocr_options, + ) + elif isinstance(pipeline_options.ocr_options, TesserOcrOptions): raise NotImplemented() # TODO # ocr_model = TesseractOcrModel( diff --git a/examples/custom_convert.py b/examples/custom_convert.py index 9c84d8dd..70b07dd2 100644 --- a/examples/custom_convert.py +++ b/examples/custom_convert.py @@ -8,6 +8,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.document import ConversionResult, DocumentConversionInput +from docling.datamodel.pipeline_options import TesseractOcrOptions, TesserOcrOptions from docling.document_converter import DocumentConverter _log = logging.getLogger(__name__) @@ -115,6 +116,27 @@ def main(): # pdf_backend=DoclingParseDocumentBackend, # ) + # Docling Parse with Tesseract OCR + # ---------------------- + pipeline_options = PipelineOptions() + pipeline_options.do_ocr = True + pipeline_options.do_table_structure = True + pipeline_options.table_structure_options.do_cell_matching = True + pipeline_options.ocr_options = TesseractOcrOptions() + + # Docling Parse with TesserOCR + # ---------------------- + # pipeline_options = PipelineOptions() + # pipeline_options.do_ocr=True + # pipeline_options.do_table_structure=True + # pipeline_options.table_structure_options.do_cell_matching = True + # pipeline_options.ocr_options = TesserOcrOptions() + + doc_converter = DocumentConverter( + pipeline_options=pipeline_options, + pdf_backend=DoclingParseDocumentBackend, + ) + ########################################################################### # Define input files diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index 870e270b..aea55651 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -12,8 +12,7 @@ GENERATE = False # Debug def save_output(pdf_path: Path, doc_result: ConversionResult): - r""" - """ + r""" """ import json import os