From 70a8a2cc82e0cf4b9d7bd77e686544d2c16d6262 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Tue, 8 Oct 2024 14:44:23 +0200 Subject: [PATCH] chore(OCR): Rename class names to use Tesseract for the tesserocr and TesseractCLI for the tesseract process Signed-off-by: Nikos Livathinos --- docling/cli/main.py | 12 +- docling/datamodel/pipeline_options.py | 6 +- docling/models/tesseract_cli_model.py | 167 ++++++++++++++++++ docling/models/tesseract_model.py | 185 ++++++++------------ docling/models/tesserocr_model.py | 122 ------------- docling/pipeline/standard_model_pipeline.py | 16 +- examples/custom_convert.py | 6 +- tests/test_e2e_ocr_conversion.py | 8 +- 8 files changed, 261 insertions(+), 261 deletions(-) create mode 100644 docling/models/tesseract_cli_model.py delete mode 100644 docling/models/tesserocr_model.py diff --git a/docling/cli/main.py b/docling/cli/main.py index 6459b414..41f9440e 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -17,8 +17,8 @@ from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.pipeline_options import ( EasyOcrOptions, PipelineOptions, - TesseractOcrOptions, - TesserOcrOptions, + TesseractCLIOptions, + TesseractOptions, ) from docling.document_converter import DocumentConverter @@ -61,8 +61,8 @@ class Backend(str, Enum): # Define an enum for the ocr engines class OcrEngine(str, Enum): EASYOCR = "easyocr" + TESSERACT_CLI = "tesseract_cli" TESSERACT = "tesseract" - TESSEROCR = "tesserocr" def export_documents( @@ -209,10 +209,10 @@ def convert( match ocr_engine: case OcrEngine.EASYOCR: ocr_options = EasyOcrOptions() + case OcrEngine.TESSERACT_CLI: + ocr_options = TesseractCLIOptions() case OcrEngine.TESSERACT: - ocr_options = TesseractOcrOptions() - case OcrEngine.TESSEROCR: - ocr_options = TesserOcrOptions() + ocr_options = TesseractOptions() case _: raise RuntimeError(f"Unexpected backend type {backend}") diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index c3c81c3e..41e56297 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -36,7 +36,7 @@ class EasyOcrOptions(OcrOptions): ) -class TesseractOcrOptions(OcrOptions): +class TesseractCLIOptions(OcrOptions): kind: Literal["tesseract"] = "tesseract" lang: List[str] = ["fra", "deu", "spa", "eng"] tesseract_cmd: str = "tesseract" @@ -47,7 +47,7 @@ class TesseractOcrOptions(OcrOptions): ) -class TesserOcrOptions(OcrOptions): +class TesseractOptions(OcrOptions): kind: Literal["tesserocr"] = "tesserocr" lang: List[str] = ["fra", "deu", "spa", "eng"] path: Optional[str] = None @@ -62,6 +62,6 @@ class PipelineOptions(BaseModel): do_ocr: bool = True # True: perform OCR, replace programmatic PDF text table_structure_options: TableStructureOptions = TableStructureOptions() - ocr_options: Union[EasyOcrOptions, TesseractOcrOptions, TesserOcrOptions] = Field( + ocr_options: Union[EasyOcrOptions, TesseractCLIOptions, TesseractOptions] = Field( EasyOcrOptions(), discriminator="kind" ) diff --git a/docling/models/tesseract_cli_model.py b/docling/models/tesseract_cli_model.py new file mode 100644 index 00000000..0a23be97 --- /dev/null +++ b/docling/models/tesseract_cli_model.py @@ -0,0 +1,167 @@ +import io +import logging +import tempfile +from subprocess import PIPE, Popen +from typing import Iterable, Tuple + +import pandas as pd + +from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page +from docling.datamodel.pipeline_options import TesseractCLIOptions +from docling.models.base_ocr_model import BaseOcrModel + +_log = logging.getLogger(__name__) + + +class TesseractCLIModel(BaseOcrModel): + + def __init__(self, enabled: bool, options: TesseractCLIOptions): + super().__init__(enabled=enabled, options=options) + self.options: TesseractCLIOptions + + self.scale = 3 # multiplier for 72 dpi == 216 dpi. + + self._name = None + self._version = None + + if self.enabled: + try: + self._get_name_and_version() + + except Exception as exc: + raise RuntimeError( + f"Tesseract is not available, aborting: {exc} " + "Install tesseract on your system and the tesseract binary is discoverable. " + "The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. " + "Alternatively, Docling has support for other OCR engines. See the documentation." + ) + + def _get_name_and_version(self) -> Tuple[str, str]: + + if self._name != None and self._version != None: + return self._name, self._version + + cmd = [self.options.tesseract_cmd, "--version"] + + proc = Popen(cmd, stdout=PIPE, stderr=PIPE) + stdout, stderr = proc.communicate() + + proc.wait() + + # HACK: Windows versions of Tesseract output the version to stdout, Linux versions + # to stderr, so check both. + version_line = ( + (stdout.decode("utf8").strip() or stderr.decode("utf8").strip()) + .split("\n")[0] + .strip() + ) + + # If everything else fails... + if not version_line: + version_line = "tesseract XXX" + + name, version = version_line.split(" ") + + self._name = name + self._version = version + + return name, version + + def _run_tesseract(self, ifilename: str): + + cmd = [self.options.tesseract_cmd] + + if self.options.lang is not None and len(self.options.lang) > 0: + cmd.append("-l") + cmd.append("+".join(self.options.lang)) + if self.options.path is not None: + cmd.append("--tessdata-dir") + cmd.append(self.options.path) + + cmd += [ifilename, "stdout", "tsv"] + _log.info("command: {}".format(" ".join(cmd))) + + proc = Popen(cmd, stdout=PIPE) + output, _ = proc.communicate() + + # _log.info(output) + + # Decode the byte string to a regular string + decoded_data = output.decode("utf-8") + # _log.info(decoded_data) + + # Read the TSV file generated by Tesseract + df = pd.read_csv(io.StringIO(decoded_data), sep="\t") + + # Display the dataframe (optional) + # _log.info("df: ", df.head()) + + # Filter rows that contain actual text (ignore header or empty rows) + df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")] + + return df_filtered + + def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + + if not self.enabled: + yield from page_batch + return + + for page in page_batch: + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect + ) + + with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file: + fname = image_file.name + high_res_image.save(fname) + + df = self._run_tesseract(fname) + + # _log.info(df) + + # Print relevant columns (bounding box and text) + for ix, row in df.iterrows(): + text = row["text"] + conf = row["conf"] + + l = float(row["left"]) + b = float(row["top"]) + w = float(row["width"]) + h = float(row["height"]) + + t = b + h + r = l + w + + cell = OcrCell( + id=ix, + text=text, + confidence=conf / 100.0, + bbox=BoundingBox.from_tuple( + coord=( + (l / self.scale) + ocr_rect.l, + (b / self.scale) + ocr_rect.t, + (r / self.scale) + ocr_rect.l, + (t / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, + ), + ) + all_ocr_cells.append(cell) + + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) + + page.cells.extend(filtered_ocr_cells) + + # DEBUG code: + # self.draw_ocr_rects_and_cells(page, ocr_rects) + + yield page diff --git a/docling/models/tesseract_model.py b/docling/models/tesseract_model.py index 94da2779..cafb39d6 100644 --- a/docling/models/tesseract_model.py +++ b/docling/models/tesseract_model.py @@ -1,105 +1,65 @@ -import io import logging -import tempfile -from subprocess import PIPE, Popen -from typing import Iterable, Tuple +from typing import Iterable -import pandas as pd +import numpy from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page -from docling.datamodel.pipeline_options import TesseractOcrOptions +from docling.datamodel.pipeline_options import TesseractCLIOptions from docling.models.base_ocr_model import BaseOcrModel _log = logging.getLogger(__name__) -class TesseractOcrModel(BaseOcrModel): - - def __init__(self, enabled: bool, options: TesseractOcrOptions): +class TesseractModel(BaseOcrModel): + def __init__(self, enabled: bool, options: TesseractCLIOptions): super().__init__(enabled=enabled, options=options) - self.options: TesseractOcrOptions + self.options: TesseractCLIOptions self.scale = 3 # multiplier for 72 dpi == 216 dpi. - - self._name = None - self._version = None + self.reader = None if self.enabled: + setup_errmsg = ( + "tesserocr is not correctly installed. " + "Please install it via `pip install tesserocr` to use this OCR engine. " + "Note that tesserocr might have to be manually compiled for working with" + "your Tesseract installation. The Docling documentation provides examples for it. " + "Alternatively, Docling has support for other OCR engines. See the documentation." + ) try: - self._get_name_and_version() + import tesserocr + except ImportError: + raise ImportError(setup_errmsg) - except Exception as exc: - raise RuntimeError( - f"Tesseract is not available, aborting: {exc} " - "Install tesseract on your system and the tesseract binary is discoverable. " - "The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. " - "Alternatively, Docling has support for other OCR engines. See the documentation." + try: + tesseract_version = tesserocr.tesseract_version() + _log.debug("Initializing TesserOCR: %s", tesseract_version) + except: + raise ImportError(setup_errmsg) + + # Initialize the tesseractAPI + lang = "+".join(self.options.lang) + if self.options.path is not None: + self.reader = tesserocr.PyTessBaseAPI( + path=self.options.path, + lang=lang, + psm=tesserocr.PSM.AUTO, + init=True, + oem=tesserocr.OEM.DEFAULT, ) + else: + self.reader = tesserocr.PyTessBaseAPI( + lang=lang, + psm=tesserocr.PSM.AUTO, + init=True, + oem=tesserocr.OEM.DEFAULT, + ) + self.reader_RIL = tesserocr.RIL - def _get_name_and_version(self) -> Tuple[str, str]: - - if self._name != None and self._version != None: - return self._name, self._version - - cmd = [self.options.tesseract_cmd, "--version"] - - proc = Popen(cmd, stdout=PIPE, stderr=PIPE) - stdout, stderr = proc.communicate() - - proc.wait() - - # HACK: Windows versions of Tesseract output the version to stdout, Linux versions - # to stderr, so check both. - version_line = ( - (stdout.decode("utf8").strip() or stderr.decode("utf8").strip()) - .split("\n")[0] - .strip() - ) - - # If everything else fails... - if not version_line: - version_line = "tesseract XXX" - - name, version = version_line.split(" ") - - self._name = name - self._version = version - - return name, version - - def _run_tesseract(self, ifilename: str): - - cmd = [self.options.tesseract_cmd] - - if self.options.lang is not None and len(self.options.lang) > 0: - cmd.append("-l") - cmd.append("+".join(self.options.lang)) - if self.options.path is not None: - cmd.append("--tessdata-dir") - cmd.append(self.options.path) - - cmd += [ifilename, "stdout", "tsv"] - _log.info("command: {}".format(" ".join(cmd))) - - proc = Popen(cmd, stdout=PIPE) - output, _ = proc.communicate() - - # _log.info(output) - - # Decode the byte string to a regular string - decoded_data = output.decode("utf-8") - # _log.info(decoded_data) - - # Read the TSV file generated by Tesseract - df = pd.read_csv(io.StringIO(decoded_data), sep="\t") - - # Display the dataframe (optional) - # _log.info("df: ", df.head()) - - # Filter rows that contain actual text (ignore header or empty rows) - df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")] - - return df_filtered + def __del__(self): + if self.reader is not None: + # Finalize the tesseractAPI + self.reader.End() def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: @@ -119,42 +79,37 @@ class TesseractOcrModel(BaseOcrModel): scale=self.scale, cropbox=ocr_rect ) - with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file: - fname = image_file.name - high_res_image.save(fname) + # Retrieve text snippets with their bounding boxes + self.reader.SetImage(high_res_image) + boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True) - df = self._run_tesseract(fname) + cells = [] + for ix, (im, box, _, _) in enumerate(boxes): + # Set the area of interest. Tesseract uses Bottom-Left for the origin + self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"]) - # _log.info(df) + # Extract text within the bounding box + text = self.reader.GetUTF8Text().strip() + confidence = self.reader.MeanTextConf() + left = box["x"] / self.scale + bottom = box["y"] / self.scale + right = (box["x"] + box["w"]) / self.scale + top = (box["y"] + box["h"]) / self.scale - # Print relevant columns (bounding box and text) - for ix, row in df.iterrows(): - text = row["text"] - conf = row["conf"] - - l = float(row["left"]) - b = float(row["top"]) - w = float(row["width"]) - h = float(row["height"]) - - t = b + h - r = l + w - - cell = OcrCell( - id=ix, - text=text, - confidence=conf / 100.0, - bbox=BoundingBox.from_tuple( - coord=( - (l / self.scale) + ocr_rect.l, - (b / self.scale) + ocr_rect.t, - (r / self.scale) + ocr_rect.l, - (t / self.scale) + ocr_rect.t, + cells.append( + OcrCell( + id=ix, + text=text, + confidence=confidence, + bbox=BoundingBox.from_tuple( + coord=(left, top, right, bottom), + origin=CoordOrigin.TOPLEFT, ), - origin=CoordOrigin.TOPLEFT, - ), + ) ) - all_ocr_cells.append(cell) + + # del high_res_image + all_ocr_cells.extend(cells) ## Remove OCR cells which overlap with programmatic cells. filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) diff --git a/docling/models/tesserocr_model.py b/docling/models/tesserocr_model.py deleted file mode 100644 index f748abb3..00000000 --- a/docling/models/tesserocr_model.py +++ /dev/null @@ -1,122 +0,0 @@ -import logging -from typing import Iterable - -import numpy - -from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page -from docling.datamodel.pipeline_options import TesseractOcrOptions -from docling.models.base_ocr_model import BaseOcrModel - -_log = logging.getLogger(__name__) - - -class TesserOcrModel(BaseOcrModel): - def __init__(self, enabled: bool, options: TesseractOcrOptions): - super().__init__(enabled=enabled, options=options) - self.options: TesseractOcrOptions - - self.scale = 3 # multiplier for 72 dpi == 216 dpi. - self.reader = None - - if self.enabled: - setup_errmsg = ( - "tesserocr is not correctly installed. " - "Please install it via `pip install tesserocr` to use this OCR engine. " - "Note that tesserocr might have to be manually compiled for working with" - "your Tesseract installation. The Docling documentation provides examples for it. " - "Alternatively, Docling has support for other OCR engines. See the documentation." - ) - try: - import tesserocr - except ImportError: - raise ImportError(setup_errmsg) - - try: - tesseract_version = tesserocr.tesseract_version() - _log.debug("Initializing TesserOCR: %s", tesseract_version) - except: - raise ImportError(setup_errmsg) - - # Initialize the tesseractAPI - lang = "+".join(self.options.lang) - if self.options.path is not None: - self.reader = tesserocr.PyTessBaseAPI( - path=self.options.path, - lang=lang, - psm=tesserocr.PSM.AUTO, - init=True, - oem=tesserocr.OEM.DEFAULT, - ) - else: - self.reader = tesserocr.PyTessBaseAPI( - lang=lang, - psm=tesserocr.PSM.AUTO, - init=True, - oem=tesserocr.OEM.DEFAULT, - ) - self.reader_RIL = tesserocr.RIL - - def __del__(self): - if self.reader is not None: - # Finalize the tesseractAPI - self.reader.End() - - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: - - if not self.enabled: - yield from page_batch - return - - for page in page_batch: - ocr_rects = self.get_ocr_rects(page) - - all_ocr_cells = [] - for ocr_rect in ocr_rects: - # Skip zero area boxes - if ocr_rect.area() == 0: - continue - high_res_image = page._backend.get_page_image( - scale=self.scale, cropbox=ocr_rect - ) - - # Retrieve text snippets with their bounding boxes - self.reader.SetImage(high_res_image) - boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True) - - cells = [] - for ix, (im, box, _, _) in enumerate(boxes): - # Set the area of interest. Tesseract uses Bottom-Left for the origin - self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"]) - - # Extract text within the bounding box - text = self.reader.GetUTF8Text().strip() - confidence = self.reader.MeanTextConf() - left = box["x"] / self.scale - bottom = box["y"] / self.scale - right = (box["x"] + box["w"]) / self.scale - top = (box["y"] + box["h"]) / self.scale - - cells.append( - OcrCell( - id=ix, - text=text, - confidence=confidence, - bbox=BoundingBox.from_tuple( - coord=(left, top, right, bottom), - origin=CoordOrigin.TOPLEFT, - ), - ) - ) - - # del high_res_image - all_ocr_cells.extend(cells) - - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) - - page.cells.extend(filtered_ocr_cells) - - # DEBUG code: - # self.draw_ocr_rects_and_cells(page, ocr_rects) - - yield page diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py index 54eb9790..c1ef179d 100644 --- a/docling/pipeline/standard_model_pipeline.py +++ b/docling/pipeline/standard_model_pipeline.py @@ -3,15 +3,15 @@ from pathlib import Path from docling.datamodel.pipeline_options import ( EasyOcrOptions, PipelineOptions, - TesseractOcrOptions, - TesserOcrOptions, + TesseractCLIOptions, + TesseractOptions, ) from docling.models.base_ocr_model import BaseOcrModel from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel from docling.models.table_structure_model import TableStructureModel -from docling.models.tesseract_model import TesseractOcrModel -from docling.models.tesserocr_model import TesserOcrModel +from docling.models.tesseract_cli_model import TesseractCLIModel +from docling.models.tesseract_model import TesseractModel from docling.pipeline.base_model_pipeline import BaseModelPipeline @@ -28,13 +28,13 @@ class StandardModelPipeline(BaseModelPipeline): enabled=pipeline_options.do_ocr, options=pipeline_options.ocr_options, ) - elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions): - ocr_model = TesseractOcrModel( + elif isinstance(pipeline_options.ocr_options, TesseractCLIOptions): + ocr_model = TesseractCLIModel( enabled=pipeline_options.do_ocr, options=pipeline_options.ocr_options, ) - elif isinstance(pipeline_options.ocr_options, TesserOcrOptions): - ocr_model = TesserOcrModel( + elif isinstance(pipeline_options.ocr_options, TesseractOptions): + ocr_model = TesseractModel( enabled=pipeline_options.do_ocr, options=pipeline_options.ocr_options, ) diff --git a/examples/custom_convert.py b/examples/custom_convert.py index 4971abb2..17c78f02 100644 --- a/examples/custom_convert.py +++ b/examples/custom_convert.py @@ -8,7 +8,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.document import ConversionResult, DocumentConversionInput -from docling.datamodel.pipeline_options import TesseractOcrOptions, TesserOcrOptions +from docling.datamodel.pipeline_options import TesseractCLIOptions, TesseractOptions from docling.document_converter import DocumentConverter _log = logging.getLogger(__name__) @@ -126,7 +126,7 @@ def main(): pipeline_options.do_ocr = True pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True - pipeline_options.ocr_options = TesserOcrOptions() + pipeline_options.ocr_options = TesseractOptions() # Docling Parse with Tesseract CLI # ---------------------- @@ -134,7 +134,7 @@ def main(): pipeline_options.do_ocr = True pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True - pipeline_options.ocr_options = TesseractOcrOptions() + pipeline_options.ocr_options = TesseractCLIOptions() doc_converter = DocumentConverter( pipeline_options=pipeline_options, diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index c7a1147d..c875963f 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -7,8 +7,8 @@ from docling.datamodel.pipeline_options import ( EasyOcrOptions, OcrOptions, PipelineOptions, - TesseractOcrOptions, - TesserOcrOptions, + TesseractCLIOptions, + TesseractOptions, ) from docling.document_converter import DocumentConverter @@ -74,8 +74,8 @@ def test_e2e_conversions(): engines: List[OcrOptions] = [ EasyOcrOptions(), - TesserOcrOptions(), - TesseractOcrOptions(), + TesseractOptions(), + TesseractCLIOptions(), ] for ocr_options in engines: