feat: tesseract and tesserocr models. WIP.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
2025-12-09 05:08:14 +00:00 · 2024-10-02 13:30:27 +02:00
parent 455d6ff70f
commit c211808742
3 changed files with 224 additions and 0 deletions
--- a/docling/models/tesseract_model.py
+++ b/docling/models/tesseract_model.py
@@ -0,0 +1,72 @@
+import logging
+from typing import Iterable
+
+import numpy
+
+from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.pipeline_options import TesseractOcrOptions
+from docling.models.base_ocr_model import BaseOcrModel
+
+_log = logging.getLogger(__name__)
+
+
+class TesseractModel(BaseOcrModel):
+    def __init__(self, enabled: bool, options: TesseractOcrOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: TesseractOcrOptions
+
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+
+        if self.enabled:
+            import tesserocr
+
+            self.reader = easyocr.Reader(lang_list=self.options.lang)
+
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+
+        if not self.enabled:
+            yield from page_batch
+            return
+
+        for page in page_batch:
+            ocr_rects = self.get_ocr_rects(page)
+
+            all_ocr_cells = []
+            for ocr_rect in ocr_rects:
+                high_res_image = page._backend.get_page_image(
+                    scale=self.scale, cropbox=ocr_rect
+                )
+                im = numpy.array(high_res_image)
+                result = self.reader.readtext(im)
+
+                del high_res_image
+                del im
+
+                cells = [
+                    OcrCell(
+                        id=ix,
+                        text=line[1],
+                        confidence=line[2],
+                        bbox=BoundingBox.from_tuple(
+                            coord=(
+                                (line[0][0][0] / self.scale) + ocr_rect.l,
+                                (line[0][0][1] / self.scale) + ocr_rect.t,
+                                (line[0][2][0] / self.scale) + ocr_rect.l,
+                                (line[0][2][1] / self.scale) + ocr_rect.t,
+                            ),
+                            origin=CoordOrigin.TOPLEFT,
+                        ),
+                    )
+                    for ix, line in enumerate(result)
+                ]
+                all_ocr_cells.extend(cells)
+
+            ## Remove OCR cells which overlap with programmatic cells.
+            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+
+            page.cells.extend(filtered_ocr_cells)
+
+            # DEBUG code:
+            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+
+            yield page
--- a/docling/models/tesserocr_model.py
+++ b/docling/models/tesserocr_model.py
@@ -0,0 +1,72 @@
+import logging
+from typing import Iterable
+
+import numpy
+
+from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.pipeline_options import TesseractOcrOptions
+from docling.models.base_ocr_model import BaseOcrModel
+
+_log = logging.getLogger(__name__)
+
+
+class TesserOcrModel(BaseOcrModel):
+    def __init__(self, enabled: bool, options: TesseractOcrOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: TesseractOcrOptions
+
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+
+        if self.enabled:
+            import tesserocr
+
+            self.reader = easyocr.Reader(lang_list=self.options.lang)
+
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+
+        if not self.enabled:
+            yield from page_batch
+            return
+
+        for page in page_batch:
+            ocr_rects = self.get_ocr_rects(page)
+
+            all_ocr_cells = []
+            for ocr_rect in ocr_rects:
+                high_res_image = page._backend.get_page_image(
+                    scale=self.scale, cropbox=ocr_rect
+                )
+                im = numpy.array(high_res_image)
+                result = self.reader.readtext(im)
+
+                del high_res_image
+                del im
+
+                cells = [
+                    OcrCell(
+                        id=ix,
+                        text=line[1],
+                        confidence=line[2],
+                        bbox=BoundingBox.from_tuple(
+                            coord=(
+                                (line[0][0][0] / self.scale) + ocr_rect.l,
+                                (line[0][0][1] / self.scale) + ocr_rect.t,
+                                (line[0][2][0] / self.scale) + ocr_rect.l,
+                                (line[0][2][1] / self.scale) + ocr_rect.t,
+                            ),
+                            origin=CoordOrigin.TOPLEFT,
+                        ),
+                    )
+                    for ix, line in enumerate(result)
+                ]
+                all_ocr_cells.extend(cells)
+
+            ## Remove OCR cells which overlap with programmatic cells.
+            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+
+            page.cells.extend(filtered_ocr_cells)
+
+            # DEBUG code:
+            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+
+            yield page
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -0,0 +1,80 @@
+from pathlib import Path
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import PipelineOptions
+from docling.document_converter import DocumentConverter
+
+from .verify_utils import verify_conversion_result
+
+GENERATE = False
+
+
+# Debug
+def save_output(pdf_path: Path, doc_result: ConversionResult):
+    r"""
+    """
+    import json
+    import os
+
+    parent = pdf_path.parent
+
+    dict_fn = os.path.join(parent, f"{pdf_path.stem}.json")
+    with open(dict_fn, "w") as fd:
+        json.dump(doc_result.render_as_dict(), fd)
+
+    pages_fn = os.path.join(parent, f"{pdf_path.stem}.pages.json")
+    pages = [p.model_dump() for p in doc_result.pages]
+    with open(pages_fn, "w") as fd:
+        json.dump(pages, fd)
+
+    doctags_fn = os.path.join(parent, f"{pdf_path.stem}.doctags.txt")
+    with open(doctags_fn, "w") as fd:
+        fd.write(doc_result.render_as_doctags())
+
+    md_fn = os.path.join(parent, f"{pdf_path.stem}.md")
+    with open(md_fn, "w") as fd:
+        fd.write(doc_result.render_as_markdown())
+
+
+def get_pdf_paths():
+    # TODO: Debug
+    # Define the directory you want to search
+    # directory = Path("./tests/data")
+    directory = Path("./tests/data/scanned")
+
+    # List all PDF files in the directory and its subdirectories
+    pdf_files = sorted(directory.rglob("*.pdf"))
+    return pdf_files
+
+
+def get_converter():
+
+    pipeline_options = PipelineOptions()
+    # Debug
+    pipeline_options.do_ocr = True
+    pipeline_options.do_table_structure = True
+    pipeline_options.table_structure_options.do_cell_matching = True
+
+    converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+        pdf_backend=DoclingParseDocumentBackend,
+    )
+
+    return converter
+
+
+def test_e2e_conversions():
+
+    pdf_paths = get_pdf_paths()
+    converter = get_converter()
+
+    for pdf_path in pdf_paths:
+        print(f"converting {pdf_path}")
+
+        doc_result: ConversionResult = converter.convert_single(pdf_path)
+
+        # Debug
+        verify_conversion_result(
+            input_path=pdf_path, doc_result=doc_result, generate=GENERATE
+        )