diff --git a/docling/models/tesseract_model.py b/docling/models/tesseract_model.py new file mode 100644 index 00000000..ceb81776 --- /dev/null +++ b/docling/models/tesseract_model.py @@ -0,0 +1,72 @@ +import logging +from typing import Iterable + +import numpy + +from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page +from docling.datamodel.pipeline_options import TesseractOcrOptions +from docling.models.base_ocr_model import BaseOcrModel + +_log = logging.getLogger(__name__) + + +class TesseractModel(BaseOcrModel): + def __init__(self, enabled: bool, options: TesseractOcrOptions): + super().__init__(enabled=enabled, options=options) + self.options: TesseractOcrOptions + + self.scale = 3 # multiplier for 72 dpi == 216 dpi. + + if self.enabled: + import tesserocr + + self.reader = easyocr.Reader(lang_list=self.options.lang) + + def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + + if not self.enabled: + yield from page_batch + return + + for page in page_batch: + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect + ) + im = numpy.array(high_res_image) + result = self.reader.readtext(im) + + del high_res_image + del im + + cells = [ + OcrCell( + id=ix, + text=line[1], + confidence=line[2], + bbox=BoundingBox.from_tuple( + coord=( + (line[0][0][0] / self.scale) + ocr_rect.l, + (line[0][0][1] / self.scale) + ocr_rect.t, + (line[0][2][0] / self.scale) + ocr_rect.l, + (line[0][2][1] / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, + ), + ) + for ix, line in enumerate(result) + ] + all_ocr_cells.extend(cells) + + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) + + page.cells.extend(filtered_ocr_cells) + + # DEBUG code: + # self.draw_ocr_rects_and_cells(page, ocr_rects) + + yield page diff --git a/docling/models/tesserocr_model.py b/docling/models/tesserocr_model.py new file mode 100644 index 00000000..01f1e71f --- /dev/null +++ b/docling/models/tesserocr_model.py @@ -0,0 +1,72 @@ +import logging +from typing import Iterable + +import numpy + +from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page +from docling.datamodel.pipeline_options import TesseractOcrOptions +from docling.models.base_ocr_model import BaseOcrModel + +_log = logging.getLogger(__name__) + + +class TesserOcrModel(BaseOcrModel): + def __init__(self, enabled: bool, options: TesseractOcrOptions): + super().__init__(enabled=enabled, options=options) + self.options: TesseractOcrOptions + + self.scale = 3 # multiplier for 72 dpi == 216 dpi. + + if self.enabled: + import tesserocr + + self.reader = easyocr.Reader(lang_list=self.options.lang) + + def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + + if not self.enabled: + yield from page_batch + return + + for page in page_batch: + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect + ) + im = numpy.array(high_res_image) + result = self.reader.readtext(im) + + del high_res_image + del im + + cells = [ + OcrCell( + id=ix, + text=line[1], + confidence=line[2], + bbox=BoundingBox.from_tuple( + coord=( + (line[0][0][0] / self.scale) + ocr_rect.l, + (line[0][0][1] / self.scale) + ocr_rect.t, + (line[0][2][0] / self.scale) + ocr_rect.l, + (line[0][2][1] / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, + ), + ) + for ix, line in enumerate(result) + ] + all_ocr_cells.extend(cells) + + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) + + page.cells.extend(filtered_ocr_cells) + + # DEBUG code: + # self.draw_ocr_rects_and_cells(page, ocr_rects) + + yield page diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py new file mode 100644 index 00000000..870e270b --- /dev/null +++ b/tests/test_e2e_ocr_conversion.py @@ -0,0 +1,80 @@ +from pathlib import Path + +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import PipelineOptions +from docling.document_converter import DocumentConverter + +from .verify_utils import verify_conversion_result + +GENERATE = False + + +# Debug +def save_output(pdf_path: Path, doc_result: ConversionResult): + r""" + """ + import json + import os + + parent = pdf_path.parent + + dict_fn = os.path.join(parent, f"{pdf_path.stem}.json") + with open(dict_fn, "w") as fd: + json.dump(doc_result.render_as_dict(), fd) + + pages_fn = os.path.join(parent, f"{pdf_path.stem}.pages.json") + pages = [p.model_dump() for p in doc_result.pages] + with open(pages_fn, "w") as fd: + json.dump(pages, fd) + + doctags_fn = os.path.join(parent, f"{pdf_path.stem}.doctags.txt") + with open(doctags_fn, "w") as fd: + fd.write(doc_result.render_as_doctags()) + + md_fn = os.path.join(parent, f"{pdf_path.stem}.md") + with open(md_fn, "w") as fd: + fd.write(doc_result.render_as_markdown()) + + +def get_pdf_paths(): + # TODO: Debug + # Define the directory you want to search + # directory = Path("./tests/data") + directory = Path("./tests/data/scanned") + + # List all PDF files in the directory and its subdirectories + pdf_files = sorted(directory.rglob("*.pdf")) + return pdf_files + + +def get_converter(): + + pipeline_options = PipelineOptions() + # Debug + pipeline_options.do_ocr = True + pipeline_options.do_table_structure = True + pipeline_options.table_structure_options.do_cell_matching = True + + converter = DocumentConverter( + pipeline_options=pipeline_options, + pdf_backend=DoclingParseDocumentBackend, + ) + + return converter + + +def test_e2e_conversions(): + + pdf_paths = get_pdf_paths() + converter = get_converter() + + for pdf_path in pdf_paths: + print(f"converting {pdf_path}") + + doc_result: ConversionResult = converter.convert_single(pdf_path) + + # Debug + verify_conversion_result( + input_path=pdf_path, doc_result=doc_result, generate=GENERATE + )