From 4aaf128384d837ae0385343202c9d3080869ddec Mon Sep 17 00:00:00 2001 From: NuRi Date: Fri, 8 Nov 2024 09:08:55 +0900 Subject: [PATCH] feat: add support for `ocrmac` OCR engine on macOS - Integrates `ocrmac` as an OCR engine option for macOS users. - Adds configuration options and dependencies for `ocrmac`. - Updates documentation to reflect new engine support. This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility. Signed-off-by: Suhwan Seo --- docling/cli/main.py | 4 + docling/datamodel/pipeline_options.py | 12 ++- docling/models/ocr_mac_model.py | 123 ++++++++++++++++++++++ docling/pipeline/standard_pdf_pipeline.py | 7 ++ docs/installation.md | 1 + pyproject.toml | 1 + 6 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 docling/models/ocr_mac_model.py diff --git a/docling/cli/main.py b/docling/cli/main.py index 35ae01df..e965e07a 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -28,6 +28,7 @@ from docling.datamodel.pipeline_options import ( TableFormerMode, TesseractCliOcrOptions, TesseractOcrOptions, + OcrMacOptions, ) from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption @@ -73,6 +74,7 @@ class OcrEngine(str, Enum): EASYOCR = "easyocr" TESSERACT_CLI = "tesseract_cli" TESSERACT = "tesseract" + OCRMAC = "ocrmac" def export_documents( @@ -224,6 +226,8 @@ def convert( ocr_options = TesseractCliOcrOptions() case OcrEngine.TESSERACT: ocr_options = TesseractOcrOptions() + case OcrEngine.OCRMAC: + ocr_options = OcrMacOptions() case _: raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}") diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index d57f1671..9efcfc6f 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -60,6 +60,16 @@ class TesseractOcrOptions(OcrOptions): extra="forbid", ) +class OcrMacOptions(OcrOptions): + kind: Literal["ocrmac"] = "ocrmac" + lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"] + recognition: str = "accurate" + framework: str = "vision" + + model_config = ConfigDict( + extra="forbid", + ) + class PipelineOptions(BaseModel): create_legacy_output: bool = ( @@ -73,7 +83,7 @@ class PdfPipelineOptions(PipelineOptions): do_ocr: bool = True # True: perform OCR, replace programmatic PDF text table_structure_options: TableStructureOptions = TableStructureOptions() - ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = ( + ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions] = ( Field(EasyOcrOptions(), discriminator="kind") ) diff --git a/docling/models/ocr_mac_model.py b/docling/models/ocr_mac_model.py new file mode 100644 index 00000000..cbe8fbd9 --- /dev/null +++ b/docling/models/ocr_mac_model.py @@ -0,0 +1,123 @@ +import logging +import tempfile + +from typing import Iterable, Optional, Tuple + +from docling_core.types.doc import BoundingBox, CoordOrigin + +from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import OcrMacOptions +from docling.datamodel.settings import settings +from docling.models.base_ocr_model import BaseOcrModel +from docling.utils.profiling import TimeRecorder + + +_log = logging.getLogger(__name__) + + +class OcrMacModel(BaseOcrModel): + def __init__(self, enabled: bool, options: OcrMacOptions): + super().__init__(enabled=enabled, options=options) + self.options: OcrMacOptions + + self.scale = 3 # multiplier for 72 dpi == 216 dpi. + + if self.enabled: + setup_errmsg = ( + "ocrmac is not correctly installed. " + "Please install it via `pip install ocrmac` to use this OCR engine. " + "Alternatively, Docling has support for other OCR engines. See the documentation." + ) + try: + from ocrmac import ocrmac + except ImportError: + raise ImportError(setup_errmsg) + + self.reader_RIL = ocrmac.OCR + + + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: + + if not self.enabled: + yield from page_batch + return + + for page_idx, page in enumerate(page_batch): + assert page._backend is not None + if not page._backend.is_valid(): + yield page + else: + with TimeRecorder(conv_res, "ocr"): + + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect + ) + + with tempfile.NamedTemporaryFile( + suffix=".png", mode="w" + ) as image_file: + fname = image_file.name + high_res_image.save(fname) + + boxes = self.reader_RIL(fname, + recognition_level=self.options.recognition, + framework=self.options.framework, + language_preference=self.options.lang, + ).recognize() + + im_width, im_height = high_res_image.size + cells = [] + for ix, (text, confidence, box) in enumerate(boxes): + x = float(box[0]) + y = float(box[1]) + w = float(box[2]) + h = float(box[3]) + + x1 = x * im_width + y2 = (1 - y) * im_height + + x2 = x1 + w * im_width + y1 = y2 - h * im_height + + left = x1 / self.scale + top = y1 / self.scale + right = x2 / self.scale + bottom = y2 / self.scale + + cells.append( + OcrCell( + id=ix, + text=text, + confidence=confidence, + bbox=BoundingBox.from_tuple( + coord=(left, top, right, bottom), + origin=CoordOrigin.TOPLEFT, + ), + ) + ) + + # del high_res_image + all_ocr_cells.extend(cells) + + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells( + all_ocr_cells, page.cells + ) + + page.cells.extend(filtered_ocr_cells) + + # DEBUG code: + if settings.debug.visualize_ocr: + self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects) + + yield page diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 65803d4f..5e059fc8 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -13,6 +13,7 @@ from docling.datamodel.pipeline_options import ( PdfPipelineOptions, TesseractCliOcrOptions, TesseractOcrOptions, + OcrMacOptions, ) from docling.models.base_ocr_model import BaseOcrModel from docling.models.ds_glm_model import GlmModel, GlmOptions @@ -26,6 +27,7 @@ from docling.models.page_preprocessing_model import ( from docling.models.table_structure_model import TableStructureModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_model import TesseractOcrModel +from docling.models.ocr_mac_model import OcrMacModel from docling.pipeline.base_pipeline import PaginatedPipeline from docling.utils.profiling import ProfilingScope, TimeRecorder @@ -118,6 +120,11 @@ class StandardPdfPipeline(PaginatedPipeline): enabled=self.pipeline_options.do_ocr, options=self.pipeline_options.ocr_options, ) + elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions): + return OcrMacModel( + enabled=self.pipeline_options.do_ocr, + options=self.pipeline_options.ocr_options, + ) return None def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page: diff --git a/docs/installation.md b/docs/installation.md index df18dece..7701543b 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -30,6 +30,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` | | Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` | | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` | + | OcrMac | System dependency. See description below. | `OcrMacOptions` | The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example diff --git a/pyproject.toml b/pyproject.toml index c1196d01..7504f74d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,6 +89,7 @@ torchvision = [ {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0"}, {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2"} ] +ocrmac = {markers = "sys_platform == 'darwin'", version = "^1.0.0"} [tool.poetry.extras] tesserocr = ["tesserocr"]