integrated paddleocr model for performing accurate ocr when using docling document converter

2025-07-30 22:14:37 +00:00 · 2024-11-20 13:19:14 +05:30 · 2024-11-20 13:19:14 +05:30 · 383ad1801f
commit 383ad1801f
parent 318d42c369
4 changed files with 140 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -3,7 +3,29 @@

 To know more about the original repository refer to the readme and documentation available at: </br>
 [Docling Github Repo](https://github.com/DS4SD/docling)
+[Docling Documentation](https://ds4sd.github.io/docling/)

+## PaddleOCR Usage - Demo:
+```python
+from docling.datamodel.base_models import InputFormat
+from docling.document_converter import DocumentConverter, ImageFormatOption, PdfFormatOption
+from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, TableStructureOptions
+
+pipeline_options = PdfPipelineOptions(do_table_structure=True, generate_page_images=True, images_scale=2.0)
+pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE  # use more accurate TableFormer model
+pipeline_options.table_structure_options = TableStructureOptions(do_cell_matching=True)
+pipeline_options.ocr_options = PaddleOcrOptions(lang="en")
+
+doc_converter = DocumentConverter(
+    format_options={
+        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
+        InputFormat.IMAGE: ImageFormatOption(pipeline_options=pipeline_options)
+    }
+)
+result = doc_converter.convert("sample_file.pdf")
+print(result.document.export_to_markdown())
+
+```
 ## License

 The Docling codebase is under MIT license.
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -41,6 +41,18 @@ class EasyOcrOptions(OcrOptions):
        protected_namespaces=(),
    )

+class PaddleOcrOptions(OcrOptions):
+    kind: Literal["paddleocr"] = "paddleocr"
+    lang: str = "en"
+    use_gpu: bool = True  # same default as paddleocr.ocr
+    use_angle_cls: bool = True
+    show_log: bool = False
+    cls: bool = True
+
+    model_config = ConfigDict(
+        extra="forbid",
+    )
+

 class TesseractCliOcrOptions(OcrOptions):
    kind: Literal["tesseract"] = "tesseract"
@ -75,7 +87,7 @@ class PdfPipelineOptions(PipelineOptions):
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text

    table_structure_options: TableStructureOptions = TableStructureOptions()
-    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
+    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, PaddleOcrOptions] = (
        Field(EasyOcrOptions(), discriminator="kind")
    )

--- a/docling/models/paddle_ocr_model.py
+++ b/docling/models/paddle_ocr_model.py
@ -0,0 +1,98 @@
+import logging
+from typing import Iterable
+
+import numpy
+from docling_core.types.doc import BoundingBox, CoordOrigin
+
+from docling.datamodel.base_models import Cell, OcrCell, Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import PaddleOcrOptions
+from docling.datamodel.settings import settings
+from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.profiling import TimeRecorder
+import cv2
+
+_log = logging.getLogger(__name__)
+
+
+class PaddleOcrModel(BaseOcrModel):
+    def __init__(self, enabled: bool, options: PaddleOcrOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: PaddleOcrOptions
+
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+
+        if self.enabled:
+            try:
+                from paddleocr import PaddleOCR, draw_ocr
+            except ImportError:
+                raise ImportError(
+                    "PaddleOCR is not installed. Please install it via `pip install paddlepaddle` and `pip install paddleocr` to use this OCR engine. "
+                    "Alternatively, Docling has support for other OCR engines. See the documentation."
+                )
+
+            self.reader = PaddleOCR(
+                lang=self.options.lang,
+                use_gpu=self.options.use_gpu,
+                use_angle_cls=self.options.use_angle_cls, 
+                show_log=self.options.show_log,
+            )
+
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+
+        if not self.enabled:
+            yield from page_batch
+            return
+
+        for page in page_batch:
+
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "ocr"):
+                    ocr_rects = self.get_ocr_rects(page)
+
+                    all_ocr_cells = []
+                    for ocr_rect in ocr_rects:
+                        # Skip zero area boxes
+                        if ocr_rect.area() == 0:
+                            continue
+                        high_res_image = page._backend.get_page_image(
+                            scale=self.scale, cropbox=ocr_rect
+                        )
+                        im = numpy.array(high_res_image)
+                        result = self.reader.ocr(im, cls=self.options.cls)[0]
+
+                        del high_res_image
+                        del im
+
+                        cells = [
+                            OcrCell(
+                                id=ix,
+                                text=line[1][0],
+                                confidence=line[1][1],
+                                bbox=BoundingBox.from_tuple(
+                                    coord=(
+                                        (line[0][0][0] / self.scale) + ocr_rect.l,
+                                        (line[0][0][1] / self.scale) + ocr_rect.t,
+                                        (line[0][2][0] / self.scale) + ocr_rect.l,
+                                        (line[0][2][1] / self.scale) + ocr_rect.t,
+                                    ),
+                                    origin=CoordOrigin.TOPLEFT,
+                                ),
+                            )
+                            for ix, line in enumerate(result)
+                        ]
+                        all_ocr_cells.extend(cells)
+
+                    # Post-process the cells
+                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+
+                # DEBUG code:
+                if settings.debug.visualize_ocr:
+                    self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
+
+                yield page
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -13,10 +13,12 @@ from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
+    PaddleOcrOptions,
 )
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.ds_glm_model import GlmModel, GlmOptions
 from docling.models.easyocr_model import EasyOcrModel
+from docling.models.paddle_ocr_model import PaddleOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
 from docling.models.page_preprocessing_model import (
@ -118,6 +120,11 @@ class StandardPdfPipeline(PaginatedPipeline):
                enabled=self.pipeline_options.do_ocr,
                options=self.pipeline_options.ocr_options,
            )
+        elif isinstance(self.pipeline_options.ocr_options, PaddleOcrOptions):
+            return PaddleOcrModel(
+                enabled=self.pipeline_options.do_ocr,
+                options=self.pipeline_options.ocr_options,
+            )
        return None

    def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page: