From 383ad1801fc92683013a2d74973d989df0bf91af Mon Sep 17 00:00:00 2001 From: swayam-singhal Date: Wed, 20 Nov 2024 13:19:14 +0530 Subject: [PATCH] integrated paddleocr model for performing accurate ocr when using docling document converter --- README.md | 22 +++++ docling/datamodel/pipeline_options.py | 14 +++- docling/models/paddle_ocr_model.py | 98 +++++++++++++++++++++++ docling/pipeline/standard_pdf_pipeline.py | 7 ++ 4 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 docling/models/paddle_ocr_model.py diff --git a/README.md b/README.md index bf311bdb..d3bafa11 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,29 @@ To know more about the original repository refer to the readme and documentation available at:
[Docling Github Repo](https://github.com/DS4SD/docling) +[Docling Documentation](https://ds4sd.github.io/docling/) +## PaddleOCR Usage - Demo: +```python +from docling.datamodel.base_models import InputFormat +from docling.document_converter import DocumentConverter, ImageFormatOption, PdfFormatOption +from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, TableStructureOptions + +pipeline_options = PdfPipelineOptions(do_table_structure=True, generate_page_images=True, images_scale=2.0) +pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model +pipeline_options.table_structure_options = TableStructureOptions(do_cell_matching=True) +pipeline_options.ocr_options = PaddleOcrOptions(lang="en") + +doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options), + InputFormat.IMAGE: ImageFormatOption(pipeline_options=pipeline_options) + } +) +result = doc_converter.convert("sample_file.pdf") +print(result.document.export_to_markdown()) + +``` ## License The Docling codebase is under MIT license. diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 1ea4d62a..99d6c9f2 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -41,6 +41,18 @@ class EasyOcrOptions(OcrOptions): protected_namespaces=(), ) +class PaddleOcrOptions(OcrOptions): + kind: Literal["paddleocr"] = "paddleocr" + lang: str = "en" + use_gpu: bool = True # same default as paddleocr.ocr + use_angle_cls: bool = True + show_log: bool = False + cls: bool = True + + model_config = ConfigDict( + extra="forbid", + ) + class TesseractCliOcrOptions(OcrOptions): kind: Literal["tesseract"] = "tesseract" @@ -75,7 +87,7 @@ class PdfPipelineOptions(PipelineOptions): do_ocr: bool = True # True: perform OCR, replace programmatic PDF text table_structure_options: TableStructureOptions = TableStructureOptions() - ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = ( + ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, PaddleOcrOptions] = ( Field(EasyOcrOptions(), discriminator="kind") ) diff --git a/docling/models/paddle_ocr_model.py b/docling/models/paddle_ocr_model.py new file mode 100644 index 00000000..ad40db44 --- /dev/null +++ b/docling/models/paddle_ocr_model.py @@ -0,0 +1,98 @@ +import logging +from typing import Iterable + +import numpy +from docling_core.types.doc import BoundingBox, CoordOrigin + +from docling.datamodel.base_models import Cell, OcrCell, Page +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import PaddleOcrOptions +from docling.datamodel.settings import settings +from docling.models.base_ocr_model import BaseOcrModel +from docling.utils.profiling import TimeRecorder +import cv2 + +_log = logging.getLogger(__name__) + + +class PaddleOcrModel(BaseOcrModel): + def __init__(self, enabled: bool, options: PaddleOcrOptions): + super().__init__(enabled=enabled, options=options) + self.options: PaddleOcrOptions + + self.scale = 3 # multiplier for 72 dpi == 216 dpi. + + if self.enabled: + try: + from paddleocr import PaddleOCR, draw_ocr + except ImportError: + raise ImportError( + "PaddleOCR is not installed. Please install it via `pip install paddlepaddle` and `pip install paddleocr` to use this OCR engine. " + "Alternatively, Docling has support for other OCR engines. See the documentation." + ) + + self.reader = PaddleOCR( + lang=self.options.lang, + use_gpu=self.options.use_gpu, + use_angle_cls=self.options.use_angle_cls, + show_log=self.options.show_log, + ) + + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: + + if not self.enabled: + yield from page_batch + return + + for page in page_batch: + + assert page._backend is not None + if not page._backend.is_valid(): + yield page + else: + with TimeRecorder(conv_res, "ocr"): + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect + ) + im = numpy.array(high_res_image) + result = self.reader.ocr(im, cls=self.options.cls)[0] + + del high_res_image + del im + + cells = [ + OcrCell( + id=ix, + text=line[1][0], + confidence=line[1][1], + bbox=BoundingBox.from_tuple( + coord=( + (line[0][0][0] / self.scale) + ocr_rect.l, + (line[0][0][1] / self.scale) + ocr_rect.t, + (line[0][2][0] / self.scale) + ocr_rect.l, + (line[0][2][1] / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, + ), + ) + for ix, line in enumerate(result) + ] + all_ocr_cells.extend(cells) + + # Post-process the cells + page.cells = self.post_process_cells(all_ocr_cells, page.cells) + + # DEBUG code: + if settings.debug.visualize_ocr: + self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects) + + yield page diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 65803d4f..8f12eaf2 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -13,10 +13,12 @@ from docling.datamodel.pipeline_options import ( PdfPipelineOptions, TesseractCliOcrOptions, TesseractOcrOptions, + PaddleOcrOptions, ) from docling.models.base_ocr_model import BaseOcrModel from docling.models.ds_glm_model import GlmModel, GlmOptions from docling.models.easyocr_model import EasyOcrModel +from docling.models.paddle_ocr_model import PaddleOcrModel from docling.models.layout_model import LayoutModel from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions from docling.models.page_preprocessing_model import ( @@ -118,6 +120,11 @@ class StandardPdfPipeline(PaginatedPipeline): enabled=self.pipeline_options.do_ocr, options=self.pipeline_options.ocr_options, ) + elif isinstance(self.pipeline_options.ocr_options, PaddleOcrOptions): + return PaddleOcrModel( + enabled=self.pipeline_options.do_ocr, + options=self.pipeline_options.ocr_options, + ) return None def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page: