mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-10 21:58:15 +00:00
feat(OCR): Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning (#290)
- When the OCR is forced, any existing PDF cells are rejected. - Introduce the force-ocr cmd parameter in docling CLI. - Update unit tests. - Add the full_page_ocr.py example in mkdocs. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
@@ -153,6 +153,13 @@ def convert(
|
||||
..., help="If enabled, the bitmap content will be processed using OCR."
|
||||
),
|
||||
] = True,
|
||||
force_ocr: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
...,
|
||||
help="Replace any existing text with OCR generated text over the full content.",
|
||||
),
|
||||
] = False,
|
||||
ocr_engine: Annotated[
|
||||
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
||||
] = OcrEngine.EASYOCR,
|
||||
@@ -219,11 +226,11 @@ def convert(
|
||||
|
||||
match ocr_engine:
|
||||
case OcrEngine.EASYOCR:
|
||||
ocr_options: OcrOptions = EasyOcrOptions()
|
||||
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
||||
case OcrEngine.TESSERACT_CLI:
|
||||
ocr_options = TesseractCliOcrOptions()
|
||||
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
||||
case OcrEngine.TESSERACT:
|
||||
ocr_options = TesseractOcrOptions()
|
||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
||||
case _:
|
||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
|
||||
|
||||
class OcrOptions(BaseModel):
|
||||
kind: str
|
||||
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
||||
bitmap_area_threshold: float = (
|
||||
0.05 # percentage of the area for a bitmap to processed with OCR
|
||||
)
|
||||
|
||||
@@ -10,7 +10,7 @@ from PIL import Image, ImageDraw
|
||||
from rtree import index
|
||||
from scipy.ndimage import find_objects, label
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import OcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -73,7 +73,9 @@ class BaseOcrModel(BasePageModel):
|
||||
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
||||
|
||||
# return full-page rectangle if sufficiently covered with bitmaps
|
||||
if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
|
||||
if self.options.force_full_page_ocr or coverage > max(
|
||||
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
|
||||
):
|
||||
return [
|
||||
BoundingBox(
|
||||
l=0,
|
||||
@@ -96,7 +98,7 @@ class BaseOcrModel(BasePageModel):
|
||||
return ocr_rects
|
||||
|
||||
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
||||
def filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
||||
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
||||
# Create R-tree index for programmatic cells
|
||||
p = index.Property()
|
||||
p.dimension = 2
|
||||
@@ -117,6 +119,23 @@ class BaseOcrModel(BasePageModel):
|
||||
]
|
||||
return filtered_ocr_cells
|
||||
|
||||
def post_process_cells(self, ocr_cells, programmatic_cells):
|
||||
r"""
|
||||
Post-process the ocr and programmatic cells and return the final list of of cells
|
||||
"""
|
||||
if self.options.force_full_page_ocr:
|
||||
# If a full page OCR is forced, use only the OCR cells
|
||||
cells = [
|
||||
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
|
||||
for c_ocr in ocr_cells
|
||||
]
|
||||
return cells
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
|
||||
programmatic_cells.extend(filtered_ocr_cells)
|
||||
return programmatic_cells
|
||||
|
||||
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
||||
image = copy.deepcopy(page.image)
|
||||
draw = ImageDraw.Draw(image, "RGBA")
|
||||
|
||||
@@ -5,7 +5,7 @@ import numpy
|
||||
import torch
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -88,12 +88,8 @@ class EasyOcrModel(BaseOcrModel):
|
||||
]
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
||||
@@ -7,7 +7,7 @@ from typing import Iterable, Optional, Tuple
|
||||
import pandas as pd
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -170,12 +170,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
)
|
||||
all_ocr_cells.append(cell)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import Iterable
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -140,12 +140,8 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
# del high_res_image
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
||||
Reference in New Issue
Block a user