feat(OCR): Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning (#290)

- When the OCR is forced, any existing PDF cells are rejected.
- Introduce the force-ocr cmd parameter in docling CLI.
- Update unit tests.
- Add the full_page_ocr.py example in mkdocs.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos
2024-11-12 09:46:14 +01:00
committed by GitHub
parent 81c8243a8b
commit c6b3763ecb
10 changed files with 100 additions and 62 deletions

View File

@@ -153,6 +153,13 @@ def convert(
..., help="If enabled, the bitmap content will be processed using OCR."
),
] = True,
force_ocr: Annotated[
bool,
typer.Option(
...,
help="Replace any existing text with OCR generated text over the full content.",
),
] = False,
ocr_engine: Annotated[
OcrEngine, typer.Option(..., help="The OCR engine to use.")
] = OcrEngine.EASYOCR,
@@ -219,11 +226,11 @@ def convert(
match ocr_engine:
case OcrEngine.EASYOCR:
ocr_options: OcrOptions = EasyOcrOptions()
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
case OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCliOcrOptions()
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
case OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions()
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
case _:
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

View File

@@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
class OcrOptions(BaseModel):
kind: str
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
bitmap_area_threshold: float = (
0.05 # percentage of the area for a bitmap to processed with OCR
)

View File

@@ -10,7 +10,7 @@ from PIL import Image, ImageDraw
from rtree import index
from scipy.ndimage import find_objects, label
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrOptions
from docling.datamodel.settings import settings
@@ -73,7 +73,9 @@ class BaseOcrModel(BasePageModel):
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
# return full-page rectangle if sufficiently covered with bitmaps
if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
if self.options.force_full_page_ocr or coverage > max(
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
):
return [
BoundingBox(
l=0,
@@ -96,7 +98,7 @@ class BaseOcrModel(BasePageModel):
return ocr_rects
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
def filter_ocr_cells(self, ocr_cells, programmatic_cells):
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
# Create R-tree index for programmatic cells
p = index.Property()
p.dimension = 2
@@ -117,6 +119,23 @@ class BaseOcrModel(BasePageModel):
]
return filtered_ocr_cells
def post_process_cells(self, ocr_cells, programmatic_cells):
r"""
Post-process the ocr and programmatic cells and return the final list of of cells
"""
if self.options.force_full_page_ocr:
# If a full page OCR is forced, use only the OCR cells
cells = [
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
for c_ocr in ocr_cells
]
return cells
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
programmatic_cells.extend(filtered_ocr_cells)
return programmatic_cells
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
image = copy.deepcopy(page.image)
draw = ImageDraw.Draw(image, "RGBA")

View File

@@ -5,7 +5,7 @@ import numpy
import torch
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import EasyOcrOptions
from docling.datamodel.settings import settings
@@ -88,12 +88,8 @@ class EasyOcrModel(BaseOcrModel):
]
all_ocr_cells.extend(cells)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(
all_ocr_cells, page.cells
)
page.cells.extend(filtered_ocr_cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
# DEBUG code:
if settings.debug.visualize_ocr:

View File

@@ -7,7 +7,7 @@ from typing import Iterable, Optional, Tuple
import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
from docling.datamodel.settings import settings
@@ -170,12 +170,8 @@ class TesseractOcrCliModel(BaseOcrModel):
)
all_ocr_cells.append(cell)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(
all_ocr_cells, page.cells
)
page.cells.extend(filtered_ocr_cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
# DEBUG code:
if settings.debug.visualize_ocr:

View File

@@ -3,7 +3,7 @@ from typing import Iterable
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractOcrOptions
from docling.datamodel.settings import settings
@@ -140,12 +140,8 @@ class TesseractOcrModel(BaseOcrModel):
# del high_res_image
all_ocr_cells.extend(cells)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(
all_ocr_cells, page.cells
)
page.cells.extend(filtered_ocr_cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
# DEBUG code:
if settings.debug.visualize_ocr: