mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-18 09:31:02 +00:00
feat(OCR): Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning (#290)
- When the OCR is forced, any existing PDF cells are rejected. - Introduce the force-ocr cmd parameter in docling CLI. - Update unit tests. - Add the full_page_ocr.py example in mkdocs. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
@@ -10,7 +10,7 @@ from PIL import Image, ImageDraw
|
||||
from rtree import index
|
||||
from scipy.ndimage import find_objects, label
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import OcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -73,7 +73,9 @@ class BaseOcrModel(BasePageModel):
|
||||
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
||||
|
||||
# return full-page rectangle if sufficiently covered with bitmaps
|
||||
if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
|
||||
if self.options.force_full_page_ocr or coverage > max(
|
||||
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
|
||||
):
|
||||
return [
|
||||
BoundingBox(
|
||||
l=0,
|
||||
@@ -96,7 +98,7 @@ class BaseOcrModel(BasePageModel):
|
||||
return ocr_rects
|
||||
|
||||
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
||||
def filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
||||
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
||||
# Create R-tree index for programmatic cells
|
||||
p = index.Property()
|
||||
p.dimension = 2
|
||||
@@ -117,6 +119,23 @@ class BaseOcrModel(BasePageModel):
|
||||
]
|
||||
return filtered_ocr_cells
|
||||
|
||||
def post_process_cells(self, ocr_cells, programmatic_cells):
|
||||
r"""
|
||||
Post-process the ocr and programmatic cells and return the final list of of cells
|
||||
"""
|
||||
if self.options.force_full_page_ocr:
|
||||
# If a full page OCR is forced, use only the OCR cells
|
||||
cells = [
|
||||
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
|
||||
for c_ocr in ocr_cells
|
||||
]
|
||||
return cells
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
|
||||
programmatic_cells.extend(filtered_ocr_cells)
|
||||
return programmatic_cells
|
||||
|
||||
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
||||
image = copy.deepcopy(page.image)
|
||||
draw = ImageDraw.Draw(image, "RGBA")
|
||||
|
||||
@@ -5,7 +5,7 @@ import numpy
|
||||
import torch
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -88,12 +88,8 @@ class EasyOcrModel(BaseOcrModel):
|
||||
]
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
||||
@@ -7,7 +7,7 @@ from typing import Iterable, Optional, Tuple
|
||||
import pandas as pd
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -170,12 +170,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
)
|
||||
all_ocr_cells.append(cell)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import Iterable
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -140,12 +140,8 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
# del high_res_image
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
||||
Reference in New Issue
Block a user