mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
feat(OCR): Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR
scanning and uses the recognized OCR cells instead of any existing PDF cells. Update unit tests. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
parent
97f214efdd
commit
dea1d91ebe
@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
|
||||
|
||||
class OcrOptions(BaseModel):
|
||||
kind: str
|
||||
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
||||
bitmap_area_threshold: float = (
|
||||
0.05 # percentage of the area for a bitmap to processed with OCR
|
||||
)
|
||||
|
@ -73,7 +73,9 @@ class BaseOcrModel(BasePageModel):
|
||||
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
||||
|
||||
# return full-page rectangle if sufficiently covered with bitmaps
|
||||
if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
|
||||
if self.options.force_full_page_ocr or coverage > max(
|
||||
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
|
||||
):
|
||||
return [
|
||||
BoundingBox(
|
||||
l=0,
|
||||
|
@ -5,7 +5,7 @@ import numpy
|
||||
import torch
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@ -88,12 +88,18 @@ class EasyOcrModel(BaseOcrModel):
|
||||
]
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
if self.options.force_full_page_ocr:
|
||||
# If a full page OCR is forced, use only the OCR cells
|
||||
page.cells = [
|
||||
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
|
||||
for c_ocr in all_ocr_cells
|
||||
]
|
||||
else:
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
@ -7,7 +7,7 @@ from typing import Iterable, Optional, Tuple
|
||||
import pandas as pd
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@ -170,12 +170,18 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
)
|
||||
all_ocr_cells.append(cell)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
if self.options.force_full_page_ocr:
|
||||
# If a full page OCR is forced, use only the OCR cells
|
||||
page.cells = [
|
||||
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
|
||||
for c_ocr in all_ocr_cells
|
||||
]
|
||||
else:
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
@ -3,7 +3,7 @@ from typing import Iterable
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@ -140,12 +140,18 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
# del high_res_image
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
if self.options.force_full_page_ocr:
|
||||
# If a full page OCR is forced, use only the OCR cells
|
||||
page.cells = [
|
||||
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
|
||||
for c_ocr in all_ocr_cells
|
||||
]
|
||||
else:
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
@ -15,34 +15,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||
|
||||
GENERATE = False
|
||||
|
||||
|
||||
# Debug
|
||||
def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
|
||||
r""" """
|
||||
import json
|
||||
import os
|
||||
|
||||
parent = pdf_path.parent
|
||||
eng = "" if engine is None else f".{engine}"
|
||||
|
||||
dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
|
||||
with open(dict_fn, "w") as fd:
|
||||
json.dump(doc_result.legacy_document.export_to_dict(), fd)
|
||||
|
||||
pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
|
||||
pages = [p.model_dump() for p in doc_result.pages]
|
||||
with open(pages_fn, "w") as fd:
|
||||
json.dump(pages, fd)
|
||||
|
||||
doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
|
||||
with open(doctags_fn, "w") as fd:
|
||||
fd.write(doc_result.legacy_document.export_to_doctags())
|
||||
|
||||
md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
|
||||
with open(md_fn, "w") as fd:
|
||||
fd.write(doc_result.legacy_document.export_to_markdown())
|
||||
GENERATE_V1 = False
|
||||
GENERATE_V2 = False
|
||||
|
||||
|
||||
def get_pdf_paths():
|
||||
@ -74,13 +48,15 @@ def get_converter(ocr_options: OcrOptions):
|
||||
|
||||
|
||||
def test_e2e_conversions():
|
||||
|
||||
pdf_paths = get_pdf_paths()
|
||||
|
||||
engines: List[OcrOptions] = [
|
||||
EasyOcrOptions(),
|
||||
TesseractOcrOptions(),
|
||||
TesseractCliOcrOptions(),
|
||||
EasyOcrOptions(force_full_page_ocr=True),
|
||||
TesseractOcrOptions(force_full_page_ocr=True),
|
||||
TesseractCliOcrOptions(force_full_page_ocr=True),
|
||||
]
|
||||
|
||||
for ocr_options in engines:
|
||||
@ -91,20 +67,16 @@ def test_e2e_conversions():
|
||||
|
||||
doc_result: ConversionResult = converter.convert(pdf_path)
|
||||
|
||||
# Save conversions
|
||||
# save_output(pdf_path, doc_result, None)
|
||||
|
||||
# Debug
|
||||
verify_conversion_result_v1(
|
||||
input_path=pdf_path,
|
||||
doc_result=doc_result,
|
||||
generate=GENERATE,
|
||||
generate=GENERATE_V1,
|
||||
fuzzy=True,
|
||||
)
|
||||
|
||||
verify_conversion_result_v2(
|
||||
input_path=pdf_path,
|
||||
doc_result=doc_result,
|
||||
generate=GENERATE,
|
||||
generate=GENERATE_V2,
|
||||
fuzzy=True,
|
||||
)
|
||||
|
@ -256,15 +256,19 @@ def verify_conversion_result_v1(
|
||||
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
|
||||
|
||||
if generate: # only used when re-generating truth
|
||||
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(pages_path, "w") as fw:
|
||||
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
|
||||
|
||||
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(json_path, "w") as fw:
|
||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
|
||||
|
||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(md_path, "w") as fw:
|
||||
fw.write(doc_pred_md)
|
||||
|
||||
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(dt_path, "w") as fw:
|
||||
fw.write(doc_pred_dt)
|
||||
else: # default branch in test
|
||||
@ -328,15 +332,19 @@ def verify_conversion_result_v2(
|
||||
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
|
||||
|
||||
if generate: # only used when re-generating truth
|
||||
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(pages_path, "w") as fw:
|
||||
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
|
||||
|
||||
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(json_path, "w") as fw:
|
||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
|
||||
|
||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(md_path, "w") as fw:
|
||||
fw.write(doc_pred_md)
|
||||
|
||||
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(dt_path, "w") as fw:
|
||||
fw.write(doc_pred_dt)
|
||||
else: # default branch in test
|
||||
|
Loading…
Reference in New Issue
Block a user