feat(OCR): Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning (#290)

- When the OCR is forced, any existing PDF cells are rejected.
- Introduce the force-ocr cmd parameter in docling CLI.
- Update unit tests.
- Add the full_page_ocr.py example in mkdocs.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos
2024-11-12 09:46:14 +01:00
committed by GitHub
parent 81c8243a8b
commit c6b3763ecb
10 changed files with 100 additions and 62 deletions

View File

@@ -15,34 +15,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
GENERATE = False
# Debug
def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
r""" """
import json
import os
parent = pdf_path.parent
eng = "" if engine is None else f".{engine}"
dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
with open(dict_fn, "w") as fd:
json.dump(doc_result.legacy_document.export_to_dict(), fd)
pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
pages = [p.model_dump() for p in doc_result.pages]
with open(pages_fn, "w") as fd:
json.dump(pages, fd)
doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
with open(doctags_fn, "w") as fd:
fd.write(doc_result.legacy_document.export_to_doctags())
md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
with open(md_fn, "w") as fd:
fd.write(doc_result.legacy_document.export_to_markdown())
GENERATE_V1 = False
GENERATE_V2 = False
def get_pdf_paths():
@@ -74,13 +48,15 @@ def get_converter(ocr_options: OcrOptions):
def test_e2e_conversions():
pdf_paths = get_pdf_paths()
engines: List[OcrOptions] = [
EasyOcrOptions(),
TesseractOcrOptions(),
TesseractCliOcrOptions(),
EasyOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True),
TesseractCliOcrOptions(force_full_page_ocr=True),
]
for ocr_options in engines:
@@ -91,20 +67,16 @@ def test_e2e_conversions():
doc_result: ConversionResult = converter.convert(pdf_path)
# Save conversions
# save_output(pdf_path, doc_result, None)
# Debug
verify_conversion_result_v1(
input_path=pdf_path,
doc_result=doc_result,
generate=GENERATE,
generate=GENERATE_V1,
fuzzy=True,
)
verify_conversion_result_v2(
input_path=pdf_path,
doc_result=doc_result,
generate=GENERATE,
generate=GENERATE_V2,
fuzzy=True,
)