feat(OCR): Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning (#290)

- When the OCR is forced, any existing PDF cells are rejected.
- Introduce the force-ocr cmd parameter in docling CLI.
- Update unit tests.
- Add the full_page_ocr.py example in mkdocs.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos
2024-11-12 09:46:14 +01:00
committed by GitHub
parent 81c8243a8b
commit c6b3763ecb
10 changed files with 100 additions and 62 deletions

View File

@@ -256,15 +256,19 @@ def verify_conversion_result_v1(
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
if generate: # only used when re-generating truth
pages_path.parent.mkdir(parents=True, exist_ok=True)
with open(pages_path, "w") as fw:
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
json_path.parent.mkdir(parents=True, exist_ok=True)
with open(json_path, "w") as fw:
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
md_path.parent.mkdir(parents=True, exist_ok=True)
with open(md_path, "w") as fw:
fw.write(doc_pred_md)
dt_path.parent.mkdir(parents=True, exist_ok=True)
with open(dt_path, "w") as fw:
fw.write(doc_pred_dt)
else: # default branch in test
@@ -328,15 +332,19 @@ def verify_conversion_result_v2(
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
if generate: # only used when re-generating truth
pages_path.parent.mkdir(parents=True, exist_ok=True)
with open(pages_path, "w") as fw:
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
json_path.parent.mkdir(parents=True, exist_ok=True)
with open(json_path, "w") as fw:
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
md_path.parent.mkdir(parents=True, exist_ok=True)
with open(md_path, "w") as fw:
fw.write(doc_pred_md)
dt_path.parent.mkdir(parents=True, exist_ok=True)
with open(dt_path, "w") as fw:
fw.write(doc_pred_dt)
else: # default branch in test