mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
feat(OCR): Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning (#290)
- When the OCR is forced, any existing PDF cells are rejected. - Introduce the force-ocr cmd parameter in docling CLI. - Update unit tests. - Add the full_page_ocr.py example in mkdocs. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
@@ -256,15 +256,19 @@ def verify_conversion_result_v1(
|
||||
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
|
||||
|
||||
if generate: # only used when re-generating truth
|
||||
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(pages_path, "w") as fw:
|
||||
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
|
||||
|
||||
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(json_path, "w") as fw:
|
||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
|
||||
|
||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(md_path, "w") as fw:
|
||||
fw.write(doc_pred_md)
|
||||
|
||||
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(dt_path, "w") as fw:
|
||||
fw.write(doc_pred_dt)
|
||||
else: # default branch in test
|
||||
@@ -328,15 +332,19 @@ def verify_conversion_result_v2(
|
||||
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
|
||||
|
||||
if generate: # only used when re-generating truth
|
||||
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(pages_path, "w") as fw:
|
||||
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
|
||||
|
||||
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(json_path, "w") as fw:
|
||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
|
||||
|
||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(md_path, "w") as fw:
|
||||
fw.write(doc_pred_md)
|
||||
|
||||
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(dt_path, "w") as fw:
|
||||
fw.write(doc_pred_dt)
|
||||
else: # default branch in test
|
||||
|
||||
Reference in New Issue
Block a user