fix: updated the render_as_doctags with the new arguments from docling-core (#93)

* updated the render_as_doctags with the new arguments from docling-core

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* ensuring that docling-core is >1.5.0 to accomodate with the latest export-to-doctags parameters

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added the doctags tests

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updated the README

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fix poetry lock

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Fix formatting problems

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fixed the doctag export in docling/utils/export.py

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* propagate xsize and ysize

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2024-09-23 20:12:18 +02:00
committed by GitHub
parent dce9934a0f
commit 4794ce460a
18 changed files with 3091 additions and 20 deletions

View File

@@ -122,6 +122,10 @@ def verify_md(doc_pred_md, doc_true_md):
return doc_pred_md == doc_true_md
def verify_dt(doc_pred_dt, doc_true_dt):
return doc_pred_dt == doc_true_dt
def verify_conversion_result(
input_path: Path, doc_result: ConversionResult, generate=False
):
@@ -134,10 +138,12 @@ def verify_conversion_result(
doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DsDocument = doc_result.output
doc_pred_md = doc_result.render_as_markdown()
doc_pred_dt = doc_result.render_as_doctags()
pages_path = input_path.with_suffix(".pages.json")
json_path = input_path.with_suffix(".json")
md_path = input_path.with_suffix(".md")
dt_path = input_path.with_suffix(".doctags.txt")
if generate: # only used when re-generating truth
with open(pages_path, "w") as fw:
@@ -148,6 +154,9 @@ def verify_conversion_result(
with open(md_path, "w") as fw:
fw.write(doc_pred_md)
with open(dt_path, "w") as fw:
fw.write(doc_pred_dt)
else: # default branch in test
with open(pages_path, "r") as fr:
doc_true_pages = PageList.validate_json(fr.read())
@@ -158,6 +167,9 @@ def verify_conversion_result(
with open(md_path, "r") as fr:
doc_true_md = fr.read()
with open(dt_path, "r") as fr:
doc_true_dt = fr.read()
assert verify_cells(
doc_pred_pages, doc_true_pages
), f"Mismatch in PDF cell prediction for {input_path}"
@@ -173,3 +185,7 @@ def verify_conversion_result(
assert verify_md(
doc_pred_md, doc_true_md
), f"Mismatch in Markdown prediction for {input_path}"
assert verify_dt(
doc_pred_dt, doc_true_dt
), f"Mismatch in DocTags prediction for {input_path}"