Merge branch 'dev/add-strict-tests' of github.com:DS4SD/docling into dev/add-strict-tests

This commit is contained in:
Michele Dolfi 2024-08-28 14:57:14 +02:00
commit 52b25bf030
14 changed files with 143 additions and 1231614 deletions

View File

@ -238,9 +238,9 @@ class EquationPrediction(BaseModel):
class PagePredictions(BaseModel): class PagePredictions(BaseModel):
layout: LayoutPrediction = None layout: LayoutPrediction = None
tablestructure: TableStructurePrediction = None tablestructure: Optional[TableStructurePrediction] = None
figures_classification: FigureClassificationPrediction = None figures_classification: Optional[FigureClassificationPrediction] = None
equations_prediction: EquationPrediction = None equations_prediction: Optional[EquationPrediction] = None
PageElement = Union[TextElement, TableElement, FigureElement] PageElement = Union[TextElement, TableElement, FigureElement]

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,4 +1,3 @@
import glob
from pathlib import Path from pathlib import Path
import pytest import pytest
@ -8,7 +7,6 @@ from docling.backend.docling_parse_backend import (
DoclingParsePageBackend, DoclingParsePageBackend,
) )
from docling.datamodel.base_models import BoundingBox from docling.datamodel.base_models import BoundingBox
from docling.document_converter import DocumentConverter
@pytest.fixture @pytest.fixture
@ -16,6 +14,27 @@ def test_doc_path():
return Path("./tests/data/2206.01062.pdf") return Path("./tests/data/2206.01062.pdf")
def test_text_cell_counts():
pdf_doc = Path("./tests/data/redp5695.pdf")
doc_backend = DoclingParseDocumentBackend(pdf_doc, "123456xyz")
for page_index in range(0, doc_backend.page_count()):
last_cell_count = None
for i in range(10):
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
cells = list(page_backend.get_text_cells())
if last_cell_count is None:
last_cell_count = len(cells)
if len(cells) != last_cell_count:
assert (
False
), "Loading page multiple times yielded non-identical text cell counts"
last_cell_count = len(cells)
def test_get_text_from_rect(test_doc_path): def test_get_text_from_rect(test_doc_path):
doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz") doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz")
page_backend: DoclingParsePageBackend = doc_backend.load_page(0) page_backend: DoclingParsePageBackend = doc_backend.load_page(0)

View File

@ -14,6 +14,27 @@ def test_doc_path():
return Path("./tests/data/2206.01062.pdf") return Path("./tests/data/2206.01062.pdf")
def test_text_cell_counts():
pdf_doc = Path("./tests/data/redp5695.pdf")
doc_backend = PyPdfiumDocumentBackend(pdf_doc, "123456xyz")
for page_index in range(0, doc_backend.page_count()):
last_cell_count = None
for i in range(10):
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
cells = list(page_backend.get_text_cells())
if last_cell_count is None:
last_cell_count = len(cells)
if len(cells) != last_cell_count:
assert (
False
), "Loading page multiple times yielded non-identical text cell counts"
last_cell_count = len(cells)
def test_get_text_from_rect(test_doc_path): def test_get_text_from_rect(test_doc_path):
doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz") doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz")
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)

View File

@ -1,9 +1,17 @@
import glob import glob
import json import json
from pathlib import Path, PosixPath from pathlib import Path, PosixPath
from typing import List
from docling_core.types import BaseText
from docling_core.types import Document as DsDocument
from pydantic import TypeAdapter
from pydantic.json import pydantic_encoder
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, Page, PipelineOptions
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
GENERATE = False GENERATE = False
@ -34,108 +42,86 @@ def get_converter():
return converter return converter
def convert_paths(data): def verify_cells(doc_pred_pages, doc_true_pages):
if isinstance(data, dict):
return {k: convert_paths(v) for k, v in data.items()}
elif isinstance(data, list):
return [convert_paths(v) for v in data]
elif isinstance(data, PosixPath):
return str(data)
else:
return data
assert len(doc_pred_pages) == len(
def verify_cells(doc_pred_json, doc_true_json): doc_true_pages
assert len(doc_pred_json["pages"]) == len(
doc_true_json["pages"]
), "pred- and true-doc do not have the same number of pages" ), "pred- and true-doc do not have the same number of pages"
for pid, page_true_item in enumerate(doc_true_json["pages"]): for pid, page_true_item in enumerate(doc_true_pages):
num_true_cells = len(page_true_item["cells"]) num_true_cells = len(page_true_item.cells)
num_pred_cells = len(doc_pred_json["pages"][pid]["cells"]) num_pred_cells = len(doc_pred_pages[pid].cells)
assert ( assert (
num_true_cells == num_pred_cells num_true_cells == num_pred_cells
), f"num_true_cells!=num_pred_cells {num_true_cells}!={num_pred_cells}" ), f"num_true_cells!=num_pred_cells {num_true_cells}!={num_pred_cells}"
for cid, cell_true_item in enumerate(page_true_item["cells"]): for cid, cell_true_item in enumerate(page_true_item.cells):
cell_pred_item = doc_pred_json["pages"][pid]["cells"][cid] cell_pred_item = doc_pred_pages[pid].cells[cid]
true_text = cell_true_item["text"] true_text = cell_true_item.text
pred_text = cell_pred_item["text"] pred_text = cell_pred_item.text
assert true_text == pred_text, f"{true_text}!={pred_text}" assert true_text == pred_text, f"{true_text}!={pred_text}"
for _ in ["t", "b", "l", "r"]: true_bbox = cell_true_item.bbox.as_tuple()
true_val = round(cell_true_item["bbox"][_]) pred_bbox = cell_pred_item.bbox.as_tuple()
pred_val = round(cell_pred_item["bbox"][_]) assert (
true_bbox == pred_bbox
), f"bbox is not the same: {true_bbox} != {pred_bbox}"
return True
def verify_maintext(doc_pred, doc_true):
assert len(doc_true.main_text) == len(
doc_pred.main_text
), "document has different length of main-text than expected."
for l, true_item in enumerate(doc_true.main_text):
if isinstance(true_item, BaseText):
pred_item = doc_pred.main_text[l]
assert isinstance(
pred_item, BaseText
), f"{pred_item} is not a BaseText element, but {true_item} is."
assert true_item.text == pred_item.text
return True
def verify_tables(doc_pred, doc_true):
assert len(doc_true.tables) == len(
doc_pred.tables
), "document has different count of tables than expected."
for l, true_item in enumerate(doc_true.tables):
pred_item = doc_pred.tables[l]
assert (
true_item.num_rows == pred_item.num_rows
), "table does not have the same #-rows"
assert (
true_item.num_cols == pred_item.num_cols
), "table does not have the same #-cols"
for i, row in enumerate(true_item.data):
for j, col in enumerate(true_item.data[i]):
assert ( assert (
pred_val == true_val true_item.data[i][j].text == pred_item.data[i][j].text
), f"bbox for {_} is not the same: {true_val} != {pred_val}" ), "table-cell does not have the same text"
return True return True
def verify_maintext(doc_pred_json, doc_true_json): def verify_output(doc_pred: DsDocument, doc_true: DsDocument):
for l, true_item in enumerate(doc_true_json["output"]["main_text"]): assert verify_maintext(doc_pred, doc_true), "verify_maintext(doc_pred, doc_true)"
if "text" in true_item: assert verify_tables(doc_pred, doc_true), "verify_tables(doc_pred, doc_true)"
pred_item = doc_pred_json["output"]["main_text"][l]
assert "text" in pred_item, f"`text` is in {pred_item}"
assert true_item["text"] == pred_item["text"]
def verify_tables(doc_pred_json, doc_true_json):
for l, true_item in enumerate(doc_true_json["output"]["tables"]):
if "data" in true_item:
pred_item = doc_pred_json["output"]["tables"][l]
assert "data" in pred_item, f"`data` is in {pred_item}"
assert len(true_item["data"]) == len(
pred_item["data"]
), "table does not have the same #-rows"
assert len(true_item["data"][0]) == len(
pred_item["data"][0]
), "table does not have the same #-cols"
for i, row in enumerate(true_item["data"]):
for j, col in enumerate(true_item["data"][i]):
if "text" in true_item["data"][i][j]:
assert (
"text" in pred_item["data"][i][j]
), "table-cell does not contain text"
assert (
true_item["data"][i][j]["text"]
== pred_item["data"][i][j]["text"]
), "table-cell does not have the same text"
return True
def verify_json(doc_pred_json, doc_true_json):
if doc_pred_json.keys() != doc_true_json.keys():
return False
if doc_pred_json["output"].keys() != doc_true_json["output"].keys():
return False
assert verify_maintext(
doc_pred_json, doc_true_json
), "verify_maintext(doc_pred_json, doc_true_json)"
assert verify_tables(
doc_pred_json, doc_true_json
), "verify_tables(doc_pred_json, doc_true_json)"
return True return True
@ -145,54 +131,54 @@ def verify_md(doc_pred_md, doc_true_md):
def test_e2e_conversions(): def test_e2e_conversions():
PageList = TypeAdapter(List[Page])
pdf_paths = get_pdf_paths() pdf_paths = get_pdf_paths()
converter = get_converter() converter = get_converter()
for path in pdf_paths: for path in pdf_paths:
print(f"converting {path}") print(f"converting {path}")
doc_pred_json = None
doc_true_json = None
try: try:
doc_pred_json = converter.convert_single(path) doc_result: ConversionResult = converter.convert_single(path)
except: except:
continue continue
doc_pred_md = doc_pred_json.render_as_markdown() doc_pred_pages: PageList = doc_result.pages
doc_pred: DsDocument = doc_result.output
doc_pred_md = doc_result.render_as_markdown()
pages_path = path.with_suffix(".pages.json")
json_path = path.with_suffix(".json") json_path = path.with_suffix(".json")
md_path = path.with_suffix(".md") md_path = path.with_suffix(".md")
if GENERATE: if GENERATE: # only used when re-generating truth
with open(pages_path, "w") as fw:
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
with open(json_path, "w") as fw: with open(json_path, "w") as fw:
_ = doc_pred_json.model_dump() fw.write(json.dumps(doc_pred, default=pydantic_encoder))
_ = convert_paths(_)
fw.write(json.dumps(_, indent=2))
with open(md_path, "w") as fw: with open(md_path, "w") as fw:
fw.write(doc_pred_md) fw.write(doc_pred_md)
else: # default branch in test
else: with open(pages_path, "r") as fr:
doc_true_pages = PageList.validate_python(json.load(fr))
with open(json_path, "r") as fr: with open(json_path, "r") as fr:
doc_true_json = json.load(fr) doc_true = DsDocument.model_validate(json.load(fr))
with open(md_path, "r") as fr: with open(md_path, "r") as fr:
doc_true_md = "".join(fr.readlines()) doc_true_md = "".join(fr.readlines())
assert verify_cells( assert verify_cells(
doc_pred_json.model_dump(), doc_true_json doc_pred_pages, doc_true_pages
), f"verify_cells(doc_pred_json, doc_true_json) for {path}" ), f"Mismatch in PDF cell prediction for {path}"
# assert verify_json( assert verify_output(
# doc_pred_json.model_dump(), doc_true_json doc_pred, doc_true
# ), f"failed json prediction for {path}" ), f"Mismatch in JSON prediction for {path}"
assert verify_md( assert verify_md(
doc_pred_md, doc_true_md doc_pred_md, doc_true_md
), f"failed md prediction for {path}" ), f"Mismatch in Markdown prediction for {path}"