Merge branch 'dev/add-strict-tests' of github.com:DS4SD/docling into dev/add-strict-tests

This commit is contained in:
Michele Dolfi 2024-08-28 14:57:14 +02:00
commit 52b25bf030
14 changed files with 143 additions and 1231614 deletions

View File

@ -238,9 +238,9 @@ class EquationPrediction(BaseModel):
class PagePredictions(BaseModel):
layout: LayoutPrediction = None
tablestructure: TableStructurePrediction = None
figures_classification: FigureClassificationPrediction = None
equations_prediction: EquationPrediction = None
tablestructure: Optional[TableStructurePrediction] = None
figures_classification: Optional[FigureClassificationPrediction] = None
equations_prediction: Optional[EquationPrediction] = None
PageElement = Union[TextElement, TableElement, FigureElement]

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,4 +1,3 @@
import glob
from pathlib import Path
import pytest
@ -8,7 +7,6 @@ from docling.backend.docling_parse_backend import (
DoclingParsePageBackend,
)
from docling.datamodel.base_models import BoundingBox
from docling.document_converter import DocumentConverter
@pytest.fixture
@ -16,6 +14,27 @@ def test_doc_path():
return Path("./tests/data/2206.01062.pdf")
def test_text_cell_counts():
pdf_doc = Path("./tests/data/redp5695.pdf")
doc_backend = DoclingParseDocumentBackend(pdf_doc, "123456xyz")
for page_index in range(0, doc_backend.page_count()):
last_cell_count = None
for i in range(10):
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
cells = list(page_backend.get_text_cells())
if last_cell_count is None:
last_cell_count = len(cells)
if len(cells) != last_cell_count:
assert (
False
), "Loading page multiple times yielded non-identical text cell counts"
last_cell_count = len(cells)
def test_get_text_from_rect(test_doc_path):
doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz")
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)

View File

@ -14,6 +14,27 @@ def test_doc_path():
return Path("./tests/data/2206.01062.pdf")
def test_text_cell_counts():
pdf_doc = Path("./tests/data/redp5695.pdf")
doc_backend = PyPdfiumDocumentBackend(pdf_doc, "123456xyz")
for page_index in range(0, doc_backend.page_count()):
last_cell_count = None
for i in range(10):
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
cells = list(page_backend.get_text_cells())
if last_cell_count is None:
last_cell_count = len(cells)
if len(cells) != last_cell_count:
assert (
False
), "Loading page multiple times yielded non-identical text cell counts"
last_cell_count = len(cells)
def test_get_text_from_rect(test_doc_path):
doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz")
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)

View File

@ -1,9 +1,17 @@
import glob
import json
from pathlib import Path, PosixPath
from typing import List
from docling_core.types import BaseText
from docling_core.types import Document as DsDocument
from pydantic import TypeAdapter
from pydantic.json import pydantic_encoder
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, Page, PipelineOptions
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
GENERATE = False
@ -34,108 +42,86 @@ def get_converter():
return converter
def convert_paths(data):
if isinstance(data, dict):
return {k: convert_paths(v) for k, v in data.items()}
elif isinstance(data, list):
return [convert_paths(v) for v in data]
elif isinstance(data, PosixPath):
return str(data)
else:
return data
def verify_cells(doc_pred_pages, doc_true_pages):
def verify_cells(doc_pred_json, doc_true_json):
assert len(doc_pred_json["pages"]) == len(
doc_true_json["pages"]
assert len(doc_pred_pages) == len(
doc_true_pages
), "pred- and true-doc do not have the same number of pages"
for pid, page_true_item in enumerate(doc_true_json["pages"]):
for pid, page_true_item in enumerate(doc_true_pages):
num_true_cells = len(page_true_item["cells"])
num_pred_cells = len(doc_pred_json["pages"][pid]["cells"])
num_true_cells = len(page_true_item.cells)
num_pred_cells = len(doc_pred_pages[pid].cells)
assert (
num_true_cells == num_pred_cells
), f"num_true_cells!=num_pred_cells {num_true_cells}!={num_pred_cells}"
for cid, cell_true_item in enumerate(page_true_item["cells"]):
for cid, cell_true_item in enumerate(page_true_item.cells):
cell_pred_item = doc_pred_json["pages"][pid]["cells"][cid]
cell_pred_item = doc_pred_pages[pid].cells[cid]
true_text = cell_true_item["text"]
pred_text = cell_pred_item["text"]
true_text = cell_true_item.text
pred_text = cell_pred_item.text
assert true_text == pred_text, f"{true_text}!={pred_text}"
for _ in ["t", "b", "l", "r"]:
true_val = round(cell_true_item["bbox"][_])
pred_val = round(cell_pred_item["bbox"][_])
true_bbox = cell_true_item.bbox.as_tuple()
pred_bbox = cell_pred_item.bbox.as_tuple()
assert (
true_bbox == pred_bbox
), f"bbox is not the same: {true_bbox} != {pred_bbox}"
return True
def verify_maintext(doc_pred, doc_true):
assert len(doc_true.main_text) == len(
doc_pred.main_text
), "document has different length of main-text than expected."
for l, true_item in enumerate(doc_true.main_text):
if isinstance(true_item, BaseText):
pred_item = doc_pred.main_text[l]
assert isinstance(
pred_item, BaseText
), f"{pred_item} is not a BaseText element, but {true_item} is."
assert true_item.text == pred_item.text
return True
def verify_tables(doc_pred, doc_true):
assert len(doc_true.tables) == len(
doc_pred.tables
), "document has different count of tables than expected."
for l, true_item in enumerate(doc_true.tables):
pred_item = doc_pred.tables[l]
assert (
true_item.num_rows == pred_item.num_rows
), "table does not have the same #-rows"
assert (
true_item.num_cols == pred_item.num_cols
), "table does not have the same #-cols"
for i, row in enumerate(true_item.data):
for j, col in enumerate(true_item.data[i]):
assert (
pred_val == true_val
), f"bbox for {_} is not the same: {true_val} != {pred_val}"
true_item.data[i][j].text == pred_item.data[i][j].text
), "table-cell does not have the same text"
return True
def verify_maintext(doc_pred_json, doc_true_json):
def verify_output(doc_pred: DsDocument, doc_true: DsDocument):
for l, true_item in enumerate(doc_true_json["output"]["main_text"]):
if "text" in true_item:
pred_item = doc_pred_json["output"]["main_text"][l]
assert "text" in pred_item, f"`text` is in {pred_item}"
assert true_item["text"] == pred_item["text"]
def verify_tables(doc_pred_json, doc_true_json):
for l, true_item in enumerate(doc_true_json["output"]["tables"]):
if "data" in true_item:
pred_item = doc_pred_json["output"]["tables"][l]
assert "data" in pred_item, f"`data` is in {pred_item}"
assert len(true_item["data"]) == len(
pred_item["data"]
), "table does not have the same #-rows"
assert len(true_item["data"][0]) == len(
pred_item["data"][0]
), "table does not have the same #-cols"
for i, row in enumerate(true_item["data"]):
for j, col in enumerate(true_item["data"][i]):
if "text" in true_item["data"][i][j]:
assert (
"text" in pred_item["data"][i][j]
), "table-cell does not contain text"
assert (
true_item["data"][i][j]["text"]
== pred_item["data"][i][j]["text"]
), "table-cell does not have the same text"
return True
def verify_json(doc_pred_json, doc_true_json):
if doc_pred_json.keys() != doc_true_json.keys():
return False
if doc_pred_json["output"].keys() != doc_true_json["output"].keys():
return False
assert verify_maintext(
doc_pred_json, doc_true_json
), "verify_maintext(doc_pred_json, doc_true_json)"
assert verify_tables(
doc_pred_json, doc_true_json
), "verify_tables(doc_pred_json, doc_true_json)"
assert verify_maintext(doc_pred, doc_true), "verify_maintext(doc_pred, doc_true)"
assert verify_tables(doc_pred, doc_true), "verify_tables(doc_pred, doc_true)"
return True
@ -145,54 +131,54 @@ def verify_md(doc_pred_md, doc_true_md):
def test_e2e_conversions():
PageList = TypeAdapter(List[Page])
pdf_paths = get_pdf_paths()
converter = get_converter()
for path in pdf_paths:
print(f"converting {path}")
doc_pred_json = None
doc_true_json = None
try:
doc_pred_json = converter.convert_single(path)
doc_result: ConversionResult = converter.convert_single(path)
except:
continue
doc_pred_md = doc_pred_json.render_as_markdown()
doc_pred_pages: PageList = doc_result.pages
doc_pred: DsDocument = doc_result.output
doc_pred_md = doc_result.render_as_markdown()
pages_path = path.with_suffix(".pages.json")
json_path = path.with_suffix(".json")
md_path = path.with_suffix(".md")
if GENERATE:
if GENERATE: # only used when re-generating truth
with open(pages_path, "w") as fw:
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
with open(json_path, "w") as fw:
_ = doc_pred_json.model_dump()
_ = convert_paths(_)
fw.write(json.dumps(_, indent=2))
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
with open(md_path, "w") as fw:
fw.write(doc_pred_md)
else:
else: # default branch in test
with open(pages_path, "r") as fr:
doc_true_pages = PageList.validate_python(json.load(fr))
with open(json_path, "r") as fr:
doc_true_json = json.load(fr)
doc_true = DsDocument.model_validate(json.load(fr))
with open(md_path, "r") as fr:
doc_true_md = "".join(fr.readlines())
assert verify_cells(
doc_pred_json.model_dump(), doc_true_json
), f"verify_cells(doc_pred_json, doc_true_json) for {path}"
doc_pred_pages, doc_true_pages
), f"Mismatch in PDF cell prediction for {path}"
# assert verify_json(
# doc_pred_json.model_dump(), doc_true_json
# ), f"failed json prediction for {path}"
assert verify_output(
doc_pred, doc_true
), f"Mismatch in JSON prediction for {path}"
assert verify_md(
doc_pred_md, doc_true_md
), f"failed md prediction for {path}"
), f"Mismatch in Markdown prediction for {path}"