feat: [Beta] Extraction with schema (#2138)

* Add DocumentConverter.extract and full extraction pipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add DocumentConverter.extract template arg

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add NuExtract model

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add Extraction pipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add proper test, support pydantic class types

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add qr bill example

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add base_extraction_pipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add types

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update typing of ExtractionResult and inner fields

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Factor out extract to DocumentExtractor

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Address mypy issues

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add DocumentExtractor

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Resolve circular import issue

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Clean up imports, remove Optional for template arg

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Move new type definitions into datamodel

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update comments

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Respect page-range, disable test_extraction for CI

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2025-09-01 16:09:48 +02:00
committed by GitHub
parent a283ccff25
commit 9f4bc5b2f1
14 changed files with 1171 additions and 14 deletions

BIN
tests/data_scanned/qr_bill_example.jpg vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 285 KiB

108
tests/test_extraction.py Normal file
View File

@@ -0,0 +1,108 @@
"""
Test unit for document extraction functionality.
"""
import os
from pathlib import Path
import pytest
from pydantic import BaseModel, Field
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter
from docling.document_extractor import DocumentExtractor
IS_CI = bool(os.getenv("CI"))
class ExampleTemplate(BaseModel):
bill_no: str = Field(
examples=["A123", "5414"]
) # provide some examples, but not the actual value of the test sample
total: float = Field(
default=10.0, examples=[20.0]
) # provide a default value and some examples
@pytest.fixture
def extractor() -> DocumentExtractor:
"""Create a document converter instance for testing."""
return DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])
@pytest.fixture
def test_file_path() -> Path:
"""Get the path to the test QR bill image."""
return Path(__file__).parent / "data_scanned" / "qr_bill_example.jpg"
# return Path("tests/data/pdf/code_and_formula.pdf")
@pytest.mark.skipif(
IS_CI, reason="Skipping test in CI because the dataset is too heavy."
)
def test_extraction_with_string_template(
extractor: DocumentExtractor, test_file_path: Path
) -> None:
"""Test extraction using string template."""
str_templ = '{"bill_no": "string", "total": "number"}'
result = extractor.extract(test_file_path, template=str_templ)
print(result.pages)
assert result.status is not None
assert len(result.pages) == 1
assert result.pages[0].extracted_data["bill_no"] == "3139"
assert result.pages[0].extracted_data["total"] == 3949.75
@pytest.mark.skipif(
IS_CI, reason="Skipping test in CI because the dataset is too heavy."
)
def test_extraction_with_dict_template(
extractor: DocumentExtractor, test_file_path: Path
) -> None:
"""Test extraction using dictionary template."""
dict_templ = {
"bill_no": "string",
"total": "number",
}
result = extractor.extract(test_file_path, template=dict_templ)
assert len(result.pages) == 1
assert result.pages[0].extracted_data["bill_no"] == "3139"
assert result.pages[0].extracted_data["total"] == 3949.75
@pytest.mark.skipif(
IS_CI, reason="Skipping test in CI because the dataset is too heavy."
)
def test_extraction_with_pydantic_instance_template(
extractor: DocumentExtractor, test_file_path: Path
) -> None:
"""Test extraction using pydantic instance template."""
pydantic_instance_templ = ExampleTemplate(bill_no="4321")
result = extractor.extract(test_file_path, template=pydantic_instance_templ)
assert len(result.pages) == 1
assert result.pages[0].extracted_data["bill_no"] == "3139"
assert result.pages[0].extracted_data["total"] == 3949.75
@pytest.mark.skipif(
IS_CI, reason="Skipping test in CI because the dataset is too heavy."
)
def test_extraction_with_pydantic_class_template(
extractor: DocumentExtractor, test_file_path: Path
) -> None:
"""Test extraction using pydantic class template."""
pydantic_class_templ = ExampleTemplate
result = extractor.extract(test_file_path, template=pydantic_class_templ)
assert len(result.pages) == 1
assert result.pages[0].extracted_data["bill_no"] == "3139"
assert result.pages[0].extracted_data["total"] == 3949.75