feat: [Beta] Extraction with schema (#2138)

* Add DocumentConverter.extract and full extraction pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add DocumentConverter.extract template arg Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add NuExtract model Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add Extraction pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add proper test, support pydantic class types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add qr bill example Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add base_extraction_pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update typing of ExtractionResult and inner fields Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Factor out extract to DocumentExtractor Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Address mypy issues Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add DocumentExtractor Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Resolve circular import issue Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Clean up imports, remove Optional for template arg Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Move new type definitions into datamodel Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update comments Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Respect page-range, disable test_extraction for CI Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-09-01 16:09:48 +02:00
parent a283ccff25
commit 9f4bc5b2f1
14 changed files with 1171 additions and 14 deletions
--- a/tests/data_scanned/qr_bill_example.jpg
+++ b/tests/data_scanned/qr_bill_example.jpg
--- a/tests/test_extraction.py
+++ b/tests/test_extraction.py
@@ -0,0 +1,108 @@
+"""
+Test unit for document extraction functionality.
+"""
+
+import os
+from pathlib import Path
+
+import pytest
+from pydantic import BaseModel, Field
+
+from docling.datamodel.base_models import InputFormat
+from docling.document_converter import DocumentConverter
+from docling.document_extractor import DocumentExtractor
+
+IS_CI = bool(os.getenv("CI"))
+
+
+class ExampleTemplate(BaseModel):
+    bill_no: str = Field(
+        examples=["A123", "5414"]
+    )  # provide some examples, but not the actual value of the test sample
+    total: float = Field(
+        default=10.0, examples=[20.0]
+    )  # provide a default value and some examples
+
+
+@pytest.fixture
+def extractor() -> DocumentExtractor:
+    """Create a document converter instance for testing."""
+
+    return DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])
+
+
+@pytest.fixture
+def test_file_path() -> Path:
+    """Get the path to the test QR bill image."""
+    return Path(__file__).parent / "data_scanned" / "qr_bill_example.jpg"
+    # return Path("tests/data/pdf/code_and_formula.pdf")
+
+
+@pytest.mark.skipif(
+    IS_CI, reason="Skipping test in CI because the dataset is too heavy."
+)
+def test_extraction_with_string_template(
+    extractor: DocumentExtractor, test_file_path: Path
+) -> None:
+    """Test extraction using string template."""
+    str_templ = '{"bill_no": "string", "total": "number"}'
+
+    result = extractor.extract(test_file_path, template=str_templ)
+
+    print(result.pages)
+
+    assert result.status is not None
+    assert len(result.pages) == 1
+    assert result.pages[0].extracted_data["bill_no"] == "3139"
+    assert result.pages[0].extracted_data["total"] == 3949.75
+
+
+@pytest.mark.skipif(
+    IS_CI, reason="Skipping test in CI because the dataset is too heavy."
+)
+def test_extraction_with_dict_template(
+    extractor: DocumentExtractor, test_file_path: Path
+) -> None:
+    """Test extraction using dictionary template."""
+    dict_templ = {
+        "bill_no": "string",
+        "total": "number",
+    }
+
+    result = extractor.extract(test_file_path, template=dict_templ)
+
+    assert len(result.pages) == 1
+    assert result.pages[0].extracted_data["bill_no"] == "3139"
+    assert result.pages[0].extracted_data["total"] == 3949.75
+
+
+@pytest.mark.skipif(
+    IS_CI, reason="Skipping test in CI because the dataset is too heavy."
+)
+def test_extraction_with_pydantic_instance_template(
+    extractor: DocumentExtractor, test_file_path: Path
+) -> None:
+    """Test extraction using pydantic instance template."""
+    pydantic_instance_templ = ExampleTemplate(bill_no="4321")
+
+    result = extractor.extract(test_file_path, template=pydantic_instance_templ)
+
+    assert len(result.pages) == 1
+    assert result.pages[0].extracted_data["bill_no"] == "3139"
+    assert result.pages[0].extracted_data["total"] == 3949.75
+
+
+@pytest.mark.skipif(
+    IS_CI, reason="Skipping test in CI because the dataset is too heavy."
+)
+def test_extraction_with_pydantic_class_template(
+    extractor: DocumentExtractor, test_file_path: Path
+) -> None:
+    """Test extraction using pydantic class template."""
+    pydantic_class_templ = ExampleTemplate
+
+    result = extractor.extract(test_file_path, template=pydantic_class_templ)
+
+    assert len(result.pages) == 1
+    assert result.pages[0].extracted_data["bill_no"] == "3139"
+    assert result.pages[0].extracted_data["total"] == 3949.75