mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
* Add DocumentConverter.extract and full extraction pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add DocumentConverter.extract template arg Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add NuExtract model Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add Extraction pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add proper test, support pydantic class types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add qr bill example Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add base_extraction_pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update typing of ExtractionResult and inner fields Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Factor out extract to DocumentExtractor Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Address mypy issues Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add DocumentExtractor Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Resolve circular import issue Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Clean up imports, remove Optional for template arg Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Move new type definitions into datamodel Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update comments Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Respect page-range, disable test_extraction for CI Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
40 lines
1.3 KiB
Python
40 lines
1.3 KiB
Python
"""Data models for document extraction functionality."""
|
|
|
|
from typing import Any, Dict, List, Optional, Type, Union
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
from docling.datamodel.base_models import ConversionStatus, ErrorItem
|
|
from docling.datamodel.document import InputDocument
|
|
|
|
|
|
class ExtractedPageData(BaseModel):
|
|
"""Data model for extracted content from a single page."""
|
|
|
|
page_no: int = Field(..., description="1-indexed page number")
|
|
extracted_data: Optional[Dict[str, Any]] = Field(
|
|
None, description="Extracted structured data from the page"
|
|
)
|
|
raw_text: Optional[str] = Field(None, description="Raw extracted text")
|
|
errors: List[str] = Field(
|
|
default_factory=list,
|
|
description="Any errors encountered during extraction for this page",
|
|
)
|
|
|
|
|
|
class ExtractionResult(BaseModel):
|
|
"""Result of document extraction."""
|
|
|
|
input: InputDocument
|
|
status: ConversionStatus = ConversionStatus.PENDING
|
|
errors: List[ErrorItem] = []
|
|
|
|
# Pages field - always a list for consistency
|
|
pages: List[ExtractedPageData] = Field(
|
|
default_factory=list, description="Extracted data from each page"
|
|
)
|
|
|
|
|
|
# Type alias for template parameters that can be string, dict, or BaseModel
|
|
ExtractionTemplateType = Union[str, Dict[str, Any], BaseModel, Type[BaseModel]]
|