mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: [Beta] Extraction with schema (#2138)
* Add DocumentConverter.extract and full extraction pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add DocumentConverter.extract template arg Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add NuExtract model Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add Extraction pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add proper test, support pydantic class types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add qr bill example Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add base_extraction_pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update typing of ExtractionResult and inner fields Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Factor out extract to DocumentExtractor Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Address mypy issues Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add DocumentExtractor Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Resolve circular import issue Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Clean up imports, remove Optional for template arg Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Move new type definitions into datamodel Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update comments Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Respect page-range, disable test_extraction for CI Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
import math
|
||||
from collections import defaultdict
|
||||
from enum import Enum
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import (
|
||||
@@ -32,6 +32,18 @@ from pydantic import (
|
||||
if TYPE_CHECKING:
|
||||
from docling.backend.pdf_backend import PdfPageBackend
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
|
||||
|
||||
class BaseFormatOption(BaseModel):
|
||||
"""Base class for format options used by _DocumentConversionInput."""
|
||||
|
||||
pipeline_options: Optional[PipelineOptions] = None
|
||||
backend: Type[AbstractDocumentBackend]
|
||||
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
|
||||
class ConversionStatus(str, Enum):
|
||||
PENDING = "pending"
|
||||
|
||||
Reference in New Issue
Block a user