mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
adding extraction script
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
149
docs/examples/extraction.py
vendored
Normal file
149
docs/examples/extraction.py
vendored
Normal file
@@ -0,0 +1,149 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from typing import Optional, Sequence
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.document_extractor import DocumentExtractor
|
||||
|
||||
DEFAULT_SOURCE = (
|
||||
"https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg"
|
||||
)
|
||||
|
||||
|
||||
def build_extractor() -> DocumentExtractor:
|
||||
return DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])
|
||||
|
||||
|
||||
def example_with_string_template(source: str = DEFAULT_SOURCE) -> None:
|
||||
extractor = build_extractor()
|
||||
result = extractor.extract(
|
||||
source=source,
|
||||
template='{"bill_no": "string", "total": "float"}',
|
||||
)
|
||||
print(result.pages)
|
||||
|
||||
|
||||
def example_with_dict_template(source: str = DEFAULT_SOURCE) -> None:
|
||||
extractor = build_extractor()
|
||||
result = extractor.extract(
|
||||
source=source,
|
||||
template={
|
||||
"bill_no": "string",
|
||||
"total": "float",
|
||||
},
|
||||
)
|
||||
print(result.pages)
|
||||
|
||||
|
||||
class Invoice(BaseModel):
|
||||
bill_no: str = Field(examples=["A123", "5414"]) # examples only
|
||||
total: float = Field(default=10, examples=[20])
|
||||
tax_id: Optional[str] = Field(default=None, examples=["1234567890"])
|
||||
|
||||
|
||||
def example_with_pydantic_template(source: str = DEFAULT_SOURCE) -> None:
|
||||
extractor = build_extractor()
|
||||
|
||||
# Using the model class directly
|
||||
result = extractor.extract(
|
||||
source=source,
|
||||
template=Invoice,
|
||||
)
|
||||
print(result.pages)
|
||||
|
||||
# Using a model instance with defaults/overrides
|
||||
result = extractor.extract(
|
||||
source=source,
|
||||
template=Invoice(
|
||||
bill_no="41",
|
||||
total=100,
|
||||
tax_id="42",
|
||||
),
|
||||
)
|
||||
print(result.pages)
|
||||
|
||||
|
||||
class Contact(BaseModel):
|
||||
name: Optional[str] = Field(default=None, examples=["Smith"])
|
||||
address: str = Field(default="123 Main St", examples=["456 Elm St"])
|
||||
postal_code: str = Field(default="12345", examples=["67890"])
|
||||
city: str = Field(default="Anytown", examples=["Othertown"])
|
||||
country: Optional[str] = Field(default=None, examples=["Canada"])
|
||||
|
||||
|
||||
class ExtendedInvoice(BaseModel):
|
||||
bill_no: str = Field(examples=["A123", "5414"]) # examples only
|
||||
total: float = Field(default=10, examples=[20])
|
||||
garden_work_hours: int = Field(default=1, examples=[2])
|
||||
sender: Contact = Field(default=Contact(), examples=[Contact()])
|
||||
receiver: Contact = Field(default=Contact(), examples=[Contact()])
|
||||
|
||||
|
||||
def example_with_advanced_pydantic_template(source: str = DEFAULT_SOURCE) -> None:
|
||||
extractor = build_extractor()
|
||||
result = extractor.extract(
|
||||
source=source,
|
||||
template=ExtendedInvoice,
|
||||
)
|
||||
print(result.pages)
|
||||
|
||||
if result.pages:
|
||||
invoice = ExtendedInvoice.model_validate(result.pages[0].extracted_data)
|
||||
print(invoice)
|
||||
print(
|
||||
f"Invoice #{invoice.bill_no} was sent by {invoice.sender.name} "
|
||||
f"to {invoice.receiver.name} at {invoice.sender.address}."
|
||||
)
|
||||
|
||||
|
||||
def run_all_examples(source: str = DEFAULT_SOURCE) -> None:
|
||||
print("\n-- Example: string template --")
|
||||
example_with_string_template(source)
|
||||
|
||||
print("\n-- Example: dict template --")
|
||||
example_with_dict_template(source)
|
||||
|
||||
print("\n-- Example: Pydantic model --")
|
||||
example_with_pydantic_template(source)
|
||||
|
||||
print("\n-- Example: Advanced Pydantic model --")
|
||||
example_with_advanced_pydantic_template(source)
|
||||
|
||||
|
||||
def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Docling extraction examples")
|
||||
parser.add_argument(
|
||||
"--source",
|
||||
type=str,
|
||||
default=DEFAULT_SOURCE,
|
||||
help="Path or URL to the input document/image",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--example",
|
||||
choices=["string", "dict", "pydantic", "advanced", "all"],
|
||||
default="all",
|
||||
help="Which example to run",
|
||||
)
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
def main(argv: Optional[Sequence[str]] = None) -> None:
|
||||
args = parse_args(argv)
|
||||
match args.example:
|
||||
case "string":
|
||||
example_with_string_template(args.source)
|
||||
case "dict":
|
||||
example_with_dict_template(args.source)
|
||||
case "pydantic":
|
||||
example_with_pydantic_template(args.source)
|
||||
case "advanced":
|
||||
example_with_advanced_pydantic_template(args.source)
|
||||
case _:
|
||||
run_all_examples(args.source)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user