From 6455579a9097ff8055af129451fa8a5ab542faf1 Mon Sep 17 00:00:00 2001 From: Viktor Kuropiatnyk Date: Thu, 18 Sep 2025 10:51:01 +0200 Subject: [PATCH] Stub for implementing uspto backend meta-data extraction Signed-off-by: Viktor Kuropiatnyk --- docling/backend/abstract_backend.py | 4 ++++ docling/backend/xml/uspto_backend.py | 4 ++++ docling/datamodel/document.py | 1 + docling/pipeline/simple_pipeline.py | 1 + 4 files changed, 10 insertions(+) diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index 491330b3..d6f52c33 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -38,6 +38,10 @@ class AbstractDocumentBackend(ABC): def supported_formats(cls) -> Set["InputFormat"]: pass + @abstractmethod + def extract_metadata(self) -> Dict[str, Any]: + return {} + class PaginatedDocumentBackend(AbstractDocumentBackend): """DeclarativeDocumentBackend. diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py index 268b80ad..25099a03 100644 --- a/docling/backend/xml/uspto_backend.py +++ b/docling/backend/xml/uspto_backend.py @@ -147,6 +147,10 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend): f"Cannot convert doc (hash={self.document_hash}, " f"name={self.file.name}) because the backend failed to init." ) + + @override + def extract_metadata(self) -> Dict[str, Any]: + return {} class PatentUspto(ABC): diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 7955ff9d..b3cce5d5 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -207,6 +207,7 @@ class ConversionResult(BaseModel): confidence: ConfidenceReport = Field(default_factory=ConfidenceReport) document: DoclingDocument = _EMPTY_DOCLING_DOC + metadata: Dict[str, Any] = {} @property @deprecated("Use document instead.") diff --git a/docling/pipeline/simple_pipeline.py b/docling/pipeline/simple_pipeline.py index 0e3f1b6f..7b12dfd6 100644 --- a/docling/pipeline/simple_pipeline.py +++ b/docling/pipeline/simple_pipeline.py @@ -38,6 +38,7 @@ class SimplePipeline(ConvertPipeline): # a DoclingDocument straight. with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): conv_res.document = conv_res.input._backend.convert() + conv_res.metadata = conv_res.input._backend.extract_metadata() return conv_res def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus: