mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 15:02:21 +00:00
feat: add PATENT_USPTO as input format
Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
parent
3e599c7bbe
commit
848355d74e
45
docling/backend/patent_uspto_backend.py
Normal file
45
docling/backend/patent_uspto_backend.py
Normal file
@ -0,0 +1,45 @@
|
||||
"""Backend to parse patents from the United States Patent Office (USPTO).
|
||||
|
||||
The parsers included in this module can handle patent grants pubished since 1976 and
|
||||
patent applications since 2001.
|
||||
The original files can be found in https://bulkdata.uspto.gov.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
return
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
def unload(self):
|
||||
return
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.PATENT_USPTO}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
doc = DoclingDocument(name=self.file.stem or "file")
|
||||
|
||||
return doc
|
@ -36,6 +36,7 @@ class InputFormat(str, Enum):
|
||||
ASCIIDOC = "asciidoc"
|
||||
MD = "md"
|
||||
XLSX = "xlsx"
|
||||
PATENT_USPTO = "uspto"
|
||||
|
||||
|
||||
class OutputFormat(str, Enum):
|
||||
@ -55,6 +56,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
||||
InputFormat.XLSX: ["xlsx"],
|
||||
InputFormat.PATENT_USPTO: ["xml", "txt"],
|
||||
}
|
||||
|
||||
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||
@ -81,6 +83,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.XLSX: [
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
],
|
||||
InputFormat.PATENT_USPTO: ["application/xml", "text/plain"],
|
||||
}
|
||||
|
||||
MimeTypeToFormat = {
|
||||
|
@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.patent_uspto_backend import PatentUsptoDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DoclingComponentType,
|
||||
@ -82,12 +83,17 @@ class HTMLFormatOption(FormatOption):
|
||||
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
||||
|
||||
|
||||
class PdfFormatOption(FormatOption):
|
||||
class PatentUsptoFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
|
||||
|
||||
|
||||
class ImageFormatOption(FormatOption):
|
||||
pipeline_cls: Type = StandardPdfPipeline
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
||||
|
||||
|
||||
class ImageFormatOption(FormatOption):
|
||||
class PdfFormatOption(FormatOption):
|
||||
pipeline_cls: Type = StandardPdfPipeline
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
||||
|
||||
@ -112,6 +118,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
InputFormat.HTML: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
||||
),
|
||||
InputFormat.PATENT_USPTO: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
||||
),
|
||||
InputFormat.IMAGE: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
||||
),
|
||||
|
Loading…
Reference in New Issue
Block a user