mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
feat: add PATENT_USPTO as input format
Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
parent
3e599c7bbe
commit
848355d74e
45
docling/backend/patent_uspto_backend.py
Normal file
45
docling/backend/patent_uspto_backend.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
"""Backend to parse patents from the United States Patent Office (USPTO).
|
||||||
|
|
||||||
|
The parsers included in this module can handle patent grants pubished since 1976 and
|
||||||
|
patent applications since 2001.
|
||||||
|
The original files can be found in https://bulkdata.uspto.gov.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
from docling_core.types.doc import DoclingDocument
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
|
||||||
|
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
||||||
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supports_pagination(cls) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
return
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_formats(cls) -> set[InputFormat]:
|
||||||
|
return {InputFormat.PATENT_USPTO}
|
||||||
|
|
||||||
|
def convert(self) -> DoclingDocument:
|
||||||
|
doc = DoclingDocument(name=self.file.stem or "file")
|
||||||
|
|
||||||
|
return doc
|
@ -36,6 +36,7 @@ class InputFormat(str, Enum):
|
|||||||
ASCIIDOC = "asciidoc"
|
ASCIIDOC = "asciidoc"
|
||||||
MD = "md"
|
MD = "md"
|
||||||
XLSX = "xlsx"
|
XLSX = "xlsx"
|
||||||
|
PATENT_USPTO = "uspto"
|
||||||
|
|
||||||
|
|
||||||
class OutputFormat(str, Enum):
|
class OutputFormat(str, Enum):
|
||||||
@ -55,6 +56,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|||||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||||
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
||||||
InputFormat.XLSX: ["xlsx"],
|
InputFormat.XLSX: ["xlsx"],
|
||||||
|
InputFormat.PATENT_USPTO: ["xml", "txt"],
|
||||||
}
|
}
|
||||||
|
|
||||||
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||||
@ -81,6 +83,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|||||||
InputFormat.XLSX: [
|
InputFormat.XLSX: [
|
||||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
],
|
],
|
||||||
|
InputFormat.PATENT_USPTO: ["application/xml", "text/plain"],
|
||||||
}
|
}
|
||||||
|
|
||||||
MimeTypeToFormat = {
|
MimeTypeToFormat = {
|
||||||
|
@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|||||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
|
from docling.backend.patent_uspto_backend import PatentUsptoDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
DoclingComponentType,
|
DoclingComponentType,
|
||||||
@ -82,12 +83,17 @@ class HTMLFormatOption(FormatOption):
|
|||||||
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
class PdfFormatOption(FormatOption):
|
class PatentUsptoFormatOption(FormatOption):
|
||||||
|
pipeline_cls: Type = SimplePipeline
|
||||||
|
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
|
class ImageFormatOption(FormatOption):
|
||||||
pipeline_cls: Type = StandardPdfPipeline
|
pipeline_cls: Type = StandardPdfPipeline
|
||||||
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
||||||
|
|
||||||
|
|
||||||
class ImageFormatOption(FormatOption):
|
class PdfFormatOption(FormatOption):
|
||||||
pipeline_cls: Type = StandardPdfPipeline
|
pipeline_cls: Type = StandardPdfPipeline
|
||||||
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
||||||
|
|
||||||
@ -112,6 +118,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|||||||
InputFormat.HTML: FormatOption(
|
InputFormat.HTML: FormatOption(
|
||||||
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
||||||
),
|
),
|
||||||
|
InputFormat.PATENT_USPTO: FormatOption(
|
||||||
|
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
||||||
|
),
|
||||||
InputFormat.IMAGE: FormatOption(
|
InputFormat.IMAGE: FormatOption(
|
||||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
||||||
),
|
),
|
||||||
|
Loading…
Reference in New Issue
Block a user