feat: add PATENT_USPTO as input format

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
Cesar Berrospi Ramis 2024-12-02 12:48:09 +01:00 committed by Cesar Berrospi Ramis
parent 3e599c7bbe
commit 848355d74e
3 changed files with 59 additions and 2 deletions

View File

@ -0,0 +1,45 @@
"""Backend to parse patents from the United States Patent Office (USPTO).
The parsers included in this module can handle patent grants pubished since 1976 and
patent applications since 2001.
The original files can be found in https://bulkdata.uspto.gov.
"""
import logging
from io import BytesIO
from pathlib import Path
from typing import Union
from docling_core.types.doc import DoclingDocument
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
return
def is_valid(self) -> bool:
return False
@classmethod
def supports_pagination(cls) -> bool:
return False
def unload(self):
return
@classmethod
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.PATENT_USPTO}
def convert(self) -> DoclingDocument:
doc = DoclingDocument(name=self.file.stem or "file")
return doc

View File

@ -36,6 +36,7 @@ class InputFormat(str, Enum):
ASCIIDOC = "asciidoc"
MD = "md"
XLSX = "xlsx"
PATENT_USPTO = "uspto"
class OutputFormat(str, Enum):
@ -55,6 +56,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.XLSX: ["xlsx"],
InputFormat.PATENT_USPTO: ["xml", "txt"],
}
FormatToMimeType: Dict[InputFormat, List[str]] = {
@ -81,6 +83,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
InputFormat.XLSX: [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
],
InputFormat.PATENT_USPTO: ["application/xml", "text/plain"],
}
MimeTypeToFormat = {

View File

@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.patent_uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
@ -82,12 +83,17 @@ class HTMLFormatOption(FormatOption):
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
class PdfFormatOption(FormatOption):
class PatentUsptoFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
class ImageFormatOption(FormatOption):
class PdfFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
@ -112,6 +118,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.HTML: FormatOption(
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
),
InputFormat.PATENT_USPTO: FormatOption(
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
),