mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
refactor: change the name of the USPTO input format
Change the name of the patent USPTO input format to show the typical format (XML). Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
0a35f45092
commit
c957901239
@ -110,7 +110,7 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
@classmethod
|
@classmethod
|
||||||
@override
|
@override
|
||||||
def supported_formats(cls) -> set[InputFormat]:
|
def supported_formats(cls) -> set[InputFormat]:
|
||||||
return {InputFormat.PATENT_USPTO}
|
return {InputFormat.XML_USPTO}
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
@ -36,7 +36,7 @@ class InputFormat(str, Enum):
|
|||||||
ASCIIDOC = "asciidoc"
|
ASCIIDOC = "asciidoc"
|
||||||
MD = "md"
|
MD = "md"
|
||||||
XLSX = "xlsx"
|
XLSX = "xlsx"
|
||||||
PATENT_USPTO = "uspto"
|
XML_USPTO = "uspto"
|
||||||
|
|
||||||
|
|
||||||
class OutputFormat(str, Enum):
|
class OutputFormat(str, Enum):
|
||||||
@ -56,7 +56,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|||||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||||
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
||||||
InputFormat.XLSX: ["xlsx"],
|
InputFormat.XLSX: ["xlsx"],
|
||||||
InputFormat.PATENT_USPTO: ["xml", "txt"],
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
||||||
}
|
}
|
||||||
|
|
||||||
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||||
@ -83,7 +83,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|||||||
InputFormat.XLSX: [
|
InputFormat.XLSX: [
|
||||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
],
|
],
|
||||||
InputFormat.PATENT_USPTO: ["application/xml", "text/plain"],
|
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
||||||
}
|
}
|
||||||
|
|
||||||
MimeTypeToFormat = {
|
MimeTypeToFormat = {
|
||||||
|
@ -15,7 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|||||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
from docling.backend.patent_uspto_backend import PatentUsptoDocumentBackend
|
from docling.backend.xml_uspto_backend import PatentUsptoDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
DoclingComponentType,
|
DoclingComponentType,
|
||||||
@ -118,7 +118,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|||||||
InputFormat.HTML: FormatOption(
|
InputFormat.HTML: FormatOption(
|
||||||
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
||||||
),
|
),
|
||||||
InputFormat.PATENT_USPTO: FormatOption(
|
InputFormat.XML_USPTO: FormatOption(
|
||||||
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
||||||
),
|
),
|
||||||
InputFormat.IMAGE: FormatOption(
|
InputFormat.IMAGE: FormatOption(
|
||||||
|
@ -12,7 +12,7 @@ import yaml
|
|||||||
from docling_core.types import DoclingDocument
|
from docling_core.types import DoclingDocument
|
||||||
from docling_core.types.doc import DocItemLabel, TableData, TextItem
|
from docling_core.types.doc import DocItemLabel, TableData, TextItem
|
||||||
|
|
||||||
from docling.backend.patent_uspto_backend import PatentUsptoDocumentBackend, XmlTable
|
from docling.backend.xml_uspto_backend import PatentUsptoDocumentBackend, XmlTable
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
@ -45,7 +45,7 @@ def patents() -> list[tuple[Path, DoclingDocument]]:
|
|||||||
for in_path in patent_paths:
|
for in_path in patent_paths:
|
||||||
in_doc = InputDocument(
|
in_doc = InputDocument(
|
||||||
path_or_stream=in_path,
|
path_or_stream=in_path,
|
||||||
format=InputFormat.PATENT_USPTO,
|
format=InputFormat.XML_USPTO,
|
||||||
backend=PatentUsptoDocumentBackend,
|
backend=PatentUsptoDocumentBackend,
|
||||||
)
|
)
|
||||||
backend = PatentUsptoDocumentBackend(in_doc=in_doc, path_or_stream=in_path)
|
backend = PatentUsptoDocumentBackend(in_doc=in_doc, path_or_stream=in_path)
|
||||||
|
Loading…
Reference in New Issue
Block a user