mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
feat: create a backend to parse USPTO patents into DoclingDocument (#606)
* feat: add PATENT_USPTO as input format Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * feat: add USPTO backend parser Add a backend implementation to parse patent applications and grants from the United States Patent Office (USPTO). Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * refactor: change the name of the USPTO input format Change the name of the patent USPTO input format to show the typical format (XML). Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * refactor: address several input formats with same mime type Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * refactor: group XML backend parsers in a subfolder Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore: add safe initialization of PatentUsptoDocumentBackend Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
3e599c7bbe
commit
4e087504cc
@@ -1,4 +1,4 @@
|
||||
from enum import Enum, auto
|
||||
from enum import Enum
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
@@ -28,6 +28,8 @@ class ConversionStatus(str, Enum):
|
||||
|
||||
|
||||
class InputFormat(str, Enum):
|
||||
"""A document format supported by document backend parsers."""
|
||||
|
||||
DOCX = "docx"
|
||||
PPTX = "pptx"
|
||||
HTML = "html"
|
||||
@@ -36,6 +38,7 @@ class InputFormat(str, Enum):
|
||||
ASCIIDOC = "asciidoc"
|
||||
MD = "md"
|
||||
XLSX = "xlsx"
|
||||
XML_USPTO = "xml_uspto"
|
||||
|
||||
|
||||
class OutputFormat(str, Enum):
|
||||
@@ -55,6 +58,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
||||
InputFormat.XLSX: ["xlsx"],
|
||||
InputFormat.XML_USPTO: ["xml", "txt"],
|
||||
}
|
||||
|
||||
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||
@@ -81,10 +85,13 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.XLSX: [
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
],
|
||||
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
||||
}
|
||||
|
||||
MimeTypeToFormat = {
|
||||
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
||||
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
||||
mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
|
||||
for value in FormatToMimeType.values()
|
||||
for mime in value
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user