mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: create a backend to parse USPTO patents into DoclingDocument (#606)
* feat: add PATENT_USPTO as input format Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * feat: add USPTO backend parser Add a backend implementation to parse patent applications and grants from the United States Patent Office (USPTO). Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * refactor: change the name of the USPTO input format Change the name of the patent USPTO input format to show the typical format (XML). Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * refactor: address several input formats with same mime type Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * refactor: group XML backend parsers in a subfolder Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore: add safe initialization of PatentUsptoDocumentBackend Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
3e599c7bbe
commit
4e087504cc
@@ -3,7 +3,7 @@ from pathlib import Path
|
||||
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||
|
||||
|
||||
def test_in_doc_from_valid_path():
|
||||
@@ -39,6 +39,73 @@ def test_in_doc_from_invalid_buf():
|
||||
assert doc.valid == False
|
||||
|
||||
|
||||
def test_guess_format(tmp_path):
|
||||
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
|
||||
dci = _DocumentConversionInput(path_or_stream_iterator=[])
|
||||
temp_dir = tmp_path / "test_guess_format"
|
||||
temp_dir.mkdir()
|
||||
|
||||
# Valid PDF
|
||||
buf = BytesIO(Path("./tests/data/2206.01062.pdf").open("rb").read())
|
||||
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.PDF
|
||||
doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
assert dci._guess_format(doc_path) == InputFormat.PDF
|
||||
|
||||
# Valid MS Office
|
||||
buf = BytesIO(Path("./tests/data/docx/lorem_ipsum.docx").open("rb").read())
|
||||
stream = DocumentStream(name="lorem_ipsum.docx", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.DOCX
|
||||
doc_path = Path("./tests/data/docx/lorem_ipsum.docx")
|
||||
assert dci._guess_format(doc_path) == InputFormat.DOCX
|
||||
|
||||
# Valid HTML
|
||||
buf = BytesIO(Path("./tests/data/html/wiki_duck.html").open("rb").read())
|
||||
stream = DocumentStream(name="wiki_duck.html", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.HTML
|
||||
doc_path = Path("./tests/data/html/wiki_duck.html")
|
||||
assert dci._guess_format(doc_path) == InputFormat.HTML
|
||||
|
||||
# Valid MD
|
||||
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
|
||||
stream = DocumentStream(name="wiki.md", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.MD
|
||||
doc_path = Path("./tests/data/md/wiki.md")
|
||||
assert dci._guess_format(doc_path) == InputFormat.MD
|
||||
|
||||
# Valid XML USPTO patent
|
||||
buf = BytesIO(Path("./tests/data/uspto/ipa20110039701.xml").open("rb").read())
|
||||
stream = DocumentStream(name="ipa20110039701.xml", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.XML_USPTO
|
||||
doc_path = Path("./tests/data/uspto/ipa20110039701.xml")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
|
||||
|
||||
buf = BytesIO(Path("./tests/data/uspto/pftaps057006474.txt").open("rb").read())
|
||||
stream = DocumentStream(name="pftaps057006474.txt", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.XML_USPTO
|
||||
doc_path = Path("./tests/data/uspto/pftaps057006474.txt")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
|
||||
|
||||
# Valid XML, non-supported flavor
|
||||
xml_content = (
|
||||
'<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE docling_test SYSTEM '
|
||||
'"test.dtd"><docling>Docling parses documents</docling>'
|
||||
)
|
||||
doc_path = temp_dir / "docling_test.xml"
|
||||
doc_path.write_text(xml_content, encoding="utf-8")
|
||||
assert dci._guess_format(doc_path) == None
|
||||
buf = BytesIO(Path(doc_path).open("rb").read())
|
||||
stream = DocumentStream(name="docling_test.xml", stream=buf)
|
||||
assert dci._guess_format(stream) == None
|
||||
|
||||
# Invalid USPTO patent (as plain text)
|
||||
stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz"))
|
||||
assert dci._guess_format(stream) == None
|
||||
doc_path = temp_dir / "pftaps_wrong.txt"
|
||||
doc_path.write_text("xyz", encoding="utf-8")
|
||||
assert dci._guess_format(doc_path) == None
|
||||
|
||||
|
||||
def _make_input_doc(path):
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=path,
|
||||
|
||||
Reference in New Issue
Block a user