feat: Create a backend to transform PubMed XML files to DoclingDocument (#557)

Signed-off-by: lucas-morin <lucas.morin222@gmail.com>
This commit is contained in:
Lucas Morin
2024-12-17 19:27:09 +01:00
committed by GitHub
parent e31f09f71f
commit fd034802b6
24 changed files with 31040 additions and 4 deletions

View File

@@ -86,6 +86,25 @@ def test_guess_format(tmp_path):
doc_path = Path("./tests/data/uspto/pftaps057006474.txt")
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
# Valid XML PubMed
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.xml").open("rb").read())
stream = DocumentStream(name="elife-56337.xml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
doc_path = Path("./tests/data/pubmed/elife-56337.xml")
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.nxml").open("rb").read())
stream = DocumentStream(name="elife-56337.nxml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
doc_path = Path("./tests/data/pubmed/elife-56337.nxml")
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.txt").open("rb").read())
stream = DocumentStream(name="elife-56337.txt", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
doc_path = Path("./tests/data/pubmed/elife-56337.txt")
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
# Valid XML, non-supported flavor
xml_content = (
'<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE docling_test SYSTEM '