mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: Create a backend to transform PubMed XML files to DoclingDocument (#557)
Signed-off-by: lucas-morin <lucas.morin222@gmail.com>
This commit is contained in:
@@ -86,6 +86,25 @@ def test_guess_format(tmp_path):
|
||||
doc_path = Path("./tests/data/uspto/pftaps057006474.txt")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
|
||||
|
||||
# Valid XML PubMed
|
||||
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.xml").open("rb").read())
|
||||
stream = DocumentStream(name="elife-56337.xml", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
|
||||
doc_path = Path("./tests/data/pubmed/elife-56337.xml")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
|
||||
|
||||
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.nxml").open("rb").read())
|
||||
stream = DocumentStream(name="elife-56337.nxml", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
|
||||
doc_path = Path("./tests/data/pubmed/elife-56337.nxml")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
|
||||
|
||||
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.txt").open("rb").read())
|
||||
stream = DocumentStream(name="elife-56337.txt", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
|
||||
doc_path = Path("./tests/data/pubmed/elife-56337.txt")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
|
||||
|
||||
# Valid XML, non-supported flavor
|
||||
xml_content = (
|
||||
'<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE docling_test SYSTEM '
|
||||
|
||||
Reference in New Issue
Block a user