mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat(xml-jats): parse XML JATS documents (#967)
* chore(xml-jats): separate authors and affiliations In XML PubMed (JATS) backend, convert authors and affiliations as they are typically rendered on PDFs. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * fix(xml-jats): replace new line character by a space Instead of removing new line character from text, replace it by a space character. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * feat(xml-jats): improve existing parser and extend features Partially support lists, respect reading order, parse more sections, support equations, better text formatting. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore(xml-jats): rename PubMed objects to JATS Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
e1436a8b05
commit
428b656793
@@ -130,24 +130,24 @@ def test_guess_format(tmp_path):
|
||||
doc_path = Path("./tests/data/uspto/pftaps057006474.txt")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
|
||||
|
||||
# Valid XML PubMed
|
||||
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.xml").open("rb").read())
|
||||
# Valid XML JATS
|
||||
buf = BytesIO(Path("./tests/data/jats/elife-56337.xml").open("rb").read())
|
||||
stream = DocumentStream(name="elife-56337.xml", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
|
||||
doc_path = Path("./tests/data/pubmed/elife-56337.xml")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
|
||||
assert dci._guess_format(stream) == InputFormat.XML_JATS
|
||||
doc_path = Path("./tests/data/jats/elife-56337.xml")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_JATS
|
||||
|
||||
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.nxml").open("rb").read())
|
||||
buf = BytesIO(Path("./tests/data/jats/elife-56337.nxml").open("rb").read())
|
||||
stream = DocumentStream(name="elife-56337.nxml", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
|
||||
doc_path = Path("./tests/data/pubmed/elife-56337.nxml")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
|
||||
assert dci._guess_format(stream) == InputFormat.XML_JATS
|
||||
doc_path = Path("./tests/data/jats/elife-56337.nxml")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_JATS
|
||||
|
||||
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.txt").open("rb").read())
|
||||
buf = BytesIO(Path("./tests/data/jats/elife-56337.txt").open("rb").read())
|
||||
stream = DocumentStream(name="elife-56337.txt", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
|
||||
doc_path = Path("./tests/data/pubmed/elife-56337.txt")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
|
||||
assert dci._guess_format(stream) == InputFormat.XML_JATS
|
||||
doc_path = Path("./tests/data/jats/elife-56337.txt")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_JATS
|
||||
|
||||
# Valid XML, non-supported flavor
|
||||
xml_content = (
|
||||
|
||||
Reference in New Issue
Block a user