feat(xml-jats): parse XML JATS documents (#967)

* chore(xml-jats): separate authors and affiliations

In XML PubMed (JATS) backend, convert authors and affiliations as they
are typically rendered on PDFs.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* fix(xml-jats): replace new line character by a space

Instead of removing new line character from text, replace it by a space character.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* feat(xml-jats): improve existing parser and extend features

Partially support lists, respect reading order, parse more sections, support equations, better text formatting.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* chore(xml-jats): rename PubMed objects to JATS

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-02-17 10:43:31 +01:00
committed by GitHub
parent e1436a8b05
commit 428b656793
35 changed files with 13688 additions and 30671 deletions

View File

@@ -0,0 +1,772 @@
import logging
import traceback
from io import BytesIO
from pathlib import Path
from typing import Final, Optional, Union
from bs4 import BeautifulSoup
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupItem,
GroupLabel,
NodeItem,
TableCell,
TableData,
TextItem,
)
from lxml import etree
from typing_extensions import TypedDict, override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
JATS_DTD_URL: Final = ["JATS-journalpublishing", "JATS-archive"]
DEFAULT_HEADER_ACKNOWLEDGMENTS: Final = "Acknowledgments"
DEFAULT_HEADER_ABSTRACT: Final = "Abstract"
DEFAULT_HEADER_REFERENCES: Final = "References"
DEFAULT_TEXT_ETAL: Final = "et al."
class Abstract(TypedDict):
label: str
content: str
class Author(TypedDict):
name: str
affiliation_names: list[str]
class Citation(TypedDict):
author_names: str
title: str
source: str
year: str
volume: str
page: str
pub_id: str
publisher_name: str
publisher_loc: str
class Table(TypedDict):
label: str
caption: str
content: str
class XMLComponents(TypedDict):
title: str
authors: list[Author]
abstract: list[Abstract]
class JatsDocumentBackend(DeclarativeDocumentBackend):
"""Backend to parse articles in XML format tagged according to JATS definition.
The Journal Article Tag Suite (JATS) is an definition standard for the
representation of journal articles in XML format. Several publishers and journal
archives provide content in JATS format, including PubMed Central® (PMC), bioRxiv,
medRxiv, or Springer Nature.
Refer to https://jats.nlm.nih.gov for more details on JATS.
The code from this document backend has been developed by modifying parts of the
PubMed Parser library (version 0.5.0, released on 12.08.2024):
Achakulvisut et al., (2020).
Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML
Dataset XML Dataset.
Journal of Open Source Software, 5(46), 1979,
https://doi.org/10.21105/joss.01979
"""
@override
def __init__(
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
) -> None:
super().__init__(in_doc, path_or_stream)
self.path_or_stream = path_or_stream
# Initialize the root of the document hiearchy
self.root: Optional[NodeItem] = None
self.valid = False
try:
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.seek(0)
self.tree: etree._ElementTree = etree.parse(self.path_or_stream)
doc_info: etree.DocInfo = self.tree.docinfo
if doc_info.system_url and any(
[kwd in doc_info.system_url for kwd in JATS_DTD_URL]
):
self.valid = True
return
for ent in doc_info.internalDTD.iterentities():
if ent.system_url and any(
[kwd in ent.system_url for kwd in JATS_DTD_URL]
):
self.valid = True
return
except Exception as exc:
raise RuntimeError(
f"Could not initialize JATS backend for file with hash {self.document_hash}."
) from exc
@override
def is_valid(self) -> bool:
return self.valid
@classmethod
@override
def supports_pagination(cls) -> bool:
return False
@override
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
@override
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.XML_JATS}
@override
def convert(self) -> DoclingDocument:
try:
# Create empty document
origin = DocumentOrigin(
filename=self.file.name or "file",
mimetype="application/xml",
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
# Get metadata XML components
xml_components: XMLComponents = self._parse_metadata()
# Add metadata to the document
self._add_metadata(doc, xml_components)
# walk over the XML body
body = self.tree.xpath("//body")
if self.root and len(body) > 0:
self._walk_linear(doc, self.root, body[0])
# walk over the XML back matter
back = self.tree.xpath("//back")
if self.root and len(back) > 0:
self._walk_linear(doc, self.root, back[0])
except Exception:
_log.error(traceback.format_exc())
return doc
@staticmethod
def _get_text(node: etree._Element, sep: Optional[str] = None) -> str:
skip_tags = ["term", "disp-formula", "inline-formula"]
text: str = (
node.text.replace("\n", " ")
if (node.tag not in skip_tags and node.text)
else ""
)
for child in list(node):
if child.tag not in skip_tags:
# TODO: apply styling according to child.tag when supported by docling-core
text += JatsDocumentBackend._get_text(child, sep)
if sep:
text = text.rstrip(sep) + sep
text += child.tail.replace("\n", " ") if child.tail else ""
return text
def _find_metadata(self) -> Optional[etree._Element]:
meta_names: list[str] = ["article-meta", "book-part-meta"]
meta: Optional[etree._Element] = None
for name in meta_names:
node = self.tree.xpath(f".//{name}")
if len(node) > 0:
meta = node[0]
break
return meta
def _parse_abstract(self) -> list[Abstract]:
# TODO: address cases with multiple sections
abs_list: list[Abstract] = []
for abs_node in self.tree.xpath(".//abstract"):
abstract: Abstract = dict(label="", content="")
texts = []
for abs_par in abs_node.xpath("p"):
texts.append(JatsDocumentBackend._get_text(abs_par).strip())
abstract["content"] = " ".join(texts)
label_node = abs_node.xpath("title|label")
if len(label_node) > 0:
abstract["label"] = label_node[0].text.strip()
abs_list.append(abstract)
return abs_list
def _parse_authors(self) -> list[Author]:
# Get mapping between affiliation ids and names
authors: list[Author] = []
meta: Optional[etree._Element] = self._find_metadata()
if meta is None:
return authors
affiliation_names = []
for affiliation_node in meta.xpath(".//aff[@id]"):
aff = ", ".join([t for t in affiliation_node.itertext() if t.strip()])
aff = aff.replace("\n", " ")
label = affiliation_node.xpath("label")
if label:
# TODO: once superscript is supported, add label with formatting
aff = aff.removeprefix(f"{label[0].text}, ")
affiliation_names.append(aff)
affiliation_ids_names = {
id: name
for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
}
# Get author names and affiliation names
for author_node in meta.xpath(
'.//contrib-group/contrib[@contrib-type="author"]'
):
author: Author = {
"name": "",
"affiliation_names": [],
}
# Affiliation names
affiliation_ids = [
a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
]
for id in affiliation_ids:
if id in affiliation_ids_names:
author["affiliation_names"].append(affiliation_ids_names[id])
# Name
author["name"] = (
author_node.xpath("name/given-names")[0].text
+ " "
+ author_node.xpath("name/surname")[0].text
)
authors.append(author)
return authors
def _parse_title(self) -> str:
meta_names: list[str] = [
"article-meta",
"collection-meta",
"book-meta",
"book-part-meta",
]
title_names: list[str] = ["article-title", "subtitle", "title", "label"]
titles: list[str] = [
" ".join(
elem.text.replace("\n", " ").strip()
for elem in list(title_node)
if elem.tag in title_names
).strip()
for title_node in self.tree.xpath(
"|".join([f".//{item}/title-group" for item in meta_names])
)
]
text = " - ".join(titles)
return text
def _parse_metadata(self) -> XMLComponents:
"""Parsing JATS document metadata."""
xml_components: XMLComponents = {
"title": self._parse_title(),
"authors": self._parse_authors(),
"abstract": self._parse_abstract(),
}
return xml_components
def _add_abstract(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> None:
for abstract in xml_components["abstract"]:
text: str = abstract["content"]
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
if not text:
continue
parent = doc.add_heading(parent=self.root, text=title)
doc.add_text(
parent=parent,
text=text,
label=DocItemLabel.TEXT,
)
return
def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
# TODO: once docling supports text formatting, add affiliation reference to
# author names through superscripts
authors: list = [item["name"] for item in xml_components["authors"]]
authors_str = ", ".join(authors)
affiliations: list = [
item
for author in xml_components["authors"]
for item in author["affiliation_names"]
]
affiliations_str = "; ".join(list(dict.fromkeys(affiliations)))
if authors_str:
doc.add_text(
parent=self.root,
text=authors_str,
label=DocItemLabel.PARAGRAPH,
)
if affiliations_str:
doc.add_text(
parent=self.root,
text=affiliations_str,
label=DocItemLabel.PARAGRAPH,
)
return
def _add_citation(self, doc: DoclingDocument, parent: NodeItem, text: str) -> None:
if isinstance(parent, GroupItem) and parent.label == GroupLabel.LIST:
doc.add_list_item(text=text, enumerated=False, parent=parent)
else:
doc.add_text(text=text, label=DocItemLabel.TEXT, parent=parent)
return
def _parse_element_citation(self, node: etree._Element) -> str:
citation: Citation = {
"author_names": "",
"title": "",
"source": "",
"year": "",
"volume": "",
"page": "",
"pub_id": "",
"publisher_name": "",
"publisher_loc": "",
}
_log.debug("Citation parsing started")
# Author names
names = []
for name_node in node.xpath(".//name"):
name_str = (
name_node.xpath("surname")[0].text.replace("\n", " ").strip()
+ " "
+ name_node.xpath("given-names")[0].text.replace("\n", " ").strip()
)
names.append(name_str)
etal_node = node.xpath(".//etal")
if len(etal_node) > 0:
etal_text = etal_node[0].text or DEFAULT_TEXT_ETAL
names.append(etal_text)
citation["author_names"] = ", ".join(names)
titles: list[str] = [
"article-title",
"chapter-title",
"data-title",
"issue-title",
"part-title",
"trans-title",
]
title_node: Optional[etree._Element] = None
for name in titles:
name_node = node.xpath(name)
if len(name_node) > 0:
title_node = name_node[0]
break
citation["title"] = (
JatsDocumentBackend._get_text(title_node)
if title_node is not None
else node.text.replace("\n", " ").strip()
)
# Journal, year, publisher name, publisher location, volume, elocation
fields: list[str] = [
"source",
"year",
"publisher-name",
"publisher-loc",
"volume",
]
for item in fields:
item_node = node.xpath(item)
if len(item_node) > 0:
citation[item.replace("-", "_")] = ( # type: ignore[literal-required]
item_node[0].text.replace("\n", " ").strip()
)
# Publication identifier
if len(node.xpath("pub-id")) > 0:
pub_id: list[str] = []
for id_node in node.xpath("pub-id"):
id_type = id_node.get("assigning-authority") or id_node.get(
"pub-id-type"
)
id_text = id_node.text
if id_type and id_text:
pub_id.append(
id_type.replace("\n", " ").strip().upper()
+ ": "
+ id_text.replace("\n", " ").strip()
)
if pub_id:
citation["pub_id"] = ", ".join(pub_id)
# Pages
if len(node.xpath("elocation-id")) > 0:
citation["page"] = (
node.xpath("elocation-id")[0].text.replace("\n", " ").strip()
)
elif len(node.xpath("fpage")) > 0:
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
if len(node.xpath("lpage")) > 0:
citation["page"] += (
"" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
)
# Flatten the citation to string
text = ""
if citation["author_names"]:
text += citation["author_names"].rstrip(".") + ". "
if citation["title"]:
text += citation["title"] + ". "
if citation["source"]:
text += citation["source"] + ". "
if citation["publisher_name"]:
if citation["publisher_loc"]:
text += f"{citation['publisher_loc']}: "
text += citation["publisher_name"] + ". "
if citation["volume"]:
text = text.rstrip(". ")
text += f" {citation['volume']}. "
if citation["page"]:
text = text.rstrip(". ")
if citation["volume"]:
text += ":"
text += citation["page"] + ". "
if citation["year"]:
text = text.rstrip(". ")
text += f" ({citation['year']})."
if citation["pub_id"]:
text = text.rstrip(".") + ". "
text += citation["pub_id"]
_log.debug("Citation flattened")
return text
def _add_equation(
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
) -> None:
math_text = node.text
math_parts = math_text.split("$$")
if len(math_parts) == 3:
math_formula = math_parts[1]
doc.add_text(label=DocItemLabel.FORMULA, text=math_formula, parent=parent)
return
def _add_figure_captions(
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
) -> None:
label_node = node.xpath("label")
label: Optional[str] = (
JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
)
caption_node = node.xpath("caption")
caption: Optional[str]
if len(caption_node) > 0:
caption = ""
for caption_par in list(caption_node[0]):
if caption_par.xpath(".//supplementary-material"):
continue
caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
caption = caption.strip()
else:
caption = None
# TODO: format label vs caption once styling is supported
fig_text: str = f"{label}{' ' if label and caption else ''}{caption}"
fig_caption: Optional[TextItem] = (
doc.add_text(label=DocItemLabel.CAPTION, text=fig_text)
if fig_text
else None
)
doc.add_picture(parent=parent, caption=fig_caption)
return
# TODO: add footnotes when DocItemLabel.FOOTNOTE and styling are supported
# def _add_footnote_group(self, doc: DoclingDocument, parent: NodeItem, node: etree._Element) -> None:
# new_parent = doc.add_group(label=GroupLabel.LIST, name="footnotes", parent=parent)
# for child in node.iterchildren(tag="fn"):
# text = JatsDocumentBackend._get_text(child)
# doc.add_list_item(text=text, parent=new_parent)
def _add_metadata(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> None:
self._add_title(doc, xml_components)
self._add_authors(doc, xml_components)
self._add_abstract(doc, xml_components)
return
def _add_table(
self, doc: DoclingDocument, parent: NodeItem, table_xml_component: Table
) -> None:
soup = BeautifulSoup(table_xml_component["content"], "html.parser")
table_tag = soup.find("table")
nested_tables = table_tag.find("table")
if nested_tables:
_log.warning(f"Skipping nested table in {str(self.file)}")
return
# Count the number of rows (number of <tr> elements)
num_rows = len(table_tag.find_all("tr"))
# Find the number of columns (taking into account colspan)
num_cols = 0
for row in table_tag.find_all("tr"):
col_count = 0
for cell in row.find_all(["td", "th"]):
colspan = int(cell.get("colspan", 1))
col_count += colspan
num_cols = max(num_cols, col_count)
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table
for row_idx, row in enumerate(table_tag.find_all("tr")):
# For each row, find all the column cells (both <td> and <th>)
cells = row.find_all(["td", "th"])
# Check if each cell in the row is a header -> means it is a column header
col_header = True
for j, html_cell in enumerate(cells):
if html_cell.name == "td":
col_header = False
# Extract and print the text content of each cell
col_idx = 0
for _, html_cell in enumerate(cells):
# extract inline formulas
for formula in html_cell.find_all("inline-formula"):
math_parts = formula.text.split("$$")
if len(math_parts) == 3:
math_formula = f"$${math_parts[1]}$$"
formula.replaceWith(math_formula)
text = html_cell.text
col_span = int(html_cell.get("colspan", 1))
row_span = int(html_cell.get("rowspan", 1))
while grid[row_idx][col_idx] is not None:
col_idx += 1
for r in range(row_span):
for c in range(col_span):
grid[row_idx + r][col_idx + c] = text
cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
col_header=col_header,
row_header=((not col_header) and html_cell.name == "th"),
)
data.table_cells.append(cell)
# TODO: format label vs caption once styling is supported
label = table_xml_component["label"]
caption = table_xml_component["caption"]
table_text: str = f"{label}{' ' if label and caption else ''}{caption}"
table_caption: Optional[TextItem] = (
doc.add_text(label=DocItemLabel.CAPTION, text=table_text)
if table_text
else None
)
doc.add_table(data=data, parent=parent, caption=table_caption)
return
def _add_tables(
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
) -> None:
table: Table = {"label": "", "caption": "", "content": ""}
# Content
if len(node.xpath("table")) > 0:
table_content_node = node.xpath("table")[0]
elif len(node.xpath("alternatives/table")) > 0:
table_content_node = node.xpath("alternatives/table")[0]
else:
table_content_node = None
if table_content_node is not None:
table["content"] = etree.tostring(table_content_node).decode("utf-8")
# Caption
caption_node = node.xpath("caption")
caption: Optional[str]
if caption_node:
caption = ""
for caption_par in list(caption_node[0]):
if caption_par.xpath(".//supplementary-material"):
continue
caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
caption = caption.strip()
else:
caption = None
if caption is not None:
table["caption"] = caption
# Label
if len(node.xpath("label")) > 0:
table["label"] = node.xpath("label")[0].text
try:
self._add_table(doc, parent, table)
except Exception as e:
_log.warning(f"Skipping unsupported table in {str(self.file)}")
pass
return
def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
self.root = doc.add_text(
parent=None,
text=xml_components["title"],
label=DocItemLabel.TITLE,
)
return
def _walk_linear(
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
) -> str:
# _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
skip_tags = ["term"]
flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
new_parent: NodeItem = parent
node_text: str = (
node.text.replace("\n", " ")
if (node.tag not in skip_tags and node.text)
else ""
)
for child in list(node):
stop_walk: bool = False
# flush text into TextItem for some tags in paragraph nodes
if node.tag == "p" and node_text.strip() and child.tag in flush_tags:
doc.add_text(
label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent
)
node_text = ""
# add elements and decide whether to stop walking
if child.tag in ("sec", "ack"):
header = child.xpath("title|label")
text: Optional[str] = None
if len(header) > 0:
text = JatsDocumentBackend._get_text(header[0])
elif child.tag == "ack":
text = DEFAULT_HEADER_ACKNOWLEDGMENTS
if text:
new_parent = doc.add_heading(text=text, parent=parent)
elif child.tag == "list":
new_parent = doc.add_group(
label=GroupLabel.LIST, name="list", parent=parent
)
elif child.tag == "list-item":
# TODO: address any type of content (another list, formula,...)
# TODO: address list type and item label
text = JatsDocumentBackend._get_text(child).strip()
new_parent = doc.add_list_item(text=text, parent=parent)
stop_walk = True
elif child.tag == "fig":
self._add_figure_captions(doc, parent, child)
stop_walk = True
elif child.tag == "table-wrap":
self._add_tables(doc, parent, child)
stop_walk = True
elif child.tag == "suplementary-material":
stop_walk = True
elif child.tag == "fn-group":
# header = child.xpath(".//title") or child.xpath(".//label")
# if header:
# text = JatsDocumentBackend._get_text(header[0])
# fn_parent = doc.add_heading(text=text, parent=new_parent)
# self._add_footnote_group(doc, fn_parent, child)
stop_walk = True
elif child.tag == "ref-list" and node.tag != "ref-list":
header = child.xpath("title|label")
text = (
JatsDocumentBackend._get_text(header[0])
if len(header) > 0
else DEFAULT_HEADER_REFERENCES
)
new_parent = doc.add_heading(text=text, parent=parent)
new_parent = doc.add_group(
parent=new_parent, label=GroupLabel.LIST, name="list"
)
elif child.tag == "element-citation":
text = self._parse_element_citation(child)
self._add_citation(doc, parent, text)
stop_walk = True
elif child.tag == "mixed-citation":
text = JatsDocumentBackend._get_text(child).strip()
self._add_citation(doc, parent, text)
stop_walk = True
elif child.tag == "tex-math":
self._add_equation(doc, parent, child)
stop_walk = True
elif child.tag == "inline-formula":
# TODO: address inline formulas when supported by docling-core
stop_walk = True
# step into child
if not stop_walk:
new_text = self._walk_linear(doc, new_parent, child)
if not (node.getparent().tag == "p" and node.tag in flush_tags):
node_text += new_text
# pick up the tail text
node_text += child.tail.replace("\n", " ") if child.tail else ""
# create paragraph
if node.tag == "p" and node_text.strip():
doc.add_text(label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent)
return ""
else:
# backpropagate the text
return node_text

View File

@@ -1,592 +0,0 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Any, Set, Union
import lxml
from bs4 import BeautifulSoup
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
TableCell,
TableData,
)
from lxml import etree
from typing_extensions import TypedDict, override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class Paragraph(TypedDict):
text: str
headers: list[str]
class Author(TypedDict):
name: str
affiliation_names: list[str]
class Table(TypedDict):
label: str
caption: str
content: str
class FigureCaption(TypedDict):
label: str
caption: str
class Reference(TypedDict):
author_names: str
title: str
journal: str
year: str
class XMLComponents(TypedDict):
title: str
authors: list[Author]
abstract: str
paragraphs: list[Paragraph]
tables: list[Table]
figure_captions: list[FigureCaption]
references: list[Reference]
class PubMedDocumentBackend(DeclarativeDocumentBackend):
"""
The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
Achakulvisut et al., (2020).
Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
Journal of Open Source Software, 5(46), 1979,
https://doi.org/10.21105/joss.01979
"""
@override
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.path_or_stream = path_or_stream
# Initialize parents for the document hierarchy
self.parents: dict = {}
self.valid = False
try:
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.seek(0)
self.tree: lxml.etree._ElementTree = etree.parse(self.path_or_stream)
if "/NLM//DTD JATS" in self.tree.docinfo.public_id:
self.valid = True
except Exception as exc:
raise RuntimeError(
f"Could not initialize PubMed backend for file with hash {self.document_hash}."
) from exc
@override
def is_valid(self) -> bool:
return self.valid
@classmethod
@override
def supports_pagination(cls) -> bool:
return False
@override
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
@override
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.XML_PUBMED}
@override
def convert(self) -> DoclingDocument:
# Create empty document
origin = DocumentOrigin(
filename=self.file.name or "file",
mimetype="application/xml",
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
_log.debug("Trying to convert PubMed XML document...")
# Get parsed XML components
xml_components: XMLComponents = self._parse()
# Add XML components to the document
doc = self._populate_document(doc, xml_components)
return doc
def _parse_title(self) -> str:
title: str = " ".join(
[
t.replace("\n", "")
for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
]
)
return title
def _parse_authors(self) -> list[Author]:
# Get mapping between affiliation ids and names
affiliation_names = []
for affiliation_node in self.tree.xpath(".//aff[@id]"):
affiliation_names.append(
": ".join([t for t in affiliation_node.itertext() if t != "\n"])
)
affiliation_ids_names = {
id: name
for id, name in zip(self.tree.xpath(".//aff[@id]/@id"), affiliation_names)
}
# Get author names and affiliation names
authors: list[Author] = []
for author_node in self.tree.xpath(
'.//contrib-group/contrib[@contrib-type="author"]'
):
author: Author = {
"name": "",
"affiliation_names": [],
}
# Affiliation names
affiliation_ids = [
a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
]
for id in affiliation_ids:
if id in affiliation_ids_names:
author["affiliation_names"].append(affiliation_ids_names[id])
# Name
author["name"] = (
author_node.xpath("name/surname")[0].text
+ " "
+ author_node.xpath("name/given-names")[0].text
)
authors.append(author)
return authors
def _parse_abstract(self) -> str:
texts = []
for abstract_node in self.tree.xpath(".//abstract"):
for text in abstract_node.itertext():
texts.append(text.replace("\n", ""))
abstract: str = "".join(texts)
return abstract
def _parse_main_text(self) -> list[Paragraph]:
paragraphs: list[Paragraph] = []
for paragraph_node in self.tree.xpath("//body//p"):
# Skip captions
if "/caption" in paragraph_node.getroottree().getpath(paragraph_node):
continue
paragraph: Paragraph = {"text": "", "headers": []}
# Text
paragraph["text"] = "".join(
[t.replace("\n", "") for t in paragraph_node.itertext()]
)
# Header
path = "../title"
while len(paragraph_node.xpath(path)) > 0:
paragraph["headers"].append(
"".join(
[
t.replace("\n", "")
for t in paragraph_node.xpath(path)[0].itertext()
]
)
)
path = "../" + path
paragraphs.append(paragraph)
return paragraphs
def _parse_tables(self) -> list[Table]:
tables: list[Table] = []
for table_node in self.tree.xpath(".//body//table-wrap"):
table: Table = {"label": "", "caption": "", "content": ""}
# Content
if len(table_node.xpath("table")) > 0:
table_content_node = table_node.xpath("table")[0]
elif len(table_node.xpath("alternatives/table")) > 0:
table_content_node = table_node.xpath("alternatives/table")[0]
else:
table_content_node = None
if table_content_node != None:
table["content"] = etree.tostring(table_content_node).decode("utf-8")
# Caption
if len(table_node.xpath("caption/p")) > 0:
caption_node = table_node.xpath("caption/p")[0]
elif len(table_node.xpath("caption/title")) > 0:
caption_node = table_node.xpath("caption/title")[0]
else:
caption_node = None
if caption_node != None:
table["caption"] = "".join(
[t.replace("\n", "") for t in caption_node.itertext()]
)
# Label
if len(table_node.xpath("label")) > 0:
table["label"] = table_node.xpath("label")[0].text
tables.append(table)
return tables
def _parse_figure_captions(self) -> list[FigureCaption]:
figure_captions: list[FigureCaption] = []
if not (self.tree.xpath(".//fig")):
return figure_captions
for figure_node in self.tree.xpath(".//fig"):
figure_caption: FigureCaption = {
"caption": "",
"label": "",
}
# Label
if figure_node.xpath("label"):
figure_caption["label"] = "".join(
[
t.replace("\n", "")
for t in figure_node.xpath("label")[0].itertext()
]
)
# Caption
if figure_node.xpath("caption"):
caption = ""
for caption_node in figure_node.xpath("caption")[0].getchildren():
caption += (
"".join([t.replace("\n", "") for t in caption_node.itertext()])
+ "\n"
)
figure_caption["caption"] = caption
figure_captions.append(figure_caption)
return figure_captions
def _parse_references(self) -> list[Reference]:
references: list[Reference] = []
for reference_node_abs in self.tree.xpath(".//ref-list/ref"):
reference: Reference = {
"author_names": "",
"title": "",
"journal": "",
"year": "",
}
reference_node: Any = None
for tag in ["mixed-citation", "element-citation", "citation"]:
if len(reference_node_abs.xpath(tag)) > 0:
reference_node = reference_node_abs.xpath(tag)[0]
break
if reference_node is None:
continue
if all(
not (ref_type in ["citation-type", "publication-type"])
for ref_type in reference_node.attrib.keys()
):
continue
# Author names
names = []
if len(reference_node.xpath("name")) > 0:
for name_node in reference_node.xpath("name"):
name_str = " ".join(
[t.text for t in name_node.getchildren() if (t.text != None)]
)
names.append(name_str)
elif len(reference_node.xpath("person-group")) > 0:
for name_node in reference_node.xpath("person-group")[0]:
name_str = (
name_node.xpath("given-names")[0].text
+ " "
+ name_node.xpath("surname")[0].text
)
names.append(name_str)
reference["author_names"] = "; ".join(names)
# Title
if len(reference_node.xpath("article-title")) > 0:
reference["title"] = " ".join(
[
t.replace("\n", " ")
for t in reference_node.xpath("article-title")[0].itertext()
]
)
# Journal
if len(reference_node.xpath("source")) > 0:
reference["journal"] = reference_node.xpath("source")[0].text
# Year
if len(reference_node.xpath("year")) > 0:
reference["year"] = reference_node.xpath("year")[0].text
if (
not (reference_node.xpath("article-title"))
and not (reference_node.xpath("journal"))
and not (reference_node.xpath("year"))
):
reference["title"] = reference_node.text
references.append(reference)
return references
def _parse(self) -> XMLComponents:
"""Parsing PubMed document."""
xml_components: XMLComponents = {
"title": self._parse_title(),
"authors": self._parse_authors(),
"abstract": self._parse_abstract(),
"paragraphs": self._parse_main_text(),
"tables": self._parse_tables(),
"figure_captions": self._parse_figure_captions(),
"references": self._parse_references(),
}
return xml_components
def _populate_document(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> DoclingDocument:
self._add_title(doc, xml_components)
self._add_authors(doc, xml_components)
self._add_abstract(doc, xml_components)
self._add_main_text(doc, xml_components)
if xml_components["tables"]:
self._add_tables(doc, xml_components)
if xml_components["figure_captions"]:
self._add_figure_captions(doc, xml_components)
self._add_references(doc, xml_components)
return doc
def _add_figure_captions(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> None:
self.parents["Figures"] = doc.add_heading(
parent=self.parents["Title"], text="Figures"
)
for figure_caption_xml_component in xml_components["figure_captions"]:
figure_caption_text = (
figure_caption_xml_component["label"]
+ ": "
+ figure_caption_xml_component["caption"].strip()
)
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=figure_caption_text
)
doc.add_picture(
parent=self.parents["Figures"],
caption=fig_caption,
)
return
def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
self.parents["Title"] = doc.add_text(
parent=None,
text=xml_components["title"],
label=DocItemLabel.TITLE,
)
return
def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
authors_affiliations: list = []
for author in xml_components["authors"]:
authors_affiliations.append(author["name"])
authors_affiliations.append(", ".join(author["affiliation_names"]))
authors_affiliations_str = "; ".join(authors_affiliations)
doc.add_text(
parent=self.parents["Title"],
text=authors_affiliations_str,
label=DocItemLabel.PARAGRAPH,
)
return
def _add_abstract(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> None:
abstract_text: str = xml_components["abstract"]
self.parents["Abstract"] = doc.add_heading(
parent=self.parents["Title"], text="Abstract"
)
doc.add_text(
parent=self.parents["Abstract"],
text=abstract_text,
label=DocItemLabel.TEXT,
)
return
def _add_main_text(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> None:
added_headers: list = []
for paragraph in xml_components["paragraphs"]:
if not (paragraph["headers"]):
continue
# Header
for i, header in enumerate(reversed(paragraph["headers"])):
if header in added_headers:
continue
added_headers.append(header)
if ((i - 1) >= 0) and list(reversed(paragraph["headers"]))[
i - 1
] in self.parents:
parent = self.parents[list(reversed(paragraph["headers"]))[i - 1]]
else:
parent = self.parents["Title"]
self.parents[header] = doc.add_heading(parent=parent, text=header)
# Paragraph text
if paragraph["headers"][0] in self.parents:
parent = self.parents[paragraph["headers"][0]]
else:
parent = self.parents["Title"]
doc.add_text(parent=parent, label=DocItemLabel.TEXT, text=paragraph["text"])
return
def _add_references(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> None:
self.parents["References"] = doc.add_heading(
parent=self.parents["Title"], text="References"
)
current_list = doc.add_group(
parent=self.parents["References"], label=GroupLabel.LIST, name="list"
)
for reference in xml_components["references"]:
reference_text: str = ""
if reference["author_names"]:
reference_text += reference["author_names"] + ". "
if reference["title"]:
reference_text += reference["title"]
if reference["title"][-1] != ".":
reference_text += "."
reference_text += " "
if reference["journal"]:
reference_text += reference["journal"]
if reference["year"]:
reference_text += " (" + reference["year"] + ")"
if not (reference_text):
_log.debug(f"Skipping reference for: {str(self.file)}")
continue
doc.add_list_item(
text=reference_text, enumerated=False, parent=current_list
)
return
def _add_tables(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
self.parents["Tables"] = doc.add_heading(
parent=self.parents["Title"], text="Tables"
)
for table_xml_component in xml_components["tables"]:
try:
self._add_table(doc, table_xml_component)
except Exception as e:
_log.debug(f"Skipping unsupported table for: {str(self.file)}")
pass
return
def _add_table(self, doc: DoclingDocument, table_xml_component: Table) -> None:
soup = BeautifulSoup(table_xml_component["content"], "html.parser")
table_tag = soup.find("table")
nested_tables = table_tag.find("table")
if nested_tables:
_log.debug(f"Skipping nested table for: {str(self.file)}")
return
# Count the number of rows (number of <tr> elements)
num_rows = len(table_tag.find_all("tr"))
# Find the number of columns (taking into account colspan)
num_cols = 0
for row in table_tag.find_all("tr"):
col_count = 0
for cell in row.find_all(["td", "th"]):
colspan = int(cell.get("colspan", 1))
col_count += colspan
num_cols = max(num_cols, col_count)
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table
for row_idx, row in enumerate(table_tag.find_all("tr")):
# For each row, find all the column cells (both <td> and <th>)
cells = row.find_all(["td", "th"])
# Check if each cell in the row is a header -> means it is a column header
col_header = True
for j, html_cell in enumerate(cells):
if html_cell.name == "td":
col_header = False
# Extract and print the text content of each cell
col_idx = 0
for _, html_cell in enumerate(cells):
text = html_cell.text
col_span = int(html_cell.get("colspan", 1))
row_span = int(html_cell.get("rowspan", 1))
while grid[row_idx][col_idx] != None:
col_idx += 1
for r in range(row_span):
for c in range(col_span):
grid[row_idx + r][col_idx + c] = text
cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
col_header=col_header,
row_header=((not col_header) and html_cell.name == "th"),
)
data.table_cells.append(cell)
table_caption = doc.add_text(
label=DocItemLabel.CAPTION,
text=table_xml_component["label"] + ": " + table_xml_component["caption"],
)
doc.add_table(data=data, parent=self.parents["Tables"], caption=table_caption)
return

View File

@@ -34,7 +34,6 @@ class InputFormat(str, Enum):
DOCX = "docx"
PPTX = "pptx"
HTML = "html"
XML_PUBMED = "xml_pubmed"
IMAGE = "image"
PDF = "pdf"
ASCIIDOC = "asciidoc"
@@ -42,6 +41,7 @@ class InputFormat(str, Enum):
CSV = "csv"
XLSX = "xlsx"
XML_USPTO = "xml_uspto"
XML_JATS = "xml_jats"
JSON_DOCLING = "json_docling"
@@ -59,7 +59,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.PDF: ["pdf"],
InputFormat.MD: ["md"],
InputFormat.HTML: ["html", "htm", "xhtml"],
InputFormat.XML_PUBMED: ["xml", "nxml"],
InputFormat.XML_JATS: ["xml", "nxml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.CSV: ["csv"],
@@ -79,7 +79,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
],
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
InputFormat.XML_PUBMED: ["application/xml"],
InputFormat.XML_JATS: ["application/xml"],
InputFormat.IMAGE: [
"image/png",
"image/jpeg",

View File

@@ -333,11 +333,11 @@ class _DocumentConversionInput(BaseModel):
):
input_format = InputFormat.XML_USPTO
if (
InputFormat.XML_PUBMED in formats
and "/NLM//DTD JATS" in xml_doctype
if InputFormat.XML_JATS in formats and (
"JATS-journalpublishing" in xml_doctype
or "JATS-archive" in xml_doctype
):
input_format = InputFormat.XML_PUBMED
input_format = InputFormat.XML_JATS
elif mime == "text/plain":
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):

View File

@@ -18,7 +18,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
from docling.backend.xml.jats_backend import JatsDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
@@ -102,9 +102,9 @@ class PatentUsptoFormatOption(FormatOption):
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
class XMLPubMedFormatOption(FormatOption):
class XMLJatsFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
class ImageFormatOption(FormatOption):
@@ -143,8 +143,8 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.XML_USPTO: FormatOption(
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
),
InputFormat.XML_PUBMED: FormatOption(
pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
InputFormat.XML_JATS: FormatOption(
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend