chore(xml-jats): rename PubMed objects to JATS

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-02-14 14:59:57 +01:00
parent 011dd6ce96
commit 93eb9de871
19 changed files with 95 additions and 209 deletions

View File

@ -66,11 +66,21 @@ class XMLComponents(TypedDict):
abstract: list[Abstract] abstract: list[Abstract]
class PubMedDocumentBackend(DeclarativeDocumentBackend): class JatsDocumentBackend(DeclarativeDocumentBackend):
""" """Backend to parse articles in XML format tagged according to JATS definition.
The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
The Journal Article Tag Suite (JATS) is an definition standard for the
representation of journal articles in XML format. Several publishers and journal
archives provide content in JATS format, including PubMed Central® (PMC), bioRxiv,
medRxiv, or Springer Nature.
Refer to https://jats.nlm.nih.gov for more details on JATS.
The code from this document backend has been developed by modifying parts of the
PubMed Parser library (version 0.5.0, released on 12.08.2024):
Achakulvisut et al., (2020). Achakulvisut et al., (2020).
Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset. Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML
Dataset XML Dataset.
Journal of Open Source Software, 5(46), 1979, Journal of Open Source Software, 5(46), 1979,
https://doi.org/10.21105/joss.01979 https://doi.org/10.21105/joss.01979
""" """
@ -105,7 +115,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
return return
except Exception as exc: except Exception as exc:
raise RuntimeError( raise RuntimeError(
f"Could not initialize PubMed backend for file with hash {self.document_hash}." f"Could not initialize JATS backend for file with hash {self.document_hash}."
) from exc ) from exc
@override @override
@ -126,7 +136,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
@classmethod @classmethod
@override @override
def supported_formats(cls) -> set[InputFormat]: def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.XML_PUBMED} return {InputFormat.XML_JATS}
@override @override
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
@ -170,7 +180,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
for child in list(node): for child in list(node):
if child.tag not in skip_tags: if child.tag not in skip_tags:
# TODO: apply styling according to child.tag when supported by docling-core # TODO: apply styling according to child.tag when supported by docling-core
text += PubMedDocumentBackend._get_text(child, sep) text += JatsDocumentBackend._get_text(child, sep)
if sep: if sep:
text = text.rstrip(sep) + sep text = text.rstrip(sep) + sep
text += child.tail.replace("\n", " ") if child.tail else "" text += child.tail.replace("\n", " ") if child.tail else ""
@ -196,7 +206,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
abstract: Abstract = dict(label="", content="") abstract: Abstract = dict(label="", content="")
texts = [] texts = []
for abs_par in abs_node.xpath("p"): for abs_par in abs_node.xpath("p"):
texts.append(PubMedDocumentBackend._get_text(abs_par).strip()) texts.append(JatsDocumentBackend._get_text(abs_par).strip())
abstract["content"] = " ".join(texts) abstract["content"] = " ".join(texts)
label_node = abs_node.xpath("title|label") label_node = abs_node.xpath("title|label")
@ -280,7 +290,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
return text return text
def _parse_metadata(self) -> XMLComponents: def _parse_metadata(self) -> XMLComponents:
"""Parsing PubMed document metadata.""" """Parsing JATS document metadata."""
xml_components: XMLComponents = { xml_components: XMLComponents = {
"title": self._parse_title(), "title": self._parse_title(),
"authors": self._parse_authors(), "authors": self._parse_authors(),
@ -385,7 +395,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
title_node = name_node[0] title_node = name_node[0]
break break
citation["title"] = ( citation["title"] = (
PubMedDocumentBackend._get_text(title_node) JatsDocumentBackend._get_text(title_node)
if title_node is not None if title_node is not None
else node.text.replace("\n", " ").strip() else node.text.replace("\n", " ").strip()
) )
@ -415,7 +425,9 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
id_text = id_node.text id_text = id_node.text
if id_type and id_text: if id_type and id_text:
pub_id.append( pub_id.append(
f"{id_type.replace("\n", " ").strip().upper()}: {id_text.replace("\n", " ").strip()}" id_type.replace("\n", " ").strip().upper()
+ ": "
+ id_text.replace("\n", " ").strip()
) )
if pub_id: if pub_id:
citation["pub_id"] = ", ".join(pub_id) citation["pub_id"] = ", ".join(pub_id)
@ -428,9 +440,9 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
elif len(node.xpath("fpage")) > 0: elif len(node.xpath("fpage")) > 0:
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip() citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
if len(node.xpath("lpage")) > 0: if len(node.xpath("lpage")) > 0:
citation[ citation["page"] += (
"page" "" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
] += f"{node.xpath('lpage')[0].text.replace("\n", " ").strip()}" )
# Flatten the citation to string # Flatten the citation to string
@ -447,7 +459,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
text += citation["publisher_name"] + ". " text += citation["publisher_name"] + ". "
if citation["volume"]: if citation["volume"]:
text = text.rstrip(". ") text = text.rstrip(". ")
text += f" {citation["volume"]}. " text += f" {citation['volume']}. "
if citation["page"]: if citation["page"]:
text = text.rstrip(". ") text = text.rstrip(". ")
if citation["volume"]: if citation["volume"]:
@ -480,7 +492,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
) -> None: ) -> None:
label_node = node.xpath("label") label_node = node.xpath("label")
label: Optional[str] = ( label: Optional[str] = (
PubMedDocumentBackend._get_text(label_node[0]).strip() if label_node else "" JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
) )
caption_node = node.xpath("caption") caption_node = node.xpath("caption")
@ -490,7 +502,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
for caption_par in list(caption_node[0]): for caption_par in list(caption_node[0]):
if caption_par.xpath(".//supplementary-material"): if caption_par.xpath(".//supplementary-material"):
continue continue
caption += PubMedDocumentBackend._get_text(caption_par).strip() + " " caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
caption = caption.strip() caption = caption.strip()
else: else:
caption = None caption = None
@ -511,7 +523,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
# def _add_footnote_group(self, doc: DoclingDocument, parent: NodeItem, node: etree._Element) -> None: # def _add_footnote_group(self, doc: DoclingDocument, parent: NodeItem, node: etree._Element) -> None:
# new_parent = doc.add_group(label=GroupLabel.LIST, name="footnotes", parent=parent) # new_parent = doc.add_group(label=GroupLabel.LIST, name="footnotes", parent=parent)
# for child in node.iterchildren(tag="fn"): # for child in node.iterchildren(tag="fn"):
# text = PubMedDocumentBackend._get_text(child) # text = JatsDocumentBackend._get_text(child)
# doc.add_list_item(text=text, parent=new_parent) # doc.add_list_item(text=text, parent=new_parent)
def _add_metadata( def _add_metadata(
@ -631,7 +643,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
for caption_par in list(caption_node[0]): for caption_par in list(caption_node[0]):
if caption_par.xpath(".//supplementary-material"): if caption_par.xpath(".//supplementary-material"):
continue continue
caption += PubMedDocumentBackend._get_text(caption_par).strip() + " " caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
caption = caption.strip() caption = caption.strip()
else: else:
caption = None caption = None
@ -686,7 +698,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
header = child.xpath("title|label") header = child.xpath("title|label")
text: Optional[str] = None text: Optional[str] = None
if len(header) > 0: if len(header) > 0:
text = PubMedDocumentBackend._get_text(header[0]) text = JatsDocumentBackend._get_text(header[0])
elif child.tag == "ack": elif child.tag == "ack":
text = DEFAULT_HEADER_ACKNOWLEDGMENTS text = DEFAULT_HEADER_ACKNOWLEDGMENTS
if text: if text:
@ -698,7 +710,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
elif child.tag == "list-item": elif child.tag == "list-item":
# TODO: address any type of content (another list, formula,...) # TODO: address any type of content (another list, formula,...)
# TODO: address list type and item label # TODO: address list type and item label
text = PubMedDocumentBackend._get_text(child).strip() text = JatsDocumentBackend._get_text(child).strip()
new_parent = doc.add_list_item(text=text, parent=parent) new_parent = doc.add_list_item(text=text, parent=parent)
stop_walk = True stop_walk = True
elif child.tag == "fig": elif child.tag == "fig":
@ -712,14 +724,14 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
elif child.tag == "fn-group": elif child.tag == "fn-group":
# header = child.xpath(".//title") or child.xpath(".//label") # header = child.xpath(".//title") or child.xpath(".//label")
# if header: # if header:
# text = PubMedDocumentBackend._get_text(header[0]) # text = JatsDocumentBackend._get_text(header[0])
# fn_parent = doc.add_heading(text=text, parent=new_parent) # fn_parent = doc.add_heading(text=text, parent=new_parent)
# self._add_footnote_group(doc, fn_parent, child) # self._add_footnote_group(doc, fn_parent, child)
stop_walk = True stop_walk = True
elif child.tag == "ref-list" and node.tag != "ref-list": elif child.tag == "ref-list" and node.tag != "ref-list":
header = child.xpath("title|label") header = child.xpath("title|label")
text = ( text = (
PubMedDocumentBackend._get_text(header[0]) JatsDocumentBackend._get_text(header[0])
if len(header) > 0 if len(header) > 0
else DEFAULT_HEADER_REFERENCES else DEFAULT_HEADER_REFERENCES
) )
@ -732,7 +744,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
self._add_citation(doc, parent, text) self._add_citation(doc, parent, text)
stop_walk = True stop_walk = True
elif child.tag == "mixed-citation": elif child.tag == "mixed-citation":
text = PubMedDocumentBackend._get_text(child).strip() text = JatsDocumentBackend._get_text(child).strip()
self._add_citation(doc, parent, text) self._add_citation(doc, parent, text)
stop_walk = True stop_walk = True
elif child.tag == "tex-math": elif child.tag == "tex-math":

View File

@ -34,7 +34,6 @@ class InputFormat(str, Enum):
DOCX = "docx" DOCX = "docx"
PPTX = "pptx" PPTX = "pptx"
HTML = "html" HTML = "html"
XML_PUBMED = "xml_pubmed"
IMAGE = "image" IMAGE = "image"
PDF = "pdf" PDF = "pdf"
ASCIIDOC = "asciidoc" ASCIIDOC = "asciidoc"
@ -42,6 +41,7 @@ class InputFormat(str, Enum):
CSV = "csv" CSV = "csv"
XLSX = "xlsx" XLSX = "xlsx"
XML_USPTO = "xml_uspto" XML_USPTO = "xml_uspto"
XML_JATS = "xml_jats"
JSON_DOCLING = "json_docling" JSON_DOCLING = "json_docling"
@ -59,7 +59,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.PDF: ["pdf"], InputFormat.PDF: ["pdf"],
InputFormat.MD: ["md"], InputFormat.MD: ["md"],
InputFormat.HTML: ["html", "htm", "xhtml"], InputFormat.HTML: ["html", "htm", "xhtml"],
InputFormat.XML_PUBMED: ["xml", "nxml"], InputFormat.XML_JATS: ["xml", "nxml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"], InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.CSV: ["csv"], InputFormat.CSV: ["csv"],
@ -79,7 +79,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
"application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.openxmlformats-officedocument.presentationml.presentation",
], ],
InputFormat.HTML: ["text/html", "application/xhtml+xml"], InputFormat.HTML: ["text/html", "application/xhtml+xml"],
InputFormat.XML_PUBMED: ["application/xml"], InputFormat.XML_JATS: ["application/xml"],
InputFormat.IMAGE: [ InputFormat.IMAGE: [
"image/png", "image/png",
"image/jpeg", "image/jpeg",

View File

@ -333,11 +333,11 @@ class _DocumentConversionInput(BaseModel):
): ):
input_format = InputFormat.XML_USPTO input_format = InputFormat.XML_USPTO
if InputFormat.XML_PUBMED in formats and ( if InputFormat.XML_JATS in formats and (
"JATS-journalpublishing" in xml_doctype "JATS-journalpublishing" in xml_doctype
or "JATS-archive" in xml_doctype or "JATS-archive" in xml_doctype
): ):
input_format = InputFormat.XML_PUBMED input_format = InputFormat.XML_JATS
elif mime == "text/plain": elif mime == "text/plain":
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"): if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):

View File

@ -18,7 +18,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend from docling.backend.xml.jats_backend import JatsDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
ConversionStatus, ConversionStatus,
@ -102,9 +102,9 @@ class PatentUsptoFormatOption(FormatOption):
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
class XMLPubMedFormatOption(FormatOption): class XMLJatsFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
class ImageFormatOption(FormatOption): class ImageFormatOption(FormatOption):
@ -143,8 +143,8 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.XML_USPTO: FormatOption( InputFormat.XML_USPTO: FormatOption(
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
), ),
InputFormat.XML_PUBMED: FormatOption( InputFormat.XML_JATS: FormatOption(
pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
), ),
InputFormat.IMAGE: FormatOption( InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend

View File

@ -82,7 +82,7 @@
"from docling.document_converter import DocumentConverter\n", "from docling.document_converter import DocumentConverter\n",
"\n", "\n",
"# a sample PMC article:\n", "# a sample PMC article:\n",
"source = \"../../tests/data/pubmed/elife-56337.nxml\"\n", "source = \"../../tests/data/jats/elife-56337.nxml\"\n",
"converter = DocumentConverter()\n", "converter = DocumentConverter()\n",
"result = converter.convert(source)\n", "result = converter.convert(source)\n",
"print(result.status)" "print(result.status)"
@ -97,7 +97,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 29, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -106,11 +106,11 @@
"text": [ "text": [
"# KRAB-zinc finger protein gene expansion in response to active retrotransposons in the murine lineage\n", "# KRAB-zinc finger protein gene expansion in response to active retrotransposons in the murine lineage\n",
"\n", "\n",
"Wolf Gernot; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; de Iaco Alberto; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Sun Ming-An; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Bruno Melania; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Tinkham Matthew; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Hoang Don; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Mitra Apratim; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Ralls Sherry; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Trono Didier; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Macfarlan Todd S; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States\n", "Gernot Wolf, Alberto de Iaco, Ming-An Sun, Melania Bruno, Matthew Tinkham, Don Hoang, Apratim Mitra, Sherry Ralls, Didier Trono, Todd S Macfarlan\n",
"\n",
"The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health, Bethesda, United States; School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL), Lausanne, Switzerland\n",
"\n", "\n",
"## Abstract\n", "## Abstract\n",
"\n",
"The Krüppel-associated box zinc finger protein (KRAB-ZFP) family diversified in mammals. The majority of human KRAB-ZFPs bind transposable elements (TEs), however, since most TEs are inactive in humans it is unclear whether KRAB-ZFPs emerged to suppress TEs. We demonstrate that many recently emerged murine KRAB-ZFPs also bind to TEs, including the active ETn, IAP, and L1 families. Using a CRISPR/Cas9-based engineering approach, we genetically deleted five large clusters of KRAB-ZFPs and demonstrate that target TEs are de-repressed, unleashing TE-encoded enhancers. Homozygous knockout mice lacking one of two KRAB-ZFP gene clusters on chromosome 2 and chromosome 4 were nonetheless viable. In pedigrees of chromosome 4 cluster KRAB-ZFP mutants, we identified numerous novel ETn insertions with a modest increase in mutants. Our data strongly support the current model that recent waves of retrotransposon activity drove the expansion of KRAB-ZFP genes in mice and that many KRAB-ZFPs play a redundant role restricting TE activity.\n",
"\n" "\n"
] ]
} }
@ -131,7 +131,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -198,7 +198,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -224,7 +224,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -261,7 +261,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -313,7 +313,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -359,9 +359,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2024/ipg241217.zip...\n",
"Parsing zip file, splitting into XML sections, and exporting to files...\n"
]
}
],
"source": [ "source": [
"import zipfile\n", "import zipfile\n",
"\n", "\n",
@ -407,7 +416,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -435,7 +444,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 11,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -449,7 +458,7 @@
{ {
"data": { "data": {
"application/vnd.jupyter.widget-view+json": { "application/vnd.jupyter.widget-view+json": {
"model_id": "3964d1ff30f74588a2f6b53ca8865a9f", "model_id": "316241ca89a843bda3170f2a5c76c639",
"version_major": 2, "version_major": 2,
"version_minor": 0 "version_minor": 0
}, },
@ -471,7 +480,7 @@
"source": [ "source": [
"from tqdm.notebook import tqdm\n", "from tqdm.notebook import tqdm\n",
"\n", "\n",
"from docling.backend.xml.pubmed_backend import PubMedDocumentBackend\n", "from docling.backend.xml.jats_backend import JatsDocumentBackend\n",
"from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend\n", "from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend\n",
"from docling.datamodel.base_models import InputFormat\n", "from docling.datamodel.base_models import InputFormat\n",
"from docling.datamodel.document import InputDocument\n", "from docling.datamodel.document import InputDocument\n",
@ -479,10 +488,10 @@
"# check PMC\n", "# check PMC\n",
"in_doc = InputDocument(\n", "in_doc = InputDocument(\n",
" path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\",\n", " path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\",\n",
" format=InputFormat.XML_PUBMED,\n", " format=InputFormat.XML_JATS,\n",
" backend=PubMedDocumentBackend,\n", " backend=JatsDocumentBackend,\n",
")\n", ")\n",
"backend = PubMedDocumentBackend(\n", "backend = JatsDocumentBackend(\n",
" in_doc=in_doc, path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\"\n", " in_doc=in_doc, path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\"\n",
")\n", ")\n",
"print(f\"Document {in_doc.file.name} is a valid PMC article? {backend.is_valid()}\")\n", "print(f\"Document {in_doc.file.name} is a valid PMC article? {backend.is_valid()}\")\n",
@ -521,7 +530,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 12,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -543,7 +552,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or PubMed XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files." "✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or JATS (PubMed) XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
] ]
}, },
{ {
@ -579,7 +588,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 13,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -607,7 +616,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 14,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -625,144 +634,9 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-01-24 16:49:57,108 [DEBUG][_create_connection]: Created new connection using: 2d58fad6c63448a486c0c0ffe3b7b28c (async_milvus_client.py:600)\n",
"Loading files: 51%|█████ | 51/100 [00:00<00:00, 67.88file/s]Input document ipg241217-1050.xml does not match any allowed format.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Failed to load file /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml with error: File format not allowed: /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml. Skipping...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading files: 100%|██████████| 100/100 [00:01<00:00, 58.05file/s]\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e9208639f1a4418d97267a28305d18fa",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Parsing nodes: 0%| | 0/99 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "88026613f6f44f0c8476dceaa1cb78cd",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7522b8b434b54616b4cfc3d71e9556d7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5879d8161c2041f5b100959e69ff9017",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "557912b5e3c741f3a06127156bc46379",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "843bb145942b449aa55fc5b8208da734",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c7dba09a4aed422998e9b9c2c3a70317",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0bd031356c7e4e879dcbe1d04e6c4a4e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating embeddings: 0%| | 0/425 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [ "source": [
"from llama_index.core import StorageContext, VectorStoreIndex\n", "from llama_index.core import StorageContext, VectorStoreIndex\n",
"from llama_index.vector_stores.milvus import MilvusVectorStore\n", "from llama_index.vector_stores.milvus import MilvusVectorStore\n",

View File

@ -21,7 +21,7 @@ Schema-specific support:
| Format | Description | | Format | Description |
|--------|-------------| |--------|-------------|
| USPTO XML | XML format followed by [USPTO](https://www.uspto.gov/patents) patents | | USPTO XML | XML format followed by [USPTO](https://www.uspto.gov/patents) patents |
| PMC XML | XML format followed by [PubMed Central®](https://pmc.ncbi.nlm.nih.gov/) articles | | JATS XML | XML format followed by [JATS](https://jats.nlm.nih.gov/) articles |
| Docling JSON | JSON-serialized [Docling Document](./concepts/docling_document.md) | | Docling JSON | JSON-serialized [Docling Document](./concepts/docling_document.md) |
## Supported output formats ## Supported output formats

View File

@ -19,7 +19,7 @@ def get_pubmed_paths():
def get_converter(): def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.XML_PUBMED]) converter = DocumentConverter(allowed_formats=[InputFormat.XML_JATS])
return converter return converter

View File

@ -130,24 +130,24 @@ def test_guess_format(tmp_path):
doc_path = Path("./tests/data/uspto/pftaps057006474.txt") doc_path = Path("./tests/data/uspto/pftaps057006474.txt")
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
# Valid XML PubMed # Valid XML JATS
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.xml").open("rb").read()) buf = BytesIO(Path("./tests/data/jats/elife-56337.xml").open("rb").read())
stream = DocumentStream(name="elife-56337.xml", stream=buf) stream = DocumentStream(name="elife-56337.xml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_PUBMED assert dci._guess_format(stream) == InputFormat.XML_JATS
doc_path = Path("./tests/data/pubmed/elife-56337.xml") doc_path = Path("./tests/data/jats/elife-56337.xml")
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED assert dci._guess_format(doc_path) == InputFormat.XML_JATS
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.nxml").open("rb").read()) buf = BytesIO(Path("./tests/data/jats/elife-56337.nxml").open("rb").read())
stream = DocumentStream(name="elife-56337.nxml", stream=buf) stream = DocumentStream(name="elife-56337.nxml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_PUBMED assert dci._guess_format(stream) == InputFormat.XML_JATS
doc_path = Path("./tests/data/pubmed/elife-56337.nxml") doc_path = Path("./tests/data/jats/elife-56337.nxml")
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED assert dci._guess_format(doc_path) == InputFormat.XML_JATS
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.txt").open("rb").read()) buf = BytesIO(Path("./tests/data/jats/elife-56337.txt").open("rb").read())
stream = DocumentStream(name="elife-56337.txt", stream=buf) stream = DocumentStream(name="elife-56337.txt", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_PUBMED assert dci._guess_format(stream) == InputFormat.XML_JATS
doc_path = Path("./tests/data/pubmed/elife-56337.txt") doc_path = Path("./tests/data/jats/elife-56337.txt")
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED assert dci._guess_format(doc_path) == InputFormat.XML_JATS
# Valid XML, non-supported flavor # Valid XML, non-supported flavor
xml_content = ( xml_content = (