chore(xml-jats): rename PubMed objects to JATS

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-07-31 14:34:40 +00:00 · 2025-02-14 14:59:57 +01:00 · 2025-02-14 14:59:57 +01:00 · 93eb9de871
commit 93eb9de871
parent 011dd6ce96
19 changed files with 95 additions and 209 deletions
--- a/docling/backend/xml/pubmed_backend.py
+++ b/docling/backend/xml/pubmed_backend.py
@ -66,11 +66,21 @@ class XMLComponents(TypedDict):
    abstract: list[Abstract]


-class PubMedDocumentBackend(DeclarativeDocumentBackend):
-    """
-    The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
+class JatsDocumentBackend(DeclarativeDocumentBackend):
+    """Backend to parse articles in XML format tagged according to JATS definition.
+
+    The Journal Article Tag Suite (JATS) is an definition standard for the
+    representation of journal articles in XML format. Several publishers and journal
+    archives provide content in JATS format, including PubMed Central® (PMC), bioRxiv,
+    medRxiv, or Springer Nature.
+
+    Refer to https://jats.nlm.nih.gov for more details on JATS.
+
+    The code from this document backend has been developed by modifying parts of the
+    PubMed Parser library (version 0.5.0, released on 12.08.2024):
    Achakulvisut et al., (2020).
-    Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
+    Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML
+      Dataset XML Dataset.
    Journal of Open Source Software, 5(46), 1979,
    https://doi.org/10.21105/joss.01979
    """
@ -105,7 +115,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                    return
        except Exception as exc:
            raise RuntimeError(
-                f"Could not initialize PubMed backend for file with hash {self.document_hash}."
+                f"Could not initialize JATS backend for file with hash {self.document_hash}."
            ) from exc

    @override
@ -126,7 +136,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
    @classmethod
    @override
    def supported_formats(cls) -> set[InputFormat]:
-        return {InputFormat.XML_PUBMED}
+        return {InputFormat.XML_JATS}

    @override
    def convert(self) -> DoclingDocument:
@ -170,7 +180,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
        for child in list(node):
            if child.tag not in skip_tags:
                # TODO: apply styling according to child.tag when supported by docling-core
-                text += PubMedDocumentBackend._get_text(child, sep)
+                text += JatsDocumentBackend._get_text(child, sep)
            if sep:
                text = text.rstrip(sep) + sep
            text += child.tail.replace("\n", " ") if child.tail else ""
@ -196,7 +206,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
            abstract: Abstract = dict(label="", content="")
            texts = []
            for abs_par in abs_node.xpath("p"):
-                texts.append(PubMedDocumentBackend._get_text(abs_par).strip())
+                texts.append(JatsDocumentBackend._get_text(abs_par).strip())
            abstract["content"] = " ".join(texts)

            label_node = abs_node.xpath("title|label")
@ -280,7 +290,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
        return text

    def _parse_metadata(self) -> XMLComponents:
-        """Parsing PubMed document metadata."""
+        """Parsing JATS document metadata."""
        xml_components: XMLComponents = {
            "title": self._parse_title(),
            "authors": self._parse_authors(),
@ -385,7 +395,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                title_node = name_node[0]
                break
        citation["title"] = (
-            PubMedDocumentBackend._get_text(title_node)
+            JatsDocumentBackend._get_text(title_node)
            if title_node is not None
            else node.text.replace("\n", " ").strip()
        )
@ -415,7 +425,9 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                id_text = id_node.text
                if id_type and id_text:
                    pub_id.append(
-                        f"{id_type.replace("\n", " ").strip().upper()}: {id_text.replace("\n", " ").strip()}"
+                        id_type.replace("\n", " ").strip().upper()
+                        + ": "
+                        + id_text.replace("\n", " ").strip()
                    )
            if pub_id:
                citation["pub_id"] = ", ".join(pub_id)
@ -428,9 +440,9 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
        elif len(node.xpath("fpage")) > 0:
            citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
            if len(node.xpath("lpage")) > 0:
-                citation[
-                    "page"
-                ] += f"–{node.xpath('lpage')[0].text.replace("\n", " ").strip()}"
+                citation["page"] += (
+                    "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
+                )

        # Flatten the citation to string

@ -447,7 +459,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
            text += citation["publisher_name"] + ". "
        if citation["volume"]:
            text = text.rstrip(". ")
-            text += f" {citation["volume"]}. "
+            text += f" {citation['volume']}. "
        if citation["page"]:
            text = text.rstrip(". ")
            if citation["volume"]:
@ -480,7 +492,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
    ) -> None:
        label_node = node.xpath("label")
        label: Optional[str] = (
-            PubMedDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
+            JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
        )

        caption_node = node.xpath("caption")
@ -490,7 +502,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
            for caption_par in list(caption_node[0]):
                if caption_par.xpath(".//supplementary-material"):
                    continue
-                caption += PubMedDocumentBackend._get_text(caption_par).strip() + " "
+                caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
            caption = caption.strip()
        else:
            caption = None
@ -511,7 +523,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
    # def _add_footnote_group(self, doc: DoclingDocument, parent: NodeItem, node: etree._Element) -> None:
    #     new_parent = doc.add_group(label=GroupLabel.LIST, name="footnotes", parent=parent)
    #     for child in node.iterchildren(tag="fn"):
-    #         text = PubMedDocumentBackend._get_text(child)
+    #         text = JatsDocumentBackend._get_text(child)
    #         doc.add_list_item(text=text, parent=new_parent)

    def _add_metadata(
@ -631,7 +643,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
            for caption_par in list(caption_node[0]):
                if caption_par.xpath(".//supplementary-material"):
                    continue
-                caption += PubMedDocumentBackend._get_text(caption_par).strip() + " "
+                caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
            caption = caption.strip()
        else:
            caption = None
@ -686,7 +698,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                header = child.xpath("title|label")
                text: Optional[str] = None
                if len(header) > 0:
-                    text = PubMedDocumentBackend._get_text(header[0])
+                    text = JatsDocumentBackend._get_text(header[0])
                elif child.tag == "ack":
                    text = DEFAULT_HEADER_ACKNOWLEDGMENTS
                if text:
@ -698,7 +710,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
            elif child.tag == "list-item":
                # TODO: address any type of content (another list, formula,...)
                # TODO: address list type and item label
-                text = PubMedDocumentBackend._get_text(child).strip()
+                text = JatsDocumentBackend._get_text(child).strip()
                new_parent = doc.add_list_item(text=text, parent=parent)
                stop_walk = True
            elif child.tag == "fig":
@ -712,14 +724,14 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
            elif child.tag == "fn-group":
                # header = child.xpath(".//title") or child.xpath(".//label")
                # if header:
-                #     text = PubMedDocumentBackend._get_text(header[0])
+                #     text = JatsDocumentBackend._get_text(header[0])
                #     fn_parent = doc.add_heading(text=text, parent=new_parent)
                # self._add_footnote_group(doc, fn_parent, child)
                stop_walk = True
            elif child.tag == "ref-list" and node.tag != "ref-list":
                header = child.xpath("title|label")
                text = (
-                    PubMedDocumentBackend._get_text(header[0])
+                    JatsDocumentBackend._get_text(header[0])
                    if len(header) > 0
                    else DEFAULT_HEADER_REFERENCES
                )
@ -732,7 +744,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                self._add_citation(doc, parent, text)
                stop_walk = True
            elif child.tag == "mixed-citation":
-                text = PubMedDocumentBackend._get_text(child).strip()
+                text = JatsDocumentBackend._get_text(child).strip()
                self._add_citation(doc, parent, text)
                stop_walk = True
            elif child.tag == "tex-math":
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -34,7 +34,6 @@ class InputFormat(str, Enum):
    DOCX = "docx"
    PPTX = "pptx"
    HTML = "html"
-    XML_PUBMED = "xml_pubmed"
    IMAGE = "image"
    PDF = "pdf"
    ASCIIDOC = "asciidoc"
@ -42,6 +41,7 @@ class InputFormat(str, Enum):
    CSV = "csv"
    XLSX = "xlsx"
    XML_USPTO = "xml_uspto"
+    XML_JATS = "xml_jats"
    JSON_DOCLING = "json_docling"


@ -59,7 +59,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.PDF: ["pdf"],
    InputFormat.MD: ["md"],
    InputFormat.HTML: ["html", "htm", "xhtml"],
-    InputFormat.XML_PUBMED: ["xml", "nxml"],
+    InputFormat.XML_JATS: ["xml", "nxml"],
    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
    InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
    InputFormat.CSV: ["csv"],
@ -79,7 +79,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
    ],
    InputFormat.HTML: ["text/html", "application/xhtml+xml"],
-    InputFormat.XML_PUBMED: ["application/xml"],
+    InputFormat.XML_JATS: ["application/xml"],
    InputFormat.IMAGE: [
        "image/png",
        "image/jpeg",
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -333,11 +333,11 @@ class _DocumentConversionInput(BaseModel):
                ):
                    input_format = InputFormat.XML_USPTO

-                if InputFormat.XML_PUBMED in formats and (
+                if InputFormat.XML_JATS in formats and (
                    "JATS-journalpublishing" in xml_doctype
                    or "JATS-archive" in xml_doctype
                ):
-                    input_format = InputFormat.XML_PUBMED
+                    input_format = InputFormat.XML_JATS

        elif mime == "text/plain":
            if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -18,7 +18,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
-from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
+from docling.backend.xml.jats_backend import JatsDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
    ConversionStatus,
@ -102,9 +102,9 @@ class PatentUsptoFormatOption(FormatOption):
    backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend


-class XMLPubMedFormatOption(FormatOption):
+class XMLJatsFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
+    backend: Type[AbstractDocumentBackend] = JatsDocumentBackend


 class ImageFormatOption(FormatOption):
@ -143,8 +143,8 @@ def _get_default_option(format: InputFormat) -> FormatOption:
        InputFormat.XML_USPTO: FormatOption(
            pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
        ),
-        InputFormat.XML_PUBMED: FormatOption(
-            pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
+        InputFormat.XML_JATS: FormatOption(
+            pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
        ),
        InputFormat.IMAGE: FormatOption(
            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
--- a/docs/examples/backend_xml_rag.ipynb
+++ b/docs/examples/backend_xml_rag.ipynb
@ -82,7 +82,7 @@
    "from docling.document_converter import DocumentConverter\n",
    "\n",
    "# a sample PMC article:\n",
-    "source = \"../../tests/data/pubmed/elife-56337.nxml\"\n",
+    "source = \"../../tests/data/jats/elife-56337.nxml\"\n",
    "converter = DocumentConverter()\n",
    "result = converter.convert(source)\n",
    "print(result.status)"
@ -97,7 +97,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
@ -106,11 +106,11 @@
     "text": [
      "# KRAB-zinc finger protein gene expansion in response to active retrotransposons in the murine lineage\n",
      "\n",
-      "Wolf Gernot; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; de Iaco Alberto; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Sun Ming-An; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Bruno Melania; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Tinkham Matthew; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Hoang Don; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Mitra Apratim; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Ralls Sherry; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Trono Didier; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Macfarlan Todd S; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States\n",
+      "Gernot Wolf, Alberto de Iaco, Ming-An Sun, Melania Bruno, Matthew Tinkham, Don Hoang, Apratim Mitra, Sherry Ralls, Didier Trono, Todd S Macfarlan\n",
+      "\n",
+      "The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health, Bethesda, United States; School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL), Lausanne, Switzerland\n",
      "\n",
      "## Abstract\n",
-      "\n",
-      "The Krüppel-associated box zinc finger protein (KRAB-ZFP) family diversified in mammals. The majority of human KRAB-ZFPs bind transposable elements (TEs), however, since most TEs are inactive in humans it is unclear whether KRAB-ZFPs emerged to suppress TEs. We demonstrate that many recently emerged murine KRAB-ZFPs also bind to TEs, including the active ETn, IAP, and L1 families. Using a CRISPR/Cas9-based engineering approach, we genetically deleted five large clusters of KRAB-ZFPs and demonstrate that target TEs are de-repressed, unleashing TE-encoded enhancers. Homozygous knockout mice lacking one of two KRAB-ZFP gene clusters on chromosome 2 and chromosome 4 were nonetheless viable. In pedigrees of chromosome 4 cluster KRAB-ZFP mutants, we identified numerous novel ETn insertions with a modest increase in mutants. Our data strongly support the current model that recent waves of retrotransposon activity drove the expansion of KRAB-ZFP genes in mice and that many KRAB-ZFPs play a redundant role restricting TE activity.\n",
      "\n"
     ]
    }
@ -131,7 +131,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@ -198,7 +198,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@ -224,7 +224,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -261,7 +261,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@ -313,7 +313,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@ -359,9 +359,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2024/ipg241217.zip...\n",
+      "Parsing zip file, splitting into XML sections, and exporting to files...\n"
+     ]
+    }
+   ],
   "source": [
    "import zipfile\n",
    "\n",
@ -407,7 +416,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@ -435,7 +444,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@ -449,7 +458,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3964d1ff30f74588a2f6b53ca8865a9f",
+       "model_id": "316241ca89a843bda3170f2a5c76c639",
       "version_major": 2,
       "version_minor": 0
      },
@ -471,7 +480,7 @@
   "source": [
    "from tqdm.notebook import tqdm\n",
    "\n",
-    "from docling.backend.xml.pubmed_backend import PubMedDocumentBackend\n",
+    "from docling.backend.xml.jats_backend import JatsDocumentBackend\n",
    "from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend\n",
    "from docling.datamodel.base_models import InputFormat\n",
    "from docling.datamodel.document import InputDocument\n",
@ -479,10 +488,10 @@
    "# check PMC\n",
    "in_doc = InputDocument(\n",
    "    path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\",\n",
-    "    format=InputFormat.XML_PUBMED,\n",
-    "    backend=PubMedDocumentBackend,\n",
+    "    format=InputFormat.XML_JATS,\n",
+    "    backend=JatsDocumentBackend,\n",
    ")\n",
-    "backend = PubMedDocumentBackend(\n",
+    "backend = JatsDocumentBackend(\n",
    "    in_doc=in_doc, path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\"\n",
    ")\n",
    "print(f\"Document {in_doc.file.name} is a valid PMC article? {backend.is_valid()}\")\n",
@ -521,7 +530,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
@ -543,7 +552,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or PubMed XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
+    "✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or JATS (PubMed) XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
   ]
  },
  {
@ -579,7 +588,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
@ -607,7 +616,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
@ -625,144 +634,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2025-01-24 16:49:57,108 [DEBUG][_create_connection]: Created new connection using: 2d58fad6c63448a486c0c0ffe3b7b28c (async_milvus_client.py:600)\n",
-      "Loading files:  51%|█████     | 51/100 [00:00<00:00, 67.88file/s]Input document ipg241217-1050.xml does not match any allowed format.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Failed to load file /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml with error: File format not allowed: /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml. Skipping...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading files: 100%|██████████| 100/100 [00:01<00:00, 58.05file/s]\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e9208639f1a4418d97267a28305d18fa",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Parsing nodes:   0%|          | 0/99 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "88026613f6f44f0c8476dceaa1cb78cd",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7522b8b434b54616b4cfc3d71e9556d7",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5879d8161c2041f5b100959e69ff9017",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "557912b5e3c741f3a06127156bc46379",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "843bb145942b449aa55fc5b8208da734",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c7dba09a4aed422998e9b9c2c3a70317",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0bd031356c7e4e879dcbe1d04e6c4a4e",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/425 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from llama_index.core import StorageContext, VectorStoreIndex\n",
    "from llama_index.vector_stores.milvus import MilvusVectorStore\n",
--- a/docs/supported_formats.md
+++ b/docs/supported_formats.md
@ -21,7 +21,7 @@ Schema-specific support:
 | Format | Description |
 |--------|-------------|
 | USPTO XML | XML format followed by [USPTO](https://www.uspto.gov/patents) patents |
-| PMC XML | XML format followed by [PubMed Central®](https://pmc.ncbi.nlm.nih.gov/) articles |
+| JATS XML | XML format followed by [JATS](https://jats.nlm.nih.gov/) articles |
 | Docling JSON | JSON-serialized [Docling Document](./concepts/docling_document.md) |

 ## Supported output formats
--- a/tests/data/pubmed/bmj_sample.xml
+++ b/tests/data/pubmed/bmj_sample.xml
--- a/tests/data/pubmed/elife-56337.nxml
+++ b/tests/data/pubmed/elife-56337.nxml
--- a/tests/data/pubmed/elife-56337.txt
+++ b/tests/data/pubmed/elife-56337.txt
--- a/tests/data/pubmed/elife-56337.xml
+++ b/tests/data/pubmed/elife-56337.xml
--- a/tests/data/pubmed/pnas_sample.xml
+++ b/tests/data/pubmed/pnas_sample.xml
--- a/tests/data/pubmed/pntd.0008301.nxml
+++ b/tests/data/pubmed/pntd.0008301.nxml
--- a/tests/data/pubmed/pntd.0008301.txt
+++ b/tests/data/pubmed/pntd.0008301.txt
--- a/tests/data/pubmed/pntd.0008301.xml
+++ b/tests/data/pubmed/pntd.0008301.xml
--- a/tests/data/pubmed/pone.0234687.nxml
+++ b/tests/data/pubmed/pone.0234687.nxml
--- a/tests/data/pubmed/pone.0234687.txt
+++ b/tests/data/pubmed/pone.0234687.txt
--- a/tests/data/pubmed/pone.0234687.xml
+++ b/tests/data/pubmed/pone.0234687.xml
--- a/tests/test_backend_pubmed.py
+++ b/tests/test_backend_pubmed.py
@ -19,7 +19,7 @@ def get_pubmed_paths():


 def get_converter():
-    converter = DocumentConverter(allowed_formats=[InputFormat.XML_PUBMED])
+    converter = DocumentConverter(allowed_formats=[InputFormat.XML_JATS])
    return converter


--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@ -130,24 +130,24 @@ def test_guess_format(tmp_path):
    doc_path = Path("./tests/data/uspto/pftaps057006474.txt")
    assert dci._guess_format(doc_path) == InputFormat.XML_USPTO

-    # Valid XML PubMed
-    buf = BytesIO(Path("./tests/data/pubmed/elife-56337.xml").open("rb").read())
+    # Valid XML JATS
+    buf = BytesIO(Path("./tests/data/jats/elife-56337.xml").open("rb").read())
    stream = DocumentStream(name="elife-56337.xml", stream=buf)
-    assert dci._guess_format(stream) == InputFormat.XML_PUBMED
-    doc_path = Path("./tests/data/pubmed/elife-56337.xml")
-    assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
+    assert dci._guess_format(stream) == InputFormat.XML_JATS
+    doc_path = Path("./tests/data/jats/elife-56337.xml")
+    assert dci._guess_format(doc_path) == InputFormat.XML_JATS

-    buf = BytesIO(Path("./tests/data/pubmed/elife-56337.nxml").open("rb").read())
+    buf = BytesIO(Path("./tests/data/jats/elife-56337.nxml").open("rb").read())
    stream = DocumentStream(name="elife-56337.nxml", stream=buf)
-    assert dci._guess_format(stream) == InputFormat.XML_PUBMED
-    doc_path = Path("./tests/data/pubmed/elife-56337.nxml")
-    assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
+    assert dci._guess_format(stream) == InputFormat.XML_JATS
+    doc_path = Path("./tests/data/jats/elife-56337.nxml")
+    assert dci._guess_format(doc_path) == InputFormat.XML_JATS

-    buf = BytesIO(Path("./tests/data/pubmed/elife-56337.txt").open("rb").read())
+    buf = BytesIO(Path("./tests/data/jats/elife-56337.txt").open("rb").read())
    stream = DocumentStream(name="elife-56337.txt", stream=buf)
-    assert dci._guess_format(stream) == InputFormat.XML_PUBMED
-    doc_path = Path("./tests/data/pubmed/elife-56337.txt")
-    assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
+    assert dci._guess_format(stream) == InputFormat.XML_JATS
+    doc_path = Path("./tests/data/jats/elife-56337.txt")
+    assert dci._guess_format(doc_path) == InputFormat.XML_JATS

    # Valid XML, non-supported flavor
    xml_content = (