chore(xml-jats): rename PubMed objects to JATS

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-08-01 23:12:20 +00:00 · 2025-02-14 14:59:57 +01:00 · 2025-02-14 14:59:57 +01:00 · 93eb9de871
commit 93eb9de871
parent 011dd6ce96
19 changed files with 95 additions and 209 deletions
--- a/docling/backend/xml/pubmed_backend.py
+++ b/docling/backend/xml/pubmed_backend.py
@ -66,11 +66,21 @@ class XMLComponents(TypedDict):
    abstract: list[Abstract]
-class PubMedDocumentBackend(DeclarativeDocumentBackend):
+class JatsDocumentBackend(DeclarativeDocumentBackend):
-    """
+    """Backend to parse articles in XML format tagged according to JATS definition.
-    The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
+
    The Journal Article Tag Suite (JATS) is an definition standard for the
    representation of journal articles in XML format. Several publishers and journal
    archives provide content in JATS format, including PubMed Central® (PMC), bioRxiv,
    medRxiv, or Springer Nature.
    Refer to https://jats.nlm.nih.gov for more details on JATS.
    The code from this document backend has been developed by modifying parts of the
    PubMed Parser library (version 0.5.0, released on 12.08.2024):
    Achakulvisut et al., (2020).
-    Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
+    Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML
      Dataset XML Dataset.
    Journal of Open Source Software, 5(46), 1979,
    https://doi.org/10.21105/joss.01979
    """
@ -105,7 +115,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                    return
        except Exception as exc:
            raise RuntimeError(
-                f"Could not initialize PubMed backend for file with hash {self.document_hash}."
+                f"Could not initialize JATS backend for file with hash {self.document_hash}."
            ) from exc
    @override
@ -126,7 +136,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
    @classmethod
    @override
    def supported_formats(cls) -> set[InputFormat]:
-        return {InputFormat.XML_PUBMED}
+        return {InputFormat.XML_JATS}
    @override
    def convert(self) -> DoclingDocument:
@ -170,7 +180,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
        for child in list(node):
            if child.tag not in skip_tags:
                # TODO: apply styling according to child.tag when supported by docling-core
-                text += PubMedDocumentBackend._get_text(child, sep)
+                text += JatsDocumentBackend._get_text(child, sep)
            if sep:
                text = text.rstrip(sep) + sep
            text += child.tail.replace("\n", " ") if child.tail else ""
@ -196,7 +206,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
            abstract: Abstract = dict(label="", content="")
            texts = []
            for abs_par in abs_node.xpath("p"):
-                texts.append(PubMedDocumentBackend._get_text(abs_par).strip())
+                texts.append(JatsDocumentBackend._get_text(abs_par).strip())
            abstract["content"] = " ".join(texts)
            label_node = abs_node.xpath("title|label")
@ -280,7 +290,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
        return text
    def _parse_metadata(self) -> XMLComponents:
-        """Parsing PubMed document metadata."""
+        """Parsing JATS document metadata."""
        xml_components: XMLComponents = {
            "title": self._parse_title(),
            "authors": self._parse_authors(),
@ -385,7 +395,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                title_node = name_node[0]
                break
        citation["title"] = (
-            PubMedDocumentBackend._get_text(title_node)
+            JatsDocumentBackend._get_text(title_node)
            if title_node is not None
            else node.text.replace("\n", " ").strip()
        )
@ -415,7 +425,9 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                id_text = id_node.text
                if id_type and id_text:
                    pub_id.append(
-                        f"{id_type.replace("\n", " ").strip().upper()}: {id_text.replace("\n", " ").strip()}"
+                        id_type.replace("\n", " ").strip().upper()
                        + ": "
                        + id_text.replace("\n", " ").strip()
                    )
            if pub_id:
                citation["pub_id"] = ", ".join(pub_id)
@ -428,9 +440,9 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
        elif len(node.xpath("fpage")) > 0:
            citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
            if len(node.xpath("lpage")) > 0:
-                citation[
+                citation["page"] += (
-                    "page"
+                    "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
-                ] += f"–{node.xpath('lpage')[0].text.replace("\n", " ").strip()}"
+                )
        # Flatten the citation to string
@ -447,7 +459,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
            text += citation["publisher_name"] + ". "
        if citation["volume"]:
            text = text.rstrip(". ")
-            text += f" {citation["volume"]}. "
+            text += f" {citation['volume']}. "
        if citation["page"]:
            text = text.rstrip(". ")
            if citation["volume"]:
@ -480,7 +492,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
    ) -> None:
        label_node = node.xpath("label")
        label: Optional[str] = (
-            PubMedDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
+            JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
        )
        caption_node = node.xpath("caption")
@ -490,7 +502,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
            for caption_par in list(caption_node[0]):
                if caption_par.xpath(".//supplementary-material"):
                    continue
-                caption += PubMedDocumentBackend._get_text(caption_par).strip() + " "
+                caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
            caption = caption.strip()
        else:
            caption = None
@ -511,7 +523,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
    # def _add_footnote_group(self, doc: DoclingDocument, parent: NodeItem, node: etree._Element) -> None:
    #     new_parent = doc.add_group(label=GroupLabel.LIST, name="footnotes", parent=parent)
    #     for child in node.iterchildren(tag="fn"):
-    #         text = PubMedDocumentBackend._get_text(child)
+    #         text = JatsDocumentBackend._get_text(child)
    #         doc.add_list_item(text=text, parent=new_parent)
    def _add_metadata(
@ -631,7 +643,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
            for caption_par in list(caption_node[0]):
                if caption_par.xpath(".//supplementary-material"):
                    continue
-                caption += PubMedDocumentBackend._get_text(caption_par).strip() + " "
+                caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
            caption = caption.strip()
        else:
            caption = None
@ -686,7 +698,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                header = child.xpath("title|label")
                text: Optional[str] = None
                if len(header) > 0:
-                    text = PubMedDocumentBackend._get_text(header[0])
+                    text = JatsDocumentBackend._get_text(header[0])
                elif child.tag == "ack":
                    text = DEFAULT_HEADER_ACKNOWLEDGMENTS
                if text:
@ -698,7 +710,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
            elif child.tag == "list-item":
                # TODO: address any type of content (another list, formula,...)
                # TODO: address list type and item label
-                text = PubMedDocumentBackend._get_text(child).strip()
+                text = JatsDocumentBackend._get_text(child).strip()
                new_parent = doc.add_list_item(text=text, parent=parent)
                stop_walk = True
            elif child.tag == "fig":
@ -712,14 +724,14 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
            elif child.tag == "fn-group":
                # header = child.xpath(".//title") or child.xpath(".//label")
                # if header:
-                #     text = PubMedDocumentBackend._get_text(header[0])
+                #     text = JatsDocumentBackend._get_text(header[0])
                #     fn_parent = doc.add_heading(text=text, parent=new_parent)
                # self._add_footnote_group(doc, fn_parent, child)
                stop_walk = True
            elif child.tag == "ref-list" and node.tag != "ref-list":
                header = child.xpath("title|label")
                text = (
-                    PubMedDocumentBackend._get_text(header[0])
+                    JatsDocumentBackend._get_text(header[0])
                    if len(header) > 0
                    else DEFAULT_HEADER_REFERENCES
                )
@ -732,7 +744,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                self._add_citation(doc, parent, text)
                stop_walk = True
            elif child.tag == "mixed-citation":
-                text = PubMedDocumentBackend._get_text(child).strip()
+                text = JatsDocumentBackend._get_text(child).strip()
                self._add_citation(doc, parent, text)
                stop_walk = True
            elif child.tag == "tex-math":
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -34,7 +34,6 @@ class InputFormat(str, Enum):
    DOCX = "docx"
    PPTX = "pptx"
    HTML = "html"
    XML_PUBMED = "xml_pubmed"
    IMAGE = "image"
    PDF = "pdf"
    ASCIIDOC = "asciidoc"
@ -42,6 +41,7 @@ class InputFormat(str, Enum):
    CSV = "csv"
    XLSX = "xlsx"
    XML_USPTO = "xml_uspto"
    XML_JATS = "xml_jats"
    JSON_DOCLING = "json_docling"
@ -59,7 +59,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.PDF: ["pdf"],
    InputFormat.MD: ["md"],
    InputFormat.HTML: ["html", "htm", "xhtml"],
-    InputFormat.XML_PUBMED: ["xml", "nxml"],
+    InputFormat.XML_JATS: ["xml", "nxml"],
    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
    InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
    InputFormat.CSV: ["csv"],
@ -79,7 +79,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
    ],
    InputFormat.HTML: ["text/html", "application/xhtml+xml"],
-    InputFormat.XML_PUBMED: ["application/xml"],
+    InputFormat.XML_JATS: ["application/xml"],
    InputFormat.IMAGE: [
        "image/png",
        "image/jpeg",
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -333,11 +333,11 @@ class _DocumentConversionInput(BaseModel):
                ):
                    input_format = InputFormat.XML_USPTO
-                if InputFormat.XML_PUBMED in formats and (
+                if InputFormat.XML_JATS in formats and (
                    "JATS-journalpublishing" in xml_doctype
                    or "JATS-archive" in xml_doctype
                ):
-                    input_format = InputFormat.XML_PUBMED
+                    input_format = InputFormat.XML_JATS
        elif mime == "text/plain":
            if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -18,7 +18,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
-from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
+from docling.backend.xml.jats_backend import JatsDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
    ConversionStatus,
@ -102,9 +102,9 @@ class PatentUsptoFormatOption(FormatOption):
    backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
-class XMLPubMedFormatOption(FormatOption):
+class XMLJatsFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
+    backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
 class ImageFormatOption(FormatOption):
@ -143,8 +143,8 @@ def _get_default_option(format: InputFormat) -> FormatOption:
        InputFormat.XML_USPTO: FormatOption(
            pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
        ),
-        InputFormat.XML_PUBMED: FormatOption(
+        InputFormat.XML_JATS: FormatOption(
-            pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
+            pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
        ),
        InputFormat.IMAGE: FormatOption(
            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
--- a/docs/examples/backend_xml_rag.ipynb
+++ b/docs/examples/backend_xml_rag.ipynb
@ -82,7 +82,7 @@
    "from docling.document_converter import DocumentConverter\n",
    "\n",
    "# a sample PMC article:\n",
-    "source = \"../../tests/data/pubmed/elife-56337.nxml\"\n",
+    "source = \"../../tests/data/jats/elife-56337.nxml\"\n",
    "converter = DocumentConverter()\n",
    "result = converter.convert(source)\n",
    "print(result.status)"
@ -97,7 +97,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
@ -106,11 +106,11 @@
     "text": [
      "# KRAB-zinc finger protein gene expansion in response to active retrotransposons in the murine lineage\n",
      "\n",
-      "Wolf Gernot; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; de Iaco Alberto; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Sun Ming-An; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Bruno Melania; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Tinkham Matthew; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Hoang Don; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Mitra Apratim; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Ralls Sherry; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Trono Didier; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Macfarlan Todd S; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States\n",
+      "Gernot Wolf, Alberto de Iaco, Ming-An Sun, Melania Bruno, Matthew Tinkham, Don Hoang, Apratim Mitra, Sherry Ralls, Didier Trono, Todd S Macfarlan\n",
      "\n",
      "The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health, Bethesda, United States; School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL), Lausanne, Switzerland\n",
      "\n",
      "## Abstract\n",
      "\n",
      "The Krüppel-associated box zinc finger protein (KRAB-ZFP) family diversified in mammals. The majority of human KRAB-ZFPs bind transposable elements (TEs), however, since most TEs are inactive in humans it is unclear whether KRAB-ZFPs emerged to suppress TEs. We demonstrate that many recently emerged murine KRAB-ZFPs also bind to TEs, including the active ETn, IAP, and L1 families. Using a CRISPR/Cas9-based engineering approach, we genetically deleted five large clusters of KRAB-ZFPs and demonstrate that target TEs are de-repressed, unleashing TE-encoded enhancers. Homozygous knockout mice lacking one of two KRAB-ZFP gene clusters on chromosome 2 and chromosome 4 were nonetheless viable. In pedigrees of chromosome 4 cluster KRAB-ZFP mutants, we identified numerous novel ETn insertions with a modest increase in mutants. Our data strongly support the current model that recent waves of retrotransposon activity drove the expansion of KRAB-ZFP genes in mice and that many KRAB-ZFPs play a redundant role restricting TE activity.\n",
      "\n"
     ]
    }
@ -131,7 +131,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@ -198,7 +198,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@ -224,7 +224,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -261,7 +261,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@ -313,7 +313,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@ -359,9 +359,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2024/ipg241217.zip...\n",
      "Parsing zip file, splitting into XML sections, and exporting to files...\n"
     ]
    }
   ],
   "source": [
    "import zipfile\n",
    "\n",
@ -407,7 +416,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@ -435,7 +444,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@ -449,7 +458,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3964d1ff30f74588a2f6b53ca8865a9f",
+       "model_id": "316241ca89a843bda3170f2a5c76c639",
       "version_major": 2,
       "version_minor": 0
      },
@ -471,7 +480,7 @@
   "source": [
    "from tqdm.notebook import tqdm\n",
    "\n",
-    "from docling.backend.xml.pubmed_backend import PubMedDocumentBackend\n",
+    "from docling.backend.xml.jats_backend import JatsDocumentBackend\n",
    "from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend\n",
    "from docling.datamodel.base_models import InputFormat\n",
    "from docling.datamodel.document import InputDocument\n",
@ -479,10 +488,10 @@
    "# check PMC\n",
    "in_doc = InputDocument(\n",
    "    path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\",\n",
-    "    format=InputFormat.XML_PUBMED,\n",
+    "    format=InputFormat.XML_JATS,\n",
-    "    backend=PubMedDocumentBackend,\n",
+    "    backend=JatsDocumentBackend,\n",
    ")\n",
-    "backend = PubMedDocumentBackend(\n",
+    "backend = JatsDocumentBackend(\n",
    "    in_doc=in_doc, path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\"\n",
    ")\n",
    "print(f\"Document {in_doc.file.name} is a valid PMC article? {backend.is_valid()}\")\n",
@ -521,7 +530,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
@ -543,7 +552,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or PubMed XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
+    "✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or JATS (PubMed) XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
   ]
  },
  {
@ -579,7 +588,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
@ -607,7 +616,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
@ -625,144 +634,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-01-24 16:49:57,108 [DEBUG][_create_connection]: Created new connection using: 2d58fad6c63448a486c0c0ffe3b7b28c (async_milvus_client.py:600)\n",
      "Loading files:  51%|█████     | 51/100 [00:00<00:00, 67.88file/s]Input document ipg241217-1050.xml does not match any allowed format.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed to load file /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml with error: File format not allowed: /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml. Skipping...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading files: 100%|██████████| 100/100 [00:01<00:00, 58.05file/s]\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e9208639f1a4418d97267a28305d18fa",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Parsing nodes:   0%|          | 0/99 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "88026613f6f44f0c8476dceaa1cb78cd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7522b8b434b54616b4cfc3d71e9556d7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5879d8161c2041f5b100959e69ff9017",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "557912b5e3c741f3a06127156bc46379",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "843bb145942b449aa55fc5b8208da734",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c7dba09a4aed422998e9b9c2c3a70317",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0bd031356c7e4e879dcbe1d04e6c4a4e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating embeddings:   0%|          | 0/425 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from llama_index.core import StorageContext, VectorStoreIndex\n",
    "from llama_index.vector_stores.milvus import MilvusVectorStore\n",
--- a/docs/supported_formats.md
+++ b/docs/supported_formats.md
@ -21,7 +21,7 @@ Schema-specific support:
 | Format | Description |
 |--------|-------------|
 | USPTO XML | XML format followed by [USPTO](https://www.uspto.gov/patents) patents |
-| PMC XML | XML format followed by [PubMed Central®](https://pmc.ncbi.nlm.nih.gov/) articles |
+| JATS XML | XML format followed by [JATS](https://jats.nlm.nih.gov/) articles |
 | Docling JSON | JSON-serialized [Docling Document](./concepts/docling_document.md) |
 ## Supported output formats
--- a/tests/data/pubmed/bmj_sample.xml
+++ b/tests/data/pubmed/bmj_sample.xml
--- a/tests/data/pubmed/elife-56337.nxml
+++ b/tests/data/pubmed/elife-56337.nxml
--- a/tests/data/pubmed/elife-56337.txt
+++ b/tests/data/pubmed/elife-56337.txt
--- a/tests/data/pubmed/elife-56337.xml
+++ b/tests/data/pubmed/elife-56337.xml
--- a/tests/data/pubmed/pnas_sample.xml
+++ b/tests/data/pubmed/pnas_sample.xml
--- a/tests/data/pubmed/pntd.0008301.nxml
+++ b/tests/data/pubmed/pntd.0008301.nxml
--- a/tests/data/pubmed/pntd.0008301.txt
+++ b/tests/data/pubmed/pntd.0008301.txt
--- a/tests/data/pubmed/pntd.0008301.xml
+++ b/tests/data/pubmed/pntd.0008301.xml
--- a/tests/data/pubmed/pone.0234687.nxml
+++ b/tests/data/pubmed/pone.0234687.nxml
--- a/tests/data/pubmed/pone.0234687.txt
+++ b/tests/data/pubmed/pone.0234687.txt
--- a/tests/data/pubmed/pone.0234687.xml
+++ b/tests/data/pubmed/pone.0234687.xml
--- a/tests/test_backend_pubmed.py
+++ b/tests/test_backend_pubmed.py
@ -19,7 +19,7 @@ def get_pubmed_paths():
 def get_converter():
-    converter = DocumentConverter(allowed_formats=[InputFormat.XML_PUBMED])
+    converter = DocumentConverter(allowed_formats=[InputFormat.XML_JATS])
    return converter
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@ -130,24 +130,24 @@ def test_guess_format(tmp_path):
    doc_path = Path("./tests/data/uspto/pftaps057006474.txt")
    assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
-    # Valid XML PubMed
+    # Valid XML JATS
-    buf = BytesIO(Path("./tests/data/pubmed/elife-56337.xml").open("rb").read())
+    buf = BytesIO(Path("./tests/data/jats/elife-56337.xml").open("rb").read())
    stream = DocumentStream(name="elife-56337.xml", stream=buf)
-    assert dci._guess_format(stream) == InputFormat.XML_PUBMED
+    assert dci._guess_format(stream) == InputFormat.XML_JATS
-    doc_path = Path("./tests/data/pubmed/elife-56337.xml")
+    doc_path = Path("./tests/data/jats/elife-56337.xml")
-    assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
+    assert dci._guess_format(doc_path) == InputFormat.XML_JATS
-    buf = BytesIO(Path("./tests/data/pubmed/elife-56337.nxml").open("rb").read())
+    buf = BytesIO(Path("./tests/data/jats/elife-56337.nxml").open("rb").read())
    stream = DocumentStream(name="elife-56337.nxml", stream=buf)
-    assert dci._guess_format(stream) == InputFormat.XML_PUBMED
+    assert dci._guess_format(stream) == InputFormat.XML_JATS
-    doc_path = Path("./tests/data/pubmed/elife-56337.nxml")
+    doc_path = Path("./tests/data/jats/elife-56337.nxml")
-    assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
+    assert dci._guess_format(doc_path) == InputFormat.XML_JATS
-    buf = BytesIO(Path("./tests/data/pubmed/elife-56337.txt").open("rb").read())
+    buf = BytesIO(Path("./tests/data/jats/elife-56337.txt").open("rb").read())
    stream = DocumentStream(name="elife-56337.txt", stream=buf)
-    assert dci._guess_format(stream) == InputFormat.XML_PUBMED
+    assert dci._guess_format(stream) == InputFormat.XML_JATS
-    doc_path = Path("./tests/data/pubmed/elife-56337.txt")
+    doc_path = Path("./tests/data/jats/elife-56337.txt")
-    assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
+    assert dci._guess_format(doc_path) == InputFormat.XML_JATS
    # Valid XML, non-supported flavor
    xml_content = (