diff --git a/docling/backend/xml/pubmed_backend.py b/docling/backend/xml/jats_backend.py similarity index 92% rename from docling/backend/xml/pubmed_backend.py rename to docling/backend/xml/jats_backend.py index ab28b4bd..1d7091cd 100755 --- a/docling/backend/xml/pubmed_backend.py +++ b/docling/backend/xml/jats_backend.py @@ -66,11 +66,21 @@ class XMLComponents(TypedDict): abstract: list[Abstract] -class PubMedDocumentBackend(DeclarativeDocumentBackend): - """ - The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024): +class JatsDocumentBackend(DeclarativeDocumentBackend): + """Backend to parse articles in XML format tagged according to JATS definition. + + The Journal Article Tag Suite (JATS) is an definition standard for the + representation of journal articles in XML format. Several publishers and journal + archives provide content in JATS format, including PubMed Central® (PMC), bioRxiv, + medRxiv, or Springer Nature. + + Refer to https://jats.nlm.nih.gov for more details on JATS. + + The code from this document backend has been developed by modifying parts of the + PubMed Parser library (version 0.5.0, released on 12.08.2024): Achakulvisut et al., (2020). - Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset. + Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML + Dataset XML Dataset. Journal of Open Source Software, 5(46), 1979, https://doi.org/10.21105/joss.01979 """ @@ -105,7 +115,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): return except Exception as exc: raise RuntimeError( - f"Could not initialize PubMed backend for file with hash {self.document_hash}." + f"Could not initialize JATS backend for file with hash {self.document_hash}." ) from exc @override @@ -126,7 +136,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): @classmethod @override def supported_formats(cls) -> set[InputFormat]: - return {InputFormat.XML_PUBMED} + return {InputFormat.XML_JATS} @override def convert(self) -> DoclingDocument: @@ -170,7 +180,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): for child in list(node): if child.tag not in skip_tags: # TODO: apply styling according to child.tag when supported by docling-core - text += PubMedDocumentBackend._get_text(child, sep) + text += JatsDocumentBackend._get_text(child, sep) if sep: text = text.rstrip(sep) + sep text += child.tail.replace("\n", " ") if child.tail else "" @@ -196,7 +206,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): abstract: Abstract = dict(label="", content="") texts = [] for abs_par in abs_node.xpath("p"): - texts.append(PubMedDocumentBackend._get_text(abs_par).strip()) + texts.append(JatsDocumentBackend._get_text(abs_par).strip()) abstract["content"] = " ".join(texts) label_node = abs_node.xpath("title|label") @@ -280,7 +290,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): return text def _parse_metadata(self) -> XMLComponents: - """Parsing PubMed document metadata.""" + """Parsing JATS document metadata.""" xml_components: XMLComponents = { "title": self._parse_title(), "authors": self._parse_authors(), @@ -385,7 +395,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): title_node = name_node[0] break citation["title"] = ( - PubMedDocumentBackend._get_text(title_node) + JatsDocumentBackend._get_text(title_node) if title_node is not None else node.text.replace("\n", " ").strip() ) @@ -415,7 +425,9 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): id_text = id_node.text if id_type and id_text: pub_id.append( - f"{id_type.replace("\n", " ").strip().upper()}: {id_text.replace("\n", " ").strip()}" + id_type.replace("\n", " ").strip().upper() + + ": " + + id_text.replace("\n", " ").strip() ) if pub_id: citation["pub_id"] = ", ".join(pub_id) @@ -428,9 +440,9 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): elif len(node.xpath("fpage")) > 0: citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip() if len(node.xpath("lpage")) > 0: - citation[ - "page" - ] += f"–{node.xpath('lpage')[0].text.replace("\n", " ").strip()}" + citation["page"] += ( + "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() + ) # Flatten the citation to string @@ -447,7 +459,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): text += citation["publisher_name"] + ". " if citation["volume"]: text = text.rstrip(". ") - text += f" {citation["volume"]}. " + text += f" {citation['volume']}. " if citation["page"]: text = text.rstrip(". ") if citation["volume"]: @@ -480,7 +492,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): ) -> None: label_node = node.xpath("label") label: Optional[str] = ( - PubMedDocumentBackend._get_text(label_node[0]).strip() if label_node else "" + JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else "" ) caption_node = node.xpath("caption") @@ -490,7 +502,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): for caption_par in list(caption_node[0]): if caption_par.xpath(".//supplementary-material"): continue - caption += PubMedDocumentBackend._get_text(caption_par).strip() + " " + caption += JatsDocumentBackend._get_text(caption_par).strip() + " " caption = caption.strip() else: caption = None @@ -511,7 +523,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): # def _add_footnote_group(self, doc: DoclingDocument, parent: NodeItem, node: etree._Element) -> None: # new_parent = doc.add_group(label=GroupLabel.LIST, name="footnotes", parent=parent) # for child in node.iterchildren(tag="fn"): - # text = PubMedDocumentBackend._get_text(child) + # text = JatsDocumentBackend._get_text(child) # doc.add_list_item(text=text, parent=new_parent) def _add_metadata( @@ -631,7 +643,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): for caption_par in list(caption_node[0]): if caption_par.xpath(".//supplementary-material"): continue - caption += PubMedDocumentBackend._get_text(caption_par).strip() + " " + caption += JatsDocumentBackend._get_text(caption_par).strip() + " " caption = caption.strip() else: caption = None @@ -686,7 +698,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): header = child.xpath("title|label") text: Optional[str] = None if len(header) > 0: - text = PubMedDocumentBackend._get_text(header[0]) + text = JatsDocumentBackend._get_text(header[0]) elif child.tag == "ack": text = DEFAULT_HEADER_ACKNOWLEDGMENTS if text: @@ -698,7 +710,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): elif child.tag == "list-item": # TODO: address any type of content (another list, formula,...) # TODO: address list type and item label - text = PubMedDocumentBackend._get_text(child).strip() + text = JatsDocumentBackend._get_text(child).strip() new_parent = doc.add_list_item(text=text, parent=parent) stop_walk = True elif child.tag == "fig": @@ -712,14 +724,14 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): elif child.tag == "fn-group": # header = child.xpath(".//title") or child.xpath(".//label") # if header: - # text = PubMedDocumentBackend._get_text(header[0]) + # text = JatsDocumentBackend._get_text(header[0]) # fn_parent = doc.add_heading(text=text, parent=new_parent) # self._add_footnote_group(doc, fn_parent, child) stop_walk = True elif child.tag == "ref-list" and node.tag != "ref-list": header = child.xpath("title|label") text = ( - PubMedDocumentBackend._get_text(header[0]) + JatsDocumentBackend._get_text(header[0]) if len(header) > 0 else DEFAULT_HEADER_REFERENCES ) @@ -732,7 +744,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): self._add_citation(doc, parent, text) stop_walk = True elif child.tag == "mixed-citation": - text = PubMedDocumentBackend._get_text(child).strip() + text = JatsDocumentBackend._get_text(child).strip() self._add_citation(doc, parent, text) stop_walk = True elif child.tag == "tex-math": diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index a2b9428b..da512aa1 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -34,7 +34,6 @@ class InputFormat(str, Enum): DOCX = "docx" PPTX = "pptx" HTML = "html" - XML_PUBMED = "xml_pubmed" IMAGE = "image" PDF = "pdf" ASCIIDOC = "asciidoc" @@ -42,6 +41,7 @@ class InputFormat(str, Enum): CSV = "csv" XLSX = "xlsx" XML_USPTO = "xml_uspto" + XML_JATS = "xml_jats" JSON_DOCLING = "json_docling" @@ -59,7 +59,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = { InputFormat.PDF: ["pdf"], InputFormat.MD: ["md"], InputFormat.HTML: ["html", "htm", "xhtml"], - InputFormat.XML_PUBMED: ["xml", "nxml"], + InputFormat.XML_JATS: ["xml", "nxml"], InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"], InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], InputFormat.CSV: ["csv"], @@ -79,7 +79,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = { "application/vnd.openxmlformats-officedocument.presentationml.presentation", ], InputFormat.HTML: ["text/html", "application/xhtml+xml"], - InputFormat.XML_PUBMED: ["application/xml"], + InputFormat.XML_JATS: ["application/xml"], InputFormat.IMAGE: [ "image/png", "image/jpeg", diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index cd460ce8..43894b07 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -333,11 +333,11 @@ class _DocumentConversionInput(BaseModel): ): input_format = InputFormat.XML_USPTO - if InputFormat.XML_PUBMED in formats and ( + if InputFormat.XML_JATS in formats and ( "JATS-journalpublishing" in xml_doctype or "JATS-archive" in xml_doctype ): - input_format = InputFormat.XML_PUBMED + input_format = InputFormat.XML_JATS elif mime == "text/plain": if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"): diff --git a/docling/document_converter.py b/docling/document_converter.py index 27f31acb..d52efcea 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -18,7 +18,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend -from docling.backend.xml.pubmed_backend import PubMedDocumentBackend +from docling.backend.xml.jats_backend import JatsDocumentBackend from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend from docling.datamodel.base_models import ( ConversionStatus, @@ -102,9 +102,9 @@ class PatentUsptoFormatOption(FormatOption): backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend -class XMLPubMedFormatOption(FormatOption): +class XMLJatsFormatOption(FormatOption): pipeline_cls: Type = SimplePipeline - backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend + backend: Type[AbstractDocumentBackend] = JatsDocumentBackend class ImageFormatOption(FormatOption): @@ -143,8 +143,8 @@ def _get_default_option(format: InputFormat) -> FormatOption: InputFormat.XML_USPTO: FormatOption( pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend ), - InputFormat.XML_PUBMED: FormatOption( - pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend + InputFormat.XML_JATS: FormatOption( + pipeline_cls=SimplePipeline, backend=JatsDocumentBackend ), InputFormat.IMAGE: FormatOption( pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend diff --git a/docs/examples/backend_xml_rag.ipynb b/docs/examples/backend_xml_rag.ipynb index 78c603c8..0b2227f4 100644 --- a/docs/examples/backend_xml_rag.ipynb +++ b/docs/examples/backend_xml_rag.ipynb @@ -82,7 +82,7 @@ "from docling.document_converter import DocumentConverter\n", "\n", "# a sample PMC article:\n", - "source = \"../../tests/data/pubmed/elife-56337.nxml\"\n", + "source = \"../../tests/data/jats/elife-56337.nxml\"\n", "converter = DocumentConverter()\n", "result = converter.convert(source)\n", "print(result.status)" @@ -97,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -106,11 +106,11 @@ "text": [ "# KRAB-zinc finger protein gene expansion in response to active retrotransposons in the murine lineage\n", "\n", - "Wolf Gernot; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; de Iaco Alberto; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Sun Ming-An; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Bruno Melania; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Tinkham Matthew; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Hoang Don; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Mitra Apratim; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Ralls Sherry; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Trono Didier; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Macfarlan Todd S; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States\n", + "Gernot Wolf, Alberto de Iaco, Ming-An Sun, Melania Bruno, Matthew Tinkham, Don Hoang, Apratim Mitra, Sherry Ralls, Didier Trono, Todd S Macfarlan\n", + "\n", + "The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health, Bethesda, United States; School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL), Lausanne, Switzerland\n", "\n", "## Abstract\n", - "\n", - "The Krüppel-associated box zinc finger protein (KRAB-ZFP) family diversified in mammals. The majority of human KRAB-ZFPs bind transposable elements (TEs), however, since most TEs are inactive in humans it is unclear whether KRAB-ZFPs emerged to suppress TEs. We demonstrate that many recently emerged murine KRAB-ZFPs also bind to TEs, including the active ETn, IAP, and L1 families. Using a CRISPR/Cas9-based engineering approach, we genetically deleted five large clusters of KRAB-ZFPs and demonstrate that target TEs are de-repressed, unleashing TE-encoded enhancers. Homozygous knockout mice lacking one of two KRAB-ZFP gene clusters on chromosome 2 and chromosome 4 were nonetheless viable. In pedigrees of chromosome 4 cluster KRAB-ZFP mutants, we identified numerous novel ETn insertions with a modest increase in mutants. Our data strongly support the current model that recent waves of retrotransposon activity drove the expansion of KRAB-ZFP genes in mice and that many KRAB-ZFPs play a redundant role restricting TE activity.\n", "\n" ] } @@ -131,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -198,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -224,7 +224,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -261,7 +261,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -313,7 +313,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -359,9 +359,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2024/ipg241217.zip...\n", + "Parsing zip file, splitting into XML sections, and exporting to files...\n" + ] + } + ], "source": [ "import zipfile\n", "\n", @@ -407,7 +416,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -435,7 +444,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -449,7 +458,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3964d1ff30f74588a2f6b53ca8865a9f", + "model_id": "316241ca89a843bda3170f2a5c76c639", "version_major": 2, "version_minor": 0 }, @@ -471,7 +480,7 @@ "source": [ "from tqdm.notebook import tqdm\n", "\n", - "from docling.backend.xml.pubmed_backend import PubMedDocumentBackend\n", + "from docling.backend.xml.jats_backend import JatsDocumentBackend\n", "from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend\n", "from docling.datamodel.base_models import InputFormat\n", "from docling.datamodel.document import InputDocument\n", @@ -479,10 +488,10 @@ "# check PMC\n", "in_doc = InputDocument(\n", " path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\",\n", - " format=InputFormat.XML_PUBMED,\n", - " backend=PubMedDocumentBackend,\n", + " format=InputFormat.XML_JATS,\n", + " backend=JatsDocumentBackend,\n", ")\n", - "backend = PubMedDocumentBackend(\n", + "backend = JatsDocumentBackend(\n", " in_doc=in_doc, path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\"\n", ")\n", "print(f\"Document {in_doc.file.name} is a valid PMC article? {backend.is_valid()}\")\n", @@ -521,7 +530,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -543,7 +552,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or PubMed XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files." + "✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or JATS (PubMed) XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files." ] }, { @@ -579,7 +588,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -607,7 +616,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -625,144 +634,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-01-24 16:49:57,108 [DEBUG][_create_connection]: Created new connection using: 2d58fad6c63448a486c0c0ffe3b7b28c (async_milvus_client.py:600)\n", - "Loading files: 51%|█████ | 51/100 [00:00<00:00, 67.88file/s]Input document ipg241217-1050.xml does not match any allowed format.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Failed to load file /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml with error: File format not allowed: /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml. Skipping...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Loading files: 100%|██████████| 100/100 [00:01<00:00, 58.05file/s]\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e9208639f1a4418d97267a28305d18fa", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Parsing nodes: 0%| | 0/99 [00:00