chore(xml-jats): rename PubMed objects to JATS

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-02-14 14:59:57 +01:00
parent 011dd6ce96
commit 93eb9de871
19 changed files with 95 additions and 209 deletions

View File

@ -66,11 +66,21 @@ class XMLComponents(TypedDict):
abstract: list[Abstract]
class PubMedDocumentBackend(DeclarativeDocumentBackend):
"""
The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
class JatsDocumentBackend(DeclarativeDocumentBackend):
"""Backend to parse articles in XML format tagged according to JATS definition.
The Journal Article Tag Suite (JATS) is an definition standard for the
representation of journal articles in XML format. Several publishers and journal
archives provide content in JATS format, including PubMed Central® (PMC), bioRxiv,
medRxiv, or Springer Nature.
Refer to https://jats.nlm.nih.gov for more details on JATS.
The code from this document backend has been developed by modifying parts of the
PubMed Parser library (version 0.5.0, released on 12.08.2024):
Achakulvisut et al., (2020).
Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML
Dataset XML Dataset.
Journal of Open Source Software, 5(46), 1979,
https://doi.org/10.21105/joss.01979
"""
@ -105,7 +115,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
return
except Exception as exc:
raise RuntimeError(
f"Could not initialize PubMed backend for file with hash {self.document_hash}."
f"Could not initialize JATS backend for file with hash {self.document_hash}."
) from exc
@override
@ -126,7 +136,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
@classmethod
@override
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.XML_PUBMED}
return {InputFormat.XML_JATS}
@override
def convert(self) -> DoclingDocument:
@ -170,7 +180,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
for child in list(node):
if child.tag not in skip_tags:
# TODO: apply styling according to child.tag when supported by docling-core
text += PubMedDocumentBackend._get_text(child, sep)
text += JatsDocumentBackend._get_text(child, sep)
if sep:
text = text.rstrip(sep) + sep
text += child.tail.replace("\n", " ") if child.tail else ""
@ -196,7 +206,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
abstract: Abstract = dict(label="", content="")
texts = []
for abs_par in abs_node.xpath("p"):
texts.append(PubMedDocumentBackend._get_text(abs_par).strip())
texts.append(JatsDocumentBackend._get_text(abs_par).strip())
abstract["content"] = " ".join(texts)
label_node = abs_node.xpath("title|label")
@ -280,7 +290,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
return text
def _parse_metadata(self) -> XMLComponents:
"""Parsing PubMed document metadata."""
"""Parsing JATS document metadata."""
xml_components: XMLComponents = {
"title": self._parse_title(),
"authors": self._parse_authors(),
@ -385,7 +395,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
title_node = name_node[0]
break
citation["title"] = (
PubMedDocumentBackend._get_text(title_node)
JatsDocumentBackend._get_text(title_node)
if title_node is not None
else node.text.replace("\n", " ").strip()
)
@ -415,7 +425,9 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
id_text = id_node.text
if id_type and id_text:
pub_id.append(
f"{id_type.replace("\n", " ").strip().upper()}: {id_text.replace("\n", " ").strip()}"
id_type.replace("\n", " ").strip().upper()
+ ": "
+ id_text.replace("\n", " ").strip()
)
if pub_id:
citation["pub_id"] = ", ".join(pub_id)
@ -428,9 +440,9 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
elif len(node.xpath("fpage")) > 0:
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
if len(node.xpath("lpage")) > 0:
citation[
"page"
] += f"{node.xpath('lpage')[0].text.replace("\n", " ").strip()}"
citation["page"] += (
"" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
)
# Flatten the citation to string
@ -447,7 +459,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
text += citation["publisher_name"] + ". "
if citation["volume"]:
text = text.rstrip(". ")
text += f" {citation["volume"]}. "
text += f" {citation['volume']}. "
if citation["page"]:
text = text.rstrip(". ")
if citation["volume"]:
@ -480,7 +492,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
) -> None:
label_node = node.xpath("label")
label: Optional[str] = (
PubMedDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
)
caption_node = node.xpath("caption")
@ -490,7 +502,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
for caption_par in list(caption_node[0]):
if caption_par.xpath(".//supplementary-material"):
continue
caption += PubMedDocumentBackend._get_text(caption_par).strip() + " "
caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
caption = caption.strip()
else:
caption = None
@ -511,7 +523,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
# def _add_footnote_group(self, doc: DoclingDocument, parent: NodeItem, node: etree._Element) -> None:
# new_parent = doc.add_group(label=GroupLabel.LIST, name="footnotes", parent=parent)
# for child in node.iterchildren(tag="fn"):
# text = PubMedDocumentBackend._get_text(child)
# text = JatsDocumentBackend._get_text(child)
# doc.add_list_item(text=text, parent=new_parent)
def _add_metadata(
@ -631,7 +643,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
for caption_par in list(caption_node[0]):
if caption_par.xpath(".//supplementary-material"):
continue
caption += PubMedDocumentBackend._get_text(caption_par).strip() + " "
caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
caption = caption.strip()
else:
caption = None
@ -686,7 +698,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
header = child.xpath("title|label")
text: Optional[str] = None
if len(header) > 0:
text = PubMedDocumentBackend._get_text(header[0])
text = JatsDocumentBackend._get_text(header[0])
elif child.tag == "ack":
text = DEFAULT_HEADER_ACKNOWLEDGMENTS
if text:
@ -698,7 +710,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
elif child.tag == "list-item":
# TODO: address any type of content (another list, formula,...)
# TODO: address list type and item label
text = PubMedDocumentBackend._get_text(child).strip()
text = JatsDocumentBackend._get_text(child).strip()
new_parent = doc.add_list_item(text=text, parent=parent)
stop_walk = True
elif child.tag == "fig":
@ -712,14 +724,14 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
elif child.tag == "fn-group":
# header = child.xpath(".//title") or child.xpath(".//label")
# if header:
# text = PubMedDocumentBackend._get_text(header[0])
# text = JatsDocumentBackend._get_text(header[0])
# fn_parent = doc.add_heading(text=text, parent=new_parent)
# self._add_footnote_group(doc, fn_parent, child)
stop_walk = True
elif child.tag == "ref-list" and node.tag != "ref-list":
header = child.xpath("title|label")
text = (
PubMedDocumentBackend._get_text(header[0])
JatsDocumentBackend._get_text(header[0])
if len(header) > 0
else DEFAULT_HEADER_REFERENCES
)
@ -732,7 +744,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
self._add_citation(doc, parent, text)
stop_walk = True
elif child.tag == "mixed-citation":
text = PubMedDocumentBackend._get_text(child).strip()
text = JatsDocumentBackend._get_text(child).strip()
self._add_citation(doc, parent, text)
stop_walk = True
elif child.tag == "tex-math":

View File

@ -34,7 +34,6 @@ class InputFormat(str, Enum):
DOCX = "docx"
PPTX = "pptx"
HTML = "html"
XML_PUBMED = "xml_pubmed"
IMAGE = "image"
PDF = "pdf"
ASCIIDOC = "asciidoc"
@ -42,6 +41,7 @@ class InputFormat(str, Enum):
CSV = "csv"
XLSX = "xlsx"
XML_USPTO = "xml_uspto"
XML_JATS = "xml_jats"
JSON_DOCLING = "json_docling"
@ -59,7 +59,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.PDF: ["pdf"],
InputFormat.MD: ["md"],
InputFormat.HTML: ["html", "htm", "xhtml"],
InputFormat.XML_PUBMED: ["xml", "nxml"],
InputFormat.XML_JATS: ["xml", "nxml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.CSV: ["csv"],
@ -79,7 +79,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
],
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
InputFormat.XML_PUBMED: ["application/xml"],
InputFormat.XML_JATS: ["application/xml"],
InputFormat.IMAGE: [
"image/png",
"image/jpeg",

View File

@ -333,11 +333,11 @@ class _DocumentConversionInput(BaseModel):
):
input_format = InputFormat.XML_USPTO
if InputFormat.XML_PUBMED in formats and (
if InputFormat.XML_JATS in formats and (
"JATS-journalpublishing" in xml_doctype
or "JATS-archive" in xml_doctype
):
input_format = InputFormat.XML_PUBMED
input_format = InputFormat.XML_JATS
elif mime == "text/plain":
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):

View File

@ -18,7 +18,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
from docling.backend.xml.jats_backend import JatsDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
@ -102,9 +102,9 @@ class PatentUsptoFormatOption(FormatOption):
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
class XMLPubMedFormatOption(FormatOption):
class XMLJatsFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
class ImageFormatOption(FormatOption):
@ -143,8 +143,8 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.XML_USPTO: FormatOption(
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
),
InputFormat.XML_PUBMED: FormatOption(
pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
InputFormat.XML_JATS: FormatOption(
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend

View File

@ -82,7 +82,7 @@
"from docling.document_converter import DocumentConverter\n",
"\n",
"# a sample PMC article:\n",
"source = \"../../tests/data/pubmed/elife-56337.nxml\"\n",
"source = \"../../tests/data/jats/elife-56337.nxml\"\n",
"converter = DocumentConverter()\n",
"result = converter.convert(source)\n",
"print(result.status)"
@ -97,7 +97,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 2,
"metadata": {},
"outputs": [
{
@ -106,11 +106,11 @@
"text": [
"# KRAB-zinc finger protein gene expansion in response to active retrotransposons in the murine lineage\n",
"\n",
"Wolf Gernot; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; de Iaco Alberto; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Sun Ming-An; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Bruno Melania; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Tinkham Matthew; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Hoang Don; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Mitra Apratim; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Ralls Sherry; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Trono Didier; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Macfarlan Todd S; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States\n",
"Gernot Wolf, Alberto de Iaco, Ming-An Sun, Melania Bruno, Matthew Tinkham, Don Hoang, Apratim Mitra, Sherry Ralls, Didier Trono, Todd S Macfarlan\n",
"\n",
"The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health, Bethesda, United States; School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL), Lausanne, Switzerland\n",
"\n",
"## Abstract\n",
"\n",
"The Krüppel-associated box zinc finger protein (KRAB-ZFP) family diversified in mammals. The majority of human KRAB-ZFPs bind transposable elements (TEs), however, since most TEs are inactive in humans it is unclear whether KRAB-ZFPs emerged to suppress TEs. We demonstrate that many recently emerged murine KRAB-ZFPs also bind to TEs, including the active ETn, IAP, and L1 families. Using a CRISPR/Cas9-based engineering approach, we genetically deleted five large clusters of KRAB-ZFPs and demonstrate that target TEs are de-repressed, unleashing TE-encoded enhancers. Homozygous knockout mice lacking one of two KRAB-ZFP gene clusters on chromosome 2 and chromosome 4 were nonetheless viable. In pedigrees of chromosome 4 cluster KRAB-ZFP mutants, we identified numerous novel ETn insertions with a modest increase in mutants. Our data strongly support the current model that recent waves of retrotransposon activity drove the expansion of KRAB-ZFP genes in mice and that many KRAB-ZFPs play a redundant role restricting TE activity.\n",
"\n"
]
}
@ -131,7 +131,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [
{
@ -198,7 +198,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [
{
@ -224,7 +224,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@ -261,7 +261,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@ -313,7 +313,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"outputs": [
{
@ -359,9 +359,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2024/ipg241217.zip...\n",
"Parsing zip file, splitting into XML sections, and exporting to files...\n"
]
}
],
"source": [
"import zipfile\n",
"\n",
@ -407,7 +416,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [
{
@ -435,7 +444,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 11,
"metadata": {},
"outputs": [
{
@ -449,7 +458,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3964d1ff30f74588a2f6b53ca8865a9f",
"model_id": "316241ca89a843bda3170f2a5c76c639",
"version_major": 2,
"version_minor": 0
},
@ -471,7 +480,7 @@
"source": [
"from tqdm.notebook import tqdm\n",
"\n",
"from docling.backend.xml.pubmed_backend import PubMedDocumentBackend\n",
"from docling.backend.xml.jats_backend import JatsDocumentBackend\n",
"from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend\n",
"from docling.datamodel.base_models import InputFormat\n",
"from docling.datamodel.document import InputDocument\n",
@ -479,10 +488,10 @@
"# check PMC\n",
"in_doc = InputDocument(\n",
" path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\",\n",
" format=InputFormat.XML_PUBMED,\n",
" backend=PubMedDocumentBackend,\n",
" format=InputFormat.XML_JATS,\n",
" backend=JatsDocumentBackend,\n",
")\n",
"backend = PubMedDocumentBackend(\n",
"backend = JatsDocumentBackend(\n",
" in_doc=in_doc, path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\"\n",
")\n",
"print(f\"Document {in_doc.file.name} is a valid PMC article? {backend.is_valid()}\")\n",
@ -521,7 +530,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 12,
"metadata": {},
"outputs": [
{
@ -543,7 +552,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or PubMed XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
"✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or JATS (PubMed) XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
]
},
{
@ -579,7 +588,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@ -607,7 +616,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@ -625,144 +634,9 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-01-24 16:49:57,108 [DEBUG][_create_connection]: Created new connection using: 2d58fad6c63448a486c0c0ffe3b7b28c (async_milvus_client.py:600)\n",
"Loading files: 51%|█████ | 51/100 [00:00<00:00, 67.88file/s]Input document ipg241217-1050.xml does not match any allowed format.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Failed to load file /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml with error: File format not allowed: /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml. Skipping...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading files: 100%|██████████| 100/100 [00:01<00:00, 58.05file/s]\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e9208639f1a4418d97267a28305d18fa",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Parsing nodes: 0%| | 0/99 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "88026613f6f44f0c8476dceaa1cb78cd",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7522b8b434b54616b4cfc3d71e9556d7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5879d8161c2041f5b100959e69ff9017",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "557912b5e3c741f3a06127156bc46379",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "843bb145942b449aa55fc5b8208da734",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c7dba09a4aed422998e9b9c2c3a70317",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0bd031356c7e4e879dcbe1d04e6c4a4e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating embeddings: 0%| | 0/425 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"outputs": [],
"source": [
"from llama_index.core import StorageContext, VectorStoreIndex\n",
"from llama_index.vector_stores.milvus import MilvusVectorStore\n",

View File

@ -21,7 +21,7 @@ Schema-specific support:
| Format | Description |
|--------|-------------|
| USPTO XML | XML format followed by [USPTO](https://www.uspto.gov/patents) patents |
| PMC XML | XML format followed by [PubMed Central®](https://pmc.ncbi.nlm.nih.gov/) articles |
| JATS XML | XML format followed by [JATS](https://jats.nlm.nih.gov/) articles |
| Docling JSON | JSON-serialized [Docling Document](./concepts/docling_document.md) |
## Supported output formats

View File

@ -19,7 +19,7 @@ def get_pubmed_paths():
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.XML_PUBMED])
converter = DocumentConverter(allowed_formats=[InputFormat.XML_JATS])
return converter

View File

@ -130,24 +130,24 @@ def test_guess_format(tmp_path):
doc_path = Path("./tests/data/uspto/pftaps057006474.txt")
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
# Valid XML PubMed
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.xml").open("rb").read())
# Valid XML JATS
buf = BytesIO(Path("./tests/data/jats/elife-56337.xml").open("rb").read())
stream = DocumentStream(name="elife-56337.xml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
doc_path = Path("./tests/data/pubmed/elife-56337.xml")
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
assert dci._guess_format(stream) == InputFormat.XML_JATS
doc_path = Path("./tests/data/jats/elife-56337.xml")
assert dci._guess_format(doc_path) == InputFormat.XML_JATS
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.nxml").open("rb").read())
buf = BytesIO(Path("./tests/data/jats/elife-56337.nxml").open("rb").read())
stream = DocumentStream(name="elife-56337.nxml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
doc_path = Path("./tests/data/pubmed/elife-56337.nxml")
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
assert dci._guess_format(stream) == InputFormat.XML_JATS
doc_path = Path("./tests/data/jats/elife-56337.nxml")
assert dci._guess_format(doc_path) == InputFormat.XML_JATS
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.txt").open("rb").read())
buf = BytesIO(Path("./tests/data/jats/elife-56337.txt").open("rb").read())
stream = DocumentStream(name="elife-56337.txt", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
doc_path = Path("./tests/data/pubmed/elife-56337.txt")
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
assert dci._guess_format(stream) == InputFormat.XML_JATS
doc_path = Path("./tests/data/jats/elife-56337.txt")
assert dci._guess_format(doc_path) == InputFormat.XML_JATS
# Valid XML, non-supported flavor
xml_content = (