mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
chore(xml-jats): rename PubMed objects to JATS
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
011dd6ce96
commit
93eb9de871
@ -66,11 +66,21 @@ class XMLComponents(TypedDict):
|
||||
abstract: list[Abstract]
|
||||
|
||||
|
||||
class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
"""
|
||||
The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
|
||||
class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
"""Backend to parse articles in XML format tagged according to JATS definition.
|
||||
|
||||
The Journal Article Tag Suite (JATS) is an definition standard for the
|
||||
representation of journal articles in XML format. Several publishers and journal
|
||||
archives provide content in JATS format, including PubMed Central® (PMC), bioRxiv,
|
||||
medRxiv, or Springer Nature.
|
||||
|
||||
Refer to https://jats.nlm.nih.gov for more details on JATS.
|
||||
|
||||
The code from this document backend has been developed by modifying parts of the
|
||||
PubMed Parser library (version 0.5.0, released on 12.08.2024):
|
||||
Achakulvisut et al., (2020).
|
||||
Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
|
||||
Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML
|
||||
Dataset XML Dataset.
|
||||
Journal of Open Source Software, 5(46), 1979,
|
||||
https://doi.org/10.21105/joss.01979
|
||||
"""
|
||||
@ -105,7 +115,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
return
|
||||
except Exception as exc:
|
||||
raise RuntimeError(
|
||||
f"Could not initialize PubMed backend for file with hash {self.document_hash}."
|
||||
f"Could not initialize JATS backend for file with hash {self.document_hash}."
|
||||
) from exc
|
||||
|
||||
@override
|
||||
@ -126,7 +136,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
@classmethod
|
||||
@override
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.XML_PUBMED}
|
||||
return {InputFormat.XML_JATS}
|
||||
|
||||
@override
|
||||
def convert(self) -> DoclingDocument:
|
||||
@ -170,7 +180,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
for child in list(node):
|
||||
if child.tag not in skip_tags:
|
||||
# TODO: apply styling according to child.tag when supported by docling-core
|
||||
text += PubMedDocumentBackend._get_text(child, sep)
|
||||
text += JatsDocumentBackend._get_text(child, sep)
|
||||
if sep:
|
||||
text = text.rstrip(sep) + sep
|
||||
text += child.tail.replace("\n", " ") if child.tail else ""
|
||||
@ -196,7 +206,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
abstract: Abstract = dict(label="", content="")
|
||||
texts = []
|
||||
for abs_par in abs_node.xpath("p"):
|
||||
texts.append(PubMedDocumentBackend._get_text(abs_par).strip())
|
||||
texts.append(JatsDocumentBackend._get_text(abs_par).strip())
|
||||
abstract["content"] = " ".join(texts)
|
||||
|
||||
label_node = abs_node.xpath("title|label")
|
||||
@ -280,7 +290,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
return text
|
||||
|
||||
def _parse_metadata(self) -> XMLComponents:
|
||||
"""Parsing PubMed document metadata."""
|
||||
"""Parsing JATS document metadata."""
|
||||
xml_components: XMLComponents = {
|
||||
"title": self._parse_title(),
|
||||
"authors": self._parse_authors(),
|
||||
@ -385,7 +395,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
title_node = name_node[0]
|
||||
break
|
||||
citation["title"] = (
|
||||
PubMedDocumentBackend._get_text(title_node)
|
||||
JatsDocumentBackend._get_text(title_node)
|
||||
if title_node is not None
|
||||
else node.text.replace("\n", " ").strip()
|
||||
)
|
||||
@ -415,7 +425,9 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
id_text = id_node.text
|
||||
if id_type and id_text:
|
||||
pub_id.append(
|
||||
f"{id_type.replace("\n", " ").strip().upper()}: {id_text.replace("\n", " ").strip()}"
|
||||
id_type.replace("\n", " ").strip().upper()
|
||||
+ ": "
|
||||
+ id_text.replace("\n", " ").strip()
|
||||
)
|
||||
if pub_id:
|
||||
citation["pub_id"] = ", ".join(pub_id)
|
||||
@ -428,9 +440,9 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
elif len(node.xpath("fpage")) > 0:
|
||||
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
|
||||
if len(node.xpath("lpage")) > 0:
|
||||
citation[
|
||||
"page"
|
||||
] += f"–{node.xpath('lpage')[0].text.replace("\n", " ").strip()}"
|
||||
citation["page"] += (
|
||||
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
|
||||
)
|
||||
|
||||
# Flatten the citation to string
|
||||
|
||||
@ -447,7 +459,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
text += citation["publisher_name"] + ". "
|
||||
if citation["volume"]:
|
||||
text = text.rstrip(". ")
|
||||
text += f" {citation["volume"]}. "
|
||||
text += f" {citation['volume']}. "
|
||||
if citation["page"]:
|
||||
text = text.rstrip(". ")
|
||||
if citation["volume"]:
|
||||
@ -480,7 +492,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
) -> None:
|
||||
label_node = node.xpath("label")
|
||||
label: Optional[str] = (
|
||||
PubMedDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
|
||||
JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
|
||||
)
|
||||
|
||||
caption_node = node.xpath("caption")
|
||||
@ -490,7 +502,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
for caption_par in list(caption_node[0]):
|
||||
if caption_par.xpath(".//supplementary-material"):
|
||||
continue
|
||||
caption += PubMedDocumentBackend._get_text(caption_par).strip() + " "
|
||||
caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
|
||||
caption = caption.strip()
|
||||
else:
|
||||
caption = None
|
||||
@ -511,7 +523,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
# def _add_footnote_group(self, doc: DoclingDocument, parent: NodeItem, node: etree._Element) -> None:
|
||||
# new_parent = doc.add_group(label=GroupLabel.LIST, name="footnotes", parent=parent)
|
||||
# for child in node.iterchildren(tag="fn"):
|
||||
# text = PubMedDocumentBackend._get_text(child)
|
||||
# text = JatsDocumentBackend._get_text(child)
|
||||
# doc.add_list_item(text=text, parent=new_parent)
|
||||
|
||||
def _add_metadata(
|
||||
@ -631,7 +643,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
for caption_par in list(caption_node[0]):
|
||||
if caption_par.xpath(".//supplementary-material"):
|
||||
continue
|
||||
caption += PubMedDocumentBackend._get_text(caption_par).strip() + " "
|
||||
caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
|
||||
caption = caption.strip()
|
||||
else:
|
||||
caption = None
|
||||
@ -686,7 +698,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
header = child.xpath("title|label")
|
||||
text: Optional[str] = None
|
||||
if len(header) > 0:
|
||||
text = PubMedDocumentBackend._get_text(header[0])
|
||||
text = JatsDocumentBackend._get_text(header[0])
|
||||
elif child.tag == "ack":
|
||||
text = DEFAULT_HEADER_ACKNOWLEDGMENTS
|
||||
if text:
|
||||
@ -698,7 +710,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
elif child.tag == "list-item":
|
||||
# TODO: address any type of content (another list, formula,...)
|
||||
# TODO: address list type and item label
|
||||
text = PubMedDocumentBackend._get_text(child).strip()
|
||||
text = JatsDocumentBackend._get_text(child).strip()
|
||||
new_parent = doc.add_list_item(text=text, parent=parent)
|
||||
stop_walk = True
|
||||
elif child.tag == "fig":
|
||||
@ -712,14 +724,14 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
elif child.tag == "fn-group":
|
||||
# header = child.xpath(".//title") or child.xpath(".//label")
|
||||
# if header:
|
||||
# text = PubMedDocumentBackend._get_text(header[0])
|
||||
# text = JatsDocumentBackend._get_text(header[0])
|
||||
# fn_parent = doc.add_heading(text=text, parent=new_parent)
|
||||
# self._add_footnote_group(doc, fn_parent, child)
|
||||
stop_walk = True
|
||||
elif child.tag == "ref-list" and node.tag != "ref-list":
|
||||
header = child.xpath("title|label")
|
||||
text = (
|
||||
PubMedDocumentBackend._get_text(header[0])
|
||||
JatsDocumentBackend._get_text(header[0])
|
||||
if len(header) > 0
|
||||
else DEFAULT_HEADER_REFERENCES
|
||||
)
|
||||
@ -732,7 +744,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._add_citation(doc, parent, text)
|
||||
stop_walk = True
|
||||
elif child.tag == "mixed-citation":
|
||||
text = PubMedDocumentBackend._get_text(child).strip()
|
||||
text = JatsDocumentBackend._get_text(child).strip()
|
||||
self._add_citation(doc, parent, text)
|
||||
stop_walk = True
|
||||
elif child.tag == "tex-math":
|
@ -34,7 +34,6 @@ class InputFormat(str, Enum):
|
||||
DOCX = "docx"
|
||||
PPTX = "pptx"
|
||||
HTML = "html"
|
||||
XML_PUBMED = "xml_pubmed"
|
||||
IMAGE = "image"
|
||||
PDF = "pdf"
|
||||
ASCIIDOC = "asciidoc"
|
||||
@ -42,6 +41,7 @@ class InputFormat(str, Enum):
|
||||
CSV = "csv"
|
||||
XLSX = "xlsx"
|
||||
XML_USPTO = "xml_uspto"
|
||||
XML_JATS = "xml_jats"
|
||||
JSON_DOCLING = "json_docling"
|
||||
|
||||
|
||||
@ -59,7 +59,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.PDF: ["pdf"],
|
||||
InputFormat.MD: ["md"],
|
||||
InputFormat.HTML: ["html", "htm", "xhtml"],
|
||||
InputFormat.XML_PUBMED: ["xml", "nxml"],
|
||||
InputFormat.XML_JATS: ["xml", "nxml"],
|
||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
||||
InputFormat.CSV: ["csv"],
|
||||
@ -79,7 +79,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
],
|
||||
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
|
||||
InputFormat.XML_PUBMED: ["application/xml"],
|
||||
InputFormat.XML_JATS: ["application/xml"],
|
||||
InputFormat.IMAGE: [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
|
@ -333,11 +333,11 @@ class _DocumentConversionInput(BaseModel):
|
||||
):
|
||||
input_format = InputFormat.XML_USPTO
|
||||
|
||||
if InputFormat.XML_PUBMED in formats and (
|
||||
if InputFormat.XML_JATS in formats and (
|
||||
"JATS-journalpublishing" in xml_doctype
|
||||
or "JATS-archive" in xml_doctype
|
||||
):
|
||||
input_format = InputFormat.XML_PUBMED
|
||||
input_format = InputFormat.XML_JATS
|
||||
|
||||
elif mime == "text/plain":
|
||||
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
||||
|
@ -18,7 +18,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
|
||||
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
||||
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
@ -102,9 +102,9 @@ class PatentUsptoFormatOption(FormatOption):
|
||||
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
|
||||
|
||||
|
||||
class XMLPubMedFormatOption(FormatOption):
|
||||
class XMLJatsFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
|
||||
backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
|
||||
|
||||
|
||||
class ImageFormatOption(FormatOption):
|
||||
@ -143,8 +143,8 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
InputFormat.XML_USPTO: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
||||
),
|
||||
InputFormat.XML_PUBMED: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
|
||||
InputFormat.XML_JATS: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
|
||||
),
|
||||
InputFormat.IMAGE: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
||||
|
@ -82,7 +82,7 @@
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"\n",
|
||||
"# a sample PMC article:\n",
|
||||
"source = \"../../tests/data/pubmed/elife-56337.nxml\"\n",
|
||||
"source = \"../../tests/data/jats/elife-56337.nxml\"\n",
|
||||
"converter = DocumentConverter()\n",
|
||||
"result = converter.convert(source)\n",
|
||||
"print(result.status)"
|
||||
@ -97,7 +97,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -106,11 +106,11 @@
|
||||
"text": [
|
||||
"# KRAB-zinc finger protein gene expansion in response to active retrotransposons in the murine lineage\n",
|
||||
"\n",
|
||||
"Wolf Gernot; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; de Iaco Alberto; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Sun Ming-An; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Bruno Melania; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Tinkham Matthew; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Hoang Don; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Mitra Apratim; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Ralls Sherry; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Trono Didier; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Macfarlan Todd S; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States\n",
|
||||
"Gernot Wolf, Alberto de Iaco, Ming-An Sun, Melania Bruno, Matthew Tinkham, Don Hoang, Apratim Mitra, Sherry Ralls, Didier Trono, Todd S Macfarlan\n",
|
||||
"\n",
|
||||
"The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health, Bethesda, United States; School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL), Lausanne, Switzerland\n",
|
||||
"\n",
|
||||
"## Abstract\n",
|
||||
"\n",
|
||||
"The Krüppel-associated box zinc finger protein (KRAB-ZFP) family diversified in mammals. The majority of human KRAB-ZFPs bind transposable elements (TEs), however, since most TEs are inactive in humans it is unclear whether KRAB-ZFPs emerged to suppress TEs. We demonstrate that many recently emerged murine KRAB-ZFPs also bind to TEs, including the active ETn, IAP, and L1 families. Using a CRISPR/Cas9-based engineering approach, we genetically deleted five large clusters of KRAB-ZFPs and demonstrate that target TEs are de-repressed, unleashing TE-encoded enhancers. Homozygous knockout mice lacking one of two KRAB-ZFP gene clusters on chromosome 2 and chromosome 4 were nonetheless viable. In pedigrees of chromosome 4 cluster KRAB-ZFP mutants, we identified numerous novel ETn insertions with a modest increase in mutants. Our data strongly support the current model that recent waves of retrotransposon activity drove the expansion of KRAB-ZFP genes in mice and that many KRAB-ZFPs play a redundant role restricting TE activity.\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
@ -131,7 +131,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -198,7 +198,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -224,7 +224,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -261,7 +261,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -313,7 +313,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -359,9 +359,18 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2024/ipg241217.zip...\n",
|
||||
"Parsing zip file, splitting into XML sections, and exporting to files...\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import zipfile\n",
|
||||
"\n",
|
||||
@ -407,7 +416,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -435,7 +444,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -449,7 +458,7 @@
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "3964d1ff30f74588a2f6b53ca8865a9f",
|
||||
"model_id": "316241ca89a843bda3170f2a5c76c639",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
@ -471,7 +480,7 @@
|
||||
"source": [
|
||||
"from tqdm.notebook import tqdm\n",
|
||||
"\n",
|
||||
"from docling.backend.xml.pubmed_backend import PubMedDocumentBackend\n",
|
||||
"from docling.backend.xml.jats_backend import JatsDocumentBackend\n",
|
||||
"from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend\n",
|
||||
"from docling.datamodel.base_models import InputFormat\n",
|
||||
"from docling.datamodel.document import InputDocument\n",
|
||||
@ -479,10 +488,10 @@
|
||||
"# check PMC\n",
|
||||
"in_doc = InputDocument(\n",
|
||||
" path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\",\n",
|
||||
" format=InputFormat.XML_PUBMED,\n",
|
||||
" backend=PubMedDocumentBackend,\n",
|
||||
" format=InputFormat.XML_JATS,\n",
|
||||
" backend=JatsDocumentBackend,\n",
|
||||
")\n",
|
||||
"backend = PubMedDocumentBackend(\n",
|
||||
"backend = JatsDocumentBackend(\n",
|
||||
" in_doc=in_doc, path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\"\n",
|
||||
")\n",
|
||||
"print(f\"Document {in_doc.file.name} is a valid PMC article? {backend.is_valid()}\")\n",
|
||||
@ -521,7 +530,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -543,7 +552,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or PubMed XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
|
||||
"✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or JATS (PubMed) XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -579,7 +588,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -607,7 +616,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -625,144 +634,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2025-01-24 16:49:57,108 [DEBUG][_create_connection]: Created new connection using: 2d58fad6c63448a486c0c0ffe3b7b28c (async_milvus_client.py:600)\n",
|
||||
"Loading files: 51%|█████ | 51/100 [00:00<00:00, 67.88file/s]Input document ipg241217-1050.xml does not match any allowed format.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Failed to load file /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml with error: File format not allowed: /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml. Skipping...\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loading files: 100%|██████████| 100/100 [00:01<00:00, 58.05file/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "e9208639f1a4418d97267a28305d18fa",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Parsing nodes: 0%| | 0/99 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "88026613f6f44f0c8476dceaa1cb78cd",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "7522b8b434b54616b4cfc3d71e9556d7",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "5879d8161c2041f5b100959e69ff9017",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "557912b5e3c741f3a06127156bc46379",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "843bb145942b449aa55fc5b8208da734",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "c7dba09a4aed422998e9b9c2c3a70317",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "0bd031356c7e4e879dcbe1d04e6c4a4e",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating embeddings: 0%| | 0/425 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.core import StorageContext, VectorStoreIndex\n",
|
||||
"from llama_index.vector_stores.milvus import MilvusVectorStore\n",
|
||||
|
@ -21,7 +21,7 @@ Schema-specific support:
|
||||
| Format | Description |
|
||||
|--------|-------------|
|
||||
| USPTO XML | XML format followed by [USPTO](https://www.uspto.gov/patents) patents |
|
||||
| PMC XML | XML format followed by [PubMed Central®](https://pmc.ncbi.nlm.nih.gov/) articles |
|
||||
| JATS XML | XML format followed by [JATS](https://jats.nlm.nih.gov/) articles |
|
||||
| Docling JSON | JSON-serialized [Docling Document](./concepts/docling_document.md) |
|
||||
|
||||
## Supported output formats
|
||||
|
@ -19,7 +19,7 @@ def get_pubmed_paths():
|
||||
|
||||
|
||||
def get_converter():
|
||||
converter = DocumentConverter(allowed_formats=[InputFormat.XML_PUBMED])
|
||||
converter = DocumentConverter(allowed_formats=[InputFormat.XML_JATS])
|
||||
return converter
|
||||
|
||||
|
@ -130,24 +130,24 @@ def test_guess_format(tmp_path):
|
||||
doc_path = Path("./tests/data/uspto/pftaps057006474.txt")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
|
||||
|
||||
# Valid XML PubMed
|
||||
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.xml").open("rb").read())
|
||||
# Valid XML JATS
|
||||
buf = BytesIO(Path("./tests/data/jats/elife-56337.xml").open("rb").read())
|
||||
stream = DocumentStream(name="elife-56337.xml", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
|
||||
doc_path = Path("./tests/data/pubmed/elife-56337.xml")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
|
||||
assert dci._guess_format(stream) == InputFormat.XML_JATS
|
||||
doc_path = Path("./tests/data/jats/elife-56337.xml")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_JATS
|
||||
|
||||
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.nxml").open("rb").read())
|
||||
buf = BytesIO(Path("./tests/data/jats/elife-56337.nxml").open("rb").read())
|
||||
stream = DocumentStream(name="elife-56337.nxml", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
|
||||
doc_path = Path("./tests/data/pubmed/elife-56337.nxml")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
|
||||
assert dci._guess_format(stream) == InputFormat.XML_JATS
|
||||
doc_path = Path("./tests/data/jats/elife-56337.nxml")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_JATS
|
||||
|
||||
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.txt").open("rb").read())
|
||||
buf = BytesIO(Path("./tests/data/jats/elife-56337.txt").open("rb").read())
|
||||
stream = DocumentStream(name="elife-56337.txt", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
|
||||
doc_path = Path("./tests/data/pubmed/elife-56337.txt")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
|
||||
assert dci._guess_format(stream) == InputFormat.XML_JATS
|
||||
doc_path = Path("./tests/data/jats/elife-56337.txt")
|
||||
assert dci._guess_format(doc_path) == InputFormat.XML_JATS
|
||||
|
||||
# Valid XML, non-supported flavor
|
||||
xml_content = (
|
||||
|
Loading…
Reference in New Issue
Block a user