mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
Updates for DoclingParseV3DocumentBackend
Some checks failed
Run CI / code-checks (push) Failing after 7m55s
Run Docs CI / build-docs (push) Failing after 13m31s
Some checks failed
Run CI / code-checks (push) Failing after 7m55s
Run Docs CI / build-docs (push) Failing after 13m31s
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
80
docs/examples/backend_csv.ipynb
Normal file
80
docs/examples/backend_csv.ipynb
Normal file
@@ -0,0 +1,80 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Conversion of CSV files\n",
|
||||
"\n",
|
||||
"This example shows how to convert CSV files to a structured Docling Document.\n",
|
||||
"\n",
|
||||
"* Multiple delimiters are supported: `,` `;` `|` `[tab]`\n",
|
||||
"* Additional CSV dialect settings are detected automatically (e.g. quotes, line separator, escape character)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Example Code"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 59,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"\n",
|
||||
"# Convert CSV to Docling document\n",
|
||||
"converter = DocumentConverter()\n",
|
||||
"result = converter.convert(Path(\"../../tests/data/csv/csv-comma.csv\"))\n",
|
||||
"output = result.document.export_to_markdown()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This code generates the following output:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |\n",
|
||||
"|---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|\n",
|
||||
"| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |\n",
|
||||
"| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano, Dr | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |\n",
|
||||
"| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |\n",
|
||||
"| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez, Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |\n",
|
||||
"| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin, Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "docling-TtEIaPrw-py3.12",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -82,7 +82,7 @@
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"\n",
|
||||
"# a sample PMC article:\n",
|
||||
"source = \"../../tests/data/pubmed/elife-56337.nxml\"\n",
|
||||
"source = \"../../tests/data/jats/elife-56337.nxml\"\n",
|
||||
"converter = DocumentConverter()\n",
|
||||
"result = converter.convert(source)\n",
|
||||
"print(result.status)"
|
||||
@@ -97,7 +97,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -106,11 +106,11 @@
|
||||
"text": [
|
||||
"# KRAB-zinc finger protein gene expansion in response to active retrotransposons in the murine lineage\n",
|
||||
"\n",
|
||||
"Wolf Gernot; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; de Iaco Alberto; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Sun Ming-An; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Bruno Melania; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Tinkham Matthew; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Hoang Don; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Mitra Apratim; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Ralls Sherry; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Trono Didier; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Macfarlan Todd S; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States\n",
|
||||
"Gernot Wolf, Alberto de Iaco, Ming-An Sun, Melania Bruno, Matthew Tinkham, Don Hoang, Apratim Mitra, Sherry Ralls, Didier Trono, Todd S Macfarlan\n",
|
||||
"\n",
|
||||
"The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health, Bethesda, United States; School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL), Lausanne, Switzerland\n",
|
||||
"\n",
|
||||
"## Abstract\n",
|
||||
"\n",
|
||||
"The Krüppel-associated box zinc finger protein (KRAB-ZFP) family diversified in mammals. The majority of human KRAB-ZFPs bind transposable elements (TEs), however, since most TEs are inactive in humans it is unclear whether KRAB-ZFPs emerged to suppress TEs. We demonstrate that many recently emerged murine KRAB-ZFPs also bind to TEs, including the active ETn, IAP, and L1 families. Using a CRISPR/Cas9-based engineering approach, we genetically deleted five large clusters of KRAB-ZFPs and demonstrate that target TEs are de-repressed, unleashing TE-encoded enhancers. Homozygous knockout mice lacking one of two KRAB-ZFP gene clusters on chromosome 2 and chromosome 4 were nonetheless viable. In pedigrees of chromosome 4 cluster KRAB-ZFP mutants, we identified numerous novel ETn insertions with a modest increase in mutants. Our data strongly support the current model that recent waves of retrotransposon activity drove the expansion of KRAB-ZFP genes in mice and that many KRAB-ZFPs play a redundant role restricting TE activity.\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
@@ -131,7 +131,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -198,7 +198,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -224,7 +224,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -261,7 +261,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -313,7 +313,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -359,9 +359,18 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2024/ipg241217.zip...\n",
|
||||
"Parsing zip file, splitting into XML sections, and exporting to files...\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import zipfile\n",
|
||||
"\n",
|
||||
@@ -407,7 +416,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -435,7 +444,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -449,7 +458,7 @@
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "3964d1ff30f74588a2f6b53ca8865a9f",
|
||||
"model_id": "316241ca89a843bda3170f2a5c76c639",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
@@ -471,7 +480,7 @@
|
||||
"source": [
|
||||
"from tqdm.notebook import tqdm\n",
|
||||
"\n",
|
||||
"from docling.backend.xml.pubmed_backend import PubMedDocumentBackend\n",
|
||||
"from docling.backend.xml.jats_backend import JatsDocumentBackend\n",
|
||||
"from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend\n",
|
||||
"from docling.datamodel.base_models import InputFormat\n",
|
||||
"from docling.datamodel.document import InputDocument\n",
|
||||
@@ -479,10 +488,10 @@
|
||||
"# check PMC\n",
|
||||
"in_doc = InputDocument(\n",
|
||||
" path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\",\n",
|
||||
" format=InputFormat.XML_PUBMED,\n",
|
||||
" backend=PubMedDocumentBackend,\n",
|
||||
" format=InputFormat.XML_JATS,\n",
|
||||
" backend=JatsDocumentBackend,\n",
|
||||
")\n",
|
||||
"backend = PubMedDocumentBackend(\n",
|
||||
"backend = JatsDocumentBackend(\n",
|
||||
" in_doc=in_doc, path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\"\n",
|
||||
")\n",
|
||||
"print(f\"Document {in_doc.file.name} is a valid PMC article? {backend.is_valid()}\")\n",
|
||||
@@ -521,7 +530,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -543,7 +552,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or PubMed XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
|
||||
"✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or JATS (PubMed) XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -579,7 +588,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -607,7 +616,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -625,144 +634,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2025-01-24 16:49:57,108 [DEBUG][_create_connection]: Created new connection using: 2d58fad6c63448a486c0c0ffe3b7b28c (async_milvus_client.py:600)\n",
|
||||
"Loading files: 51%|█████ | 51/100 [00:00<00:00, 67.88file/s]Input document ipg241217-1050.xml does not match any allowed format.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Failed to load file /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml with error: File format not allowed: /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml. Skipping...\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loading files: 100%|██████████| 100/100 [00:01<00:00, 58.05file/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "e9208639f1a4418d97267a28305d18fa",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Parsing nodes: 0%| | 0/99 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "88026613f6f44f0c8476dceaa1cb78cd",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "7522b8b434b54616b4cfc3d71e9556d7",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "5879d8161c2041f5b100959e69ff9017",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "557912b5e3c741f3a06127156bc46379",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "843bb145942b449aa55fc5b8208da734",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "c7dba09a4aed422998e9b9c2c3a70317",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating embeddings: 0%| | 0/2048 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "0bd031356c7e4e879dcbe1d04e6c4a4e",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating embeddings: 0%| | 0/425 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.core import StorageContext, VectorStoreIndex\n",
|
||||
"from llama_index.vector_stores.milvus import MilvusVectorStore\n",
|
||||
|
||||
@@ -5,17 +5,19 @@ from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
import yaml
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
|
||||
from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
USE_V2 = True
|
||||
USE_LEGACY = True
|
||||
USE_LEGACY = False
|
||||
|
||||
|
||||
def export_documents(
|
||||
@@ -34,9 +36,26 @@ def export_documents(
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
if USE_V2:
|
||||
# Export Docling document format to JSON:
|
||||
with (output_dir / f"{doc_filename}.json").open("w") as fp:
|
||||
fp.write(json.dumps(conv_res.document.export_to_dict()))
|
||||
conv_res.document.save_as_json(
|
||||
output_dir / f"{doc_filename}.json",
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
)
|
||||
conv_res.document.save_as_html(
|
||||
output_dir / f"{doc_filename}.html",
|
||||
image_mode=ImageRefMode.EMBEDDED,
|
||||
)
|
||||
conv_res.document.save_as_document_tokens(
|
||||
output_dir / f"{doc_filename}.doctags.txt"
|
||||
)
|
||||
conv_res.document.save_as_markdown(
|
||||
output_dir / f"{doc_filename}.md",
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
)
|
||||
conv_res.document.save_as_markdown(
|
||||
output_dir / f"{doc_filename}.txt",
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
strict_text=True,
|
||||
)
|
||||
|
||||
# Export Docling document format to YAML:
|
||||
with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
|
||||
@@ -104,11 +123,10 @@ def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("tests/data/redp5110_sampled.pdf"),
|
||||
# Path("./tests/data/2206.01062.pdf"),
|
||||
# Path("./tests/data/2203.01017v2.pdf"),
|
||||
# Path("./tests/data/2305.03393v1.pdf"),
|
||||
# Path("./tests/data/redp5110_sampled.pdf"),
|
||||
Path("./tests/data/pdf/2206.01062.pdf"),
|
||||
Path("./tests/data/pdf/2203.01017v2.pdf"),
|
||||
Path("./tests/data/pdf/2305.03393v1.pdf"),
|
||||
Path("./tests/data/pdf/redp5110_sampled.pdf"),
|
||||
]
|
||||
|
||||
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
|
||||
@@ -121,9 +139,14 @@ def main():
|
||||
# settings.debug.visualize_tables = True
|
||||
# settings.debug.visualize_cells = True
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.generate_page_images = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(backend=DoclingParseV3DocumentBackend)
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options, backend=DoclingParseV3DocumentBackend
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ _log = logging.getLogger(__name__)
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
|
||||
###########################################################################
|
||||
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
# WARNING
|
||||
# This example demonstrates only how to develop a new enrichment model.
|
||||
# It does not run the actual formula understanding model.
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
@@ -68,7 +72,7 @@ class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/2203.01017v2.pdf")
|
||||
input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf")
|
||||
|
||||
pipeline_options = ExampleFormulaUnderstandingPipelineOptions()
|
||||
pipeline_options.do_formula_understanding = True
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
# WARNING
|
||||
# This example demonstrates only how to develop a new enrichment model.
|
||||
# It does not run the actual picture classifier model.
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable
|
||||
@@ -71,7 +75,7 @@ class ExamplePictureClassifierPipeline(StandardPdfPipeline):
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
|
||||
pipeline_options = ExamplePictureClassifierPipelineOptions()
|
||||
pipeline_options.images_scale = 2.0
|
||||
|
||||
@@ -16,7 +16,7 @@ IMAGE_RESOLUTION_SCALE = 2.0
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
output_dir = Path("scratch")
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
|
||||
@@ -19,7 +19,7 @@ IMAGE_RESOLUTION_SCALE = 2.0
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
output_dir = Path("scratch")
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
|
||||
@@ -12,7 +12,7 @@ _log = logging.getLogger(__name__)
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
output_dir = Path("scratch")
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
|
||||
@@ -14,7 +14,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
def main():
|
||||
input_doc = Path("./tests/data/2206.01062.pdf")
|
||||
input_doc = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = True
|
||||
|
||||
@@ -83,7 +83,15 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from docling.chunking import HybridChunker\n",
|
||||
"\n",
|
||||
@@ -91,6 +99,13 @@
|
||||
"chunk_iter = chunker.chunk(dl_doc=doc)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> 👉 **NOTE**: As you see above, using the `HybridChunker` can sometimes lead to a warning from the transformers library, however this is a \"false alarm\" — for details check [here](https://ds4sd.github.io/docling/faq/#hybridchunker-triggers-warning-token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-length-for-this-model)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -337,11 +352,11 @@
|
||||
"source": [
|
||||
"for i, chunk in enumerate(chunks):\n",
|
||||
" print(f\"=== {i} ===\")\n",
|
||||
" txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))\n",
|
||||
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
|
||||
" print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
|
||||
"\n",
|
||||
" ser_txt = chunker.serialize(chunk=chunk)\n",
|
||||
" ser_tokens = len(tokenizer.tokenize(ser_txt, max_length=None))\n",
|
||||
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
|
||||
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
|
||||
"\n",
|
||||
" print()"
|
||||
|
||||
@@ -4,7 +4,7 @@ from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
source = "tests/data/amt_handbook_sample.pdf"
|
||||
source = "tests/data/pdf/amt_handbook_sample.pdf"
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.images_scale = 2
|
||||
|
||||
96
docs/examples/minimal_vlm_pipeline.py
Normal file
96
docs/examples/minimal_vlm_pipeline.py
Normal file
@@ -0,0 +1,96 @@
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
VlmPipelineOptions,
|
||||
granite_vision_vlm_conversion_options,
|
||||
smoldocling_vlm_conversion_options,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
sources = [
|
||||
"tests/data/2305.03393v1-pg9-img.png",
|
||||
]
|
||||
|
||||
## Use experimental VlmPipeline
|
||||
pipeline_options = VlmPipelineOptions()
|
||||
# If force_backend_text = True, text from backend will be used instead of generated text
|
||||
pipeline_options.force_backend_text = False
|
||||
|
||||
## On GPU systems, enable flash_attention_2 with CUDA:
|
||||
# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
|
||||
# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
|
||||
|
||||
## Pick a VLM model. We choose SmolDocling-256M by default
|
||||
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
||||
|
||||
## Alternative VLM models:
|
||||
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
||||
|
||||
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
||||
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||
|
||||
## Set up pipeline for PDF or image inputs
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=VlmPipeline,
|
||||
pipeline_options=pipeline_options,
|
||||
),
|
||||
InputFormat.IMAGE: PdfFormatOption(
|
||||
pipeline_cls=VlmPipeline,
|
||||
pipeline_options=pipeline_options,
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
out_path = Path("scratch")
|
||||
out_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for source in sources:
|
||||
start_time = time.time()
|
||||
print("================================================")
|
||||
print("Processing... {}".format(source))
|
||||
print("================================================")
|
||||
print("")
|
||||
|
||||
res = converter.convert(source)
|
||||
|
||||
print("------------------------------------------------")
|
||||
print("MD:")
|
||||
print("------------------------------------------------")
|
||||
print("")
|
||||
print(res.document.export_to_markdown())
|
||||
|
||||
for page in res.pages:
|
||||
print("")
|
||||
print("Predicted page in DOCTAGS:")
|
||||
print(page.predictions.vlm_response.text)
|
||||
|
||||
res.document.save_as_html(
|
||||
filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
|
||||
image_mode=ImageRefMode.REFERENCED,
|
||||
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
|
||||
)
|
||||
|
||||
with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
|
||||
fp.write(json.dumps(res.document.export_to_dict()))
|
||||
|
||||
pg_num = res.document.num_pages()
|
||||
|
||||
print("")
|
||||
inference_time = time.time() - start_time
|
||||
print(
|
||||
f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
|
||||
)
|
||||
|
||||
print("================================================")
|
||||
print("done!")
|
||||
print("================================================")
|
||||
343
docs/examples/pictures_description.ipynb
Normal file
343
docs/examples/pictures_description.ipynb
Normal file
File diff suppressed because one or more lines are too long
118
docs/examples/pictures_description_api.py
Normal file
118
docs/examples/pictures_description_api.py
Normal file
@@ -0,0 +1,118 @@
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from docling_core.types.doc import PictureItem
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
PdfPipelineOptions,
|
||||
PictureDescriptionApiOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
def vllm_local_options(model: str):
|
||||
options = PictureDescriptionApiOptions(
|
||||
url="http://localhost:8000/v1/chat/completions",
|
||||
params=dict(
|
||||
model=model,
|
||||
seed=42,
|
||||
max_completion_tokens=200,
|
||||
),
|
||||
prompt="Describe the image in three sentences. Be consise and accurate.",
|
||||
timeout=90,
|
||||
)
|
||||
return options
|
||||
|
||||
|
||||
def watsonx_vlm_options():
|
||||
load_dotenv()
|
||||
api_key = os.environ.get("WX_API_KEY")
|
||||
project_id = os.environ.get("WX_PROJECT_ID")
|
||||
|
||||
def _get_iam_access_token(api_key: str) -> str:
|
||||
res = requests.post(
|
||||
url="https://iam.cloud.ibm.com/identity/token",
|
||||
headers={
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
},
|
||||
data=f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={api_key}",
|
||||
)
|
||||
res.raise_for_status()
|
||||
api_out = res.json()
|
||||
print(f"{api_out=}")
|
||||
return api_out["access_token"]
|
||||
|
||||
options = PictureDescriptionApiOptions(
|
||||
url="https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29",
|
||||
params=dict(
|
||||
model_id="meta-llama/llama-3-2-11b-vision-instruct",
|
||||
project_id=project_id,
|
||||
parameters=dict(
|
||||
max_new_tokens=400,
|
||||
),
|
||||
),
|
||||
headers={
|
||||
"Authorization": "Bearer " + _get_iam_access_token(api_key=api_key),
|
||||
},
|
||||
prompt="Describe the image in three sentences. Be consise and accurate.",
|
||||
timeout=60,
|
||||
)
|
||||
return options
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
enable_remote_services=True # <-- this is required!
|
||||
)
|
||||
pipeline_options.do_picture_description = True
|
||||
|
||||
# The PictureDescriptionApiOptions() allows to interface with APIs supporting
|
||||
# the multi-modal chat interface. Here follow a few example on how to configure those.
|
||||
#
|
||||
# One possibility is self-hosting model, e.g. via VLLM.
|
||||
# $ vllm serve MODEL_NAME
|
||||
# Then PictureDescriptionApiOptions can point to the localhost endpoint.
|
||||
#
|
||||
# Example for the Granite Vision model: (uncomment the following lines)
|
||||
# pipeline_options.picture_description_options = vllm_local_options(
|
||||
# model="ibm-granite/granite-vision-3.1-2b-preview"
|
||||
# )
|
||||
#
|
||||
# Example for the SmolVLM model: (uncomment the following lines)
|
||||
pipeline_options.picture_description_options = vllm_local_options(
|
||||
model="HuggingFaceTB/SmolVLM-256M-Instruct"
|
||||
)
|
||||
#
|
||||
# Another possibility is using online services, e.g. watsonx.ai.
|
||||
# Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID.
|
||||
# Uncomment the following line for this option:
|
||||
# pipeline_options.picture_description_options = watsonx_vlm_options()
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
)
|
||||
}
|
||||
)
|
||||
result = doc_converter.convert(input_doc_path)
|
||||
|
||||
for element, _level in result.document.iterate_items():
|
||||
if isinstance(element, PictureItem):
|
||||
print(
|
||||
f"Picture {element.self_ref}\n"
|
||||
f"Caption: {element.caption_text(doc=result.document)}\n"
|
||||
f"Annotations: {element.annotations}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -14,7 +14,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
def main():
|
||||
input_doc = Path("./tests/data/2206.01062.pdf")
|
||||
input_doc = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
|
||||
# Explicitly set the accelerator
|
||||
# accelerator_options = AcceleratorOptions(
|
||||
@@ -30,6 +30,9 @@ def main():
|
||||
# num_threads=8, device=AcceleratorDevice.CUDA
|
||||
# )
|
||||
|
||||
# easyocr doesnt support cuda:N allocation, defaults to cuda:0
|
||||
# accelerator_options = AcceleratorOptions(num_threads=8, device="cuda:1")
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.accelerator_options = accelerator_options
|
||||
pipeline_options.do_ocr = True
|
||||
|
||||
@@ -25,9 +25,8 @@ def main():
|
||||
Path("tests/data/docx/lorem_ipsum.docx"),
|
||||
Path("tests/data/pptx/powerpoint_sample.pptx"),
|
||||
Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
Path("tests/data/2206.01062.pdf"),
|
||||
Path("tests/data/test_01.asciidoc"),
|
||||
Path("tests/data/test_01.asciidoc"),
|
||||
Path("tests/data/pdf/2206.01062.pdf"),
|
||||
Path("tests/data/asciidoc/test_01.asciidoc"),
|
||||
]
|
||||
|
||||
## for defaults use:
|
||||
@@ -44,6 +43,7 @@ def main():
|
||||
InputFormat.HTML,
|
||||
InputFormat.PPTX,
|
||||
InputFormat.ASCIIDOC,
|
||||
InputFormat.CSV,
|
||||
InputFormat.MD,
|
||||
], # whitelist formats, non-matching files are ignored.
|
||||
format_options={
|
||||
|
||||
@@ -10,7 +10,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
def main():
|
||||
input_doc = Path("./tests/data/2206.01062.pdf")
|
||||
input_doc = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
|
||||
# Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
|
||||
# ocr_options = TesseractOcrOptions(lang=["auto"])
|
||||
|
||||
@@ -32,7 +32,7 @@ def translate(text: str, src: str = "en", dest: str = "de"):
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
output_dir = Path("scratch")
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
|
||||
Reference in New Issue
Block a user