feat!: Docling v2 (#117)

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Christoph Auer
2024-10-16 21:02:03 +02:00
committed by GitHub
parent d504432c1e
commit 7d3be0edeb
144 changed files with 15180 additions and 3828 deletions

View File

@@ -4,12 +4,17 @@ import time
from pathlib import Path
from typing import Iterable
import yaml
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
USE_V2 = True
USE_LEGACY = False
def export_documents(
conv_results: Iterable[ConversionResult],
@@ -26,25 +31,53 @@ def export_documents(
success_count += 1
doc_filename = conv_res.input.file.stem
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.render_as_dict()))
if USE_V2:
# Export Docling document format to JSON:
with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(json.dumps(conv_res.document.export_to_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_text())
# Export Docling document format to YAML:
with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
fp.write(yaml.safe_dump(conv_res.document.export_to_dict()))
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_markdown())
# Export Docling document format to doctags:
with (output_dir / f"{doc_filename}.doctags.txt").open("w") as fp:
fp.write(conv_res.document.export_to_document_tokens())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_doctags())
# Export Docling document format to markdown:
with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(conv_res.document.export_to_markdown())
# Export Docling document format to text:
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
fp.write(conv_res.document.export_to_markdown(strict_text=True))
if USE_LEGACY:
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.legacy.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.legacy_document.export_to_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.legacy.txt").open(
"w", encoding="utf-8"
) as fp:
fp.write(
conv_res.legacy_document.export_to_markdown(strict_text=True)
)
# Export Markdown format:
with (output_dir / f"{doc_filename}.legacy.md").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.legacy_document.export_to_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.legacy_document.export_to_doctags())
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info(
@@ -77,23 +110,24 @@ def main():
]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter()
input = DocumentConversionInput.from_paths(input_doc_paths)
start_time = time.time()
conv_results = doc_converter.convert(input)
conv_results = doc_converter.convert_all(
input_doc_paths,
raises_on_error=False, # to let conversion run through all and examine results at the end
)
success_count, partial_success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
conv_results, output_dir=Path("scratch")
)
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
_log.info(f"Document conversion complete in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(

View File

@@ -2,72 +2,18 @@ import json
import logging
import time
from pathlib import Path
from typing import Iterable
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)
def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)
success_count = 0
failure_count = 0
for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
doc_filename = conv_res.input.file.stem
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.render_as_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_doctags())
else:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
_log.info(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
)
return success_count, failure_count
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
input_doc_path = Path("./tests/data/2206.01062.pdf")
###########################################################################
@@ -101,14 +47,15 @@ def main():
# Docling Parse without EasyOCR
# -------------------------
pipeline_options = PipelineOptions()
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
# Docling Parse with EasyOCR
@@ -151,24 +98,32 @@ def main():
###########################################################################
# Define input files
input = DocumentConversionInput.from_paths(input_doc_paths)
start_time = time.time()
conv_results = doc_converter.convert(input)
success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)
conv_result = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
_log.info(f"Document converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
## Export results
output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_result.input.file.stem
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
fp.write(json.dumps(conv_result.document.export_to_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_result.document.export_to_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_result.document.export_to_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
fp.write(conv_result.document.export_to_document_tokens())
if __name__ == "__main__":

View File

@@ -0,0 +1,100 @@
import logging
from pathlib import Path
from typing import Any, Iterable
from docling_core.types.doc import (
DoclingDocument,
NodeItem,
PictureClassificationClass,
PictureClassificationData,
PictureItem,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.base_model import BaseEnrichmentModel
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
class ExamplePictureClassifierPipelineOptions(PdfPipelineOptions):
do_picture_classifer: bool = True
class ExamplePictureClassifierEnrichmentModel(BaseEnrichmentModel):
def __init__(self, enabled: bool):
self.enabled = enabled
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
return self.enabled and isinstance(element, PictureItem)
def __call__(
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
) -> Iterable[Any]:
if not self.enabled:
return
for element in element_batch:
assert isinstance(element, PictureItem)
# uncomment this to interactively visualize the image
# element.image.pil_image.show()
element.annotations.append(
PictureClassificationData(
provenance="example_classifier-0.0.1",
predicted_classes=[
PictureClassificationClass(class_name="dummy", confidence=0.42)
],
)
)
yield element
class ExamplePictureClassifierPipeline(StandardPdfPipeline):
def __init__(self, pipeline_options: ExamplePictureClassifierPipelineOptions):
super().__init__(pipeline_options)
self.pipeline_options: ExamplePictureClassifierPipeline
self.enrichment_pipe = [
ExamplePictureClassifierEnrichmentModel(
enabled=pipeline_options.do_picture_classifer
)
]
@classmethod
def get_default_options(cls) -> ExamplePictureClassifierPipelineOptions:
return ExamplePictureClassifierPipelineOptions()
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/2206.01062.pdf")
pipeline_options = ExamplePictureClassifierPipelineOptions()
pipeline_options.images_scale = 2.0
pipeline_options.generate_picture_images = True
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=ExamplePictureClassifierPipeline,
pipeline_options=pipeline_options,
)
}
)
result = doc_converter.convert(input_doc_path)
for element, _level in result.document.iterate_items():
if isinstance(element, PictureItem):
print(
f"The model populated the `data` portion of picture {element.self_ref}:\n{element.annotations}"
)
if __name__ == "__main__":
main()

View File

@@ -1,17 +1,12 @@
import logging
import time
from pathlib import Path
from typing import Tuple
from docling.datamodel.base_models import (
AssembleOptions,
ConversionStatus,
FigureElement,
PageElement,
TableElement,
)
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling_core.types.doc import PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)
@@ -21,64 +16,64 @@ IMAGE_RESOLUTION_SCALE = 2.0
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
input_doc_path = Path("./tests/data/2206.01062.pdf")
output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions()
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
# The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
# with the image field
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_table_images = True
pipeline_options.generate_picture_images = True
doc_converter = DocumentConverter(assemble_options=assemble_options)
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
start_time = time.time()
conv_results = doc_converter.convert(input_files)
conv_res = doc_converter.convert(input_doc_path)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results:
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue
doc_filename = conv_res.input.file.stem
doc_filename = conv_res.input.file.stem
# Save page images
for page_no, page in conv_res.document.pages.items():
page_no = page.page_no
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.pil_image.save(fp, format="PNG")
# Export page images
for page in conv_res.pages:
page_no = page.page_no + 1
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.save(fp, format="PNG")
# Export figures and tables
for element, image in conv_res.render_element_images(
element_types=(FigureElement, TableElement)
):
# Save images of figures and tables
table_counter = 0
picture_counter = 0
for element, _level in conv_res.document.iterate_items():
if isinstance(element, TableItem):
table_counter += 1
element_image_filename = (
output_dir / f"{doc_filename}-element-{element.id}.png"
output_dir / f"{doc_filename}-table-{table_counter}.png"
)
with element_image_filename.open("wb") as fp:
image.save(fp, "PNG")
element.image.pil_image.save(fp, "PNG")
success_count += 1
if isinstance(element, PictureItem):
picture_counter += 1
element_image_filename = (
output_dir / f"{doc_filename}-picture-{picture_counter}.png"
)
with element_image_filename.open("wb") as fp:
element.image.pil_image.save(fp, "PNG")
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
_log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")
if __name__ == "__main__":

View File

@@ -5,10 +5,11 @@ from pathlib import Path
import pandas as pd
from docling.datamodel.base_models import AssembleOptions, ConversionStatus
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.utils.export import generate_multimodal_pages
from docling.utils.utils import create_hash
_log = logging.getLogger(__name__)
@@ -18,71 +19,66 @@ IMAGE_RESOLUTION_SCALE = 2.0
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
input_doc_path = Path("./tests/data/2206.01062.pdf")
output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions()
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
doc_converter = DocumentConverter(assemble_options=assemble_options)
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
start_time = time.time()
converted_docs = doc_converter.convert(input_files)
conv_res = doc_converter.convert(input_doc_path)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for doc in converted_docs:
if doc.status != ConversionStatus.SUCCESS:
_log.info(f"Document {doc.input.file} failed to convert.")
failure_count += 1
continue
rows = []
for (
content_text,
content_md,
content_dt,
page_cells,
page_segments,
page,
) in generate_multimodal_pages(doc):
rows = []
for (
content_text,
content_md,
content_dt,
page_cells,
page_segments,
page,
) in generate_multimodal_pages(conv_res):
dpi = page._default_image_scale * 72
dpi = page._default_image_scale * 72
rows.append(
{
"document": doc.input.file.name,
"hash": doc.input.document_hash,
"page_hash": page.page_hash,
"image": {
"width": page.image.width,
"height": page.image.height,
"bytes": page.image.tobytes(),
},
"cells": page_cells,
"contents": content_text,
"contents_md": content_md,
"contents_dt": content_dt,
"segments": page_segments,
"extra": {
"page_num": page.page_no + 1,
"width_in_points": page.size.width,
"height_in_points": page.size.height,
"dpi": dpi,
},
}
)
success_count += 1
rows.append(
{
"document": conv_res.input.file.name,
"hash": conv_res.input.document_hash,
"page_hash": create_hash(
conv_res.input.document_hash + ":" + str(page.page_no - 1)
),
"image": {
"width": page.image.width,
"height": page.image.height,
"bytes": page.image.tobytes(),
},
"cells": page_cells,
"contents": content_text,
"contents_md": content_md,
"contents_dt": content_dt,
"segments": page_segments,
"extra": {
"page_num": page.page_no + 1,
"width_in_points": page.size.width,
"height_in_points": page.size.height,
"dpi": dpi,
},
}
)
# Generate one parquet from all documents
df = pd.json_normalize(rows)
@@ -92,12 +88,9 @@ def main():
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
_log.info(
f"Document converted and multimodal pages generated in {end_time:.2f} seconds."
)
# This block demonstrates how the file can be opened with the HF datasets library
# from datasets import Dataset

View File

@@ -1,12 +1,9 @@
import logging
import time
from pathlib import Path
from typing import Tuple
import pandas as pd
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
@@ -15,59 +12,39 @@ _log = logging.getLogger(__name__)
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
input_doc_path = Path("./tests/data/2206.01062.pdf")
output_dir = Path("scratch")
doc_converter = DocumentConverter()
start_time = time.time()
conv_results = doc_converter.convert(input_files)
conv_res = doc_converter.convert(input_doc_path)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results:
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue
doc_filename = conv_res.input.file.stem
doc_filename = conv_res.input.file.stem
# Export tables
for table_ix, table in enumerate(conv_res.output.tables):
table_df: pd.DataFrame = table.export_to_dataframe()
print(f"## Table {table_ix}")
print(table_df.to_markdown())
# Export tables
for table_ix, table in enumerate(conv_res.document.tables):
table_df: pd.DataFrame = table.export_to_dataframe()
print(f"## Table {table_ix}")
print(table_df.to_markdown())
# Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename)
# Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename)
# Save the table as html
element_html_filename = (
output_dir / f"{doc_filename}-table-{table_ix+1}.html"
)
_log.info(f"Saving HTML table to {element_html_filename}")
with element_html_filename.open("w") as fp:
fp.write(table.export_to_html())
success_count += 1
# Save the table as html
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
_log.info(f"Saving HTML table to {element_html_filename}")
with element_html_filename.open("w") as fp:
fp.write(table.export_to_html())
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
_log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")
if __name__ == "__main__":

View File

@@ -2,5 +2,9 @@ from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter()
doc = converter.convert_single(source)
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"
result = converter.convert(source)
print(
result.document.export_to_markdown()
) # output: ## Docling Technical Report [...]"
# if the legacy output is needed, use this version
# print(result.legacy_document.export_to_markdown()) # output: ## Docling Technical Report [...]"

View File

@@ -49,18 +49,6 @@
"load_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"\n",
"warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n",
"warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -86,54 +74,37 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from enum import Enum\n",
"from typing import Iterator\n",
"\n",
"from langchain_core.document_loaders import BaseLoader\n",
"from langchain_core.documents import Document as LCDocument\n",
"from pydantic import BaseModel\n",
"\n",
"from docling.document_converter import DocumentConverter\n",
"\n",
"\n",
"class DocumentMetadata(BaseModel):\n",
" dl_doc_hash: str\n",
" # source: str\n",
"\n",
"\n",
"class DoclingPDFLoader(BaseLoader):\n",
" class ParseType(str, Enum):\n",
" MARKDOWN = \"markdown\"\n",
" # JSON = \"json\"\n",
"\n",
" def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:\n",
" def __init__(self, file_path: str | list[str]) -> None:\n",
" self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n",
" self._parse_type = parse_type\n",
" self._converter = DocumentConverter()\n",
"\n",
" def lazy_load(self) -> Iterator[LCDocument]:\n",
" for source in self._file_paths:\n",
" dl_doc = self._converter.convert_single(source).output\n",
" match self._parse_type:\n",
" case self.ParseType.MARKDOWN:\n",
" text = dl_doc.export_to_markdown()\n",
" # case self.ParseType.JSON:\n",
" # text = dl_doc.model_dump_json()\n",
" case _:\n",
" raise RuntimeError(\n",
" f\"Unexpected parse type encountered: {self._parse_type}\"\n",
" )\n",
" lc_doc = LCDocument(\n",
" page_content=text,\n",
" metadata=DocumentMetadata(\n",
" dl_doc_hash=dl_doc.file_info.document_hash,\n",
" ).model_dump(),\n",
" )\n",
" yield lc_doc"
" dl_doc = self._converter.convert(source).document\n",
" text = dl_doc.export_to_markdown()\n",
" yield LCDocument(page_content=text)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"FILE_PATH = \"https://raw.githubusercontent.com/DS4SD/docling/main/tests/data/2206.01062.pdf\" # DocLayNet paper"
]
},
{
@@ -141,37 +112,10 @@
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "1b38d07d5fed4618a44ecf261e1e5c44",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
"\n",
"loader = DoclingPDFLoader(\n",
" file_path=FILE_PATH,\n",
" parse_type=DoclingPDFLoader.ParseType.MARKDOWN,\n",
")\n",
"loader = DoclingPDFLoader(file_path=FILE_PATH)\n",
"text_splitter = RecursiveCharacterTextSplitter(\n",
" chunk_size=1000,\n",
" chunk_overlap=200,\n",
@@ -187,7 +131,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -204,7 +148,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@@ -223,7 +167,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -232,7 +176,7 @@
"from langchain_milvus import Milvus\n",
"\n",
"MILVUS_URI = os.environ.get(\n",
" \"MILVUS_URL\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
" \"MILVUS_URI\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
")\n",
"\n",
"vectorstore = Milvus.from_documents(\n",
@@ -252,7 +196,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -287,7 +231,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@@ -319,16 +263,16 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The human annotation of DocLayNet was performed on 80863 pages.\\n\\nExplanation:\\nThe information is found in the paragraph \"DocLayNet contains 80863 PDF pages\" in the context.'"
"'- 80,863 pages were human annotated for DocLayNet.'"
]
},
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -336,13 +280,6 @@
"source": [
"rag_chain.invoke(\"How many pages were human annotated for DocLayNet?\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/DS4SD/docling/blob/main/examples/rag_llamaindex.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
"<a href=\"https://colab.research.google.com/github/DS4SD/docling/blob/main/docs/examples/rag_llamaindex.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
@@ -14,6 +14,13 @@
"# RAG with LlamaIndex 🦙"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> 👉 **The LlamaIndex Docling extension update to Docling v2 is ongoing; in the meanwhile, this notebook is showing current extension output, based on Docling v1.**"
]
},
{
"cell_type": "markdown",
"metadata": {},

View File

@@ -0,0 +1,76 @@
import json
import logging
from pathlib import Path
import yaml
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
DocumentConverter,
PdfFormatOption,
WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
_log = logging.getLogger(__name__)
def main():
input_paths = [
Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"),
]
## for defaults use:
# doc_converter = DocumentConverter()
## to customize use:
doc_converter = (
DocumentConverter( # all of the below is optional, has internal defaults.
allowed_formats=[
InputFormat.PDF,
InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
], # whitelist formats, non-matching files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
),
},
)
)
conv_results = doc_converter.convert_all(input_paths)
for res in conv_results:
out_path = Path("scratch")
print(
f"Document {res.input.file.name} converted."
f"\nSaved markdown output to: {str(out_path)}"
)
# print(res.docdocument.export_to_markdown())
# Export Docling document format to markdowndoc:
with (out_path / f"{res.input.file.name}.md").open("w") as fp:
fp.write(res.document.export_to_markdown())
with (out_path / f"{res.input.file.name}.json").open("w") as fp:
fp.write(json.dumps(res.document.export_to_dict()))
with (out_path / f"{res.input.file.name}.yaml").open("w") as fp:
fp.write(yaml.safe_dump(res.document.export_to_dict()))
if __name__ == "__main__":
main()