From 8a252302404a217e108d190bb71ebb9e10cb405e Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Wed, 16 Oct 2024 10:28:40 +0200 Subject: [PATCH] Update v2 documentation --- docs/v2.md | 76 ++++++++++++++++++++++++++---------- tests/test_e2e_conversion.py | 7 +++- 2 files changed, 60 insertions(+), 23 deletions(-) diff --git a/docs/v2.md b/docs/v2.md index 0965115e..04260875 100644 --- a/docs/v2.md +++ b/docs/v2.md @@ -5,7 +5,33 @@ Docling v2 introduces several new features: - Produces a new, universal document representation which can encapsulate document hierarchy - Comes with a fresh new API and CLI -## Migration from v1 +## Changes in Docling v2 + +### CLI + +We updated the command line syntax of Docling v2 to support many formats. Examples are seen below. +```shell +# Convert a single file to Markdown (default) +docling myfile.pdf + +# Convert a single file to Markdown and JSON, without OCR +docling myfile.pdf --to json --to md --no-ocr + +# Convert PDF files in input directory to Markdown (default) +docling ./input/dir --from pdf + +# Convert PDF and Word files in input directory to Markdown and JSON +docling ./input/dir --from pdf --from docx --to md --to json --output ./scratch + +# Convert all supported files in input directory to Markdown, but abort on first error +docling ./input/dir --output ./scratch --abort-on-error + +``` + +**Notable changes from Docling v1:** +- The standalone switches for different export formats are removed, and replaced with `--from` and `--to` arguments, to define input and output formats respectively. +- The new `--abort-on-error` will abort any batch conversion as soon an error is encountered +- The `--backend` option for PDFs was removed ### Setting up a `DocumentConverter` @@ -81,8 +107,9 @@ or `DocumentStream` objects, without constructing a `DocumentConversionInput` ob ```python ... +from docling.datamodel.document import ConversionResult ## Convert a single file (from URL or local path) -conv_result = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single` +conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single` ## Convert several files at once: @@ -116,7 +143,7 @@ is now available in conversion results as a `DoclingDocument` object. `DoclingDocument` provides a neat set of APIs to construct, iterate and export content in the document, as shown below. ```python -conv_result = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single` +conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single` ## Inspect the converted document: conv_result.document.print_element_tree() @@ -132,6 +159,11 @@ for item, level in conv_result.document.iterate_items: #... ``` +**Note**: While it is deprecated, you can _still_ work with the Docling v1 document representation, it is available as: +```shell +conv_result.legacy_document # provides the representation in previous ExportedCCSDocument type +``` + ## Export into JSON, Markdown, Doctags **Note**: All `render_...` methods in `ConversionResult` have been removed in Docling v2, and are now available on `DoclingDocument` as: @@ -140,7 +172,7 @@ and are now available on `DoclingDocument` as: - `DoclingDocument.export_to_document_tokens` ```python -conv_result = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single` +conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single` ## Export to desired format: print(json.dumps(conv_res.document.export_to_dict())) @@ -148,28 +180,30 @@ print(conv_res.document.export_to_markdown()) print(conv_res.document.export_to_document_tokens()) ``` -### CLI - -We updated the command line syntax of Docling v2 to support many formats. Examples are seen below. +**Note**: While it is deprecated, you can _still_ export Docling v1 JSON format. This is available through the same +methods as on the `DoclingDocument` type: ```shell -# Convert a single file to Markdown (default) -docling myfile.pdf +## Export legacy document representation to desired format, for v1 compatibility: +print(json.dumps(conv_res.legacy_document.export_to_dict())) +print(conv_res.legacy_document.export_to_markdown()) +print(conv_res.legacy_document.export_to_document_tokens()) +``` -# Convert a single file to Markdown and JSON, without OCR -docling myfile.pdf --to json --to md --no-ocr +## Reload a `DoclingDocument` stored as JSON -# Convert PDF files in input directory to Markdown (default) -docling ./input/dir --from pdf +You can save and reload a `DoclingDocument` to disk in JSON format using the following codes: -# Convert PDF and Word files in input directory to Markdown and JSON -docling ./input/dir --from pdf --from docx --to md --to json --output ./scratch +```python +# Save to disk: +doc: DoclingDocument = conv_res.document # produced from conversion result... -# Convert all supported files in input directory to Markdown, but abort on first error -docling ./input/dir --output ./scratch --abort-on-error +with Path("./doc.json").open("w") as fp: + fp.write(json.dumps(doc.export_to_dict())) # use `export_to_dict` to ensure consistency + +# Load from disk: +with Path("./doc.json").open("r") as fp: + doc_dict = json.loads(fp.read()) + doc = DoclingDocument.model_validate(doc_dict) # use standard pydantic API to populate doc ``` -Notable changes from v1: -- The standalone switches for different export formats are removed, and replaced with `--from` and `--to` arguments, to define input and output formats respectively. -- The new `--abort-on-error` will abort any batch conversion as soon an error is encountered -- The `--backend` option for PDFs was removed \ No newline at end of file diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py index 81a3771c..1e166116 100644 --- a/tests/test_e2e_conversion.py +++ b/tests/test_e2e_conversion.py @@ -1,5 +1,8 @@ from pathlib import Path +import yaml +from docling_core.types.doc import DoclingDocument + from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult @@ -8,8 +11,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 -GENERATE_V1 = True -GENERATE_V2 = True +GENERATE_V1 = False +GENERATE_V2 = False def get_pdf_paths():