mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Update v2 documentation
This commit is contained in:
parent
df3ff47914
commit
8a25230240
76
docs/v2.md
76
docs/v2.md
@ -5,7 +5,33 @@ Docling v2 introduces several new features:
|
|||||||
- Produces a new, universal document representation which can encapsulate document hierarchy
|
- Produces a new, universal document representation which can encapsulate document hierarchy
|
||||||
- Comes with a fresh new API and CLI
|
- Comes with a fresh new API and CLI
|
||||||
|
|
||||||
## Migration from v1
|
## Changes in Docling v2
|
||||||
|
|
||||||
|
### CLI
|
||||||
|
|
||||||
|
We updated the command line syntax of Docling v2 to support many formats. Examples are seen below.
|
||||||
|
```shell
|
||||||
|
# Convert a single file to Markdown (default)
|
||||||
|
docling myfile.pdf
|
||||||
|
|
||||||
|
# Convert a single file to Markdown and JSON, without OCR
|
||||||
|
docling myfile.pdf --to json --to md --no-ocr
|
||||||
|
|
||||||
|
# Convert PDF files in input directory to Markdown (default)
|
||||||
|
docling ./input/dir --from pdf
|
||||||
|
|
||||||
|
# Convert PDF and Word files in input directory to Markdown and JSON
|
||||||
|
docling ./input/dir --from pdf --from docx --to md --to json --output ./scratch
|
||||||
|
|
||||||
|
# Convert all supported files in input directory to Markdown, but abort on first error
|
||||||
|
docling ./input/dir --output ./scratch --abort-on-error
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
**Notable changes from Docling v1:**
|
||||||
|
- The standalone switches for different export formats are removed, and replaced with `--from` and `--to` arguments, to define input and output formats respectively.
|
||||||
|
- The new `--abort-on-error` will abort any batch conversion as soon an error is encountered
|
||||||
|
- The `--backend` option for PDFs was removed
|
||||||
|
|
||||||
### Setting up a `DocumentConverter`
|
### Setting up a `DocumentConverter`
|
||||||
|
|
||||||
@ -81,8 +107,9 @@ or `DocumentStream` objects, without constructing a `DocumentConversionInput` ob
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
...
|
...
|
||||||
|
from docling.datamodel.document import ConversionResult
|
||||||
## Convert a single file (from URL or local path)
|
## Convert a single file (from URL or local path)
|
||||||
conv_result = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single`
|
conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single`
|
||||||
|
|
||||||
## Convert several files at once:
|
## Convert several files at once:
|
||||||
|
|
||||||
@ -116,7 +143,7 @@ is now available in conversion results as a `DoclingDocument` object.
|
|||||||
`DoclingDocument` provides a neat set of APIs to construct, iterate and export content in the document, as shown below.
|
`DoclingDocument` provides a neat set of APIs to construct, iterate and export content in the document, as shown below.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
conv_result = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single`
|
conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single`
|
||||||
|
|
||||||
## Inspect the converted document:
|
## Inspect the converted document:
|
||||||
conv_result.document.print_element_tree()
|
conv_result.document.print_element_tree()
|
||||||
@ -132,6 +159,11 @@ for item, level in conv_result.document.iterate_items:
|
|||||||
#...
|
#...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Note**: While it is deprecated, you can _still_ work with the Docling v1 document representation, it is available as:
|
||||||
|
```shell
|
||||||
|
conv_result.legacy_document # provides the representation in previous ExportedCCSDocument type
|
||||||
|
```
|
||||||
|
|
||||||
## Export into JSON, Markdown, Doctags
|
## Export into JSON, Markdown, Doctags
|
||||||
**Note**: All `render_...` methods in `ConversionResult` have been removed in Docling v2,
|
**Note**: All `render_...` methods in `ConversionResult` have been removed in Docling v2,
|
||||||
and are now available on `DoclingDocument` as:
|
and are now available on `DoclingDocument` as:
|
||||||
@ -140,7 +172,7 @@ and are now available on `DoclingDocument` as:
|
|||||||
- `DoclingDocument.export_to_document_tokens`
|
- `DoclingDocument.export_to_document_tokens`
|
||||||
|
|
||||||
```python
|
```python
|
||||||
conv_result = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single`
|
conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single`
|
||||||
|
|
||||||
## Export to desired format:
|
## Export to desired format:
|
||||||
print(json.dumps(conv_res.document.export_to_dict()))
|
print(json.dumps(conv_res.document.export_to_dict()))
|
||||||
@ -148,28 +180,30 @@ print(conv_res.document.export_to_markdown())
|
|||||||
print(conv_res.document.export_to_document_tokens())
|
print(conv_res.document.export_to_document_tokens())
|
||||||
```
|
```
|
||||||
|
|
||||||
### CLI
|
**Note**: While it is deprecated, you can _still_ export Docling v1 JSON format. This is available through the same
|
||||||
|
methods as on the `DoclingDocument` type:
|
||||||
We updated the command line syntax of Docling v2 to support many formats. Examples are seen below.
|
|
||||||
```shell
|
```shell
|
||||||
# Convert a single file to Markdown (default)
|
## Export legacy document representation to desired format, for v1 compatibility:
|
||||||
docling myfile.pdf
|
print(json.dumps(conv_res.legacy_document.export_to_dict()))
|
||||||
|
print(conv_res.legacy_document.export_to_markdown())
|
||||||
|
print(conv_res.legacy_document.export_to_document_tokens())
|
||||||
|
```
|
||||||
|
|
||||||
# Convert a single file to Markdown and JSON, without OCR
|
## Reload a `DoclingDocument` stored as JSON
|
||||||
docling myfile.pdf --to json --to md --no-ocr
|
|
||||||
|
|
||||||
# Convert PDF files in input directory to Markdown (default)
|
You can save and reload a `DoclingDocument` to disk in JSON format using the following codes:
|
||||||
docling ./input/dir --from pdf
|
|
||||||
|
|
||||||
# Convert PDF and Word files in input directory to Markdown and JSON
|
```python
|
||||||
docling ./input/dir --from pdf --from docx --to md --to json --output ./scratch
|
# Save to disk:
|
||||||
|
doc: DoclingDocument = conv_res.document # produced from conversion result...
|
||||||
|
|
||||||
# Convert all supported files in input directory to Markdown, but abort on first error
|
with Path("./doc.json").open("w") as fp:
|
||||||
docling ./input/dir --output ./scratch --abort-on-error
|
fp.write(json.dumps(doc.export_to_dict())) # use `export_to_dict` to ensure consistency
|
||||||
|
|
||||||
|
# Load from disk:
|
||||||
|
with Path("./doc.json").open("r") as fp:
|
||||||
|
doc_dict = json.loads(fp.read())
|
||||||
|
doc = DoclingDocument.model_validate(doc_dict) # use standard pydantic API to populate doc
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Notable changes from v1:
|
|
||||||
- The standalone switches for different export formats are removed, and replaced with `--from` and `--to` arguments, to define input and output formats respectively.
|
|
||||||
- The new `--abort-on-error` will abort any batch conversion as soon an error is encountered
|
|
||||||
- The `--backend` option for PDFs was removed
|
|
@ -1,5 +1,8 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from docling_core.types.doc import DoclingDocument
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
@ -8,8 +11,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
|||||||
|
|
||||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||||
|
|
||||||
GENERATE_V1 = True
|
GENERATE_V1 = False
|
||||||
GENERATE_V2 = True
|
GENERATE_V2 = False
|
||||||
|
|
||||||
|
|
||||||
def get_pdf_paths():
|
def get_pdf_paths():
|
||||||
|
Loading…
Reference in New Issue
Block a user