diff --git a/docs/v2.md b/docs/v2.md index 7e5e9ad9..0965115e 100644 --- a/docs/v2.md +++ b/docs/v2.md @@ -27,11 +27,19 @@ from docling.document_converter import ( ) from docling.pipeline.simple_pipeline import SimplePipeline from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline +from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend ## Default initialization still works as before: # doc_converter = DocumentConverter() + +# previous `PipelineOptions` is now `PdfPipelineOptions` +pipeline_options = PdfPipelineOptions() +pipeline_options.do_ocr = False +pipeline_options.do_table_structure = True +#... + ## Custom options are now defined per format. doc_converter = ( DocumentConverter( # all of the below is optional, has internal defaults. @@ -44,17 +52,22 @@ doc_converter = ( ], # whitelist formats, non-matching files are ignored. format_options={ InputFormat.PDF: PdfFormatOption( - pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend + pipeline_options=pipeline_options, # pipeline options go here. + backend=PyPdfiumDocumentBackend # optional: pick an alternative backend ), InputFormat.DOCX: WordFormatOption( - pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend + pipeline_cls=SimplePipeline # default for office formats and HTML ), }, ) ) ``` -Note: If you work only with defaults, all remains the same as in Docling v1. +**Note**: If you work only with defaults, all remains the same as in Docling v1. + +More options are shown in the following example units: +- [run_with_formats.py](docs/examples/run_with_formats.py) +- [custom_convert.py](docs/examples/custom_convert.py) ### Converting documents @@ -82,6 +95,7 @@ input_files = [ "tests/data/2206.01062.pdf", ] +# Directly pass list of files or streams to `convert_all` conv_results_iter = doc_converter.convert_all(input_files) # previously `convert_batch` ``` @@ -95,13 +109,67 @@ conv_results_iter = doc_converter.convert_all(input_files, raises_on_error=False ``` -### Exporting documents into JSON, Markdown, Doctags +### Access document structures -We have simplified how you can access and export the converted document data, too. +We have simplified how you can access and export the converted document data, too. Our universal document representation +is now available in conversion results as a `DoclingDocument` object. +`DoclingDocument` provides a neat set of APIs to construct, iterate and export content in the document, as shown below. -TBD. +```python +conv_result = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single` +## Inspect the converted document: +conv_result.document.print_element_tree() + +## Iterate the elements in reading order, including hierachy level: +for item, level in conv_result.document.iterate_items: + if isinstance(item, TextItem): + print(item.text) + elif isinstance(item, TableItem): + table_df: pd.DataFrame = item.export_to_dataframe() + print(table_df.to_markdown()) + elif ...: + #... +``` + +## Export into JSON, Markdown, Doctags +**Note**: All `render_...` methods in `ConversionResult` have been removed in Docling v2, +and are now available on `DoclingDocument` as: +- `DoclingDocument.export_to_dict` +- `DoclingDocument.export_to_markdown` +- `DoclingDocument.export_to_document_tokens` + +```python +conv_result = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single` + +## Export to desired format: +print(json.dumps(conv_res.document.export_to_dict())) +print(conv_res.document.export_to_markdown()) +print(conv_res.document.export_to_document_tokens()) +``` ### CLI -TBD. +We updated the command line syntax of Docling v2 to support many formats. Examples are seen below. +```shell +# Convert a single file to Markdown (default) +docling myfile.pdf + +# Convert a single file to Markdown and JSON, without OCR +docling myfile.pdf --to json --to md --no-ocr + +# Convert PDF files in input directory to Markdown (default) +docling ./input/dir --from pdf + +# Convert PDF and Word files in input directory to Markdown and JSON +docling ./input/dir --from pdf --from docx --to md --to json --output ./scratch + +# Convert all supported files in input directory to Markdown, but abort on first error +docling ./input/dir --output ./scratch --abort-on-error + +``` + +Notable changes from v1: +- The standalone switches for different export formats are removed, and replaced with `--from` and `--to` arguments, to define input and output formats respectively. +- The new `--abort-on-error` will abort any batch conversion as soon an error is encountered +- The `--backend` option for PDFs was removed \ No newline at end of file