diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f8772f2..d3f59878 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,23 @@ +## [v1.14.0](https://github.com/DS4SD/docling/releases/tag/v1.14.0) - 2024-09-24 + +### Feature + +* Add URL support to CLI ([#99](https://github.com/DS4SD/docling/issues/99)) ([`3c46e42`](https://github.com/DS4SD/docling/commit/3c46e4266cf1ad8d3a99aa33636d84d34222b4fe)) + +### Fix + +* Fix OCR setting for pypdfium, minor refactor ([#102](https://github.com/DS4SD/docling/issues/102)) ([`d96b96c`](https://github.com/DS4SD/docling/commit/d96b96c8481a8ae68545a34aaf9b8d5a6637a6be)) + +### Documentation + +* Document CLI, minor README revamp ([#100](https://github.com/DS4SD/docling/issues/100)) ([`f8f2303`](https://github.com/DS4SD/docling/commit/f8f2303348c4bbcb7903ff172746a69607e53271)) + +## [v1.13.1](https://github.com/DS4SD/docling/releases/tag/v1.13.1) - 2024-09-23 + +### Fix + +* Updated the render_as_doctags with the new arguments from docling-core ([#93](https://github.com/DS4SD/docling/issues/93)) ([`4794ce4`](https://github.com/DS4SD/docling/commit/4794ce460a542a730fd5a72a7be7f94a07ed5d12)) + ## [v1.13.0](https://github.com/DS4SD/docling/releases/tag/v1.13.0) - 2024-09-18 ### Feature diff --git a/README.md b/README.md index 2631d3ca..2fd199cd 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,9 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co * ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast * 📑 Understands detailed page layout, reading order and recovers table structures * 📝 Extracts metadata from the document, such as title, authors, references and language -* 🔍 Optionally applies OCR (use with scanned PDFs) +* 🔍 Includes OCR support for scanned PDFs * 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain +* 💻 Provides a simple and convenient CLI ## Installation @@ -35,31 +36,33 @@ pip install docling > [!NOTE] > Works on macOS and Linux environments. Windows platforms are currently not tested. +
+ Alternative PyTorch distributions -### Use alternative PyTorch distributions + The Docling models depend on the [PyTorch](https://pytorch.org/) library. + Depending on your architecture, you might want to use a different distribution of `torch`. + For example, you might want support for different accelerator or for a cpu-only version. + All the different ways for installing `torch` are listed on their website . -The Docling models depend on the [PyTorch](https://pytorch.org/) library. -Depending on your architecture, you might want to use a different distribution of `torch`. -For example, you might want support for different accelerator or for a cpu-only version. -All the different ways for installing `torch` are listed on their website . + One common situation is the installation on Linux systems with cpu-only support. + In this case, we suggest the installation of Docling with the following options -One common situation is the installation on Linux systems with cpu-only support. -In this case, we suggest the installation of Docling with the following options + ```bash + # Example for installing on the Linux cpu-only version + pip install docling --extra-index-url https://download.pytorch.org/whl/cpu + ``` +
-```bash -# Example for installing on the Linux cpu-only version -pip install docling --extra-index-url https://download.pytorch.org/whl/cpu -``` +
+ Docling development setup + To develop for Docling (features, bugfixes etc.), install as follows from your local clone's root dir: + ```bash + poetry install --all-extras + ``` +
-### Development setup - -To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir: -```bash -poetry install --all-extras -``` - -## Usage +## Getting started ### Convert a single document @@ -70,7 +73,6 @@ from docling.document_converter import DocumentConverter source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) - print(result.render_as_markdown()) # output: "## Docling Technical Report[...]" print(result.render_as_doctags()) # output: "<page_1><loc_20>..." ``` @@ -86,6 +88,51 @@ python examples/batch_convert.py ``` The output of the above command will be written to `./scratch`. +### CLI + +You can also use Docling directly from your command line to convert individual files —be it local or by URL— or whole directories. + +A simple example would look like this: +```console +docling https://arxiv.org/pdf/2206.01062 +``` + +To see all available options (export formats etc.) run `docling --help`. + +<details> + <summary><b>CLI reference</b></summary> + + Here are the available options as of this writing (for an up-to-date listing, run `docling --help`): + + ```console + $ docling --help + + Usage: docling [OPTIONS] source + + ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ + │ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] [required] │ + ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ + │ --json --no-json If enabled the document is exported as JSON. [default: no-json] │ + │ --md --no-md If enabled the document is exported as Markdown. [default: md] │ + │ --txt --no-txt If enabled the document is exported as Text. [default: no-txt] │ + │ --doctags --no-doctags If enabled the document is exported as Doc Tags. [default: no-doctags] │ + │ --ocr --no-ocr If enabled, the bitmap content will be processed using OCR. [default: ocr] │ + │ --backend [pypdfium2|docling] The PDF backend to use. [default: docling] │ + │ --output PATH Output directory where results are saved. [default: .] │ + │ --version Show version information. │ + │ --help Show this message and exit. │ + ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + ``` +</details> + +### RAG +Check out the following examples showcasing RAG using Docling with standard LLM application frameworks: +- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb) +- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb) + +## Advanced features + ### Adjust pipeline features The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways @@ -144,11 +191,6 @@ results = doc_converter.convert(conv_input) You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads. -### RAG -Check out the following examples showcasing RAG using Docling with standard LLM application frameworks: -- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb) -- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb) - ## Technical report For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869). diff --git a/docling/cli/main.py b/docling/cli/main.py index c8a8f3ac..894e9ab1 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Annotated, Iterable, List, Optional import typer -from pydantic import AnyUrl +from docling_core.utils.file import resolve_file_source from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend @@ -109,11 +109,11 @@ def export_documents( @app.command(no_args_is_help=True) def convert( input_sources: Annotated[ - List[Path], + List[str], typer.Argument( ..., metavar="source", - help="PDF files to convert. Directories are also accepted.", + help="PDF files to convert. Can be local file / directory paths or URL.", ), ], export_json: Annotated[ @@ -167,7 +167,8 @@ def convert( logging.basicConfig(level=logging.INFO) input_doc_paths: List[Path] = [] - for source in input_sources: + for src in input_sources: + source = resolve_file_source(source=src) if not source.exists(): err_console.print( f"[red]Error: The input file {source} does not exist.[/red]" @@ -179,58 +180,25 @@ def convert( else: input_doc_paths.append(source) - ########################################################################### + match backend: + case Backend.PYPDFIUM2: + do_cell_matching = ocr # only do cell matching when OCR enabled + pdf_backend = PyPdfiumDocumentBackend + case Backend.DOCLING: + do_cell_matching = True + pdf_backend = DoclingParseDocumentBackend + case _: + raise RuntimeError(f"Unexpected backend type {backend}") - # The following sections contain a combination of PipelineOptions - # and PDF Backends for various configurations. - # Uncomment one section at the time to see the differences in the output. - - doc_converter = None - if backend == Backend.PYPDFIUM2 and not ocr: # PyPdfium without OCR - pipeline_options = PipelineOptions() - pipeline_options.do_ocr = False - pipeline_options.do_table_structure = True - pipeline_options.table_structure_options.do_cell_matching = False - - doc_converter = DocumentConverter( - pipeline_options=pipeline_options, - pdf_backend=PyPdfiumDocumentBackend, - ) - - elif backend == Backend.PYPDFIUM2.value and ocr: # PyPdfium with OCR - pipeline_options = PipelineOptions() - pipeline_options.do_ocr = False - pipeline_options.do_table_structure = True - pipeline_options.table_structure_options.do_cell_matching = True - - doc_converter = DocumentConverter( - pipeline_options=pipeline_options, - pdf_backend=PyPdfiumDocumentBackend, - ) - - elif backend == Backend.DOCLING.value and not ocr: # Docling Parse without OCR - pipeline_options = PipelineOptions() - pipeline_options.do_ocr = False - pipeline_options.do_table_structure = True - pipeline_options.table_structure_options.do_cell_matching = True - - doc_converter = DocumentConverter( - pipeline_options=pipeline_options, - pdf_backend=DoclingParseDocumentBackend, - ) - - elif backend == Backend.DOCLING.value and ocr: # Docling Parse with OCR - pipeline_options = PipelineOptions() - pipeline_options.do_ocr = True - pipeline_options.do_table_structure = True - pipeline_options.table_structure_options.do_cell_matching = True - - doc_converter = DocumentConverter( - pipeline_options=pipeline_options, - pdf_backend=DoclingParseDocumentBackend, - ) - - ########################################################################### + pipeline_options = PipelineOptions( + do_ocr=ocr, + do_table_structure=True, + ) + pipeline_options.table_structure_options.do_cell_matching = do_cell_matching + doc_converter = DocumentConverter( + pipeline_options=pipeline_options, + pdf_backend=pdf_backend, + ) # Define input files input = DocumentConversionInput.from_paths(input_doc_paths) diff --git a/examples/rag_langchain.ipynb b/examples/rag_langchain.ipynb index fd26908a..30e38329 100644 --- a/examples/rag_langchain.ipynb +++ b/examples/rag_langchain.ipynb @@ -1,5 +1,12 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAG with Docling and 🦜🔗 LangChain" + ] + }, { "cell_type": "code", "execution_count": 1, diff --git a/examples/rag_llamaindex.ipynb b/examples/rag_llamaindex.ipynb index a8703cc2..6dd9e0f4 100644 --- a/examples/rag_llamaindex.ipynb +++ b/examples/rag_llamaindex.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Quackling — Basic Pipeline" + "# RAG with Docling and 🦙 LlamaIndex" ] }, { diff --git a/pyproject.toml b/pyproject.toml index 28e7435d..a69ea26c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "1.13.0" # DO NOT EDIT, updated automatically +version = "1.14.0" # DO NOT EDIT, updated automatically description = "Docling PDF conversion package" authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"] license = "MIT"