diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7f8772f2..d3f59878 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,23 @@
+## [v1.14.0](https://github.com/DS4SD/docling/releases/tag/v1.14.0) - 2024-09-24
+
+### Feature
+
+* Add URL support to CLI ([#99](https://github.com/DS4SD/docling/issues/99)) ([`3c46e42`](https://github.com/DS4SD/docling/commit/3c46e4266cf1ad8d3a99aa33636d84d34222b4fe))
+
+### Fix
+
+* Fix OCR setting for pypdfium, minor refactor ([#102](https://github.com/DS4SD/docling/issues/102)) ([`d96b96c`](https://github.com/DS4SD/docling/commit/d96b96c8481a8ae68545a34aaf9b8d5a6637a6be))
+
+### Documentation
+
+* Document CLI, minor README revamp ([#100](https://github.com/DS4SD/docling/issues/100)) ([`f8f2303`](https://github.com/DS4SD/docling/commit/f8f2303348c4bbcb7903ff172746a69607e53271))
+
+## [v1.13.1](https://github.com/DS4SD/docling/releases/tag/v1.13.1) - 2024-09-23
+
+### Fix
+
+* Updated the render_as_doctags with the new arguments from docling-core ([#93](https://github.com/DS4SD/docling/issues/93)) ([`4794ce4`](https://github.com/DS4SD/docling/commit/4794ce460a542a730fd5a72a7be7f94a07ed5d12))
+
## [v1.13.0](https://github.com/DS4SD/docling/releases/tag/v1.13.0) - 2024-09-18
### Feature
diff --git a/README.md b/README.md
index 2631d3ca..2fd199cd 100644
--- a/README.md
+++ b/README.md
@@ -22,8 +22,9 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
* 📑 Understands detailed page layout, reading order and recovers table structures
* 📝 Extracts metadata from the document, such as title, authors, references and language
-* 🔍 Optionally applies OCR (use with scanned PDFs)
+* 🔍 Includes OCR support for scanned PDFs
* 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
+* 💻 Provides a simple and convenient CLI
## Installation
@@ -35,31 +36,33 @@ pip install docling
> [!NOTE]
> Works on macOS and Linux environments. Windows platforms are currently not tested.
+
+ Alternative PyTorch distributions
-### Use alternative PyTorch distributions
+ The Docling models depend on the [PyTorch](https://pytorch.org/) library.
+ Depending on your architecture, you might want to use a different distribution of `torch`.
+ For example, you might want support for different accelerator or for a cpu-only version.
+ All the different ways for installing `torch` are listed on their website .
-The Docling models depend on the [PyTorch](https://pytorch.org/) library.
-Depending on your architecture, you might want to use a different distribution of `torch`.
-For example, you might want support for different accelerator or for a cpu-only version.
-All the different ways for installing `torch` are listed on their website .
+ One common situation is the installation on Linux systems with cpu-only support.
+ In this case, we suggest the installation of Docling with the following options
-One common situation is the installation on Linux systems with cpu-only support.
-In this case, we suggest the installation of Docling with the following options
+ ```bash
+ # Example for installing on the Linux cpu-only version
+ pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
+ ```
+
-```bash
-# Example for installing on the Linux cpu-only version
-pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
-```
+
+ Docling development setup
+ To develop for Docling (features, bugfixes etc.), install as follows from your local clone's root dir:
+ ```bash
+ poetry install --all-extras
+ ```
+
-### Development setup
-
-To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
-```bash
-poetry install --all-extras
-```
-
-## Usage
+## Getting started
### Convert a single document
@@ -70,7 +73,6 @@ from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter()
result = converter.convert_single(source)
-
print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
print(result.render_as_doctags()) # output: "..."
```
@@ -86,6 +88,51 @@ python examples/batch_convert.py
```
The output of the above command will be written to `./scratch`.
+### CLI
+
+You can also use Docling directly from your command line to convert individual files —be it local or by URL— or whole directories.
+
+A simple example would look like this:
+```console
+docling https://arxiv.org/pdf/2206.01062
+```
+
+To see all available options (export formats etc.) run `docling --help`.
+
+
+ CLI reference
+
+ Here are the available options as of this writing (for an up-to-date listing, run `docling --help`):
+
+ ```console
+ $ docling --help
+
+ Usage: docling [OPTIONS] source
+
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+ │ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] [required] │
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+ │ --json --no-json If enabled the document is exported as JSON. [default: no-json] │
+ │ --md --no-md If enabled the document is exported as Markdown. [default: md] │
+ │ --txt --no-txt If enabled the document is exported as Text. [default: no-txt] │
+ │ --doctags --no-doctags If enabled the document is exported as Doc Tags. [default: no-doctags] │
+ │ --ocr --no-ocr If enabled, the bitmap content will be processed using OCR. [default: ocr] │
+ │ --backend [pypdfium2|docling] The PDF backend to use. [default: docling] │
+ │ --output PATH Output directory where results are saved. [default: .] │
+ │ --version Show version information. │
+ │ --help Show this message and exit. │
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+ ```
+
+
+### RAG
+Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
+- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
+- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
+
+## Advanced features
+
### Adjust pipeline features
The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
@@ -144,11 +191,6 @@ results = doc_converter.convert(conv_input)
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
-### RAG
-Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
-- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
-- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
-
## Technical report
For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
diff --git a/docling/cli/main.py b/docling/cli/main.py
index c8a8f3ac..894e9ab1 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -8,7 +8,7 @@ from pathlib import Path
from typing import Annotated, Iterable, List, Optional
import typer
-from pydantic import AnyUrl
+from docling_core.utils.file import resolve_file_source
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -109,11 +109,11 @@ def export_documents(
@app.command(no_args_is_help=True)
def convert(
input_sources: Annotated[
- List[Path],
+ List[str],
typer.Argument(
...,
metavar="source",
- help="PDF files to convert. Directories are also accepted.",
+ help="PDF files to convert. Can be local file / directory paths or URL.",
),
],
export_json: Annotated[
@@ -167,7 +167,8 @@ def convert(
logging.basicConfig(level=logging.INFO)
input_doc_paths: List[Path] = []
- for source in input_sources:
+ for src in input_sources:
+ source = resolve_file_source(source=src)
if not source.exists():
err_console.print(
f"[red]Error: The input file {source} does not exist.[/red]"
@@ -179,58 +180,25 @@ def convert(
else:
input_doc_paths.append(source)
- ###########################################################################
+ match backend:
+ case Backend.PYPDFIUM2:
+ do_cell_matching = ocr # only do cell matching when OCR enabled
+ pdf_backend = PyPdfiumDocumentBackend
+ case Backend.DOCLING:
+ do_cell_matching = True
+ pdf_backend = DoclingParseDocumentBackend
+ case _:
+ raise RuntimeError(f"Unexpected backend type {backend}")
- # The following sections contain a combination of PipelineOptions
- # and PDF Backends for various configurations.
- # Uncomment one section at the time to see the differences in the output.
-
- doc_converter = None
- if backend == Backend.PYPDFIUM2 and not ocr: # PyPdfium without OCR
- pipeline_options = PipelineOptions()
- pipeline_options.do_ocr = False
- pipeline_options.do_table_structure = True
- pipeline_options.table_structure_options.do_cell_matching = False
-
- doc_converter = DocumentConverter(
- pipeline_options=pipeline_options,
- pdf_backend=PyPdfiumDocumentBackend,
- )
-
- elif backend == Backend.PYPDFIUM2.value and ocr: # PyPdfium with OCR
- pipeline_options = PipelineOptions()
- pipeline_options.do_ocr = False
- pipeline_options.do_table_structure = True
- pipeline_options.table_structure_options.do_cell_matching = True
-
- doc_converter = DocumentConverter(
- pipeline_options=pipeline_options,
- pdf_backend=PyPdfiumDocumentBackend,
- )
-
- elif backend == Backend.DOCLING.value and not ocr: # Docling Parse without OCR
- pipeline_options = PipelineOptions()
- pipeline_options.do_ocr = False
- pipeline_options.do_table_structure = True
- pipeline_options.table_structure_options.do_cell_matching = True
-
- doc_converter = DocumentConverter(
- pipeline_options=pipeline_options,
- pdf_backend=DoclingParseDocumentBackend,
- )
-
- elif backend == Backend.DOCLING.value and ocr: # Docling Parse with OCR
- pipeline_options = PipelineOptions()
- pipeline_options.do_ocr = True
- pipeline_options.do_table_structure = True
- pipeline_options.table_structure_options.do_cell_matching = True
-
- doc_converter = DocumentConverter(
- pipeline_options=pipeline_options,
- pdf_backend=DoclingParseDocumentBackend,
- )
-
- ###########################################################################
+ pipeline_options = PipelineOptions(
+ do_ocr=ocr,
+ do_table_structure=True,
+ )
+ pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
+ doc_converter = DocumentConverter(
+ pipeline_options=pipeline_options,
+ pdf_backend=pdf_backend,
+ )
# Define input files
input = DocumentConversionInput.from_paths(input_doc_paths)
diff --git a/examples/rag_langchain.ipynb b/examples/rag_langchain.ipynb
index fd26908a..30e38329 100644
--- a/examples/rag_langchain.ipynb
+++ b/examples/rag_langchain.ipynb
@@ -1,5 +1,12 @@
{
"cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# RAG with Docling and 🦜🔗 LangChain"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 1,
diff --git a/examples/rag_llamaindex.ipynb b/examples/rag_llamaindex.ipynb
index a8703cc2..6dd9e0f4 100644
--- a/examples/rag_llamaindex.ipynb
+++ b/examples/rag_llamaindex.ipynb
@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Quackling — Basic Pipeline"
+ "# RAG with Docling and 🦙 LlamaIndex"
]
},
{
diff --git a/pyproject.toml b/pyproject.toml
index 28e7435d..a69ea26c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "docling"
-version = "1.13.0" # DO NOT EDIT, updated automatically
+version = "1.14.0" # DO NOT EDIT, updated automatically
description = "Docling PDF conversion package"
authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Peter Staar "]
license = "MIT"