Merge remote-tracking branch 'origin/main' into feat-figure-in-markdown

2025-07-26 20:14:47 +00:00 · 2024-09-24 15:41:22 +02:00 · 2024-09-24 15:41:22 +02:00 · b1a3a7a56c
commit b1a3a7a56c
parent 1571e1e17d 001d214a13
6 changed files with 120 additions and 83 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,23 @@
+## [v1.14.0](https://github.com/DS4SD/docling/releases/tag/v1.14.0) - 2024-09-24
+
+### Feature
+
+* Add URL support to CLI ([#99](https://github.com/DS4SD/docling/issues/99)) ([`3c46e42`](https://github.com/DS4SD/docling/commit/3c46e4266cf1ad8d3a99aa33636d84d34222b4fe))
+
+### Fix
+
+* Fix OCR setting for pypdfium, minor refactor ([#102](https://github.com/DS4SD/docling/issues/102)) ([`d96b96c`](https://github.com/DS4SD/docling/commit/d96b96c8481a8ae68545a34aaf9b8d5a6637a6be))
+
+### Documentation
+
+* Document CLI, minor README revamp ([#100](https://github.com/DS4SD/docling/issues/100)) ([`f8f2303`](https://github.com/DS4SD/docling/commit/f8f2303348c4bbcb7903ff172746a69607e53271))
+
+## [v1.13.1](https://github.com/DS4SD/docling/releases/tag/v1.13.1) - 2024-09-23
+
+### Fix
+
+* Updated the render_as_doctags with the new arguments from docling-core ([#93](https://github.com/DS4SD/docling/issues/93)) ([`4794ce4`](https://github.com/DS4SD/docling/commit/4794ce460a542a730fd5a72a7be7f94a07ed5d12))
+
 ## [v1.13.0](https://github.com/DS4SD/docling/releases/tag/v1.13.0) - 2024-09-18

 ### Feature
--- a/README.md
+++ b/README.md
@ -22,8 +22,9 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
 * ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
 * 📑 Understands detailed page layout, reading order and recovers table structures
 * 📝 Extracts metadata from the document, such as title, authors, references and language
-* 🔍 Optionally applies OCR (use with scanned PDFs)
+* 🔍 Includes OCR support for scanned PDFs
 * 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
+* 💻 Provides a simple and convenient CLI

 ## Installation

@ -35,8 +36,8 @@ pip install docling
 > [!NOTE]
 > Works on macOS and Linux environments. Windows platforms are currently not tested.

-
-### Use alternative PyTorch distributions
+<details>
+  <summary><b>Alternative PyTorch distributions</b></summary>

  The Docling models depend on the [PyTorch](https://pytorch.org/) library.
  Depending on your architecture, you might want to use a different distribution of `torch`.
@ -50,16 +51,18 @@ In this case, we suggest the installation of Docling with the following options
  # Example for installing on the Linux cpu-only version
  pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
  ```
+</details>

+<details>
+  <summary><b>Docling development setup</b></summary>

-### Development setup
-
-To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
+  To develop for Docling (features, bugfixes etc.), install as follows from your local clone's root dir:
  ```bash
  poetry install --all-extras
  ```
+</details>

-## Usage
+## Getting started

 ### Convert a single document

@ -70,7 +73,6 @@ from docling.document_converter import DocumentConverter
 source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
 converter = DocumentConverter()
 result = converter.convert_single(source)
-
 print(result.render_as_markdown())  # output: "## Docling Technical Report[...]"
 print(result.render_as_doctags())  # output: "<document><title><page_1><loc_20>..."
 ```
@ -86,6 +88,51 @@ python examples/batch_convert.py
 ```
 The output of the above command will be written to `./scratch`.

+### CLI
+
+You can also use Docling directly from your command line to convert individual files —be it local or by URL— or whole directories.
+
+A simple example would look like this:
+```console
+docling https://arxiv.org/pdf/2206.01062
+```
+
+To see all available options (export formats etc.) run `docling --help`.
+
+<details>
+  <summary><b>CLI reference</b></summary>
+
+  Here are the available options as of this writing (for an up-to-date listing, run `docling --help`):
+
+  ```console
+  $ docling --help
+
+  Usage: docling [OPTIONS] source
+
+  ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+  │ *    input_sources      source  PDF files to convert. Can be local file / directory paths or URL. [default: None] [required] │
+  ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+  ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+  │ --json       --no-json                            If enabled the document is exported as JSON. [default: no-json]            │
+  │ --md         --no-md                              If enabled the document is exported as Markdown. [default: md]             │
+  │ --txt        --no-txt                             If enabled the document is exported as Text. [default: no-txt]             │
+  │ --doctags    --no-doctags                         If enabled the document is exported as Doc Tags. [default: no-doctags]     │
+  │ --ocr        --no-ocr                             If enabled, the bitmap content will be processed using OCR. [default: ocr] │
+  │ --backend                    [pypdfium2|docling]  The PDF backend to use. [default: docling]                                 │
+  │ --output                     PATH                 Output directory where results are saved. [default: .]                     │
+  │ --version                                         Show version information.                                                  │
+  │ --help                                            Show this message and exit.                                                │
+  ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+  ```
+</details>
+
+### RAG
+Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
+- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
+- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
+
+## Advanced features
+
 ### Adjust pipeline features

 The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
@ -144,11 +191,6 @@ results = doc_converter.convert(conv_input)

 You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.

-### RAG
-Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
-
 ## Technical report

 For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -8,7 +8,7 @@ from pathlib import Path
 from typing import Annotated, Iterable, List, Optional

 import typer
-from pydantic import AnyUrl
+from docling_core.utils.file import resolve_file_source

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@ -109,11 +109,11 @@ def export_documents(
@app.command(no_args_is_help=True)
 def convert(
    input_sources: Annotated[
-        List[Path],
+        List[str],
        typer.Argument(
            ...,
            metavar="source",
-            help="PDF files to convert. Directories are also accepted.",
+            help="PDF files to convert. Can be local file / directory paths or URL.",
        ),
    ],
    export_json: Annotated[
@ -167,7 +167,8 @@ def convert(
    logging.basicConfig(level=logging.INFO)

    input_doc_paths: List[Path] = []
-    for source in input_sources:
+    for src in input_sources:
+        source = resolve_file_source(source=src)
        if not source.exists():
            err_console.print(
                f"[red]Error: The input file {source} does not exist.[/red]"
@ -179,59 +180,26 @@ def convert(
        else:
            input_doc_paths.append(source)

-    ###########################################################################
-
-    # The following sections contain a combination of PipelineOptions
-    # and PDF Backends for various configurations.
-    # Uncomment one section at the time to see the differences in the output.
-
-    doc_converter = None
-    if backend == Backend.PYPDFIUM2 and not ocr:  # PyPdfium without OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = False
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = False
+    match backend:
+        case Backend.PYPDFIUM2:
+            do_cell_matching = ocr  # only do cell matching when OCR enabled
+            pdf_backend = PyPdfiumDocumentBackend
+        case Backend.DOCLING:
+            do_cell_matching = True
+            pdf_backend = DoclingParseDocumentBackend
+        case _:
+            raise RuntimeError(f"Unexpected backend type {backend}")

+    pipeline_options = PipelineOptions(
+        do_ocr=ocr,
+        do_table_structure=True,
+    )
+    pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
    doc_converter = DocumentConverter(
        pipeline_options=pipeline_options,
-            pdf_backend=PyPdfiumDocumentBackend,
+        pdf_backend=pdf_backend,
    )

-    elif backend == Backend.PYPDFIUM2.value and ocr:  # PyPdfium with OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = False
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-
-        doc_converter = DocumentConverter(
-            pipeline_options=pipeline_options,
-            pdf_backend=PyPdfiumDocumentBackend,
-        )
-
-    elif backend == Backend.DOCLING.value and not ocr:  # Docling Parse without OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = False
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-
-        doc_converter = DocumentConverter(
-            pipeline_options=pipeline_options,
-            pdf_backend=DoclingParseDocumentBackend,
-        )
-
-    elif backend == Backend.DOCLING.value and ocr:  # Docling Parse with OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = True
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-
-        doc_converter = DocumentConverter(
-            pipeline_options=pipeline_options,
-            pdf_backend=DoclingParseDocumentBackend,
-        )
-
-    ###########################################################################
-
    # Define input files
    input = DocumentConversionInput.from_paths(input_doc_paths)

--- a/examples/rag_langchain.ipynb
+++ b/examples/rag_langchain.ipynb
@ -1,5 +1,12 @@
 {
 "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# RAG with Docling and 🦜🔗 LangChain"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 1,
--- a/examples/rag_llamaindex.ipynb
+++ b/examples/rag_llamaindex.ipynb
@ -4,7 +4,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Quackling — Basic Pipeline"
+    "# RAG with Docling and 🦙 LlamaIndex"
   ]
  },
  {
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "1.13.0"  # DO NOT EDIT, updated automatically
+version = "1.14.0"  # DO NOT EDIT, updated automatically
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"