From 440c16ff2004eb674242c37f9e2705212c5b0dfe Mon Sep 17 00:00:00 2001
From: Christoph Auer <60343111+cau-git@users.noreply.github.com>
Date: Mon, 9 Dec 2024 17:06:47 +0100
Subject: [PATCH 1/5] fix: Call into docling-core for legacy document transform
 (#551)

Call into docling-core for legacy document transform

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/datamodel/document.py | 255 +---------------------------------
 poetry.lock                   |  13 +-
 pyproject.toml                |   2 +-
 3 files changed, 12 insertions(+), 258 deletions(-)

diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index e5b49343..f8dec5cb 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -33,6 +33,7 @@ from docling_core.types.legacy_doc.document import (
 from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
 from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
 from docling_core.utils.file import resolve_source_to_stream
+from docling_core.utils.legacy import docling_document_to_legacy
 from pydantic import BaseModel
 from typing_extensions import deprecated
 
@@ -189,259 +190,7 @@ class ConversionResult(BaseModel):
     @property
     @deprecated("Use document instead.")
     def legacy_document(self):
-        reverse_label_mapping = {
-            DocItemLabel.CAPTION.value: "Caption",
-            DocItemLabel.FOOTNOTE.value: "Footnote",
-            DocItemLabel.FORMULA.value: "Formula",
-            DocItemLabel.LIST_ITEM.value: "List-item",
-            DocItemLabel.PAGE_FOOTER.value: "Page-footer",
-            DocItemLabel.PAGE_HEADER.value: "Page-header",
-            DocItemLabel.PICTURE.value: "Picture",  # low threshold adjust to capture chemical structures for examples.
-            DocItemLabel.SECTION_HEADER.value: "Section-header",
-            DocItemLabel.TABLE.value: "Table",
-            DocItemLabel.TEXT.value: "Text",
-            DocItemLabel.TITLE.value: "Title",
-            DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
-            DocItemLabel.CODE.value: "Code",
-            DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
-            DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
-            DocItemLabel.FORM.value: "Form",
-            DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
-            DocItemLabel.PARAGRAPH.value: "paragraph",
-        }
-
-        title = ""
-        desc = DsDocumentDescription(logs=[])
-
-        page_hashes = [
-            PageReference(
-                hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
-                page=p.page_no,
-                model="default",
-            )
-            for p in self.document.pages.values()
-        ]
-
-        file_info = DsFileInfoObject(
-            filename=self.input.file.name,
-            document_hash=self.input.document_hash,
-            num_pages=self.input.page_count,
-            page_hashes=page_hashes,
-        )
-
-        main_text = []
-        tables = []
-        figures = []
-        equations = []
-        footnotes = []
-        page_headers = []
-        page_footers = []
-
-        embedded_captions = set()
-        for ix, (item, level) in enumerate(
-            self.document.iterate_items(self.document.body)
-        ):
-
-            if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
-                caption = item.caption_text(self.document)
-                if caption:
-                    embedded_captions.add(caption)
-
-        for item, level in self.document.iterate_items():
-            if isinstance(item, DocItem):
-                item_type = item.label
-
-                if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
-
-                    if isinstance(item, ListItem) and item.marker:
-                        text = f"{item.marker} {item.text}"
-                    else:
-                        text = item.text
-
-                    # Can be empty.
-                    prov = [
-                        Prov(
-                            bbox=p.bbox.as_tuple(),
-                            page=p.page_no,
-                            span=[0, len(item.text)],
-                        )
-                        for p in item.prov
-                    ]
-                    main_text.append(
-                        BaseText(
-                            text=text,
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            name=reverse_label_mapping[item.label],
-                            prov=prov,
-                        )
-                    )
-
-                    # skip captions of they are embedded in the actual
-                    # floating object
-                    if item_type == DocItemLabel.CAPTION and text in embedded_captions:
-                        continue
-
-                elif isinstance(item, TableItem) and item.data:
-                    index = len(tables)
-                    ref_str = f"#/tables/{index}"
-                    main_text.append(
-                        Ref(
-                            name=reverse_label_mapping[item.label],
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            ref=ref_str,
-                        ),
-                    )
-
-                    # Initialise empty table data grid (only empty cells)
-                    table_data = [
-                        [
-                            TableCell(
-                                text="",
-                                # bbox=[0,0,0,0],
-                                spans=[[i, j]],
-                                obj_type="body",
-                            )
-                            for j in range(item.data.num_cols)
-                        ]
-                        for i in range(item.data.num_rows)
-                    ]
-
-                    # Overwrite cells in table data for which there is actual cell content.
-                    for cell in item.data.table_cells:
-                        for i in range(
-                            min(cell.start_row_offset_idx, item.data.num_rows),
-                            min(cell.end_row_offset_idx, item.data.num_rows),
-                        ):
-                            for j in range(
-                                min(cell.start_col_offset_idx, item.data.num_cols),
-                                min(cell.end_col_offset_idx, item.data.num_cols),
-                            ):
-                                celltype = "body"
-                                if cell.column_header:
-                                    celltype = "col_header"
-                                elif cell.row_header:
-                                    celltype = "row_header"
-                                elif cell.row_section:
-                                    celltype = "row_section"
-
-                                def make_spans(cell):
-                                    for rspan in range(
-                                        min(
-                                            cell.start_row_offset_idx,
-                                            item.data.num_rows,
-                                        ),
-                                        min(
-                                            cell.end_row_offset_idx, item.data.num_rows
-                                        ),
-                                    ):
-                                        for cspan in range(
-                                            min(
-                                                cell.start_col_offset_idx,
-                                                item.data.num_cols,
-                                            ),
-                                            min(
-                                                cell.end_col_offset_idx,
-                                                item.data.num_cols,
-                                            ),
-                                        ):
-                                            yield [rspan, cspan]
-
-                                spans = list(make_spans(cell))
-                                table_data[i][j] = GlmTableCell(
-                                    text=cell.text,
-                                    bbox=(
-                                        cell.bbox.as_tuple()
-                                        if cell.bbox is not None
-                                        else None
-                                    ),  # check if this is bottom-left
-                                    spans=spans,
-                                    obj_type=celltype,
-                                    col=j,
-                                    row=i,
-                                    row_header=cell.row_header,
-                                    row_section=cell.row_section,
-                                    col_header=cell.column_header,
-                                    row_span=[
-                                        cell.start_row_offset_idx,
-                                        cell.end_row_offset_idx,
-                                    ],
-                                    col_span=[
-                                        cell.start_col_offset_idx,
-                                        cell.end_col_offset_idx,
-                                    ],
-                                )
-
-                    # Compute the caption
-                    caption = item.caption_text(self.document)
-
-                    tables.append(
-                        DsSchemaTable(
-                            text=caption,
-                            num_cols=item.data.num_cols,
-                            num_rows=item.data.num_rows,
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            data=table_data,
-                            prov=[
-                                Prov(
-                                    bbox=p.bbox.as_tuple(),
-                                    page=p.page_no,
-                                    span=[0, 0],
-                                )
-                                for p in item.prov
-                            ],
-                        )
-                    )
-
-                elif isinstance(item, PictureItem):
-                    index = len(figures)
-                    ref_str = f"#/figures/{index}"
-                    main_text.append(
-                        Ref(
-                            name=reverse_label_mapping[item.label],
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            ref=ref_str,
-                        ),
-                    )
-
-                    # Compute the caption
-                    caption = item.caption_text(self.document)
-
-                    figures.append(
-                        Figure(
-                            prov=[
-                                Prov(
-                                    bbox=p.bbox.as_tuple(),
-                                    page=p.page_no,
-                                    span=[0, len(caption)],
-                                )
-                                for p in item.prov
-                            ],
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            text=caption,
-                            # data=[[]],
-                        )
-                    )
-
-        page_dimensions = [
-            PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
-            for p in self.document.pages.values()
-        ]
-
-        ds_doc = DsDocument(
-            name=title,
-            description=desc,
-            file_info=file_info,
-            main_text=main_text,
-            equations=equations,
-            footnotes=footnotes,
-            page_headers=page_headers,
-            page_footers=page_footers,
-            tables=tables,
-            figures=figures,
-            page_dimensions=page_dimensions,
-        )
-
-        return ds_doc
+        return docling_document_to_legacy(self.document)
 
 
 class _DummyBackend(AbstractDocumentBackend):
diff --git a/poetry.lock b/poetry.lock
index 67ff0fee..a09e995a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -888,13 +888,13 @@ files = [
 
 [[package]]
 name = "docling-core"
-version = "2.8.0"
+version = "2.9.0"
 description = "A python library to define and validate data types in Docling."
 optional = false
 python-versions = "<4.0,>=3.9"
 files = [
-    {file = "docling_core-2.8.0-py3-none-any.whl", hash = "sha256:392aad49e25f5fd1d279410118fbd91d9aaab9dd92d043738d20c10c57193d86"},
-    {file = "docling_core-2.8.0.tar.gz", hash = "sha256:6ac5cbc6f0abcbdf599c2a4b1a3f7b52fd8baebf3c4ebf94d7b7e2ee061a654e"},
+    {file = "docling_core-2.9.0-py3-none-any.whl", hash = "sha256:b44b077db5d2ac8a900f30a15abe329c165b1f2eb7f1c90d1275c423c1c3d668"},
+    {file = "docling_core-2.9.0.tar.gz", hash = "sha256:1bf12fe67ee4852330e9bac33fe62b45598ff885481e03a88fa8e1bf48252424"},
 ]
 
 [package.dependencies]
@@ -6061,6 +6061,11 @@ files = [
     {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
     {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
     {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
     {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
     {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
     {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
@@ -7597,4 +7602,4 @@ tesserocr = ["tesserocr"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "621f8de238fd1f82cfd783531b6ab7c1598378a499c0dcfac323d66bc7ab32ea"
+content-hash = "3e66a54bd0433581e4909003124e2b79b42bdd1fb90d17c037f3294aeff56aa9"
diff --git a/pyproject.toml b/pyproject.toml
index b2593d77..649f07b2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ packages = [{include = "docling"}]
 # actual dependencies:
 ######################
 python = "^3.9"
-docling-core = { version = "^2.8.0", extras = ["chunking"] }
+docling-core = { version = "^2.9.0", extras = ["chunking"] }
 pydantic = "^2.0.0"
 docling-ibm-models = "^2.0.6"
 deepsearch-glm = "^1.0.0"

From ca83a1f0c9e9e95597da558d95c7eca3082c6810 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 9 Dec 2024 16:28:46 +0000
Subject: [PATCH 2/5] chore: bump version to 2.10.0 [skip ci]

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 CHANGELOG.md   | 11 +++++++++++
 pyproject.toml |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4e31b769..3561c135 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,14 @@
+## [v2.10.0](https://github.com/DS4SD/docling/releases/tag/v2.10.0) - 2024-12-09
+
+### Feature
+
+* Docling-parse v2 as default PDF backend ([#549](https://github.com/DS4SD/docling/issues/549)) ([`aca57f0`](https://github.com/DS4SD/docling/commit/aca57f0527dddcc027dc1ee840e2e492ab997170))
+
+### Fix
+
+* Call into docling-core for legacy document transform ([#551](https://github.com/DS4SD/docling/issues/551)) ([`7972d47`](https://github.com/DS4SD/docling/commit/7972d47f88604f02d6a32527116c4d78eb1005e2))
+* Introduce Image format options in CLI. Silence the tqdm downloading messages. ([#544](https://github.com/DS4SD/docling/issues/544)) ([`78f61a8`](https://github.com/DS4SD/docling/commit/78f61a8522d3a19ecc1d605e8441fb543ca0fa96))
+
 ## [v2.9.0](https://github.com/DS4SD/docling/releases/tag/v2.9.0) - 2024-12-09
 
 ### Feature
diff --git a/pyproject.toml b/pyproject.toml
index 649f07b2..1d545937 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.9.0"  # DO NOT EDIT, updated automatically
+version = "2.10.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"

From 1a3daf2ffb9615a4e42177f966eb2e7472f7273c Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Tue, 10 Dec 2024 13:12:44 +0100
Subject: [PATCH 3/5] fix: make enum serializable with human-readable value
 (#555)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/cli/main.py                   | 18 ++----------------
 docling/datamodel/base_models.py      | 24 ++++++++++++------------
 docling/datamodel/pipeline_options.py | 20 ++++++++++++++++++++
 3 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/docling/cli/main.py b/docling/cli/main.py
index b06354c8..260d8152 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -27,8 +27,10 @@ from docling.datamodel.base_models import (
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
+    OcrEngine,
     OcrMacOptions,
     OcrOptions,
+    PdfBackend,
     PdfPipelineOptions,
     RapidOcrOptions,
     TableFormerMode,
@@ -68,22 +70,6 @@ def version_callback(value: bool):
         raise typer.Exit()
 
 
-# Define an enum for the backend options
-class PdfBackend(str, Enum):
-    PYPDFIUM2 = "pypdfium2"
-    DLPARSE_V1 = "dlparse_v1"
-    DLPARSE_V2 = "dlparse_v2"
-
-
-# Define an enum for the ocr engines
-class OcrEngine(str, Enum):
-    EASYOCR = "easyocr"
-    TESSERACT_CLI = "tesseract_cli"
-    TESSERACT = "tesseract"
-    OCRMAC = "ocrmac"
-    RAPIDOCR = "rapidocr"
-
-
 def export_documents(
     conv_results: Iterable[ConversionResult],
     output_dir: Path,
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
index b71c0f97..dd6291ab 100644
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
 
 
 class ConversionStatus(str, Enum):
-    PENDING = auto()
-    STARTED = auto()
-    FAILURE = auto()
-    SUCCESS = auto()
-    PARTIAL_SUCCESS = auto()
-    SKIPPED = auto()
+    PENDING = "pending"
+    STARTED = "started"
+    FAILURE = "failure"
+    SUCCESS = "success"
+    PARTIAL_SUCCESS = "partial_success"
+    SKIPPED = "skipped"
 
 
 class InputFormat(str, Enum):
@@ -89,15 +89,15 @@ MimeTypeToFormat = {
 
 
 class DocInputType(str, Enum):
-    PATH = auto()
-    STREAM = auto()
+    PATH = "path"
+    STREAM = "stream"
 
 
 class DoclingComponentType(str, Enum):
-    DOCUMENT_BACKEND = auto()
-    MODEL = auto()
-    DOC_ASSEMBLER = auto()
-    USER_INPUT = auto()
+    DOCUMENT_BACKEND = "document_backend"
+    MODEL = "model"
+    DOC_ASSEMBLER = "doc_assembler"
+    USER_INPUT = "user_input"
 
 
 class ErrorItem(BaseModel):
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 9be3ee82..235b5b7f 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -126,6 +126,26 @@ class OcrMacOptions(OcrOptions):
     )
 
 
+# Define an enum for the backend options
+class PdfBackend(str, Enum):
+    """Enum of valid PDF backends."""
+
+    PYPDFIUM2 = "pypdfium2"
+    DLPARSE_V1 = "dlparse_v1"
+    DLPARSE_V2 = "dlparse_v2"
+
+
+# Define an enum for the ocr engines
+class OcrEngine(str, Enum):
+    """Enum of valid OCR engines."""
+
+    EASYOCR = "easyocr"
+    TESSERACT_CLI = "tesseract_cli"
+    TESSERACT = "tesseract"
+    OCRMAC = "ocrmac"
+    RAPIDOCR = "rapidocr"
+
+
 class PipelineOptions(BaseModel):
     """Base pipeline options."""
 

From 6f986d26e149dfb6705faf4ba2f0bac1d33d940c Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Tue, 10 Dec 2024 16:03:02 +0100
Subject: [PATCH 4/5] docs: update chunking usage docs, minor reorg (#550)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docs/concepts/architecture.md                 |  2 +-
 docs/{ => reference}/cli.md                   |  4 +--
 .../docling_document.md                       |  0
 .../document_converter.md                     |  0
 .../pipeline_options.md                       |  0
 docs/usage.md                                 | 32 +++++++++++--------
 mkdocs.yml                                    | 19 ++++++-----
 7 files changed, 31 insertions(+), 26 deletions(-)
 rename docs/{ => reference}/cli.md (82%)
 rename docs/{api_reference => reference}/docling_document.md (100%)
 rename docs/{api_reference => reference}/document_converter.md (100%)
 rename docs/{api_reference => reference}/pipeline_options.md (100%)

diff --git a/docs/concepts/architecture.md b/docs/concepts/architecture.md
index 07aa1b30..00e81db0 100644
--- a/docs/concepts/architecture.md
+++ b/docs/concepts/architecture.md
@@ -10,7 +10,7 @@ For each document format, the *document converter* knows which format-specific *
 
 The *conversion result* contains the [*Docling document*](./docling_document.md), Docling's fundamental document representation.
 
-Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it chunked by a *chunker*.
+Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it chunked by a [*chunker*](./chunking.md).
 
 For more details on Docling's architecture, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
 
diff --git a/docs/cli.md b/docs/reference/cli.md
similarity index 82%
rename from docs/cli.md
rename to docs/reference/cli.md
index 3f67df0d..25612267 100644
--- a/docs/cli.md
+++ b/docs/reference/cli.md
@@ -1,4 +1,4 @@
-# CLI Reference
+# CLI reference
 
 This page provides documentation for our command line tools.
 
@@ -6,4 +6,4 @@ This page provides documentation for our command line tools.
     :module: docling.cli.main
     :command: click_app
     :prog_name: docling
-    :style: table
\ No newline at end of file
+    :style: table
diff --git a/docs/api_reference/docling_document.md b/docs/reference/docling_document.md
similarity index 100%
rename from docs/api_reference/docling_document.md
rename to docs/reference/docling_document.md
diff --git a/docs/api_reference/document_converter.md b/docs/reference/document_converter.md
similarity index 100%
rename from docs/api_reference/document_converter.md
rename to docs/reference/document_converter.md
diff --git a/docs/api_reference/pipeline_options.md b/docs/reference/pipeline_options.md
similarity index 100%
rename from docs/api_reference/pipeline_options.md
rename to docs/reference/pipeline_options.md
diff --git a/docs/usage.md b/docs/usage.md
index e7a214a0..9a5b555a 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -22,9 +22,7 @@ A simple example would look like this:
 docling https://arxiv.org/pdf/2206.01062
 ```
 
-To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./cli.md).
-
-
+To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).
 
 ### Advanced options
 
@@ -130,29 +128,37 @@ You can limit the CPU threads used by Docling by setting the environment variabl
 
 ## Chunking
 
-You can perform a hierarchy-aware chunking of a Docling document as follows:
+You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a
+`HybridChunker`, as shown below (for more details check out
+[this example](examples/hybrid_chunking.ipynb)):
 
 ```python
 from docling.document_converter import DocumentConverter
-from docling_core.transforms.chunker import HierarchicalChunker
+from docling.chunking import HybridChunker
 
 conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062")
 doc = conv_res.document
-chunks = list(HierarchicalChunker().chunk(doc))
 
-print(chunks[30])
+chunker = HybridChunker(tokenizer="BAAI/bge-small-en-v1.5")  # set tokenizer as needed
+chunk_iter = chunker.chunk(doc)
+```
+
+An example chunk would look like this:
+
+```python
+print(list(chunk_iter)[11])
 # {
-#   "text": "Lately, new types of ML models for document-layout analysis have emerged [...]",
+#   "text": "In this paper, we present the DocLayNet dataset. [...]",
 #   "meta": {
 #     "doc_items": [{
-#       "self_ref": "#/texts/40",
+#       "self_ref": "#/texts/28",
 #       "label": "text",
 #       "prov": [{
 #         "page_no": 2,
-#         "bbox": {"l": 317.06, "t": 325.81, "r": 559.18, "b": 239.97, ...},
-#       }]
-#     }],
-#     "headings": ["2 RELATED WORK"],
+#         "bbox": {"l": 53.29, "t": 287.14, "r": 295.56, "b": 212.37, ...},
+#       }], ...,
+#     }, ...],
+#     "headings": ["1 INTRODUCTION"],
 #   }
 # }
 ```
diff --git a/mkdocs.yml b/mkdocs.yml
index 81abcc6a..6973824d 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -56,7 +56,6 @@ nav:
     - "Docling": index.md
     - Installation: installation.md
     - Usage: usage.md
-    - CLI: cli.md
     - FAQ: faq.md
     - Docling v2: v2.md
   - Concepts:
@@ -76,15 +75,12 @@ nav:
       - "Table export": examples/export_tables.py
       - "Multimodal export": examples/export_multimodal.py
       - "Force full page OCR": examples/full_page_ocr.py
+    - Chunking:
+      - "Hybrid chunking": examples/hybrid_chunking.ipynb
     - RAG / QA:
       - "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
       - "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
       - "Hybrid RAG with Qdrant": examples/hybrid_rag_qdrant.ipynb
-    - Chunking:
-      - "Hybrid chunking": examples/hybrid_chunking.ipynb
-    #   - Chunking: examples/chunking.md
-    # - CLI:
-    #   - CLI: examples/cli.md
   - Integrations:
     - Integrations: integrations/index.md
     - "🐝 Bee": integrations/bee.md
@@ -99,10 +95,13 @@ nav:
     - "spaCy": integrations/spacy.md
     - "txtai": integrations/txtai.md
     # - "LangChain 🦜🔗": integrations/langchain.md
-  - API reference:
-    - Document Converter: api_reference/document_converter.md
-    - Pipeline options: api_reference/pipeline_options.md
-    - Docling Document: api_reference/docling_document.md
+  - Reference:
+    - Python API:
+      - Document Converter: reference/document_converter.md
+      - Pipeline options: reference/pipeline_options.md
+      - Docling Document: reference/docling_document.md
+    - CLI:
+      - CLI reference: reference/cli.md
 
 markdown_extensions:
   - pymdownx.superfences

From 861e6fa90cca4bb21d08721078c606193626a149 Mon Sep 17 00:00:00 2001
From: Christoph Auer <60343111+cau-git@users.noreply.github.com>
Date: Tue, 10 Dec 2024 16:25:05 +0100
Subject: [PATCH 5/5] fix: Handle no result from RapidOcr reader (#558)

Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/models/rapid_ocr_model.py | 35 ++++++++++++++++---------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/docling/models/rapid_ocr_model.py b/docling/models/rapid_ocr_model.py
index 7fd5a3d4..b40dbf6a 100644
--- a/docling/models/rapid_ocr_model.py
+++ b/docling/models/rapid_ocr_model.py
@@ -118,24 +118,25 @@ class RapidOcrModel(BaseOcrModel):
                         del high_res_image
                         del im
 
-                        cells = [
-                            OcrCell(
-                                id=ix,
-                                text=line[1],
-                                confidence=line[2],
-                                bbox=BoundingBox.from_tuple(
-                                    coord=(
-                                        (line[0][0][0] / self.scale) + ocr_rect.l,
-                                        (line[0][0][1] / self.scale) + ocr_rect.t,
-                                        (line[0][2][0] / self.scale) + ocr_rect.l,
-                                        (line[0][2][1] / self.scale) + ocr_rect.t,
+                        if result is not None:
+                            cells = [
+                                OcrCell(
+                                    id=ix,
+                                    text=line[1],
+                                    confidence=line[2],
+                                    bbox=BoundingBox.from_tuple(
+                                        coord=(
+                                            (line[0][0][0] / self.scale) + ocr_rect.l,
+                                            (line[0][0][1] / self.scale) + ocr_rect.t,
+                                            (line[0][2][0] / self.scale) + ocr_rect.l,
+                                            (line[0][2][1] / self.scale) + ocr_rect.t,
+                                        ),
+                                        origin=CoordOrigin.TOPLEFT,
                                     ),
-                                    origin=CoordOrigin.TOPLEFT,
-                                ),
-                            )
-                            for ix, line in enumerate(result)
-                        ]
-                        all_ocr_cells.extend(cells)
+                                )
+                                for ix, line in enumerate(result)
+                            ]
+                            all_ocr_cells.extend(cells)
 
                     # Post-process the cells
                     page.cells = self.post_process_cells(all_ocr_cells, page.cells)