Rebase from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-08-02 07:22:14 +00:00 · 2024-12-10 16:21:21 +01:00 · 2024-12-10 16:21:21 +01:00 · 4aecf689aa
commit 4aecf689aa
parent 5497ec8a66 e8884fa2d8
14 changed files with 97 additions and 325 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,14 @@
 ## [v2.10.0](https://github.com/DS4SD/docling/releases/tag/v2.10.0) - 2024-12-09
 ### Feature
 * Docling-parse v2 as default PDF backend ([#549](https://github.com/DS4SD/docling/issues/549)) ([`aca57f0`](https://github.com/DS4SD/docling/commit/aca57f0527dddcc027dc1ee840e2e492ab997170))
 ### Fix
 * Call into docling-core for legacy document transform ([#551](https://github.com/DS4SD/docling/issues/551)) ([`7972d47`](https://github.com/DS4SD/docling/commit/7972d47f88604f02d6a32527116c4d78eb1005e2))
 * Introduce Image format options in CLI. Silence the tqdm downloading messages. ([#544](https://github.com/DS4SD/docling/issues/544)) ([`78f61a8`](https://github.com/DS4SD/docling/commit/78f61a8522d3a19ecc1d605e8441fb543ca0fa96))
 ## [v2.9.0](https://github.com/DS4SD/docling/releases/tag/v2.9.0) - 2024-12-09
 ### Feature
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -29,8 +29,10 @@ from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    EasyOcrOptions,
    OcrEngine,
    OcrMacOptions,
    OcrOptions,
    PdfBackend,
    PdfPipelineOptions,
    RapidOcrOptions,
    TableFormerMode,
@ -70,22 +72,6 @@ def version_callback(value: bool):
        raise typer.Exit()
 # Define an enum for the backend options
 class PdfBackend(str, Enum):
    PYPDFIUM2 = "pypdfium2"
    DLPARSE_V1 = "dlparse_v1"
    DLPARSE_V2 = "dlparse_v2"
 # Define an enum for the ocr engines
 class OcrEngine(str, Enum):
    EASYOCR = "easyocr"
    TESSERACT_CLI = "tesseract_cli"
    TESSERACT = "tesseract"
    OCRMAC = "ocrmac"
    RAPIDOCR = "rapidocr"
 def export_documents(
    conv_results: Iterable[ConversionResult],
    output_dir: Path,
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -19,12 +19,12 @@ if TYPE_CHECKING:
 class ConversionStatus(str, Enum):
-    PENDING = auto()
+    PENDING = "pending"
-    STARTED = auto()
+    STARTED = "started"
-    FAILURE = auto()
+    FAILURE = "failure"
-    SUCCESS = auto()
+    SUCCESS = "success"
-    PARTIAL_SUCCESS = auto()
+    PARTIAL_SUCCESS = "partial_success"
-    SKIPPED = auto()
+    SKIPPED = "skipped"
 class InputFormat(str, Enum):
@ -89,15 +89,15 @@ MimeTypeToFormat = {
 class DocInputType(str, Enum):
-    PATH = auto()
+    PATH = "path"
-    STREAM = auto()
+    STREAM = "stream"
 class DoclingComponentType(str, Enum):
-    DOCUMENT_BACKEND = auto()
+    DOCUMENT_BACKEND = "document_backend"
-    MODEL = auto()
+    MODEL = "model"
-    DOC_ASSEMBLER = auto()
+    DOC_ASSEMBLER = "doc_assembler"
-    USER_INPUT = auto()
+    USER_INPUT = "user_input"
 class ErrorItem(BaseModel):
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -33,6 +33,7 @@ from docling_core.types.legacy_doc.document import (
 from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
 from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
 from docling_core.utils.file import resolve_source_to_stream
 from docling_core.utils.legacy import docling_document_to_legacy
 from pydantic import BaseModel
 from typing_extensions import deprecated
@ -191,259 +192,7 @@ class ConversionResult(BaseModel):
    @property
    @deprecated("Use document instead.")
    def legacy_document(self):
-        reverse_label_mapping = {
+        return docling_document_to_legacy(self.document)
            DocItemLabel.CAPTION.value: "Caption",
            DocItemLabel.FOOTNOTE.value: "Footnote",
            DocItemLabel.FORMULA.value: "Formula",
            DocItemLabel.LIST_ITEM.value: "List-item",
            DocItemLabel.PAGE_FOOTER.value: "Page-footer",
            DocItemLabel.PAGE_HEADER.value: "Page-header",
            DocItemLabel.PICTURE.value: "Picture",  # low threshold adjust to capture chemical structures for examples.
            DocItemLabel.SECTION_HEADER.value: "Section-header",
            DocItemLabel.TABLE.value: "Table",
            DocItemLabel.TEXT.value: "Text",
            DocItemLabel.TITLE.value: "Title",
            DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
            DocItemLabel.CODE.value: "Code",
            DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
            DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
            DocItemLabel.FORM.value: "Form",
            DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
            DocItemLabel.PARAGRAPH.value: "paragraph",
        }
        title = ""
        desc = DsDocumentDescription(logs=[])
        page_hashes = [
            PageReference(
                hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
                page=p.page_no,
                model="default",
            )
            for p in self.document.pages.values()
        ]
        file_info = DsFileInfoObject(
            filename=self.input.file.name,
            document_hash=self.input.document_hash,
            num_pages=self.input.page_count,
            page_hashes=page_hashes,
        )
        main_text = []
        tables = []
        figures = []
        equations = []
        footnotes = []
        page_headers = []
        page_footers = []
        embedded_captions = set()
        for ix, (item, level) in enumerate(
            self.document.iterate_items(self.document.body)
        ):
            if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
                caption = item.caption_text(self.document)
                if caption:
                    embedded_captions.add(caption)
        for item, level in self.document.iterate_items():
            if isinstance(item, DocItem):
                item_type = item.label
                if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
                    if isinstance(item, ListItem) and item.marker:
                        text = f"{item.marker} {item.text}"
                    else:
                        text = item.text
                    # Can be empty.
                    prov = [
                        Prov(
                            bbox=p.bbox.as_tuple(),
                            page=p.page_no,
                            span=[0, len(item.text)],
                        )
                        for p in item.prov
                    ]
                    main_text.append(
                        BaseText(
                            text=text,
                            obj_type=layout_label_to_ds_type.get(item.label),
                            name=reverse_label_mapping[item.label],
                            prov=prov,
                        )
                    )
                    # skip captions of they are embedded in the actual
                    # floating object
                    if item_type == DocItemLabel.CAPTION and text in embedded_captions:
                        continue
                elif isinstance(item, TableItem) and item.data:
                    index = len(tables)
                    ref_str = f"#/tables/{index}"
                    main_text.append(
                        Ref(
                            name=reverse_label_mapping[item.label],
                            obj_type=layout_label_to_ds_type.get(item.label),
                            ref=ref_str,
                        ),
                    )
                    # Initialise empty table data grid (only empty cells)
                    table_data = [
                        [
                            TableCell(
                                text="",
                                # bbox=[0,0,0,0],
                                spans=[[i, j]],
                                obj_type="body",
                            )
                            for j in range(item.data.num_cols)
                        ]
                        for i in range(item.data.num_rows)
                    ]
                    # Overwrite cells in table data for which there is actual cell content.
                    for cell in item.data.table_cells:
                        for i in range(
                            min(cell.start_row_offset_idx, item.data.num_rows),
                            min(cell.end_row_offset_idx, item.data.num_rows),
                        ):
                            for j in range(
                                min(cell.start_col_offset_idx, item.data.num_cols),
                                min(cell.end_col_offset_idx, item.data.num_cols),
                            ):
                                celltype = "body"
                                if cell.column_header:
                                    celltype = "col_header"
                                elif cell.row_header:
                                    celltype = "row_header"
                                elif cell.row_section:
                                    celltype = "row_section"
                                def make_spans(cell):
                                    for rspan in range(
                                        min(
                                            cell.start_row_offset_idx,
                                            item.data.num_rows,
                                        ),
                                        min(
                                            cell.end_row_offset_idx, item.data.num_rows
                                        ),
                                    ):
                                        for cspan in range(
                                            min(
                                                cell.start_col_offset_idx,
                                                item.data.num_cols,
                                            ),
                                            min(
                                                cell.end_col_offset_idx,
                                                item.data.num_cols,
                                            ),
                                        ):
                                            yield [rspan, cspan]
                                spans = list(make_spans(cell))
                                table_data[i][j] = GlmTableCell(
                                    text=cell.text,
                                    bbox=(
                                        cell.bbox.as_tuple()
                                        if cell.bbox is not None
                                        else None
                                    ),  # check if this is bottom-left
                                    spans=spans,
                                    obj_type=celltype,
                                    col=j,
                                    row=i,
                                    row_header=cell.row_header,
                                    row_section=cell.row_section,
                                    col_header=cell.column_header,
                                    row_span=[
                                        cell.start_row_offset_idx,
                                        cell.end_row_offset_idx,
                                    ],
                                    col_span=[
                                        cell.start_col_offset_idx,
                                        cell.end_col_offset_idx,
                                    ],
                                )
                    # Compute the caption
                    caption = item.caption_text(self.document)
                    tables.append(
                        DsSchemaTable(
                            text=caption,
                            num_cols=item.data.num_cols,
                            num_rows=item.data.num_rows,
                            obj_type=layout_label_to_ds_type.get(item.label),
                            data=table_data,
                            prov=[
                                Prov(
                                    bbox=p.bbox.as_tuple(),
                                    page=p.page_no,
                                    span=[0, 0],
                                )
                                for p in item.prov
                            ],
                        )
                    )
                elif isinstance(item, PictureItem):
                    index = len(figures)
                    ref_str = f"#/figures/{index}"
                    main_text.append(
                        Ref(
                            name=reverse_label_mapping[item.label],
                            obj_type=layout_label_to_ds_type.get(item.label),
                            ref=ref_str,
                        ),
                    )
                    # Compute the caption
                    caption = item.caption_text(self.document)
                    figures.append(
                        Figure(
                            prov=[
                                Prov(
                                    bbox=p.bbox.as_tuple(),
                                    page=p.page_no,
                                    span=[0, len(caption)],
                                )
                                for p in item.prov
                            ],
                            obj_type=layout_label_to_ds_type.get(item.label),
                            text=caption,
                            # data=[[]],
                        )
                    )
        page_dimensions = [
            PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
            for p in self.document.pages.values()
        ]
        ds_doc = DsDocument(
            name=title,
            description=desc,
            file_info=file_info,
            main_text=main_text,
            equations=equations,
            footnotes=footnotes,
            page_headers=page_headers,
            page_footers=page_footers,
            tables=tables,
            figures=figures,
            page_dimensions=page_dimensions,
        )
        return ds_doc
 class _DummyBackend(AbstractDocumentBackend):
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -190,6 +190,26 @@ class OcrMacOptions(OcrOptions):
    )
 # Define an enum for the backend options
 class PdfBackend(str, Enum):
    """Enum of valid PDF backends."""
    PYPDFIUM2 = "pypdfium2"
    DLPARSE_V1 = "dlparse_v1"
    DLPARSE_V2 = "dlparse_v2"
 # Define an enum for the ocr engines
 class OcrEngine(str, Enum):
    """Enum of valid OCR engines."""
    EASYOCR = "easyocr"
    TESSERACT_CLI = "tesseract_cli"
    TESSERACT = "tesseract"
    OCRMAC = "ocrmac"
    RAPIDOCR = "rapidocr"
 class PipelineOptions(BaseModel):
    """Base pipeline options."""
--- a/docling/models/rapid_ocr_model.py
+++ b/docling/models/rapid_ocr_model.py
@ -97,24 +97,25 @@ class RapidOcrModel(BaseOcrModel):
                        del high_res_image
                        del im
-                        cells = [
+                        if result is not None:
-                            OcrCell(
+                            cells = [
-                                id=ix,
+                                OcrCell(
-                                text=line[1],
+                                    id=ix,
-                                confidence=line[2],
+                                    text=line[1],
-                                bbox=BoundingBox.from_tuple(
+                                    confidence=line[2],
-                                    coord=(
+                                    bbox=BoundingBox.from_tuple(
-                                        (line[0][0][0] / self.scale) + ocr_rect.l,
+                                        coord=(
-                                        (line[0][0][1] / self.scale) + ocr_rect.t,
+                                            (line[0][0][0] / self.scale) + ocr_rect.l,
-                                        (line[0][2][0] / self.scale) + ocr_rect.l,
+                                            (line[0][0][1] / self.scale) + ocr_rect.t,
-                                        (line[0][2][1] / self.scale) + ocr_rect.t,
+                                            (line[0][2][0] / self.scale) + ocr_rect.l,
                                            (line[0][2][1] / self.scale) + ocr_rect.t,
                                        ),
                                        origin=CoordOrigin.TOPLEFT,
                                    ),
-                                    origin=CoordOrigin.TOPLEFT,
+                                )
-                                ),
+                                for ix, line in enumerate(result)
-                            )
+                            ]
-                            for ix, line in enumerate(result)
+                            all_ocr_cells.extend(cells)
                        ]
                        all_ocr_cells.extend(cells)
                    # Post-process the cells
                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
--- a/docs/concepts/architecture.md
+++ b/docs/concepts/architecture.md
@ -10,7 +10,7 @@ For each document format, the *document converter* knows which format-specific *
 The *conversion result* contains the [*Docling document*](./docling_document.md), Docling's fundamental document representation.
-Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it chunked by a *chunker*.
+Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it chunked by a [*chunker*](./chunking.md).
 For more details on Docling's architecture, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
--- a/docs/reference/cli.md
+++ b/docs/reference/cli.md
@ -1,4 +1,4 @@
-# CLI Reference
+# CLI reference
 This page provides documentation for our command line tools.
@ -6,4 +6,4 @@ This page provides documentation for our command line tools.
    :module: docling.cli.main
    :command: click_app
    :prog_name: docling
-    :style: table
+    :style: table
--- a/docs/api_reference/docling_document.md
+++ b/docs/api_reference/docling_document.md
--- a/docs/api_reference/document_converter.md
+++ b/docs/api_reference/document_converter.md
--- a/docs/api_reference/pipeline_options.md
+++ b/docs/api_reference/pipeline_options.md
--- a/docs/usage.md
+++ b/docs/usage.md
@ -22,9 +22,7 @@ A simple example would look like this:
 docling https://arxiv.org/pdf/2206.01062
 ```
-To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./cli.md).
+To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).
 ### Advanced options
@ -130,29 +128,37 @@ You can limit the CPU threads used by Docling by setting the environment variabl
 ## Chunking
-You can perform a hierarchy-aware chunking of a Docling document as follows:
+You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a
 `HybridChunker`, as shown below (for more details check out
 [this example](examples/hybrid_chunking.ipynb)):
 ```python
 from docling.document_converter import DocumentConverter
-from docling_core.transforms.chunker import HierarchicalChunker
+from docling.chunking import HybridChunker
 conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062")
 doc = conv_res.document
 chunks = list(HierarchicalChunker().chunk(doc))
-print(chunks[30])
+chunker = HybridChunker(tokenizer="BAAI/bge-small-en-v1.5")  # set tokenizer as needed
 chunk_iter = chunker.chunk(doc)
 ```
 An example chunk would look like this:
 ```python
 print(list(chunk_iter)[11])
 # {
-#   "text": "Lately, new types of ML models for document-layout analysis have emerged [...]",
+#   "text": "In this paper, we present the DocLayNet dataset. [...]",
 #   "meta": {
 #     "doc_items": [{
-#       "self_ref": "#/texts/40",
+#       "self_ref": "#/texts/28",
 #       "label": "text",
 #       "prov": [{
 #         "page_no": 2,
-#         "bbox": {"l": 317.06, "t": 325.81, "r": 559.18, "b": 239.97, ...},
+#         "bbox": {"l": 53.29, "t": 287.14, "r": 295.56, "b": 212.37, ...},
-#       }]
+#       }], ...,
-#     }],
+#     }, ...],
-#     "headings": ["2 RELATED WORK"],
+#     "headings": ["1 INTRODUCTION"],
 #   }
 # }
 ```
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -56,7 +56,6 @@ nav:
    - "Docling": index.md
    - Installation: installation.md
    - Usage: usage.md
    - CLI: cli.md
    - FAQ: faq.md
    - Docling v2: v2.md
  - Concepts:
@ -77,15 +76,12 @@ nav:
      - "Multimodal export": examples/export_multimodal.py
      - "Force full page OCR": examples/full_page_ocr.py
      - "Accelerator options": examples/run_with_acclerators.py
    - Chunking:
      - "Hybrid chunking": examples/hybrid_chunking.ipynb
    - RAG / QA:
      - "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
      - "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
      - "Hybrid RAG with Qdrant": examples/hybrid_rag_qdrant.ipynb
    - Chunking:
      - "Hybrid chunking": examples/hybrid_chunking.ipynb
    #   - Chunking: examples/chunking.md
    # - CLI:
    #   - CLI: examples/cli.md
  - Integrations:
    - Integrations: integrations/index.md
    - "🐝 Bee": integrations/bee.md
@ -100,10 +96,13 @@ nav:
    - "spaCy": integrations/spacy.md
    - "txtai": integrations/txtai.md
    # - "LangChain 🦜🔗": integrations/langchain.md
-  - API reference:
+  - Reference:
-    - Document Converter: api_reference/document_converter.md
+    - Python API:
-    - Pipeline options: api_reference/pipeline_options.md
+      - Document Converter: reference/document_converter.md
-    - Docling Document: api_reference/docling_document.md
+      - Pipeline options: reference/pipeline_options.md
      - Docling Document: reference/docling_document.md
    - CLI:
      - CLI reference: reference/cli.md
 markdown_extensions:
  - pymdownx.superfences
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.9.0"  # DO NOT EDIT, updated automatically
+version = "2.10.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"