Merge branch 'docling-project:main' into main

2025-07-30 14:04:27 +00:00 · 2025-03-20 13:15:22 +02:00 · 2025-03-20 13:15:22 +02:00 · 2b6fd251a7
commit 2b6fd251a7
parent 34c3a395fe 7df157204b
13 changed files with 449 additions and 100 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,21 @@
+## [v2.28.0](https://github.com/docling-project/docling/releases/tag/v2.28.0) - 2025-03-19
+
+### Feature
+
+* **SmolDocling:** Support MLX acceleration in VLM pipeline ([#1199](https://github.com/docling-project/docling/issues/1199)) ([`1c26769`](https://github.com/docling-project/docling/commit/1c26769785bcd17c0b8b621c5182ad81134d3915))
+* Add PPTX notes slides ([#474](https://github.com/docling-project/docling/issues/474)) ([`b454aa1`](https://github.com/docling-project/docling/commit/b454aa1551b891644ce4028ed2d7ec8f82c167ab))
+* Updated vlm pipeline (with latest changes from docling-core) ([#1158](https://github.com/docling-project/docling/issues/1158)) ([`2f72167`](https://github.com/docling-project/docling/commit/2f72167ff6421424dea4d93018b0d43af16ec153))
+
+### Fix
+
+* Determine correct page size in DoclingParseV4Backend ([#1196](https://github.com/docling-project/docling/issues/1196)) ([`f5adfb9`](https://github.com/docling-project/docling/commit/f5adfb9724aae1207f23e21d74033f331e6e1ffb))
+* **msword:** Fixing function return in equations handling ([#1194](https://github.com/docling-project/docling/issues/1194)) ([`0b707d0`](https://github.com/docling-project/docling/commit/0b707d0882f5be42505871799387d0b1882bffbf))
+
+### Documentation
+
+* Linux Foundation AI & Data ([#1183](https://github.com/docling-project/docling/issues/1183)) ([`1d680b0`](https://github.com/docling-project/docling/commit/1d680b0a321d95fc6bd65b7bb4d5e15005a0250a))
+* Move apify to docs ([#1182](https://github.com/docling-project/docling/issues/1182)) ([`54a78c3`](https://github.com/docling-project/docling/commit/54a78c307de833b93f9b84cf1f8ed6dace8573cb))
+
 ## [v2.27.0](https://github.com/docling-project/docling/releases/tag/v2.27.0) - 2025-03-18

 ### Feature
--- a/README.md
+++ b/README.md
@ -35,7 +35,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
 * 🔒 Local execution capabilities for sensitive data and air-gapped environments
 * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
 * 🔍 Extensive OCR support for scanned PDFs and images
-* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
+* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕
 * 💻 Simple and convenient CLI

 ### Coming soon
@ -57,7 +57,7 @@ More [detailed installation instructions](https://docling-project.github.io/docl

 ## Getting started

-To convert individual documents, use `convert()`, for example:
+To convert individual documents with python, use `convert()`, for example:

 ```python
 from docling.document_converter import DocumentConverter
@ -71,6 +71,22 @@ print(result.document.export_to_markdown())  # output: "## Docling Technical Rep
 More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
 the docs.

+## CLI
+
+Docling has a built-in CLI to run conversions.
+
+```bash
+docling https://arxiv.org/pdf/2206.01062
+```
+
+You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
+```bash
+docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
+```
+This will use MLX acceleration on supported Apple Silicon hardware.
+
+Read more [here](https://docling-project.github.io/docling/usage/)
+
 ## Documentation

 Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -16,6 +16,7 @@ from docling_core.types.doc import (
    TableCell,
    TableData,
 )
+from docling_core.types.doc.document import ContentLayer
 from PIL import Image, UnidentifiedImageError
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
@ -421,4 +422,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
            for shape in slide.shapes:
                handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)

+            # Handle notes slide
+            if slide.has_notes_slide:
+                notes_slide = slide.notes_slide
+                notes_text = notes_slide.notes_text_frame.text.strip()
+                if notes_text:
+                    bbox = BoundingBox(l=0, t=0, r=0, b=0)
+                    prov = ProvenanceItem(
+                        page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox
+                    )
+                    doc.add_text(
+                        label=DocItemLabel.TEXT,
+                        parent=parent_slide,
+                        text=notes_text,
+                        prov=prov,
+                        content_layer=ContentLayer.FURNITURE,
+                    )
+
        return doc
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -32,13 +32,21 @@ from docling.datamodel.pipeline_options import (
    AcceleratorOptions,
    EasyOcrOptions,
    OcrOptions,
+    PaginatedPipelineOptions,
    PdfBackend,
+    PdfPipeline,
    PdfPipelineOptions,
    TableFormerMode,
+    VlmModelType,
+    VlmPipelineOptions,
+    granite_vision_vlm_conversion_options,
+    smoldocling_vlm_conversion_options,
+    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 from docling.models.factories import get_ocr_factory
+from docling.pipeline.vlm_pipeline import VlmPipeline

 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@ -200,6 +208,14 @@ def convert(
            help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
        ),
    ] = ImageRefMode.EMBEDDED,
+    pipeline: Annotated[
+        PdfPipeline,
+        typer.Option(..., help="Choose the pipeline to process PDF or image files."),
+    ] = PdfPipeline.STANDARD,
+    vlm_model: Annotated[
+        VlmModelType,
+        typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
+    ] = VlmModelType.SMOLDOCLING,
    ocr: Annotated[
        bool,
        typer.Option(
@ -420,6 +436,9 @@ def convert(
            ocr_options.lang = ocr_lang_list

        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
+        pipeline_options: PaginatedPipelineOptions
+
+        if pipeline == PdfPipeline.STANDARD:
            pipeline_options = PdfPipelineOptions(
                allow_external_plugins=allow_external_plugins,
                enable_remote_services=enable_remote_services,
@ -445,9 +464,6 @@ def convert(
                )
                pipeline_options.images_scale = 2

-        if artifacts_path is not None:
-            pipeline_options.artifacts_path = artifacts_path
-
            backend: Type[PdfDocumentBackend]
            if pdf_backend == PdfBackend.DLPARSE_V1:
                backend = DoclingParseDocumentBackend
@ -464,6 +480,33 @@ def convert(
                pipeline_options=pipeline_options,
                backend=backend,  # pdf_backend
            )
+        elif pipeline == PdfPipeline.VLM:
+            pipeline_options = VlmPipelineOptions()
+
+            if vlm_model == VlmModelType.GRANITE_VISION:
+                pipeline_options.vlm_options = granite_vision_vlm_conversion_options
+            elif vlm_model == VlmModelType.SMOLDOCLING:
+                pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+                if sys.platform == "darwin":
+                    try:
+                        import mlx_vlm
+
+                        pipeline_options.vlm_options = (
+                            smoldocling_vlm_mlx_conversion_options
+                        )
+                    except ImportError:
+                        _log.warning(
+                            "To run SmolDocling faster, please install mlx-vlm:\n"
+                            "pip install mlx-vlm"
+                        )
+
+            pdf_format_option = PdfFormatOption(
+                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
+            )
+
+        if artifacts_path is not None:
+            pipeline_options.artifacts_path = artifacts_path
+
        format_options: Dict[InputFormat, FormatOption] = {
            InputFormat.PDF: pdf_format_option,
            InputFormat.IMAGE: pdf_format_option,
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -263,6 +263,11 @@ class ResponseFormat(str, Enum):
    MARKDOWN = "markdown"


+class InferenceFramework(str, Enum):
+    MLX = "mlx"
+    TRANSFORMERS = "transformers"
+
+
 class HuggingFaceVlmOptions(BaseVlmOptions):
    kind: Literal["hf_model_options"] = "hf_model_options"

@ -271,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
    llm_int8_threshold: float = 6.0
    quantized: bool = False

+    inference_framework: InferenceFramework
    response_format: ResponseFormat

    @property
@ -278,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
        return self.repo_id.replace("/", "--")


+smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
+    repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
+    prompt="Convert this page to docling.",
+    response_format=ResponseFormat.DOCTAGS,
+    inference_framework=InferenceFramework.MLX,
+)
+
+
 smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
    repo_id="ds4sd/SmolDocling-256M-preview",
    prompt="Convert this page to docling.",
    response_format=ResponseFormat.DOCTAGS,
+    inference_framework=InferenceFramework.TRANSFORMERS,
 )

 granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
@ -289,9 +304,15 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
    # prompt="OCR the full page to markdown.",
    prompt="OCR this image.",
    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.TRANSFORMERS,
 )


+class VlmModelType(str, Enum):
+    SMOLDOCLING = "smoldocling"
+    GRANITE_VISION = "granite_vision"
+
+
 # Define an enum for the backend options
 class PdfBackend(str, Enum):
    """Enum of valid PDF backends."""
@ -327,13 +348,14 @@ class PipelineOptions(BaseModel):


 class PaginatedPipelineOptions(PipelineOptions):
+    artifacts_path: Optional[Union[Path, str]] = None
+
    images_scale: float = 1.0
    generate_page_images: bool = False
    generate_picture_images: bool = False


 class VlmPipelineOptions(PaginatedPipelineOptions):
-    artifacts_path: Optional[Union[Path, str]] = None

    generate_page_images: bool = True
    force_backend_text: bool = (
@ -346,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
 class PdfPipelineOptions(PaginatedPipelineOptions):
    """Options for the PDF pipeline."""

-    artifacts_path: Optional[Union[Path, str]] = None
    do_table_structure: bool = True  # True: perform table structure extraction
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
    do_code_enrichment: bool = False  # True: perform code OCR
@ -377,3 +398,8 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
    )

    generate_parsed_pages: bool = False
+
+
+class PdfPipeline(str, Enum):
+    STANDARD = "standard"
+    VLM = "vlm"
--- a/docling/models/hf_mlx_model.py
+++ b/docling/models/hf_mlx_model.py
@ -0,0 +1,137 @@
+import logging
+import time
+from pathlib import Path
+from typing import Iterable, List, Optional
+
+from docling.datamodel.base_models import Page, VlmPrediction
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    HuggingFaceVlmOptions,
+)
+from docling.datamodel.settings import settings
+from docling.models.base_model import BasePageModel
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.profiling import TimeRecorder
+
+_log = logging.getLogger(__name__)
+
+
+class HuggingFaceMlxModel(BasePageModel):
+
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        vlm_options: HuggingFaceVlmOptions,
+    ):
+        self.enabled = enabled
+
+        self.vlm_options = vlm_options
+
+        if self.enabled:
+
+            try:
+                from mlx_vlm import generate, load  # type: ignore
+                from mlx_vlm.prompt_utils import apply_chat_template  # type: ignore
+                from mlx_vlm.utils import load_config, stream_generate  # type: ignore
+            except ImportError:
+                raise ImportError(
+                    "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
+                )
+
+            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
+            self.apply_chat_template = apply_chat_template
+            self.stream_generate = stream_generate
+
+            # PARAMETERS:
+            if artifacts_path is None:
+                artifacts_path = self.download_models(self.vlm_options.repo_id)
+            elif (artifacts_path / repo_cache_folder).exists():
+                artifacts_path = artifacts_path / repo_cache_folder
+
+            self.param_question = vlm_options.prompt  # "Perform Layout Analysis."
+
+            ## Load the model
+            self.vlm_model, self.processor = load(artifacts_path)
+            self.config = load_config(artifacts_path)
+
+    @staticmethod
+    def download_models(
+        repo_id: str,
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id=repo_id,
+            force_download=force,
+            local_dir=local_dir,
+            # revision="v0.0.1",
+        )
+
+        return Path(download_path)
+
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        for page in page_batch:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "vlm"):
+                    assert page.size is not None
+
+                    hi_res_image = page.get_image(scale=2.0)  # 144dpi
+                    # hi_res_image = page.get_image(scale=1.0)  # 72dpi
+
+                    if hi_res_image is not None:
+                        im_width, im_height = hi_res_image.size
+
+                    # populate page_tags with predicted doc tags
+                    page_tags = ""
+
+                    if hi_res_image:
+                        if hi_res_image.mode != "RGB":
+                            hi_res_image = hi_res_image.convert("RGB")
+
+                    prompt = self.apply_chat_template(
+                        self.processor, self.config, self.param_question, num_images=1
+                    )
+
+                    start_time = time.time()
+                    # Call model to generate:
+                    output = ""
+                    for token in self.stream_generate(
+                        self.vlm_model,
+                        self.processor,
+                        prompt,
+                        [hi_res_image],
+                        max_tokens=4096,
+                        verbose=False,
+                    ):
+                        output += token.text
+                        if "</doctag>" in token.text:
+                            break
+
+                    generation_time = time.time() - start_time
+                    page_tags = output
+
+                    # inference_time = time.time() - start_time
+                    # tokens_per_second = num_tokens / generation_time
+                    # print("")
+                    # print(f"Page Inference Time: {inference_time:.2f} seconds")
+                    # print(f"Total tokens on page: {num_tokens:.2f}")
+                    # print(f"Tokens/sec: {tokens_per_second:.2f}")
+                    # print("")
+                    page.predictions.vlm_response = VlmPrediction(text=page_tags)
+
+                yield page
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@ -14,8 +14,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_options import ResponseFormat, VlmPipelineOptions
+from docling.datamodel.pipeline_options import (
+    InferenceFramework,
+    ResponseFormat,
+    VlmPipelineOptions,
+)
 from docling.datamodel.settings import settings
+from docling.models.hf_mlx_model import HuggingFaceMlxModel
 from docling.models.hf_vlm_model import HuggingFaceVlmModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder
@ -29,12 +34,6 @@ class VlmPipeline(PaginatedPipeline):
        super().__init__(pipeline_options)
        self.keep_backend = True

-        warnings.warn(
-            "The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
-            category=UserWarning,
-            stacklevel=2,
-        )
-
        self.pipeline_options: VlmPipelineOptions

        artifacts_path: Optional[Path] = None
@ -58,6 +57,19 @@ class VlmPipeline(PaginatedPipeline):

        self.keep_images = self.pipeline_options.generate_page_images

+        if (
+            self.pipeline_options.vlm_options.inference_framework
+            == InferenceFramework.MLX
+        ):
+            self.build_pipe = [
+                HuggingFaceMlxModel(
+                    enabled=True,  # must be always enabled for this pipeline to make sense.
+                    artifacts_path=artifacts_path,
+                    accelerator_options=pipeline_options.accelerator_options,
+                    vlm_options=self.pipeline_options.vlm_options,
+                ),
+            ]
+        else:
            self.build_pipe = [
                HuggingFaceVlmModel(
                    enabled=True,  # must be always enabled for this pipeline to make sense.
@ -79,7 +91,9 @@ class VlmPipeline(PaginatedPipeline):

        return page

-    def extract_text_from_backend(self, page: Page, bbox: BoundingBox | None) -> str:
+    def extract_text_from_backend(
+        self, page: Page, bbox: Union[BoundingBox, None]
+    ) -> str:
        # Convert bounding box normalized to 0-100 into page coordinates for cropping
        text = ""
        if bbox:
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@ -10,13 +10,15 @@ from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
    granite_vision_vlm_conversion_options,
    smoldocling_vlm_conversion_options,
+    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline

 sources = [
-    "tests/data/2305.03393v1-pg9-img.png",
+    # "tests/data/2305.03393v1-pg9-img.png",
+    "tests/data/pdf/2305.03393v1-pg9.pdf",
 ]

 ## Use experimental VlmPipeline
@ -29,7 +31,10 @@ pipeline_options.force_backend_text = False
 # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True

 ## Pick a VLM model. We choose SmolDocling-256M by default
-pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+# pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+
+## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
+pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options

 ## Alternative VLM models:
 # pipeline_options.vlm_options = granite_vision_vlm_conversion_options
@ -63,9 +68,6 @@ for source in sources:

    res = converter.convert(source)

-    print("------------------------------------------------")
-    print("MD:")
-    print("------------------------------------------------")
    print("")
    print(res.document.export_to_markdown())

@ -83,8 +85,17 @@ for source in sources:
    with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
        fp.write(json.dumps(res.document.export_to_dict()))

-    pg_num = res.document.num_pages()
+    res.document.save_as_json(
+        out_path / f"{res.input.file.stem}.md",
+        image_mode=ImageRefMode.PLACEHOLDER,
+    )

+    res.document.save_as_markdown(
+        out_path / f"{res.input.file.stem}.md",
+        image_mode=ImageRefMode.PLACEHOLDER,
+    )
+
+    pg_num = res.document.num_pages()
    print("")
    inference_time = time.time() - start_time
    print(
--- a/docs/index.md
+++ b/docs/index.md
@ -26,7 +26,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
 * 🔒 Local execution capabilities for sensitive data and air-gapped environments
 * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
 * 🔍 Extensive OCR support for scanned PDFs and images
-* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
+* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕🔥
 * 💻 Simple and convenient CLI

 ### Coming soon
--- a/docs/usage/index.md
+++ b/docs/usage/index.md
@ -17,10 +17,15 @@ print(result.document.export_to_markdown())  # output: "### Docling Technical Re

 You can also use Docling directly from your command line to convert individual files —be it local or by URL— or whole directories.

-A simple example would look like this:
 ```console
 docling https://arxiv.org/pdf/2206.01062
 ```
+You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
+```bash
+docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
+```
+This will use MLX acceleration on supported Apple Silicon hardware.
+

 To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](../reference/cli.md).

--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.27.0"  # DO NOT EDIT, updated automatically
+version = "2.28.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = [
  "Christoph Auer <cau@zurich.ibm.com>",
@ -192,6 +192,7 @@ module = [
  "docling_ibm_models.*",
  "easyocr.*",
  "ocrmac.*",
+  "mlx_vlm.*",
  "lxml.*",
  "huggingface_hub.*",
  "transformers.*",
--- a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json
+++ b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json
@ -4,7 +4,7 @@
  "name": "powerpoint_sample",
  "origin": {
    "mimetype": "application/vnd.ms-powerpoint",
-    "binary_hash": 1640759611026400292,
+    "binary_hash": 15572290240354948364,
    "filename": "powerpoint_sample.pptx"
  },
  "furniture": {
@ -75,6 +75,9 @@
        },
        {
          "$ref": "#/texts/7"
+        },
+        {
+          "$ref": "#/texts/8"
        }
      ],
      "content_layer": "body",
@ -94,19 +97,22 @@
          "$ref": "#/groups/4"
        },
        {
-          "$ref": "#/texts/15"
+          "$ref": "#/texts/16"
        },
        {
          "$ref": "#/groups/5"
        },
        {
-          "$ref": "#/texts/18"
+          "$ref": "#/texts/19"
        },
        {
          "$ref": "#/groups/6"
        },
        {
          "$ref": "#/groups/7"
+        },
+        {
+          "$ref": "#/texts/26"
        }
      ],
      "content_layer": "body",
@ -119,14 +125,14 @@
        "$ref": "#/groups/2"
      },
      "children": [
-        {
-          "$ref": "#/texts/8"
-        },
        {
          "$ref": "#/texts/9"
        },
        {
          "$ref": "#/texts/10"
+        },
+        {
+          "$ref": "#/texts/11"
        }
      ],
      "content_layer": "body",
@ -139,9 +145,6 @@
        "$ref": "#/groups/2"
      },
      "children": [
-        {
-          "$ref": "#/texts/11"
-        },
        {
          "$ref": "#/texts/12"
        },
@ -150,6 +153,9 @@
        },
        {
          "$ref": "#/texts/14"
+        },
+        {
+          "$ref": "#/texts/15"
        }
      ],
      "content_layer": "body",
@ -163,10 +169,10 @@
      },
      "children": [
        {
-          "$ref": "#/texts/16"
+          "$ref": "#/texts/17"
        },
        {
-          "$ref": "#/texts/17"
+          "$ref": "#/texts/18"
        }
      ],
      "content_layer": "body",
@ -179,14 +185,14 @@
        "$ref": "#/groups/2"
      },
      "children": [
-        {
-          "$ref": "#/texts/19"
-        },
        {
          "$ref": "#/texts/20"
        },
        {
          "$ref": "#/texts/21"
+        },
+        {
+          "$ref": "#/texts/22"
        }
      ],
      "content_layer": "body",
@ -199,14 +205,14 @@
        "$ref": "#/groups/2"
      },
      "children": [
-        {
-          "$ref": "#/texts/22"
-        },
        {
          "$ref": "#/texts/23"
        },
        {
          "$ref": "#/texts/24"
+        },
+        {
+          "$ref": "#/texts/25"
        }
      ],
      "content_layer": "body",
@ -433,6 +439,33 @@
    },
    {
      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "furniture",
+      "label": "text",
+      "prov": [
+        {
+          "page_no": 2,
+          "bbox": {
+            "l": 0.0,
+            "t": 0.0,
+            "r": 0.0,
+            "b": 0.0,
+            "coord_origin": "TOPLEFT"
+          },
+          "charspan": [
+            0,
+            31
+          ]
+        }
+      ],
+      "orig": "Some notes on the second slide.",
+      "text": "Some notes on the second slide."
+    },
+    {
+      "self_ref": "#/texts/9",
      "parent": {
        "$ref": "#/groups/3"
      },
@ -461,7 +494,7 @@
      "marker": "1."
    },
    {
-      "self_ref": "#/texts/9",
+      "self_ref": "#/texts/10",
      "parent": {
        "$ref": "#/groups/3"
      },
@ -490,7 +523,7 @@
      "marker": "2."
    },
    {
-      "self_ref": "#/texts/10",
+      "self_ref": "#/texts/11",
      "parent": {
        "$ref": "#/groups/3"
      },
@ -519,7 +552,7 @@
      "marker": "3."
    },
    {
-      "self_ref": "#/texts/11",
+      "self_ref": "#/texts/12",
      "parent": {
        "$ref": "#/groups/4"
      },
@ -548,7 +581,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/12",
+      "self_ref": "#/texts/13",
      "parent": {
        "$ref": "#/groups/4"
      },
@ -577,7 +610,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/13",
+      "self_ref": "#/texts/14",
      "parent": {
        "$ref": "#/groups/4"
      },
@ -606,7 +639,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/14",
+      "self_ref": "#/texts/15",
      "parent": {
        "$ref": "#/groups/4"
      },
@ -635,7 +668,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/15",
+      "self_ref": "#/texts/16",
      "parent": {
        "$ref": "#/groups/2"
      },
@ -662,7 +695,7 @@
      "text": "Some info:"
    },
    {
-      "self_ref": "#/texts/16",
+      "self_ref": "#/texts/17",
      "parent": {
        "$ref": "#/groups/5"
      },
@ -691,7 +724,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/17",
+      "self_ref": "#/texts/18",
      "parent": {
        "$ref": "#/groups/5"
      },
@ -720,7 +753,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/18",
+      "self_ref": "#/texts/19",
      "parent": {
        "$ref": "#/groups/2"
      },
@ -747,7 +780,7 @@
      "text": "Maybe a list?"
    },
    {
-      "self_ref": "#/texts/19",
+      "self_ref": "#/texts/20",
      "parent": {
        "$ref": "#/groups/6"
      },
@ -776,7 +809,7 @@
      "marker": "1."
    },
    {
-      "self_ref": "#/texts/20",
+      "self_ref": "#/texts/21",
      "parent": {
        "$ref": "#/groups/6"
      },
@ -805,7 +838,7 @@
      "marker": "2."
    },
    {
-      "self_ref": "#/texts/21",
+      "self_ref": "#/texts/22",
      "parent": {
        "$ref": "#/groups/6"
      },
@ -834,7 +867,7 @@
      "marker": "3."
    },
    {
-      "self_ref": "#/texts/22",
+      "self_ref": "#/texts/23",
      "parent": {
        "$ref": "#/groups/7"
      },
@ -863,7 +896,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/23",
+      "self_ref": "#/texts/24",
      "parent": {
        "$ref": "#/groups/7"
      },
@ -892,7 +925,7 @@
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/24",
+      "self_ref": "#/texts/25",
      "parent": {
        "$ref": "#/groups/7"
      },
@ -919,6 +952,33 @@
      "text": "l3",
      "enumerated": false,
      "marker": "-"
+    },
+    {
+      "self_ref": "#/texts/26",
+      "parent": {
+        "$ref": "#/groups/2"
+      },
+      "children": [],
+      "content_layer": "furniture",
+      "label": "text",
+      "prov": [
+        {
+          "page_no": 3,
+          "bbox": {
+            "l": 0.0,
+            "t": 0.0,
+            "r": 0.0,
+            "b": 0.0,
+            "coord_origin": "TOPLEFT"
+          },
+          "charspan": [
+            0,
+            53
+          ]
+        }
+      ],
+      "orig": "Final notes on the third slide.\nSecond line of notes.",
+      "text": "Final notes on the third slide.\nSecond line of notes."
    }
  ],
  "pictures": [],
--- a/tests/data/pptx/powerpoint_sample.pptx
+++ b/tests/data/pptx/powerpoint_sample.pptx