Merge branch 'DS4SD:main' into main

2025-07-29 21:44:32 +00:00 · 2025-03-13 11:11:43 +01:00 · 2025-03-13 11:11:43 +01:00 · 53837fe30e
commit 53837fe30e
parent ebd323a5e8 aa92a57fa9
333 changed files with 43379 additions and 37533 deletions
--- a/.github/actions/setup-poetry/action.yml
+++ b/.github/actions/setup-poetry/action.yml
@ -8,7 +8,7 @@ runs:
  using: 'composite'
  steps:
    - name: Install poetry
-      run: pipx install poetry==1.8.3
+      run: pipx install poetry==1.8.5
      shell: bash
    - uses: actions/setup-python@v5
      with:
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@ -1,19 +1,28 @@
 on:
  workflow_call:

+env:
+  HF_HUB_DOWNLOAD_TIMEOUT: "60"
+  HF_HUB_ETAG_TIMEOUT: "60"
+
 jobs:
  run-checks:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ['3.9', '3.10', '3.11', '3.12']
+        python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
    steps:
      - uses: actions/checkout@v4
      - name: Install tesseract
-        run: sudo apt-get update && sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa libleptonica-dev libtesseract-dev pkg-config
+        run: sudo apt-get update && sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
      - name: Set TESSDATA_PREFIX
        run: |
          echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
+      - name: Cache Hugging Face models
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface
+          key: huggingface-cache-py${{ matrix.python-version }}
      - uses: ./.github/actions/setup-poetry
        with:
          python-version: ${{ matrix.python-version }}
@ -28,7 +37,7 @@ jobs:
        run: |
          for file in docs/examples/*.py; do
            # Skip batch_convert.py
-            if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment).py ]]; then
+            if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then
                echo "Skipping $file"
                continue
            fi
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@ -17,4 +17,3 @@ jobs:
        - name: Build and push docs
          if: inputs.deploy
          run: poetry run mkdocs gh-deploy --force
- 
--- a/6
+++ b/6
@ -16,8 +16,7 @@ ENV TORCH_HOME=/tmp/

 COPY docs/examples/minimal.py /root/minimal.py

-RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
-RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);'
+RUN docling-tools models download

 # On container environments, always set a thread budget to avoid undesired thread congestion.
 ENV OMP_NUM_THREADS=4
@ -25,3 +24,6 @@ ENV OMP_NUM_THREADS=4
 # On container shell:
 # > cd /root/
 # > python minimal.py
+
+# Running as `docker run -e DOCLING_ARTIFACTS_PATH=/root/.cache/docling/models` will use the
+# model weights included in the container image.
--- a/docling/backend/abstract_backend.py
+++ b/docling/backend/abstract_backend.py
@ -27,7 +27,6 @@ class AbstractDocumentBackend(ABC):
    def supports_pagination(cls) -> bool:
        pass

-    @abstractmethod
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@ -24,7 +24,6 @@ _log = logging.getLogger(__name__)


 class AsciiDocBackend(DeclarativeDocumentBackend):
-
    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)

--- a/docling/backend/csv_backend.py
+++ b/docling/backend/csv_backend.py
@ -0,0 +1,125 @@
+import csv
+import logging
+import warnings
+from io import BytesIO, StringIO
+from pathlib import Path
+from typing import Set, Union
+
+from docling_core.types.doc import DoclingDocument, DocumentOrigin, TableCell, TableData
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class CsvDocumentBackend(DeclarativeDocumentBackend):
+    content: StringIO
+
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
+        # Load content
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8"))
+            elif isinstance(self.path_or_stream, Path):
+                self.content = StringIO(self.path_or_stream.read_text("utf-8"))
+            self.valid = True
+        except Exception as e:
+            raise RuntimeError(
+                f"CsvDocumentBackend could not load document with hash {self.document_hash}"
+            ) from e
+        return
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+        self.path_or_stream = None
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.CSV}
+
+    def convert(self) -> DoclingDocument:
+        """
+        Parses the CSV data into a structured document model.
+        """
+
+        # Detect CSV dialect
+        head = self.content.readline()
+        dialect = csv.Sniffer().sniff(head, ",;\t|:")
+        _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
+        if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
+            raise RuntimeError(
+                f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
+            )
+
+        # Parce CSV
+        self.content.seek(0)
+        result = csv.reader(self.content, dialect=dialect, strict=True)
+        self.csv_data = list(result)
+        _log.info(f"Detected {len(self.csv_data)} lines")
+
+        # Ensure uniform column length
+        expected_length = len(self.csv_data[0])
+        is_uniform = all(len(row) == expected_length for row in self.csv_data)
+        if not is_uniform:
+            warnings.warn(
+                f"Inconsistent column lengths detected in CSV data. "
+                f"Expected {expected_length} columns, but found rows with varying lengths. "
+                f"Ensure all rows have the same number of columns."
+            )
+
+        # Parse the CSV into a structured document model
+        origin = DocumentOrigin(
+            filename=self.file.name or "file.csv",
+            mimetype="text/csv",
+            binary_hash=self.document_hash,
+        )
+
+        doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin)
+
+        if self.is_valid():
+            # Convert CSV data to table
+            if self.csv_data:
+                num_rows = len(self.csv_data)
+                num_cols = max(len(row) for row in self.csv_data)
+
+                table_data = TableData(
+                    num_rows=num_rows,
+                    num_cols=num_cols,
+                    table_cells=[],
+                )
+
+                # Convert each cell to TableCell
+                for row_idx, row in enumerate(self.csv_data):
+                    for col_idx, cell_value in enumerate(row):
+                        cell = TableCell(
+                            text=str(cell_value),
+                            row_span=1,  # CSV doesn't support merged cells
+                            col_span=1,
+                            start_row_offset_idx=row_idx,
+                            end_row_offset_idx=row_idx + 1,
+                            start_col_offset_idx=col_idx,
+                            end_col_offset_idx=col_idx + 1,
+                            col_header=row_idx == 0,  # First row as header
+                            row_header=False,
+                        )
+                        table_data.table_cells.append(cell)
+
+                doc.add_table(data=table_data)
+        else:
+            raise RuntimeError(
+                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
+            )
+
+        return doc
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -1,17 +1,22 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import Final, Optional, Union, cast

-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
+from bs4.element import PreformattedString
 from docling_core.types.doc import (
+    DocItem,
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
+    GroupItem,
    GroupLabel,
    TableCell,
    TableData,
 )
+from docling_core.types.doc.document import ContentLayer
+from typing_extensions import override

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
@ -19,21 +24,38 @@ from docling.datamodel.document import InputDocument

 _log = logging.getLogger(__name__)

+# tags that generate NodeItem elements
+TAGS_FOR_NODE_ITEMS: Final = [
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "p",
+    "pre",
+    "ul",
+    "ol",
+    "li",
+    "table",
+    "figure",
+    "img",
+]
+

 class HTMLDocumentBackend(DeclarativeDocumentBackend):
+    @override
    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)
-        _log.debug("About to init HTML backend...")
-        self.soup = None
+        self.soup: Optional[Tag] = None
        # HTML file:
        self.path_or_stream = path_or_stream
        # Initialise the parents for the hierarchy
        self.max_levels = 10
        self.level = 0
-        self.parents = {}  # type: ignore
+        self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
        for i in range(0, self.max_levels):
            self.parents[i] = None
-        self.labels = {}  # type: ignore

        try:
            if isinstance(self.path_or_stream, BytesIO):
@ -45,16 +67,20 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    self.soup = BeautifulSoup(html_content, "html.parser")
        except Exception as e:
            raise RuntimeError(
-                f"Could not initialize HTML backend for file with hash {self.document_hash}."
+                "Could not initialize HTML backend for file with "
+                f"hash {self.document_hash}."
            ) from e

+    @override
    def is_valid(self) -> bool:
        return self.soup is not None

    @classmethod
+    @override
    def supports_pagination(cls) -> bool:
        return False

+    @override
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()
@ -62,9 +88,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        self.path_or_stream = None

    @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
+    @override
+    def supported_formats(cls) -> set[InputFormat]:
        return {InputFormat.HTML}

+    @override
    def convert(self) -> DoclingDocument:
        # access self.path_or_stream to load stuff
        origin = DocumentOrigin(
@ -78,108 +106,118 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):

        if self.is_valid():
            assert self.soup is not None
+            content = self.soup.body or self.soup
            # Replace <br> tags with newline characters
-            for br in self.soup.body.find_all("br"):
-                br.replace_with("\n")
-            doc = self.walk(self.soup.body, doc)
+            # TODO: remove style to avoid losing text from tags like i, b, span, ...
+            for br in content("br"):
+                br.replace_with(NavigableString("\n"))
+
+            headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
+            self.content_layer = (
+                ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
+            )
+            self.walk(content, doc)
        else:
            raise RuntimeError(
-                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
+                f"Cannot convert doc with {self.document_hash} because the backend "
+                "failed to init."
            )
        return doc

-    def walk(self, element, doc):
-        try:
-            # Iterate over elements in the body of the document
-            for idx, element in enumerate(element.children):
+    def walk(self, tag: Tag, doc: DoclingDocument) -> None:
+
+        # Iterate over elements in the body of the document
+        text: str = ""
+        for element in tag.children:
+            if isinstance(element, Tag):
                try:
-                    self.analyse_element(element, idx, doc)
+                    self.analyze_tag(cast(Tag, element), doc)
                except Exception as exc_child:
-
-                    _log.error(" -> error treating child: ", exc_child)
-                    _log.error(" => element: ", element, "\n")
+                    _log.error(
+                        f"Error processing child from tag{tag.name}: {exc_child}"
+                    )
                    raise exc_child
+            elif isinstance(element, NavigableString) and not isinstance(
+                element, PreformattedString
+            ):
+                # Floating text outside paragraphs or analyzed tags
+                text += element
+                siblings: list[Tag] = [
+                    item for item in element.next_siblings if isinstance(item, Tag)
+                ]
+                if element.next_sibling is None or any(
+                    [item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
+                ):
+                    text = text.strip()
+                    if text and tag.name in ["div"]:
+                        doc.add_text(
+                            parent=self.parents[self.level],
+                            label=DocItemLabel.TEXT,
+                            text=text,
+                            content_layer=self.content_layer,
+                        )
+                    text = ""

-        except Exception as exc:
-            pass
+        return

-        return doc
-
-    def analyse_element(self, element, idx, doc):
-        """
-        if element.name!=None:
-            _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
-        """
-
-        if element.name in self.labels:
-            self.labels[element.name] += 1
+    def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
+        if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
+            self.handle_header(tag, doc)
+        elif tag.name in ["p"]:
+            self.handle_paragraph(tag, doc)
+        elif tag.name in ["pre"]:
+            self.handle_code(tag, doc)
+        elif tag.name in ["ul", "ol"]:
+            self.handle_list(tag, doc)
+        elif tag.name in ["li"]:
+            self.handle_list_item(tag, doc)
+        elif tag.name == "table":
+            self.handle_table(tag, doc)
+        elif tag.name == "figure":
+            self.handle_figure(tag, doc)
+        elif tag.name == "img":
+            self.handle_image(tag, doc)
        else:
-            self.labels[element.name] = 1
+            self.walk(tag, doc)

-        if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
-            self.handle_header(element, idx, doc)
-        elif element.name in ["p"]:
-            self.handle_paragraph(element, idx, doc)
-        elif element.name in ["pre"]:
-            self.handle_code(element, idx, doc)
-        elif element.name in ["ul", "ol"]:
-            self.handle_list(element, idx, doc)
-        elif element.name in ["li"]:
-            self.handle_listitem(element, idx, doc)
-        elif element.name == "table":
-            self.handle_table(element, idx, doc)
-        elif element.name == "figure":
-            self.handle_figure(element, idx, doc)
-        elif element.name == "img":
-            self.handle_image(element, idx, doc)
-        else:
-            self.walk(element, doc)
+    def get_text(self, item: PageElement) -> str:
+        """Get the text content of a tag."""
+        parts: list[str] = self.extract_text_recursively(item)

-    def get_direct_text(self, item):
-        """Get the direct text of the <li> element (ignoring nested lists)."""
-        text = item.find(string=True, recursive=False)
-        if isinstance(text, str):
-            return text.strip()
-
-        return ""
+        return "".join(parts) + " "

    # Function to recursively extract text from all child nodes
-    def extract_text_recursively(self, item):
-        result = []
+    def extract_text_recursively(self, item: PageElement) -> list[str]:
+        result: list[str] = []

-        if isinstance(item, str):
+        if isinstance(item, NavigableString):
            return [item]

-        if item.name not in ["ul", "ol"]:
-            try:
-                # Iterate over the children (and their text and tails)
-                for child in item:
-                    try:
-                        # Recursively get the child's text content
-                        result.extend(self.extract_text_recursively(child))
-                    except:
-                        pass
-            except:
-                _log.warn("item has no children")
-                pass
+        tag = cast(Tag, item)
+        if tag.name not in ["ul", "ol"]:
+            for child in tag:
+                # Recursively get the child's text content
+                result.extend(self.extract_text_recursively(child))

-        return "".join(result) + " "
+        return ["".join(result) + " "]

-    def handle_header(self, element, idx, doc):
+    def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles header tags (h1, h2, etc.)."""
        hlevel = int(element.name.replace("h", ""))
-        slevel = hlevel - 1
-
-        label = DocItemLabel.SECTION_HEADER
        text = element.text.strip()

        if hlevel == 1:
-            for key, val in self.parents.items():
+            self.content_layer = ContentLayer.BODY
+
+            for key in self.parents.keys():
                self.parents[key] = None

            self.level = 1
            self.parents[self.level] = doc.add_text(
-                parent=self.parents[0], label=DocItemLabel.TITLE, text=text
+                parent=self.parents[0],
+                label=DocItemLabel.TITLE,
+                text=text,
+                content_layer=self.content_layer,
            )
        else:
            if hlevel > self.level:
@ -190,13 +228,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                        name=f"header-{i}",
                        label=GroupLabel.SECTION,
                        parent=self.parents[i - 1],
+                        content_layer=self.content_layer,
                    )
                self.level = hlevel

            elif hlevel < self.level:

                # remove the tail
-                for key, val in self.parents.items():
+                for key in self.parents.keys():
                    if key > hlevel:
                        self.parents[key] = None
                self.level = hlevel
@ -205,42 +244,58 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                parent=self.parents[hlevel - 1],
                text=text,
                level=hlevel,
+                content_layer=self.content_layer,
            )

-    def handle_code(self, element, idx, doc):
+    def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles monospace code snippets (pre)."""
        if element.text is None:
            return
        text = element.text.strip()
-        label = DocItemLabel.CODE
-        if len(text) == 0:
-            return
-        doc.add_text(parent=self.parents[self.level], label=label, text=text)
+        if text:
+            doc.add_code(
+                parent=self.parents[self.level],
+                text=text,
+                content_layer=self.content_layer,
+            )

-    def handle_paragraph(self, element, idx, doc):
+    def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles paragraph tags (p)."""
        if element.text is None:
            return
        text = element.text.strip()
-        label = DocItemLabel.PARAGRAPH
-        if len(text) == 0:
-            return
-        doc.add_text(parent=self.parents[self.level], label=label, text=text)
+        if text:
+            doc.add_text(
+                parent=self.parents[self.level],
+                label=DocItemLabel.TEXT,
+                text=text,
+                content_layer=self.content_layer,
+            )

-    def handle_list(self, element, idx, doc):
+    def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles list tags (ul, ol) and their list items."""

        if element.name == "ul":
            # create a list group
            self.parents[self.level + 1] = doc.add_group(
-                parent=self.parents[self.level], name="list", label=GroupLabel.LIST
+                parent=self.parents[self.level],
+                name="list",
+                label=GroupLabel.LIST,
+                content_layer=self.content_layer,
            )
        elif element.name == "ol":
+            start_attr = element.get("start")
+            start: int = (
+                int(start_attr)
+                if isinstance(start_attr, str) and start_attr.isnumeric()
+                else 1
+            )
            # create a list group
            self.parents[self.level + 1] = doc.add_group(
                parent=self.parents[self.level],
-                name="ordered list",
+                name="ordered list" + (f" start {start}" if start != 1 else ""),
                label=GroupLabel.ORDERED_LIST,
+                content_layer=self.content_layer,
            )
        self.level += 1

@ -249,25 +304,36 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        self.parents[self.level + 1] = None
        self.level -= 1

-    def handle_listitem(self, element, idx, doc):
-        """Handles listitem tags (li)."""
-        nested_lists = element.find(["ul", "ol"])
+    def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
+        """Handles list item tags (li)."""
+        nested_list = element.find(["ul", "ol"])

-        parent_list_label = self.parents[self.level].label
-        index_in_list = len(self.parents[self.level].children) + 1
+        parent = self.parents[self.level]
+        if parent is None:
+            _log.debug(f"list-item has no parent in DoclingDocument: {element}")
+            return
+        parent_label: str = parent.label
+        index_in_list = len(parent.children) + 1
+        if (
+            parent_label == GroupLabel.ORDERED_LIST
+            and isinstance(parent, GroupItem)
+            and parent.name
+        ):
+            start_in_list: str = parent.name.split(" ")[-1]
+            start: int = int(start_in_list) if start_in_list.isnumeric() else 1
+            index_in_list += start - 1

-        if nested_lists:
-            name = element.name
+        if nested_list:
            # Text in list item can be hidden within hierarchy, hence
            # we need to extract it recursively
-            text = self.extract_text_recursively(element)
+            text: str = self.get_text(element)
            # Flatten text, remove break lines:
            text = text.replace("\n", "").replace("\r", "")
            text = " ".join(text.split()).strip()

            marker = ""
            enumerated = False
-            if parent_list_label == GroupLabel.ORDERED_LIST:
+            if parent_label == GroupLabel.ORDERED_LIST:
                marker = str(index_in_list)
                enumerated = True

@ -277,7 +343,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    text=text,
                    enumerated=enumerated,
                    marker=marker,
-                    parent=self.parents[self.level],
+                    parent=parent,
+                    content_layer=self.content_layer,
                )
                self.level += 1

@ -286,74 +353,95 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            self.parents[self.level + 1] = None
            self.level -= 1

-        elif isinstance(element.text, str):
+        elif element.text.strip():
            text = element.text.strip()

            marker = ""
            enumerated = False
-            if parent_list_label == GroupLabel.ORDERED_LIST:
+            if parent_label == GroupLabel.ORDERED_LIST:
                marker = f"{str(index_in_list)}."
                enumerated = True
            doc.add_list_item(
                text=text,
                enumerated=enumerated,
                marker=marker,
-                parent=self.parents[self.level],
+                parent=parent,
+                content_layer=self.content_layer,
            )
        else:
-            _log.warn("list-item has no text: ", element)
-
-    def handle_table(self, element, idx, doc):
-        """Handles table tags."""
+            _log.debug(f"list-item has no text: {element}")

+    @staticmethod
+    def parse_table_data(element: Tag) -> Optional[TableData]:
        nested_tables = element.find("table")
        if nested_tables is not None:
-            _log.warn("detected nested tables: skipping for now")
-            return
+            _log.debug("Skipping nested table.")
+            return None

        # Count the number of rows (number of <tr> elements)
-        num_rows = len(element.find_all("tr"))
+        num_rows = len(element("tr"))

        # Find the number of columns (taking into account colspan)
        num_cols = 0
-        for row in element.find_all("tr"):
+        for row in element("tr"):
            col_count = 0
-            for cell in row.find_all(["td", "th"]):
-                colspan = int(cell.get("colspan", 1))
+            if not isinstance(row, Tag):
+                continue
+            for cell in row(["td", "th"]):
+                if not isinstance(row, Tag):
+                    continue
+                val = cast(Tag, cell).get("colspan", "1")
+                colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
                col_count += colspan
            num_cols = max(num_cols, col_count)

-        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
+        grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]

        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])

        # Iterate over the rows in the table
-        for row_idx, row in enumerate(element.find_all("tr")):
+        for row_idx, row in enumerate(element("tr")):
+            if not isinstance(row, Tag):
+                continue

            # For each row, find all the column cells (both <td> and <th>)
-            cells = row.find_all(["td", "th"])
+            cells = row(["td", "th"])

            # Check if each cell in the row is a header -> means it is a column header
            col_header = True
-            for j, html_cell in enumerate(cells):
-                if html_cell.name == "td":
+            for html_cell in cells:
+                if isinstance(html_cell, Tag) and html_cell.name == "td":
                    col_header = False

+            # Extract the text content of each cell
            col_idx = 0
-            # Extract and print the text content of each cell
-            for _, html_cell in enumerate(cells):
+            for html_cell in cells:
+                if not isinstance(html_cell, Tag):
+                    continue

+                # extract inline formulas
+                for formula in html_cell("inline-formula"):
+                    math_parts = formula.text.split("$$")
+                    if len(math_parts) == 3:
+                        math_formula = f"$${math_parts[1]}$$"
+                        formula.replace_with(NavigableString(math_formula))
+
+                # TODO: extract content correctly from table-cells with lists
                text = html_cell.text
-                try:
-                    text = self.extract_table_cell_text(html_cell)
-                except Exception as exc:
-                    _log.warn("exception: ", exc)
-                    exit(-1)

                # label = html_cell.name
-
-                col_span = int(html_cell.get("colspan", 1))
-                row_span = int(html_cell.get("rowspan", 1))
+                col_val = html_cell.get("colspan", "1")
+                col_span = (
+                    int(col_val)
+                    if isinstance(col_val, str) and col_val.isnumeric()
+                    else 1
+                )
+                row_val = html_cell.get("rowspan", "1")
+                row_span = (
+                    int(row_val)
+                    if isinstance(row_val, str) and row_val.isnumeric()
+                    else 1
+                )

                while grid[row_idx][col_idx] is not None:
                    col_idx += 1
@ -361,7 +449,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    for c in range(col_span):
                        grid[row_idx + r][col_idx + c] = text

-                cell = TableCell(
+                table_cell = TableCell(
                    text=text,
                    row_span=row_span,
                    col_span=col_span,
@ -372,70 +460,87 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    col_header=col_header,
                    row_header=((not col_header) and html_cell.name == "th"),
                )
-                data.table_cells.append(cell)
+                data.table_cells.append(table_cell)

-        doc.add_table(data=data, parent=self.parents[self.level])
+        return data

-    def get_list_text(self, list_element, level=0):
+    def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
+        """Handles table tags."""
+
+        table_data = HTMLDocumentBackend.parse_table_data(element)
+
+        if table_data is not None:
+            doc.add_table(
+                data=table_data,
+                parent=self.parents[self.level],
+                content_layer=self.content_layer,
+            )
+
+    def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
        """Recursively extract text from <ul> or <ol> with proper indentation."""
        result = []
        bullet_char = "*"  # Default bullet character for unordered lists

        if list_element.name == "ol":  # For ordered lists, use numbers
-            for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
+            for i, li in enumerate(list_element("li", recursive=False), 1):
+                if not isinstance(li, Tag):
+                    continue
                # Add numbering for ordered lists
                result.append(f"{'    ' * level}{i}. {li.get_text(strip=True)}")
                # Handle nested lists
                nested_list = li.find(["ul", "ol"])
-                if nested_list:
+                if isinstance(nested_list, Tag):
                    result.extend(self.get_list_text(nested_list, level + 1))
        elif list_element.name == "ul":  # For unordered lists, use bullet points
-            for li in list_element.find_all("li", recursive=False):
+            for li in list_element("li", recursive=False):
+                if not isinstance(li, Tag):
+                    continue
                # Add bullet points for unordered lists
                result.append(
                    f"{'    ' * level}{bullet_char} {li.get_text(strip=True)}"
                )
                # Handle nested lists
                nested_list = li.find(["ul", "ol"])
-                if nested_list:
+                if isinstance(nested_list, Tag):
                    result.extend(self.get_list_text(nested_list, level + 1))

        return result

-    def extract_table_cell_text(self, cell):
-        """Extract text from a table cell, including lists with indents."""
-        contains_lists = cell.find(["ul", "ol"])
-        if contains_lists is None:
-            return cell.text
-        else:
-            _log.debug(
-                "should extract the content correctly for table-cells with lists ..."
-            )
-            return cell.text
-
-    def handle_figure(self, element, idx, doc):
+    def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles image tags (img)."""

        # Extract the image URI from the <img> tag
        # image_uri = root.xpath('//figure//img/@src')[0]

        contains_captions = element.find(["figcaption"])
-        if contains_captions is None:
-            doc.add_picture(parent=self.parents[self.level], caption=None)
-
+        if not isinstance(contains_captions, Tag):
+            doc.add_picture(
+                parent=self.parents[self.level],
+                caption=None,
+                content_layer=self.content_layer,
+            )
        else:
            texts = []
            for item in contains_captions:
                texts.append(item.text)

            fig_caption = doc.add_text(
-                label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
+                label=DocItemLabel.CAPTION,
+                text=("".join(texts)).strip(),
+                content_layer=self.content_layer,
            )
            doc.add_picture(
                parent=self.parents[self.level],
                caption=fig_caption,
+                content_layer=self.content_layer,
            )

-    def handle_image(self, element, idx, doc):
+    def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
        """Handles image tags (img)."""
-        doc.add_picture(parent=self.parents[self.level], caption=None)
+        _log.debug(f"ignoring <img> tags at the moment: {element}")
+
+        doc.add_picture(
+            parent=self.parents[self.level],
+            caption=None,
+            content_layer=self.content_layer,
+        )
--- a/docling/backend/json/init.py
+++ b/docling/backend/json/init.py
--- a/docling/backend/json/docling_json_backend.py
+++ b/docling/backend/json/docling_json_backend.py
@ -0,0 +1,58 @@
+from io import BytesIO
+from pathlib import Path
+from typing import Union
+
+from docling_core.types.doc import DoclingDocument
+from typing_extensions import override
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+
+class DoclingJSONBackend(DeclarativeDocumentBackend):
+    @override
+    def __init__(
+        self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
+    ) -> None:
+        super().__init__(in_doc, path_or_stream)
+
+        # given we need to store any actual conversion exception for raising it from
+        # convert(), this captures the successful result or the actual error in a
+        # mutually exclusive way:
+        self._doc_or_err = self._get_doc_or_err()
+
+    @override
+    def is_valid(self) -> bool:
+        return isinstance(self._doc_or_err, DoclingDocument)
+
+    @classmethod
+    @override
+    def supports_pagination(cls) -> bool:
+        return False
+
+    @classmethod
+    @override
+    def supported_formats(cls) -> set[InputFormat]:
+        return {InputFormat.JSON_DOCLING}
+
+    def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
+        try:
+            json_data: Union[str, bytes]
+            if isinstance(self.path_or_stream, Path):
+                with open(self.path_or_stream, encoding="utf-8") as f:
+                    json_data = f.read()
+            elif isinstance(self.path_or_stream, BytesIO):
+                json_data = self.path_or_stream.getvalue()
+            else:
+                raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
+            return DoclingDocument.model_validate_json(json_data=json_data)
+        except Exception as e:
+            return e
+
+    @override
+    def convert(self) -> DoclingDocument:
+        if isinstance(self._doc_or_err, DoclingDocument):
+            return self._doc_or_err
+        else:
+            raise self._doc_or_err
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@ -3,32 +3,40 @@ import re
 import warnings
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import List, Optional, Set, Union

 import marko
+import marko.element
 import marko.ext
 import marko.ext.gfm
 import marko.inline
 from docling_core.types.doc import (
+    DocItem,
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
    GroupLabel,
+    NodeItem,
    TableCell,
    TableData,
+    TextItem,
 )
 from marko import Markdown

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument

 _log = logging.getLogger(__name__)

+_MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
+_START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
+_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
+

 class MarkdownDocumentBackend(DeclarativeDocumentBackend):
-
-    def shorten_underscore_sequences(self, markdown_text, max_length=10):
+    def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
        # This regex will match any sequence of underscores
        pattern = r"_+"

@ -63,7 +71,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):

        self.in_table = False
        self.md_table_buffer: list[str] = []
-        self.inline_text_buffer = ""
+        self.inline_texts: list[str] = []
+        self._html_blocks: int = 0

        try:
            if isinstance(self.path_or_stream, BytesIO):
@ -72,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                # very long sequences of underscores will lead to unnecessary long processing times.
                # In any proper Markdown files, underscores have to be escaped,
                # otherwise they represent emphasis (bold or italic)
-                self.markdown = self.shorten_underscore_sequences(text_stream)
+                self.markdown = self._shorten_underscore_sequences(text_stream)
            if isinstance(self.path_or_stream, Path):
                with open(self.path_or_stream, "r", encoding="utf-8") as f:
                    md_content = f.read()
@ -80,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                    # very long sequences of underscores will lead to unnecessary long processing times.
                    # In any proper Markdown files, underscores have to be escaped,
                    # otherwise they represent emphasis (bold or italic)
-                    self.markdown = self.shorten_underscore_sequences(md_content)
+                    self.markdown = self._shorten_underscore_sequences(md_content)
            self.valid = True

            _log.debug(self.markdown)
@ -90,13 +99,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            ) from e
        return

-    def close_table(self, doc=None):
+    def _close_table(self, doc: DoclingDocument):
        if self.in_table:
            _log.debug("=== TABLE START ===")
            for md_table_row in self.md_table_buffer:
                _log.debug(md_table_row)
            _log.debug("=== TABLE END ===")
-            tcells = []
+            tcells: List[TableCell] = []
            result_table = []
            for n, md_table_row in enumerate(self.md_table_buffer):
                data = []
@ -137,33 +146,47 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            self.in_table = False
            self.md_table_buffer = []  # clean table markdown buffer
            # Initialize Docling TableData
-            data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
+            table_data = TableData(
+                num_rows=num_rows, num_cols=num_cols, table_cells=tcells
+            )
            # Populate
            for tcell in tcells:
-                data.table_cells.append(tcell)
+                table_data.table_cells.append(tcell)
            if len(tcells) > 0:
-                doc.add_table(data=data)
+                doc.add_table(data=table_data)
        return

-    def process_inline_text(self, parent_element, doc=None):
-        # self.inline_text_buffer += str(text_in)
-        txt = self.inline_text_buffer.strip()
+    def _process_inline_text(
+        self, parent_item: Optional[NodeItem], doc: DoclingDocument
+    ):
+        txt = " ".join(self.inline_texts)
        if len(txt) > 0:
            doc.add_text(
                label=DocItemLabel.PARAGRAPH,
-                parent=parent_element,
+                parent=parent_item,
                text=txt,
            )
-        self.inline_text_buffer = ""
+        self.inline_texts = []
+
+    def _iterate_elements(
+        self,
+        element: marko.element.Element,
+        depth: int,
+        doc: DoclingDocument,
+        visited: Set[marko.element.Element],
+        parent_item: Optional[NodeItem] = None,
+    ):
+
+        if element in visited:
+            return

-    def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
        # Iterates over all elements in the AST
        # Check for different element types and process relevant details
-        if isinstance(element, marko.block.Heading):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+        if isinstance(element, marko.block.Heading) and len(element.children) > 0:
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
            _log.debug(
-                f" - Heading level {element.level}, content: {element.children[0].children}"
+                f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
            )
            if element.level == 1:
                doc_label = DocItemLabel.TITLE
@ -172,10 +195,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):

            # Header could have arbitrary inclusion of bold, italic or emphasis,
            # hence we need to traverse the tree to get full text of a header
-            strings = []
+            strings: List[str] = []

            # Define a recursive function to traverse the tree
-            def traverse(node):
+            def traverse(node: marko.block.BlockElement):
                # Check if the node has a "children" attribute
                if hasattr(node, "children"):
                    # If "children" is a list, continue traversal
@ -189,121 +212,137 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            traverse(element)
            snippet_text = "".join(strings)
            if len(snippet_text) > 0:
-                parent_element = doc.add_text(
-                    label=doc_label, parent=parent_element, text=snippet_text
+                parent_item = doc.add_text(
+                    label=doc_label, parent=parent_item, text=snippet_text
                )

        elif isinstance(element, marko.block.List):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
-            _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
-            list_label = GroupLabel.LIST
-            if element.ordered:
-                list_label = GroupLabel.ORDERED_LIST
-            parent_element = doc.add_group(
-                label=list_label, name=f"list", parent=parent_element
-            )
+            has_non_empty_list_items = False
+            for child in element.children:
+                if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
+                    has_non_empty_list_items = True
+                    break

-        elif isinstance(element, marko.block.ListItem):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
+            _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
+            if has_non_empty_list_items:
+                label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
+                parent_item = doc.add_group(
+                    label=label, name=f"list", parent=parent_item
+                )
+
+        elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
            _log.debug(" - List item")

-            snippet_text = str(element.children[0].children[0].children)
+            first_child = element.children[0]
+            snippet_text = str(first_child.children[0].children)  # type: ignore
            is_numbered = False
-            if parent_element.label == GroupLabel.ORDERED_LIST:
+            if (
+                parent_item is not None
+                and isinstance(parent_item, DocItem)
+                and parent_item.label == GroupLabel.ORDERED_LIST
+            ):
                is_numbered = True
            doc.add_list_item(
-                enumerated=is_numbered, parent=parent_element, text=snippet_text
+                enumerated=is_numbered, parent=parent_item, text=snippet_text
            )
+            visited.add(first_child)

        elif isinstance(element, marko.inline.Image):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
            _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
-            doc.add_picture(parent=parent_element, caption=element.title)

-        elif isinstance(element, marko.block.Paragraph):
-            self.process_inline_text(parent_element, doc)
+            fig_caption: Optional[TextItem] = None
+            if element.title is not None and element.title != "":
+                fig_caption = doc.add_text(
+                    label=DocItemLabel.CAPTION, text=element.title
+                )
+
+            doc.add_picture(parent=parent_item, caption=fig_caption)
+
+        elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
+            self._process_inline_text(parent_item, doc)

        elif isinstance(element, marko.inline.RawText):
            _log.debug(f" - Paragraph (raw text): {element.children}")
-            snippet_text = str(element.children).strip()
+            snippet_text = element.children.strip()
            # Detect start of the table:
            if "|" in snippet_text:
                # most likely part of the markdown table
                self.in_table = True
                if len(self.md_table_buffer) > 0:
-                    self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
-                        snippet_text
-                    )
+                    self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
                else:
                    self.md_table_buffer.append(snippet_text)
            else:
-                self.close_table(doc)
-                self.in_table = False
+                self._close_table(doc)
                # most likely just inline text
-                self.inline_text_buffer += str(
-                    element.children
-                )  # do not strip an inline text, as it may contain important spaces
+                self.inline_texts.append(str(element.children))

        elif isinstance(element, marko.inline.CodeSpan):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
            _log.debug(f" - Code Span: {element.children}")
            snippet_text = str(element.children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            doc.add_code(parent=parent_item, text=snippet_text)

-        elif isinstance(element, marko.block.CodeBlock):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+        elif (
+            isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
+            and len(element.children) > 0
+            and isinstance((first_child := element.children[0]), marko.inline.RawText)
+            and len(snippet_text := (first_child.children.strip())) > 0
+        ):
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
            _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
-
-        elif isinstance(element, marko.block.FencedCode):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
-            _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            doc.add_code(parent=parent_item, text=snippet_text)

        elif isinstance(element, marko.inline.LineBreak):
-            self.process_inline_text(parent_element, doc)
            if self.in_table:
                _log.debug("Line break in a table")
                self.md_table_buffer.append("")

        elif isinstance(element, marko.block.HTMLBlock):
-            self.process_inline_text(parent_element, doc)
-            self.close_table(doc)
+            self._html_blocks += 1
+            self._process_inline_text(parent_item, doc)
+            self._close_table(doc)
            _log.debug("HTML Block: {}".format(element))
            if (
-                len(element.children) > 0
+                len(element.body) > 0
            ):  # If Marko doesn't return any content for HTML block, skip it
-                snippet_text = str(element.children).strip()
-                doc.add_text(
-                    label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-                )
+                html_block = element.body.strip()
+
+                # wrap in markers to enable post-processing in convert()
+                text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
+                doc.add_code(parent=parent_item, text=text_to_add)
        else:
            if not isinstance(element, str):
-                self.close_table(doc)
+                self._close_table(doc)
                _log.debug("Some other element: {}".format(element))

+        processed_block_types = (
+            marko.block.Heading,
+            marko.block.CodeBlock,
+            marko.block.FencedCode,
+            marko.inline.RawText,
+        )
+
        # Iterate through the element's children (if any)
-        if not isinstance(element, marko.block.ListItem):
-            if not isinstance(element, marko.block.Heading):
-                if not isinstance(element, marko.block.FencedCode):
-                    # if not isinstance(element, marko.block.Paragraph):
-                    if hasattr(element, "children"):
-                        for child in element.children:
-                            self.iterate_elements(child, depth + 1, doc, parent_element)
+        if hasattr(element, "children") and not isinstance(
+            element, processed_block_types
+        ):
+            for child in element.children:
+                self._iterate_elements(
+                    element=child,
+                    depth=depth + 1,
+                    doc=doc,
+                    visited=visited,
+                    parent_item=parent_item,
+                )

    def is_valid(self) -> bool:
        return self.valid
@ -337,8 +376,51 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            marko_parser = Markdown()
            parsed_ast = marko_parser.parse(self.markdown)
            # Start iterating from the root of the AST
-            self.iterate_elements(parsed_ast, 0, doc, None)
-            self.process_inline_text(None, doc)  # handle last hanging inline text
+            self._iterate_elements(
+                element=parsed_ast,
+                depth=0,
+                doc=doc,
+                parent_item=None,
+                visited=set(),
+            )
+            self._process_inline_text(None, doc)  # handle last hanging inline text
+            self._close_table(doc=doc)  # handle any last hanging table
+
+            # if HTML blocks were detected, export to HTML and delegate to HTML backend
+            if self._html_blocks > 0:
+
+                # export to HTML
+                html_backend_cls = HTMLDocumentBackend
+                html_str = doc.export_to_html()
+
+                def _restore_original_html(txt, regex):
+                    _txt, count = re.subn(regex, "", txt)
+                    if count != self._html_blocks:
+                        raise RuntimeError(
+                            "An internal error has occurred during Markdown conversion."
+                        )
+                    return _txt
+
+                # restore original HTML by removing previouly added markers
+                for regex in [
+                    rf"<pre>\s*<code>\s*{_START_MARKER}",
+                    rf"{_STOP_MARKER}\s*</code>\s*</pre>",
+                ]:
+                    html_str = _restore_original_html(txt=html_str, regex=regex)
+                self._html_blocks = 0
+
+                # delegate to HTML backend
+                stream = BytesIO(bytes(html_str, encoding="utf-8"))
+                in_doc = InputDocument(
+                    path_or_stream=stream,
+                    format=InputFormat.HTML,
+                    backend=html_backend_cls,
+                    filename=self.file.name,
+                )
+                html_backend_obj = html_backend_cls(
+                    in_doc=in_doc, path_or_stream=stream
+                )
+                doc = html_backend_obj.convert()
        else:
            raise RuntimeError(
                f"Cannot convert md with {self.document_hash} because the backend failed to init."
--- a/docling/backend/msexcel_backend.py
+++ b/docling/backend/msexcel_backend.py
@ -26,6 +26,7 @@ _log = logging.getLogger(__name__)

 from typing import Any, List

+from PIL import Image as PILImage
 from pydantic import BaseModel


@ -44,7 +45,6 @@ class ExcelTable(BaseModel):


 class MsExcelDocumentBackend(DeclarativeDocumentBackend):
-
    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)

@ -326,49 +326,61 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
        self, doc: DoclingDocument, sheet: Worksheet
    ) -> DoclingDocument:

-        # FIXME: mypy does not agree with _images ...
+        # Iterate over byte images in the sheet
+        for idx, image in enumerate(sheet._images):  # type: ignore
+
+            try:
+                pil_image = PILImage.open(image.ref)
+
+                doc.add_picture(
+                    parent=self.parents[0],
+                    image=ImageRef.from_pil(image=pil_image, dpi=72),
+                    caption=None,
+                )
+            except:
+                _log.error("could not extract the image from excel sheets")
+
        """
-        # Iterate over images in the sheet
-        for idx, image in enumerate(sheet._images):  # Access embedded images
+        for idx, chart in enumerate(sheet._charts):  # type: ignore
+            try:
+                chart_path = f"chart_{idx + 1}.png"
+                _log.info(
+                    f"Chart found, but dynamic rendering is required for: {chart_path}"
+                )

-            image_bytes = BytesIO(image.ref.blob)
-            pil_image = Image.open(image_bytes)
-
-            doc.add_picture(
-                parent=self.parents[0],
-                image=ImageRef.from_pil(image=pil_image, dpi=72),
-                caption=None,
-            )
-        """
-
-        # FIXME: mypy does not agree with _charts ...
-        """
-        for idx, chart in enumerate(sheet._charts):  # Access embedded charts
-            chart_path = f"chart_{idx + 1}.png"
-            _log.info(
-                f"Chart found, but dynamic rendering is required for: {chart_path}"
-            )
-
-            _log.info(f"Chart {idx + 1}:")
-        
-            # Chart type
-            _log.info(f"Type: {type(chart).__name__}")
-            
-            # Title
-            if chart.title:
-                _log.info(f"Title: {chart.title}")
-            else:
-                _log.info("No title")
-            
-            # Data series
-            for series in chart.series:
-                _log.info(" => series ...")
-                _log.info(f"Data Series: {series.title}")
-                _log.info(f"Values: {series.values}")
-                _log.info(f"Categories: {series.categories}")
+                _log.info(f"Chart {idx + 1}:")
                
-            # Position
-            # _log.info(f"Anchor Cell: {chart.anchor}")
+                # Chart type
+                # _log.info(f"Type: {type(chart).__name__}")
+                print(f"Type: {type(chart).__name__}")
+
+                # Extract series data
+                for series_idx, series in enumerate(chart.series):
+                    #_log.info(f"Series {series_idx + 1}:")
+                    print(f"Series {series_idx + 1} type: {type(series).__name__}")
+                    #print(f"x-values: {series.xVal}")
+                    #print(f"y-values: {series.yVal}")
+
+                    print(f"xval type: {type(series.xVal).__name__}")
+                    
+                    xvals = []
+                    for _ in series.xVal.numLit.pt:
+                        print(f"xval type: {type(_).__name__}")
+                        if hasattr(_, 'v'):
+                            xvals.append(_.v)
+
+                    print(f"x-values: {xvals}")
+                            
+                    yvals = []
+                    for _ in series.yVal:
+                        if hasattr(_, 'v'):
+                            yvals.append(_.v)
+                            
+                    print(f"y-values: {yvals}")                    
+                    
+            except Exception as exc:
+                print(exc)
+                continue
        """

        return doc
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -98,21 +98,28 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB

        return doc

-    def generate_prov(self, shape, slide_ind, text=""):
-        left = shape.left
-        top = shape.top
-        width = shape.width
-        height = shape.height
+    def generate_prov(
+        self, shape, slide_ind, text="", slide_size=Size(width=1, height=1)
+    ):
+        if shape.left:
+            left = shape.left
+            top = shape.top
+            width = shape.width
+            height = shape.height
+        else:
+            left = 0
+            top = 0
+            width = slide_size.width
+            height = slide_size.height
        shape_bbox = [left, top, left + width, top + height]
        shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
-        # prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
        prov = ProvenanceItem(
            page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
        )

        return prov

-    def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
+    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
        is_a_list = False
        is_list_group_created = False
        enum_list_item_value = 0
@ -121,7 +128,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        list_text = ""
        list_label = GroupLabel.LIST
        doc_label = DocItemLabel.LIST_ITEM
-        prov = self.generate_prov(shape, slide_ind, shape.text.strip())
+        prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)

        # Identify if shape contains lists
        for paragraph in shape.text_frame.paragraphs:
@ -270,18 +277,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                )
        return

-    def handle_pictures(self, shape, parent_slide, slide_ind, doc):
-        # Get the image bytes
-        image = shape.image
-        image_bytes = image.blob
-        im_dpi, _ = image.dpi
-
+    def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
        # Open it with PIL
        try:
+            # Get the image bytes
+            image = shape.image
+            image_bytes = image.blob
+            im_dpi, _ = image.dpi
            pil_image = Image.open(BytesIO(image_bytes))

            # shape has picture
-            prov = self.generate_prov(shape, slide_ind, "")
+            prov = self.generate_prov(shape, slide_ind, "", slide_size)
            doc.add_picture(
                parent=parent_slide,
                image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
@ -292,13 +298,13 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
            _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
        return

-    def handle_tables(self, shape, parent_slide, slide_ind, doc):
+    def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
        # Handling tables, images, charts
        if shape.has_table:
            table = shape.table
            table_xml = shape._element

-            prov = self.generate_prov(shape, slide_ind, "")
+            prov = self.generate_prov(shape, slide_ind, "", slide_size)

            num_cols = 0
            num_rows = len(table.rows)
@ -375,17 +381,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
            )

-            size = Size(width=slide_width, height=slide_height)
-            parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
+            slide_size = Size(width=slide_width, height=slide_height)
+            parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)

-            def handle_shapes(shape, parent_slide, slide_ind, doc):
-                handle_groups(shape, parent_slide, slide_ind, doc)
+            def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
+                handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
                if shape.has_table:
                    # Handle Tables
-                    self.handle_tables(shape, parent_slide, slide_ind, doc)
+                    self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                    # Handle Pictures
-                    self.handle_pictures(shape, parent_slide, slide_ind, doc)
+                    self.handle_pictures(
+                        shape, parent_slide, slide_ind, doc, slide_size
+                    )
                # If shape doesn't have any text, move on to the next shape
                if not hasattr(shape, "text"):
                    return
@ -397,16 +405,20 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                    _log.warning("Warning: shape has text but not text_frame")
                    return
                # Handle other text elements, including lists (bullet lists, numbered lists)
-                self.handle_text_elements(shape, parent_slide, slide_ind, doc)
+                self.handle_text_elements(
+                    shape, parent_slide, slide_ind, doc, slide_size
+                )
                return

-            def handle_groups(shape, parent_slide, slide_ind, doc):
+            def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
                if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
                    for groupedshape in shape.shapes:
-                        handle_shapes(groupedshape, parent_slide, slide_ind, doc)
+                        handle_shapes(
+                            groupedshape, parent_slide, slide_ind, doc, slide_size
+                        )

            # Loop through each shape in the slide
            for shape in slide.shapes:
-                handle_shapes(shape, parent_slide, slide_ind, doc)
+                handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)

        return doc
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -2,21 +2,28 @@ import logging
 import re
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import Any, Optional, Union

-import docx
 from docling_core.types.doc import (
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
    GroupLabel,
    ImageRef,
+    NodeItem,
    TableCell,
    TableData,
 )
+from docx import Document
+from docx.document import Document as DocxDocument
+from docx.oxml.table import CT_Tc
+from docx.oxml.xmlchemy import BaseOxmlElement
+from docx.table import Table, _Cell
+from docx.text.paragraph import Paragraph
 from lxml import etree
 from lxml.etree import XPath
 from PIL import Image, UnidentifiedImageError
+from typing_extensions import override

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
@ -26,8 +33,10 @@ _log = logging.getLogger(__name__)


 class MsWordDocumentBackend(DeclarativeDocumentBackend):
-
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+    @override
+    def __init__(
+        self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
+    ) -> None:
        super().__init__(in_doc, path_or_stream)
        self.XML_KEY = (
            "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
@ -37,19 +46,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        }
        # self.initialise(path_or_stream)
        # Word file:
-        self.path_or_stream = path_or_stream
-        self.valid = False
+        self.path_or_stream: Union[BytesIO, Path] = path_or_stream
+        self.valid: bool = False
        # Initialise the parents for the hierarchy
-        self.max_levels = 10
-        self.level_at_new_list = None
-        self.parents = {}  # type: ignore
+        self.max_levels: int = 10
+        self.level_at_new_list: Optional[int] = None
+        self.parents: dict[int, Optional[NodeItem]] = {}
        for i in range(-1, self.max_levels):
            self.parents[i] = None

        self.level = 0
        self.listIter = 0

-        self.history = {
+        self.history: dict[str, Any] = {
            "names": [None],
            "levels": [None],
            "numids": [None],
@ -59,9 +68,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        self.docx_obj = None
        try:
            if isinstance(self.path_or_stream, BytesIO):
-                self.docx_obj = docx.Document(self.path_or_stream)
+                self.docx_obj = Document(self.path_or_stream)
            elif isinstance(self.path_or_stream, Path):
-                self.docx_obj = docx.Document(str(self.path_or_stream))
+                self.docx_obj = Document(str(self.path_or_stream))

            self.valid = True
        except Exception as e:
@ -69,13 +78,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
            ) from e

+    @override
    def is_valid(self) -> bool:
        return self.valid

    @classmethod
+    @override
    def supports_pagination(cls) -> bool:
        return False

+    @override
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()
@ -83,11 +95,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        self.path_or_stream = None

    @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
+    @override
+    def supported_formats(cls) -> set[InputFormat]:
        return {InputFormat.DOCX}

+    @override
    def convert(self) -> DoclingDocument:
-        # Parses the DOCX into a structured document model.
+        """Parses the DOCX into a structured document model.
+
+        Returns:
+            The parsed document.
+        """

        origin = DocumentOrigin(
            filename=self.file.name or "file",
@ -105,23 +123,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
            )

-    def update_history(self, name, level, numid, ilevel):
+    def update_history(
+        self,
+        name: str,
+        level: Optional[int],
+        numid: Optional[int],
+        ilevel: Optional[int],
+    ):
        self.history["names"].append(name)
        self.history["levels"].append(level)

        self.history["numids"].append(numid)
        self.history["indents"].append(ilevel)

-    def prev_name(self):
+    def prev_name(self) -> Optional[str]:
        return self.history["names"][-1]

-    def prev_level(self):
+    def prev_level(self) -> Optional[int]:
        return self.history["levels"][-1]

-    def prev_numid(self):
+    def prev_numid(self) -> Optional[int]:
        return self.history["numids"][-1]

-    def prev_indent(self):
+    def prev_indent(self) -> Optional[int]:
        return self.history["indents"][-1]

    def get_level(self) -> int:
@ -131,13 +155,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                return k
        return 0

-    def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
+    def walk_linear(
+        self,
+        body: BaseOxmlElement,
+        docx_obj: DocxDocument,
+        doc: DoclingDocument,
+    ) -> DoclingDocument:
        for element in body:
            tag_name = etree.QName(element).localname
            # Check for Inline Images (blip elements)
            namespaces = {
                "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
                "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
+                "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
            }
            xpath_expr = XPath(".//a:blip", namespaces=namespaces)
            drawing_blip = xpath_expr(element)
@ -150,7 +180,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    _log.debug("could not parse a table, broken docx table")

            elif drawing_blip:
-                self.handle_pictures(element, docx_obj, drawing_blip, doc)
+                self.handle_pictures(docx_obj, drawing_blip, doc)
+            # Check for the sdt containers, like table of contents
+            elif tag_name in ["sdt"]:
+                sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
+                if sdt_content is not None:
+                    # Iterate paragraphs, runs, or text inside <w:sdtContent>.
+                    paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
+                    for p in paragraphs:
+                        self.handle_text_elements(p, docx_obj, doc)
            # Check for Text
            elif tag_name in ["p"]:
                # "tcPr", "sectPr"
@ -159,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
        return doc

-    def str_to_int(self, s, default=0):
+    def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]:
        if s is None:
            return None
        try:
@ -167,7 +205,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        except ValueError:
            return default

-    def split_text_and_number(self, input_string):
+    def split_text_and_number(self, input_string: str) -> list[str]:
        match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
        if match:
            parts = list(filter(None, match.groups()))
@ -175,7 +213,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        else:
            return [input_string]

-    def get_numId_and_ilvl(self, paragraph):
+    def get_numId_and_ilvl(
+        self, paragraph: Paragraph
+    ) -> tuple[Optional[int], Optional[int]]:
        # Access the XML element of the paragraph
        numPr = paragraph._element.find(
            ".//w:numPr", namespaces=paragraph._element.nsmap
@ -188,13 +228,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
            ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None

-            return self.str_to_int(numId, default=None), self.str_to_int(
-                ilvl, default=None
-            )
+            return self.str_to_int(numId, None), self.str_to_int(ilvl, None)

        return None, None  # If the paragraph is not part of a list

-    def get_label_and_level(self, paragraph):
+    def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
        if paragraph.style is None:
            return "Normal", None
        label = paragraph.style.style_id
@ -204,26 +242,31 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            parts = label.split(":")

            if len(parts) == 2:
-                return parts[0], int(parts[1])
+                return parts[0], self.str_to_int(parts[1], None)

        parts = self.split_text_and_number(label)

        if "Heading" in label and len(parts) == 2:
            parts.sort()
-            label_str = ""
-            label_level = 0
+            label_str: str = ""
+            label_level: Optional[int] = 0
            if parts[0] == "Heading":
                label_str = parts[0]
-                label_level = self.str_to_int(parts[1], default=None)
+                label_level = self.str_to_int(parts[1], None)
            if parts[1] == "Heading":
                label_str = parts[1]
-                label_level = self.str_to_int(parts[0], default=None)
+                label_level = self.str_to_int(parts[0], None)
            return label_str, label_level
        else:
            return label, None

-    def handle_text_elements(self, element, docx_obj, doc):
-        paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
+    def handle_text_elements(
+        self,
+        element: BaseOxmlElement,
+        docx_obj: DocxDocument,
+        doc: DoclingDocument,
+    ) -> None:
+        paragraph = Paragraph(element, docx_obj)

        if paragraph.text is None:
            return
@ -241,13 +284,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            numid = None

        # Handle lists
-        if numid is not None and ilevel is not None:
+        if (
+            numid is not None
+            and ilevel is not None
+            and p_style_id not in ["Title", "Heading"]
+        ):
            self.add_listitem(
-                element,
-                docx_obj,
                doc,
-                p_style_id,
-                p_level,
                numid,
                ilevel,
                text,
@ -255,20 +298,30 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            )
            self.update_history(p_style_id, p_level, numid, ilevel)
            return
-        elif numid is None and self.prev_numid() is not None:  # Close list
-            for key, val in self.parents.items():
-                if key >= self.level_at_new_list:
+        elif (
+            numid is None
+            and self.prev_numid() is not None
+            and p_style_id not in ["Title", "Heading"]
+        ):  # Close list
+            if self.level_at_new_list:
+                for key in range(len(self.parents)):
+                    if key >= self.level_at_new_list:
+                        self.parents[key] = None
+                self.level = self.level_at_new_list - 1
+                self.level_at_new_list = None
+            else:
+                for key in range(len(self.parents)):
                    self.parents[key] = None
-            self.level = self.level_at_new_list - 1
-            self.level_at_new_list = None
+                self.level = 0
+
        if p_style_id in ["Title"]:
-            for key, val in self.parents.items():
+            for key in range(len(self.parents)):
                self.parents[key] = None
            self.parents[0] = doc.add_text(
                parent=None, label=DocItemLabel.TITLE, text=text
            )
        elif "Heading" in p_style_id:
-            self.add_header(element, docx_obj, doc, p_style_id, p_level, text)
+            self.add_header(doc, p_level, text)

        elif p_style_id in [
            "Paragraph",
@ -296,7 +349,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        self.update_history(p_style_id, p_level, numid, ilevel)
        return

-    def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
+    def add_header(
+        self, doc: DoclingDocument, curr_level: Optional[int], text: str
+    ) -> None:
        level = self.get_level()
        if isinstance(curr_level, int):
            if curr_level > level:
@ -309,7 +364,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    )
            elif curr_level < level:
                # remove the tail
-                for key, val in self.parents.items():
+                for key in range(len(self.parents)):
                    if key >= curr_level:
                        self.parents[key] = None

@ -328,22 +383,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):

    def add_listitem(
        self,
-        element,
-        docx_obj,
-        doc,
-        p_style_id,
-        p_level,
-        numid,
-        ilevel,
+        doc: DoclingDocument,
+        numid: int,
+        ilevel: int,
        text: str,
-        is_numbered=False,
-    ):
-        # is_numbered = is_numbered
+        is_numbered: bool = False,
+    ) -> None:
        enum_marker = ""

        level = self.get_level()
+        prev_indent = self.prev_indent()
        if self.prev_numid() is None:  # Open new list
-            self.level_at_new_list = level  # type: ignore
+            self.level_at_new_list = level

            self.parents[level] = doc.add_group(
                label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
@ -362,10 +413,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            )

        elif (
-            self.prev_numid() == numid and self.prev_indent() < ilevel
+            self.prev_numid() == numid
+            and self.level_at_new_list is not None
+            and prev_indent is not None
+            and prev_indent < ilevel
        ):  # Open indented list
            for i in range(
-                self.level_at_new_list + self.prev_indent() + 1,
+                self.level_at_new_list + prev_indent + 1,
                self.level_at_new_list + ilevel + 1,
            ):
                # Determine if this is an unordered list or an ordered list.
@ -394,7 +448,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                text=text,
            )

-        elif self.prev_numid() == numid and ilevel < self.prev_indent():  # Close list
+        elif (
+            self.prev_numid() == numid
+            and self.level_at_new_list is not None
+            and prev_indent is not None
+            and ilevel < prev_indent
+        ):  # Close list
            for k, v in self.parents.items():
                if k > self.level_at_new_list + ilevel:
                    self.parents[k] = None
@ -412,7 +471,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            )
            self.listIter = 0

-        elif self.prev_numid() == numid or self.prev_indent() == ilevel:
+        elif self.prev_numid() == numid or prev_indent == ilevel:
            # TODO: Set marker and enumerated arguments if this is an enumeration element.
            self.listIter += 1
            if is_numbered:
@ -426,31 +485,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            )
        return

-    def handle_tables(self, element, docx_obj, doc):
-
-        # Function to check if a cell has a colspan (gridSpan)
-        def get_colspan(cell):
-            grid_span = cell._element.xpath("@w:gridSpan")
-            if grid_span:
-                return int(grid_span[0])  # Return the number of columns spanned
-            return 1  # Default is 1 (no colspan)
-
-        # Function to check if a cell has a rowspan (vMerge)
-        def get_rowspan(cell):
-            v_merge = cell._element.xpath("@w:vMerge")
-            if v_merge:
-                return v_merge[
-                    0
-                ]  # 'restart' indicates the beginning of a rowspan, others are continuation
-            return 1
-
-        table = docx.table.Table(element, docx_obj)
-
+    def handle_tables(
+        self,
+        element: BaseOxmlElement,
+        docx_obj: DocxDocument,
+        doc: DoclingDocument,
+    ) -> None:
+        table: Table = Table(element, docx_obj)
        num_rows = len(table.rows)
-        num_cols = 0
-        for row in table.rows:
-            # Calculate the max number of columns
-            num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
+        num_cols = len(table.columns)
+        _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")

        if num_rows == 1 and num_cols == 1:
            cell_element = table.rows[0].cells[0]
@ -459,59 +503,56 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            self.walk_linear(cell_element._element, docx_obj, doc)
            return

-        # Initialize the table grid
-        table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
-
-        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
-
+        data = TableData(num_rows=num_rows, num_cols=num_cols)
+        cell_set: set[CT_Tc] = set()
        for row_idx, row in enumerate(table.rows):
+            _log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
            col_idx = 0
-            for c, cell in enumerate(row.cells):
-                row_span = get_rowspan(cell)
-                col_span = get_colspan(cell)
+            while col_idx < num_cols:
+                cell: _Cell = row.cells[col_idx]
+                _log.debug(
+                    f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
+                )
+                if cell is None or cell._tc in cell_set:
+                    _log.debug(f"  skipped since repeated content")
+                    col_idx += cell.grid_span
+                    continue
+                else:
+                    cell_set.add(cell._tc)

-                cell_text = cell.text
-                # In case cell doesn't return text via docx library:
-                if len(cell_text) == 0:
-                    cell_xml = cell._element
+                spanned_idx = row_idx
+                spanned_tc: Optional[CT_Tc] = cell._tc
+                while spanned_tc == cell._tc:
+                    spanned_idx += 1
+                    spanned_tc = (
+                        table.rows[spanned_idx].cells[col_idx]._tc
+                        if spanned_idx < num_rows
+                        else None
+                    )
+                _log.debug(f"  spanned before row {spanned_idx}")

-                    texts = [""]
-                    for elem in cell_xml.iter():
-                        if elem.tag.endswith("t"):  # <w:t> tags that contain text
-                            if elem.text:
-                                texts.append(elem.text)
-                    # Join the collected text
-                    cell_text = " ".join(texts).strip()
-
-                # Find the next available column in the grid
-                while table_grid[row_idx][col_idx] is not None:
-                    col_idx += 1
-
-                # Fill the grid with the cell value, considering rowspan and colspan
-                for i in range(row_span if row_span == "restart" else 1):
-                    for j in range(col_span):
-                        table_grid[row_idx + i][col_idx + j] = ""
-
-                cell = TableCell(
-                    text=cell_text,
-                    row_span=row_span,
-                    col_span=col_span,
-                    start_row_offset_idx=row_idx,
-                    end_row_offset_idx=row_idx + row_span,
+                table_cell = TableCell(
+                    text=cell.text,
+                    row_span=spanned_idx - row_idx,
+                    col_span=cell.grid_span,
+                    start_row_offset_idx=row.grid_cols_before + row_idx,
+                    end_row_offset_idx=row.grid_cols_before + spanned_idx,
                    start_col_offset_idx=col_idx,
-                    end_col_offset_idx=col_idx + col_span,
+                    end_col_offset_idx=col_idx + cell.grid_span,
                    col_header=False,
                    row_header=False,
                )
-
-                data.table_cells.append(cell)
+                data.table_cells.append(table_cell)
+                col_idx += cell.grid_span

        level = self.get_level()
        doc.add_table(data=data, parent=self.parents[level - 1])
        return

-    def handle_pictures(self, element, docx_obj, drawing_blip, doc):
-        def get_docx_image(element, drawing_blip):
+    def handle_pictures(
+        self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
+    ) -> None:
+        def get_docx_image(drawing_blip):
            rId = drawing_blip[0].get(
                "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
            )
@ -521,11 +562,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                image_data = image_part.blob  # Get the binary image data
            return image_data

-        image_data = get_docx_image(element, drawing_blip)
-        image_bytes = BytesIO(image_data)
        level = self.get_level()
        # Open the BytesIO object with PIL to create an Image
        try:
+            image_data = get_docx_image(drawing_blip)
+            image_bytes = BytesIO(image_data)
            pil_image = Image.open(image_bytes)
            doc.add_picture(
                parent=self.parents[level - 1],
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@ -12,7 +12,6 @@ from docling.datamodel.document import InputDocument


 class PdfPageBackend(ABC):
-
    @abstractmethod
    def get_text_in_rect(self, bbox: BoundingBox) -> str:
        pass
@ -45,7 +44,6 @@ class PdfPageBackend(ABC):


 class PdfDocumentBackend(PaginatedDocumentBackend):
-
    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)

--- a/docling/backend/xml/jats_backend.py
+++ b/docling/backend/xml/jats_backend.py
@ -0,0 +1,710 @@
+import logging
+import traceback
+from io import BytesIO
+from pathlib import Path
+from typing import Final, Optional, Union
+
+from bs4 import BeautifulSoup, Tag
+from docling_core.types.doc import (
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupItem,
+    GroupLabel,
+    NodeItem,
+    TextItem,
+)
+from lxml import etree
+from typing_extensions import TypedDict, override
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+JATS_DTD_URL: Final = ["JATS-journalpublishing", "JATS-archive"]
+DEFAULT_HEADER_ACKNOWLEDGMENTS: Final = "Acknowledgments"
+DEFAULT_HEADER_ABSTRACT: Final = "Abstract"
+DEFAULT_HEADER_REFERENCES: Final = "References"
+DEFAULT_TEXT_ETAL: Final = "et al."
+
+
+class Abstract(TypedDict):
+    label: str
+    content: str
+
+
+class Author(TypedDict):
+    name: str
+    affiliation_names: list[str]
+
+
+class Citation(TypedDict):
+    author_names: str
+    title: str
+    source: str
+    year: str
+    volume: str
+    page: str
+    pub_id: str
+    publisher_name: str
+    publisher_loc: str
+
+
+class Table(TypedDict):
+    label: str
+    caption: str
+    content: str
+
+
+class XMLComponents(TypedDict):
+    title: str
+    authors: list[Author]
+    abstract: list[Abstract]
+
+
+class JatsDocumentBackend(DeclarativeDocumentBackend):
+    """Backend to parse articles in XML format tagged according to JATS definition.
+
+    The Journal Article Tag Suite (JATS) is an definition standard for the
+    representation of journal articles in XML format. Several publishers and journal
+    archives provide content in JATS format, including PubMed Central® (PMC), bioRxiv,
+    medRxiv, or Springer Nature.
+
+    Refer to https://jats.nlm.nih.gov for more details on JATS.
+
+    The code from this document backend has been developed by modifying parts of the
+    PubMed Parser library (version 0.5.0, released on 12.08.2024):
+    Achakulvisut et al., (2020).
+    Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML
+      Dataset XML Dataset.
+    Journal of Open Source Software, 5(46), 1979,
+    https://doi.org/10.21105/joss.01979
+    """
+
+    @override
+    def __init__(
+        self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
+    ) -> None:
+        super().__init__(in_doc, path_or_stream)
+        self.path_or_stream = path_or_stream
+
+        # Initialize the root of the document hiearchy
+        self.root: Optional[NodeItem] = None
+
+        self.valid = False
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                self.path_or_stream.seek(0)
+            self.tree: etree._ElementTree = etree.parse(self.path_or_stream)
+
+            doc_info: etree.DocInfo = self.tree.docinfo
+            if doc_info.system_url and any(
+                [kwd in doc_info.system_url for kwd in JATS_DTD_URL]
+            ):
+                self.valid = True
+                return
+            for ent in doc_info.internalDTD.iterentities():
+                if ent.system_url and any(
+                    [kwd in ent.system_url for kwd in JATS_DTD_URL]
+                ):
+                    self.valid = True
+                    return
+        except Exception as exc:
+            raise RuntimeError(
+                f"Could not initialize JATS backend for file with hash {self.document_hash}."
+            ) from exc
+
+    @override
+    def is_valid(self) -> bool:
+        return self.valid
+
+    @classmethod
+    @override
+    def supports_pagination(cls) -> bool:
+        return False
+
+    @override
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+        self.path_or_stream = None
+
+    @classmethod
+    @override
+    def supported_formats(cls) -> set[InputFormat]:
+        return {InputFormat.XML_JATS}
+
+    @override
+    def convert(self) -> DoclingDocument:
+        try:
+            # Create empty document
+            origin = DocumentOrigin(
+                filename=self.file.name or "file",
+                mimetype="application/xml",
+                binary_hash=self.document_hash,
+            )
+            doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
+
+            # Get metadata XML components
+            xml_components: XMLComponents = self._parse_metadata()
+
+            # Add metadata to the document
+            self._add_metadata(doc, xml_components)
+
+            # walk over the XML body
+            body = self.tree.xpath("//body")
+            if self.root and len(body) > 0:
+                self._walk_linear(doc, self.root, body[0])
+
+            # walk over the XML back matter
+            back = self.tree.xpath("//back")
+            if self.root and len(back) > 0:
+                self._walk_linear(doc, self.root, back[0])
+        except Exception:
+            _log.error(traceback.format_exc())
+
+        return doc
+
+    @staticmethod
+    def _get_text(node: etree._Element, sep: Optional[str] = None) -> str:
+        skip_tags = ["term", "disp-formula", "inline-formula"]
+        text: str = (
+            node.text.replace("\n", " ")
+            if (node.tag not in skip_tags and node.text)
+            else ""
+        )
+        for child in list(node):
+            if child.tag not in skip_tags:
+                # TODO: apply styling according to child.tag when supported by docling-core
+                text += JatsDocumentBackend._get_text(child, sep)
+            if sep:
+                text = text.rstrip(sep) + sep
+            text += child.tail.replace("\n", " ") if child.tail else ""
+
+        return text
+
+    def _find_metadata(self) -> Optional[etree._Element]:
+        meta_names: list[str] = ["article-meta", "book-part-meta"]
+        meta: Optional[etree._Element] = None
+        for name in meta_names:
+            node = self.tree.xpath(f".//{name}")
+            if len(node) > 0:
+                meta = node[0]
+                break
+
+        return meta
+
+    def _parse_abstract(self) -> list[Abstract]:
+        # TODO: address cases with multiple sections
+        abs_list: list[Abstract] = []
+
+        for abs_node in self.tree.xpath(".//abstract"):
+            abstract: Abstract = dict(label="", content="")
+            texts = []
+            for abs_par in abs_node.xpath("p"):
+                texts.append(JatsDocumentBackend._get_text(abs_par).strip())
+            abstract["content"] = " ".join(texts)
+
+            label_node = abs_node.xpath("title|label")
+            if len(label_node) > 0:
+                abstract["label"] = label_node[0].text.strip()
+
+            abs_list.append(abstract)
+
+        return abs_list
+
+    def _parse_authors(self) -> list[Author]:
+        # Get mapping between affiliation ids and names
+        authors: list[Author] = []
+        meta: Optional[etree._Element] = self._find_metadata()
+        if meta is None:
+            return authors
+
+        affiliation_names = []
+        for affiliation_node in meta.xpath(".//aff[@id]"):
+            aff = ", ".join([t for t in affiliation_node.itertext() if t.strip()])
+            aff = aff.replace("\n", " ")
+            label = affiliation_node.xpath("label")
+            if label:
+                # TODO: once superscript is supported, add label with formatting
+                aff = aff.removeprefix(f"{label[0].text}, ")
+            affiliation_names.append(aff)
+        affiliation_ids_names = {
+            id: name
+            for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
+        }
+
+        # Get author names and affiliation names
+        for author_node in meta.xpath(
+            './/contrib-group/contrib[@contrib-type="author"]'
+        ):
+            author: Author = {
+                "name": "",
+                "affiliation_names": [],
+            }
+
+            # Affiliation names
+            affiliation_ids = [
+                a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
+            ]
+            for id in affiliation_ids:
+                if id in affiliation_ids_names:
+                    author["affiliation_names"].append(affiliation_ids_names[id])
+
+            # Name
+            author["name"] = (
+                author_node.xpath("name/given-names")[0].text
+                + " "
+                + author_node.xpath("name/surname")[0].text
+            )
+
+            authors.append(author)
+
+        return authors
+
+    def _parse_title(self) -> str:
+        meta_names: list[str] = [
+            "article-meta",
+            "collection-meta",
+            "book-meta",
+            "book-part-meta",
+        ]
+        title_names: list[str] = ["article-title", "subtitle", "title", "label"]
+        titles: list[str] = [
+            " ".join(
+                elem.text.replace("\n", " ").strip()
+                for elem in list(title_node)
+                if elem.tag in title_names
+            ).strip()
+            for title_node in self.tree.xpath(
+                "|".join([f".//{item}/title-group" for item in meta_names])
+            )
+        ]
+
+        text = " - ".join(titles)
+
+        return text
+
+    def _parse_metadata(self) -> XMLComponents:
+        """Parsing JATS document metadata."""
+        xml_components: XMLComponents = {
+            "title": self._parse_title(),
+            "authors": self._parse_authors(),
+            "abstract": self._parse_abstract(),
+        }
+        return xml_components
+
+    def _add_abstract(
+        self, doc: DoclingDocument, xml_components: XMLComponents
+    ) -> None:
+
+        for abstract in xml_components["abstract"]:
+            text: str = abstract["content"]
+            title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
+            if not text:
+                continue
+            parent = doc.add_heading(parent=self.root, text=title)
+            doc.add_text(
+                parent=parent,
+                text=text,
+                label=DocItemLabel.TEXT,
+            )
+
+        return
+
+    def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
+        # TODO: once docling supports text formatting, add affiliation reference to
+        # author names through superscripts
+        authors: list = [item["name"] for item in xml_components["authors"]]
+        authors_str = ", ".join(authors)
+        affiliations: list = [
+            item
+            for author in xml_components["authors"]
+            for item in author["affiliation_names"]
+        ]
+        affiliations_str = "; ".join(list(dict.fromkeys(affiliations)))
+        if authors_str:
+            doc.add_text(
+                parent=self.root,
+                text=authors_str,
+                label=DocItemLabel.PARAGRAPH,
+            )
+        if affiliations_str:
+            doc.add_text(
+                parent=self.root,
+                text=affiliations_str,
+                label=DocItemLabel.PARAGRAPH,
+            )
+
+        return
+
+    def _add_citation(self, doc: DoclingDocument, parent: NodeItem, text: str) -> None:
+        if isinstance(parent, GroupItem) and parent.label == GroupLabel.LIST:
+            doc.add_list_item(text=text, enumerated=False, parent=parent)
+        else:
+            doc.add_text(text=text, label=DocItemLabel.TEXT, parent=parent)
+
+        return
+
+    def _parse_element_citation(self, node: etree._Element) -> str:
+        citation: Citation = {
+            "author_names": "",
+            "title": "",
+            "source": "",
+            "year": "",
+            "volume": "",
+            "page": "",
+            "pub_id": "",
+            "publisher_name": "",
+            "publisher_loc": "",
+        }
+
+        _log.debug("Citation parsing started")
+
+        # Author names
+        names = []
+        for name_node in node.xpath(".//name"):
+            name_str = (
+                name_node.xpath("surname")[0].text.replace("\n", " ").strip()
+                + " "
+                + name_node.xpath("given-names")[0].text.replace("\n", " ").strip()
+            )
+            names.append(name_str)
+        etal_node = node.xpath(".//etal")
+        if len(etal_node) > 0:
+            etal_text = etal_node[0].text or DEFAULT_TEXT_ETAL
+            names.append(etal_text)
+        citation["author_names"] = ", ".join(names)
+
+        titles: list[str] = [
+            "article-title",
+            "chapter-title",
+            "data-title",
+            "issue-title",
+            "part-title",
+            "trans-title",
+        ]
+        title_node: Optional[etree._Element] = None
+        for name in titles:
+            name_node = node.xpath(name)
+            if len(name_node) > 0:
+                title_node = name_node[0]
+                break
+        citation["title"] = (
+            JatsDocumentBackend._get_text(title_node)
+            if title_node is not None
+            else node.text.replace("\n", " ").strip()
+        )
+
+        # Journal, year, publisher name, publisher location, volume, elocation
+        fields: list[str] = [
+            "source",
+            "year",
+            "publisher-name",
+            "publisher-loc",
+            "volume",
+        ]
+        for item in fields:
+            item_node = node.xpath(item)
+            if len(item_node) > 0:
+                citation[item.replace("-", "_")] = (  # type: ignore[literal-required]
+                    item_node[0].text.replace("\n", " ").strip()
+                )
+
+        # Publication identifier
+        if len(node.xpath("pub-id")) > 0:
+            pub_id: list[str] = []
+            for id_node in node.xpath("pub-id"):
+                id_type = id_node.get("assigning-authority") or id_node.get(
+                    "pub-id-type"
+                )
+                id_text = id_node.text
+                if id_type and id_text:
+                    pub_id.append(
+                        id_type.replace("\n", " ").strip().upper()
+                        + ": "
+                        + id_text.replace("\n", " ").strip()
+                    )
+            if pub_id:
+                citation["pub_id"] = ", ".join(pub_id)
+
+        # Pages
+        if len(node.xpath("elocation-id")) > 0:
+            citation["page"] = (
+                node.xpath("elocation-id")[0].text.replace("\n", " ").strip()
+            )
+        elif len(node.xpath("fpage")) > 0:
+            citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
+            if len(node.xpath("lpage")) > 0:
+                citation["page"] += (
+                    "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
+                )
+
+        # Flatten the citation to string
+
+        text = ""
+        if citation["author_names"]:
+            text += citation["author_names"].rstrip(".") + ". "
+        if citation["title"]:
+            text += citation["title"] + ". "
+        if citation["source"]:
+            text += citation["source"] + ". "
+        if citation["publisher_name"]:
+            if citation["publisher_loc"]:
+                text += f"{citation['publisher_loc']}: "
+            text += citation["publisher_name"] + ". "
+        if citation["volume"]:
+            text = text.rstrip(". ")
+            text += f" {citation['volume']}. "
+        if citation["page"]:
+            text = text.rstrip(". ")
+            if citation["volume"]:
+                text += ":"
+            text += citation["page"] + ". "
+        if citation["year"]:
+            text = text.rstrip(". ")
+            text += f" ({citation['year']})."
+        if citation["pub_id"]:
+            text = text.rstrip(".") + ". "
+            text += citation["pub_id"]
+
+        _log.debug("Citation flattened")
+
+        return text
+
+    def _add_equation(
+        self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
+    ) -> None:
+        math_text = node.text
+        math_parts = math_text.split("$$")
+        if len(math_parts) == 3:
+            math_formula = math_parts[1]
+            doc.add_text(label=DocItemLabel.FORMULA, text=math_formula, parent=parent)
+
+        return
+
+    def _add_figure_captions(
+        self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
+    ) -> None:
+        label_node = node.xpath("label")
+        label: Optional[str] = (
+            JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
+        )
+
+        caption_node = node.xpath("caption")
+        caption: Optional[str]
+        if len(caption_node) > 0:
+            caption = ""
+            for caption_par in list(caption_node[0]):
+                if caption_par.xpath(".//supplementary-material"):
+                    continue
+                caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
+            caption = caption.strip()
+        else:
+            caption = None
+
+        # TODO: format label vs caption once styling is supported
+        fig_text: str = f"{label}{' ' if label and caption else ''}{caption}"
+        fig_caption: Optional[TextItem] = (
+            doc.add_text(label=DocItemLabel.CAPTION, text=fig_text)
+            if fig_text
+            else None
+        )
+
+        doc.add_picture(parent=parent, caption=fig_caption)
+
+        return
+
+    # TODO: add footnotes when DocItemLabel.FOOTNOTE and styling are supported
+    # def _add_footnote_group(self, doc: DoclingDocument, parent: NodeItem, node: etree._Element) -> None:
+    #     new_parent = doc.add_group(label=GroupLabel.LIST, name="footnotes", parent=parent)
+    #     for child in node.iterchildren(tag="fn"):
+    #         text = JatsDocumentBackend._get_text(child)
+    #         doc.add_list_item(text=text, parent=new_parent)
+
+    def _add_metadata(
+        self, doc: DoclingDocument, xml_components: XMLComponents
+    ) -> None:
+        self._add_title(doc, xml_components)
+        self._add_authors(doc, xml_components)
+        self._add_abstract(doc, xml_components)
+
+        return
+
+    def _add_table(
+        self, doc: DoclingDocument, parent: NodeItem, table_xml_component: Table
+    ) -> None:
+        soup = BeautifulSoup(table_xml_component["content"], "html.parser")
+        table_tag = soup.find("table")
+        if not isinstance(table_tag, Tag):
+            return
+
+        data = HTMLDocumentBackend.parse_table_data(table_tag)
+
+        # TODO: format label vs caption once styling is supported
+        label = table_xml_component["label"]
+        caption = table_xml_component["caption"]
+        table_text: str = f"{label}{' ' if label and caption else ''}{caption}"
+        table_caption: Optional[TextItem] = (
+            doc.add_text(label=DocItemLabel.CAPTION, text=table_text)
+            if table_text
+            else None
+        )
+
+        if data is not None:
+            doc.add_table(data=data, parent=parent, caption=table_caption)
+
+        return
+
+    def _add_tables(
+        self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
+    ) -> None:
+        table: Table = {"label": "", "caption": "", "content": ""}
+
+        # Content
+        if len(node.xpath("table")) > 0:
+            table_content_node = node.xpath("table")[0]
+        elif len(node.xpath("alternatives/table")) > 0:
+            table_content_node = node.xpath("alternatives/table")[0]
+        else:
+            table_content_node = None
+        if table_content_node is not None:
+            table["content"] = etree.tostring(table_content_node).decode("utf-8")
+
+        # Caption
+        caption_node = node.xpath("caption")
+        caption: Optional[str]
+        if caption_node:
+            caption = ""
+            for caption_par in list(caption_node[0]):
+                if caption_par.xpath(".//supplementary-material"):
+                    continue
+                caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
+            caption = caption.strip()
+        else:
+            caption = None
+        if caption is not None:
+            table["caption"] = caption
+
+        # Label
+        if len(node.xpath("label")) > 0:
+            table["label"] = node.xpath("label")[0].text
+
+        try:
+            self._add_table(doc, parent, table)
+        except Exception as e:
+            _log.warning(f"Skipping unsupported table in {str(self.file)}")
+            pass
+
+        return
+
+    def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
+        self.root = doc.add_text(
+            parent=None,
+            text=xml_components["title"],
+            label=DocItemLabel.TITLE,
+        )
+        return
+
+    def _walk_linear(
+        self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
+    ) -> str:
+        skip_tags = ["term"]
+        flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
+        new_parent: NodeItem = parent
+        node_text: str = (
+            node.text.replace("\n", " ")
+            if (node.tag not in skip_tags and node.text)
+            else ""
+        )
+
+        for child in list(node):
+            stop_walk: bool = False
+
+            # flush text into TextItem for some tags in paragraph nodes
+            if node.tag == "p" and node_text.strip() and child.tag in flush_tags:
+                doc.add_text(
+                    label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent
+                )
+                node_text = ""
+
+            # add elements and decide whether to stop walking
+            if child.tag in ("sec", "ack"):
+                header = child.xpath("title|label")
+                text: Optional[str] = None
+                if len(header) > 0:
+                    text = JatsDocumentBackend._get_text(header[0])
+                elif child.tag == "ack":
+                    text = DEFAULT_HEADER_ACKNOWLEDGMENTS
+                if text:
+                    new_parent = doc.add_heading(text=text, parent=parent)
+            elif child.tag == "list":
+                new_parent = doc.add_group(
+                    label=GroupLabel.LIST, name="list", parent=parent
+                )
+            elif child.tag == "list-item":
+                # TODO: address any type of content (another list, formula,...)
+                # TODO: address list type and item label
+                text = JatsDocumentBackend._get_text(child).strip()
+                new_parent = doc.add_list_item(text=text, parent=parent)
+                stop_walk = True
+            elif child.tag == "fig":
+                self._add_figure_captions(doc, parent, child)
+                stop_walk = True
+            elif child.tag == "table-wrap":
+                self._add_tables(doc, parent, child)
+                stop_walk = True
+            elif child.tag == "suplementary-material":
+                stop_walk = True
+            elif child.tag == "fn-group":
+                # header = child.xpath(".//title") or child.xpath(".//label")
+                # if header:
+                #     text = JatsDocumentBackend._get_text(header[0])
+                #     fn_parent = doc.add_heading(text=text, parent=new_parent)
+                # self._add_footnote_group(doc, fn_parent, child)
+                stop_walk = True
+            elif child.tag == "ref-list" and node.tag != "ref-list":
+                header = child.xpath("title|label")
+                text = (
+                    JatsDocumentBackend._get_text(header[0])
+                    if len(header) > 0
+                    else DEFAULT_HEADER_REFERENCES
+                )
+                new_parent = doc.add_heading(text=text, parent=parent)
+                new_parent = doc.add_group(
+                    parent=new_parent, label=GroupLabel.LIST, name="list"
+                )
+            elif child.tag == "element-citation":
+                text = self._parse_element_citation(child)
+                self._add_citation(doc, parent, text)
+                stop_walk = True
+            elif child.tag == "mixed-citation":
+                text = JatsDocumentBackend._get_text(child).strip()
+                self._add_citation(doc, parent, text)
+                stop_walk = True
+            elif child.tag == "tex-math":
+                self._add_equation(doc, parent, child)
+                stop_walk = True
+            elif child.tag == "inline-formula":
+                # TODO: address inline formulas when supported by docling-core
+                stop_walk = True
+
+            # step into child
+            if not stop_walk:
+                new_text = self._walk_linear(doc, new_parent, child)
+                if not (node.getparent().tag == "p" and node.tag in flush_tags):
+                    node_text += new_text
+
+            # pick up the tail text
+            node_text += child.tail.replace("\n", " ") if child.tail else ""
+
+        # create paragraph
+        if node.tag == "p" and node_text.strip():
+            doc.add_text(label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent)
+            return ""
+        else:
+            # backpropagate the text
+            return node_text
--- a/docling/backend/xml/pubmed_backend.py
+++ b/docling/backend/xml/pubmed_backend.py
@ -1,592 +0,0 @@
-import logging
-from io import BytesIO
-from pathlib import Path
-from typing import Any, Set, Union
-
-import lxml
-from bs4 import BeautifulSoup
-from docling_core.types.doc import (
-    DocItemLabel,
-    DoclingDocument,
-    DocumentOrigin,
-    GroupLabel,
-    TableCell,
-    TableData,
-)
-from lxml import etree
-from typing_extensions import TypedDict, override
-
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
-
-_log = logging.getLogger(__name__)
-
-
-class Paragraph(TypedDict):
-    text: str
-    headers: list[str]
-
-
-class Author(TypedDict):
-    name: str
-    affiliation_names: list[str]
-
-
-class Table(TypedDict):
-    label: str
-    caption: str
-    content: str
-
-
-class FigureCaption(TypedDict):
-    label: str
-    caption: str
-
-
-class Reference(TypedDict):
-    author_names: str
-    title: str
-    journal: str
-    year: str
-
-
-class XMLComponents(TypedDict):
-    title: str
-    authors: list[Author]
-    abstract: str
-    paragraphs: list[Paragraph]
-    tables: list[Table]
-    figure_captions: list[FigureCaption]
-    references: list[Reference]
-
-
-class PubMedDocumentBackend(DeclarativeDocumentBackend):
-    """
-    The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
-    Achakulvisut et al., (2020).
-    Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
-    Journal of Open Source Software, 5(46), 1979,
-    https://doi.org/10.21105/joss.01979
-    """
-
-    @override
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
-        self.path_or_stream = path_or_stream
-
-        # Initialize parents for the document hierarchy
-        self.parents: dict = {}
-
-        self.valid = False
-        try:
-            if isinstance(self.path_or_stream, BytesIO):
-                self.path_or_stream.seek(0)
-            self.tree: lxml.etree._ElementTree = etree.parse(self.path_or_stream)
-            if "/NLM//DTD JATS" in self.tree.docinfo.public_id:
-                self.valid = True
-        except Exception as exc:
-            raise RuntimeError(
-                f"Could not initialize PubMed backend for file with hash {self.document_hash}."
-            ) from exc
-
-    @override
-    def is_valid(self) -> bool:
-        return self.valid
-
-    @classmethod
-    @override
-    def supports_pagination(cls) -> bool:
-        return False
-
-    @override
-    def unload(self):
-        if isinstance(self.path_or_stream, BytesIO):
-            self.path_or_stream.close()
-        self.path_or_stream = None
-
-    @classmethod
-    @override
-    def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.XML_PUBMED}
-
-    @override
-    def convert(self) -> DoclingDocument:
-        # Create empty document
-        origin = DocumentOrigin(
-            filename=self.file.name or "file",
-            mimetype="application/xml",
-            binary_hash=self.document_hash,
-        )
-        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
-
-        _log.debug("Trying to convert PubMed XML document...")
-
-        # Get parsed XML components
-        xml_components: XMLComponents = self._parse()
-
-        # Add XML components to the document
-        doc = self._populate_document(doc, xml_components)
-        return doc
-
-    def _parse_title(self) -> str:
-        title: str = " ".join(
-            [
-                t.replace("\n", "")
-                for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
-            ]
-        )
-        return title
-
-    def _parse_authors(self) -> list[Author]:
-        # Get mapping between affiliation ids and names
-        affiliation_names = []
-        for affiliation_node in self.tree.xpath(".//aff[@id]"):
-            affiliation_names.append(
-                ": ".join([t for t in affiliation_node.itertext() if t != "\n"])
-            )
-        affiliation_ids_names = {
-            id: name
-            for id, name in zip(self.tree.xpath(".//aff[@id]/@id"), affiliation_names)
-        }
-
-        # Get author names and affiliation names
-        authors: list[Author] = []
-        for author_node in self.tree.xpath(
-            './/contrib-group/contrib[@contrib-type="author"]'
-        ):
-            author: Author = {
-                "name": "",
-                "affiliation_names": [],
-            }
-
-            # Affiliation names
-            affiliation_ids = [
-                a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
-            ]
-            for id in affiliation_ids:
-                if id in affiliation_ids_names:
-                    author["affiliation_names"].append(affiliation_ids_names[id])
-
-            # Name
-            author["name"] = (
-                author_node.xpath("name/surname")[0].text
-                + " "
-                + author_node.xpath("name/given-names")[0].text
-            )
-
-            authors.append(author)
-        return authors
-
-    def _parse_abstract(self) -> str:
-        texts = []
-        for abstract_node in self.tree.xpath(".//abstract"):
-            for text in abstract_node.itertext():
-                texts.append(text.replace("\n", ""))
-        abstract: str = "".join(texts)
-        return abstract
-
-    def _parse_main_text(self) -> list[Paragraph]:
-        paragraphs: list[Paragraph] = []
-        for paragraph_node in self.tree.xpath("//body//p"):
-            # Skip captions
-            if "/caption" in paragraph_node.getroottree().getpath(paragraph_node):
-                continue
-
-            paragraph: Paragraph = {"text": "", "headers": []}
-
-            # Text
-            paragraph["text"] = "".join(
-                [t.replace("\n", "") for t in paragraph_node.itertext()]
-            )
-
-            # Header
-            path = "../title"
-            while len(paragraph_node.xpath(path)) > 0:
-                paragraph["headers"].append(
-                    "".join(
-                        [
-                            t.replace("\n", "")
-                            for t in paragraph_node.xpath(path)[0].itertext()
-                        ]
-                    )
-                )
-                path = "../" + path
-
-            paragraphs.append(paragraph)
-
-        return paragraphs
-
-    def _parse_tables(self) -> list[Table]:
-        tables: list[Table] = []
-        for table_node in self.tree.xpath(".//body//table-wrap"):
-            table: Table = {"label": "", "caption": "", "content": ""}
-
-            # Content
-            if len(table_node.xpath("table")) > 0:
-                table_content_node = table_node.xpath("table")[0]
-            elif len(table_node.xpath("alternatives/table")) > 0:
-                table_content_node = table_node.xpath("alternatives/table")[0]
-            else:
-                table_content_node = None
-            if table_content_node != None:
-                table["content"] = etree.tostring(table_content_node).decode("utf-8")
-
-            # Caption
-            if len(table_node.xpath("caption/p")) > 0:
-                caption_node = table_node.xpath("caption/p")[0]
-            elif len(table_node.xpath("caption/title")) > 0:
-                caption_node = table_node.xpath("caption/title")[0]
-            else:
-                caption_node = None
-            if caption_node != None:
-                table["caption"] = "".join(
-                    [t.replace("\n", "") for t in caption_node.itertext()]
-                )
-
-            # Label
-            if len(table_node.xpath("label")) > 0:
-                table["label"] = table_node.xpath("label")[0].text
-
-            tables.append(table)
-        return tables
-
-    def _parse_figure_captions(self) -> list[FigureCaption]:
-        figure_captions: list[FigureCaption] = []
-
-        if not (self.tree.xpath(".//fig")):
-            return figure_captions
-
-        for figure_node in self.tree.xpath(".//fig"):
-            figure_caption: FigureCaption = {
-                "caption": "",
-                "label": "",
-            }
-
-            # Label
-            if figure_node.xpath("label"):
-                figure_caption["label"] = "".join(
-                    [
-                        t.replace("\n", "")
-                        for t in figure_node.xpath("label")[0].itertext()
-                    ]
-                )
-
-            # Caption
-            if figure_node.xpath("caption"):
-                caption = ""
-                for caption_node in figure_node.xpath("caption")[0].getchildren():
-                    caption += (
-                        "".join([t.replace("\n", "") for t in caption_node.itertext()])
-                        + "\n"
-                    )
-                figure_caption["caption"] = caption
-
-            figure_captions.append(figure_caption)
-
-        return figure_captions
-
-    def _parse_references(self) -> list[Reference]:
-        references: list[Reference] = []
-        for reference_node_abs in self.tree.xpath(".//ref-list/ref"):
-            reference: Reference = {
-                "author_names": "",
-                "title": "",
-                "journal": "",
-                "year": "",
-            }
-            reference_node: Any = None
-            for tag in ["mixed-citation", "element-citation", "citation"]:
-                if len(reference_node_abs.xpath(tag)) > 0:
-                    reference_node = reference_node_abs.xpath(tag)[0]
-                    break
-
-            if reference_node is None:
-                continue
-
-            if all(
-                not (ref_type in ["citation-type", "publication-type"])
-                for ref_type in reference_node.attrib.keys()
-            ):
-                continue
-
-            # Author names
-            names = []
-            if len(reference_node.xpath("name")) > 0:
-                for name_node in reference_node.xpath("name"):
-                    name_str = " ".join(
-                        [t.text for t in name_node.getchildren() if (t.text != None)]
-                    )
-                    names.append(name_str)
-            elif len(reference_node.xpath("person-group")) > 0:
-                for name_node in reference_node.xpath("person-group")[0]:
-                    name_str = (
-                        name_node.xpath("given-names")[0].text
-                        + " "
-                        + name_node.xpath("surname")[0].text
-                    )
-                    names.append(name_str)
-            reference["author_names"] = "; ".join(names)
-
-            # Title
-            if len(reference_node.xpath("article-title")) > 0:
-                reference["title"] = " ".join(
-                    [
-                        t.replace("\n", " ")
-                        for t in reference_node.xpath("article-title")[0].itertext()
-                    ]
-                )
-
-            # Journal
-            if len(reference_node.xpath("source")) > 0:
-                reference["journal"] = reference_node.xpath("source")[0].text
-
-            # Year
-            if len(reference_node.xpath("year")) > 0:
-                reference["year"] = reference_node.xpath("year")[0].text
-
-            if (
-                not (reference_node.xpath("article-title"))
-                and not (reference_node.xpath("journal"))
-                and not (reference_node.xpath("year"))
-            ):
-                reference["title"] = reference_node.text
-
-            references.append(reference)
-        return references
-
-    def _parse(self) -> XMLComponents:
-        """Parsing PubMed document."""
-        xml_components: XMLComponents = {
-            "title": self._parse_title(),
-            "authors": self._parse_authors(),
-            "abstract": self._parse_abstract(),
-            "paragraphs": self._parse_main_text(),
-            "tables": self._parse_tables(),
-            "figure_captions": self._parse_figure_captions(),
-            "references": self._parse_references(),
-        }
-        return xml_components
-
-    def _populate_document(
-        self, doc: DoclingDocument, xml_components: XMLComponents
-    ) -> DoclingDocument:
-        self._add_title(doc, xml_components)
-        self._add_authors(doc, xml_components)
-        self._add_abstract(doc, xml_components)
-        self._add_main_text(doc, xml_components)
-
-        if xml_components["tables"]:
-            self._add_tables(doc, xml_components)
-
-        if xml_components["figure_captions"]:
-            self._add_figure_captions(doc, xml_components)
-
-        self._add_references(doc, xml_components)
-        return doc
-
-    def _add_figure_captions(
-        self, doc: DoclingDocument, xml_components: XMLComponents
-    ) -> None:
-        self.parents["Figures"] = doc.add_heading(
-            parent=self.parents["Title"], text="Figures"
-        )
-        for figure_caption_xml_component in xml_components["figure_captions"]:
-            figure_caption_text = (
-                figure_caption_xml_component["label"]
-                + ": "
-                + figure_caption_xml_component["caption"].strip()
-            )
-            fig_caption = doc.add_text(
-                label=DocItemLabel.CAPTION, text=figure_caption_text
-            )
-            doc.add_picture(
-                parent=self.parents["Figures"],
-                caption=fig_caption,
-            )
-        return
-
-    def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
-        self.parents["Title"] = doc.add_text(
-            parent=None,
-            text=xml_components["title"],
-            label=DocItemLabel.TITLE,
-        )
-        return
-
-    def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
-        authors_affiliations: list = []
-        for author in xml_components["authors"]:
-            authors_affiliations.append(author["name"])
-            authors_affiliations.append(", ".join(author["affiliation_names"]))
-        authors_affiliations_str = "; ".join(authors_affiliations)
-
-        doc.add_text(
-            parent=self.parents["Title"],
-            text=authors_affiliations_str,
-            label=DocItemLabel.PARAGRAPH,
-        )
-        return
-
-    def _add_abstract(
-        self, doc: DoclingDocument, xml_components: XMLComponents
-    ) -> None:
-        abstract_text: str = xml_components["abstract"]
-        self.parents["Abstract"] = doc.add_heading(
-            parent=self.parents["Title"], text="Abstract"
-        )
-        doc.add_text(
-            parent=self.parents["Abstract"],
-            text=abstract_text,
-            label=DocItemLabel.TEXT,
-        )
-        return
-
-    def _add_main_text(
-        self, doc: DoclingDocument, xml_components: XMLComponents
-    ) -> None:
-        added_headers: list = []
-        for paragraph in xml_components["paragraphs"]:
-            if not (paragraph["headers"]):
-                continue
-
-            # Header
-            for i, header in enumerate(reversed(paragraph["headers"])):
-                if header in added_headers:
-                    continue
-                added_headers.append(header)
-
-                if ((i - 1) >= 0) and list(reversed(paragraph["headers"]))[
-                    i - 1
-                ] in self.parents:
-                    parent = self.parents[list(reversed(paragraph["headers"]))[i - 1]]
-                else:
-                    parent = self.parents["Title"]
-
-                self.parents[header] = doc.add_heading(parent=parent, text=header)
-
-            # Paragraph text
-            if paragraph["headers"][0] in self.parents:
-                parent = self.parents[paragraph["headers"][0]]
-            else:
-                parent = self.parents["Title"]
-
-            doc.add_text(parent=parent, label=DocItemLabel.TEXT, text=paragraph["text"])
-        return
-
-    def _add_references(
-        self, doc: DoclingDocument, xml_components: XMLComponents
-    ) -> None:
-        self.parents["References"] = doc.add_heading(
-            parent=self.parents["Title"], text="References"
-        )
-        current_list = doc.add_group(
-            parent=self.parents["References"], label=GroupLabel.LIST, name="list"
-        )
-        for reference in xml_components["references"]:
-            reference_text: str = ""
-            if reference["author_names"]:
-                reference_text += reference["author_names"] + ". "
-
-            if reference["title"]:
-                reference_text += reference["title"]
-                if reference["title"][-1] != ".":
-                    reference_text += "."
-                reference_text += " "
-
-            if reference["journal"]:
-                reference_text += reference["journal"]
-
-            if reference["year"]:
-                reference_text += " (" + reference["year"] + ")"
-
-            if not (reference_text):
-                _log.debug(f"Skipping reference for: {str(self.file)}")
-                continue
-
-            doc.add_list_item(
-                text=reference_text, enumerated=False, parent=current_list
-            )
-        return
-
-    def _add_tables(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
-        self.parents["Tables"] = doc.add_heading(
-            parent=self.parents["Title"], text="Tables"
-        )
-        for table_xml_component in xml_components["tables"]:
-            try:
-                self._add_table(doc, table_xml_component)
-            except Exception as e:
-                _log.debug(f"Skipping unsupported table for: {str(self.file)}")
-                pass
-        return
-
-    def _add_table(self, doc: DoclingDocument, table_xml_component: Table) -> None:
-        soup = BeautifulSoup(table_xml_component["content"], "html.parser")
-        table_tag = soup.find("table")
-
-        nested_tables = table_tag.find("table")
-        if nested_tables:
-            _log.debug(f"Skipping nested table for: {str(self.file)}")
-            return
-
-        # Count the number of rows (number of <tr> elements)
-        num_rows = len(table_tag.find_all("tr"))
-
-        # Find the number of columns (taking into account colspan)
-        num_cols = 0
-        for row in table_tag.find_all("tr"):
-            col_count = 0
-            for cell in row.find_all(["td", "th"]):
-                colspan = int(cell.get("colspan", 1))
-                col_count += colspan
-            num_cols = max(num_cols, col_count)
-
-        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
-
-        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
-
-        # Iterate over the rows in the table
-        for row_idx, row in enumerate(table_tag.find_all("tr")):
-            # For each row, find all the column cells (both <td> and <th>)
-            cells = row.find_all(["td", "th"])
-
-            # Check if each cell in the row is a header -> means it is a column header
-            col_header = True
-            for j, html_cell in enumerate(cells):
-                if html_cell.name == "td":
-                    col_header = False
-
-            # Extract and print the text content of each cell
-            col_idx = 0
-            for _, html_cell in enumerate(cells):
-                text = html_cell.text
-
-                col_span = int(html_cell.get("colspan", 1))
-                row_span = int(html_cell.get("rowspan", 1))
-
-                while grid[row_idx][col_idx] != None:
-                    col_idx += 1
-                for r in range(row_span):
-                    for c in range(col_span):
-                        grid[row_idx + r][col_idx + c] = text
-
-                cell = TableCell(
-                    text=text,
-                    row_span=row_span,
-                    col_span=col_span,
-                    start_row_offset_idx=row_idx,
-                    end_row_offset_idx=row_idx + row_span,
-                    start_col_offset_idx=col_idx,
-                    end_col_offset_idx=col_idx + col_span,
-                    col_header=col_header,
-                    row_header=((not col_header) and html_cell.name == "th"),
-                )
-                data.table_cells.append(cell)
-
-        table_caption = doc.add_text(
-            label=DocItemLabel.CAPTION,
-            text=table_xml_component["label"] + ": " + table_xml_component["caption"],
-        )
-        doc.add_table(data=data, parent=self.parents["Tables"], caption=table_caption)
-        return
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@ -14,7 +14,7 @@ from abc import ABC, abstractmethod
 from enum import Enum, unique
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Final, Optional, Union
+from typing import Final, Optional, Union

 from bs4 import BeautifulSoup, Tag
 from docling_core.types.doc import (
@ -389,7 +389,7 @@ class PatentUsptoIce(PatentUspto):
            if name == self.Element.TITLE.value:
                if text:
                    self.parents[self.level + 1] = self.doc.add_title(
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                        text=text,
                    )
                    self.level += 1
@ -406,7 +406,7 @@ class PatentUsptoIce(PatentUspto):
                    abstract_item = self.doc.add_heading(
                        heading_text,
                        level=heading_level,
-                        parent=self.parents[heading_level],  # type: ignore[arg-type]
+                        parent=self.parents[heading_level],
                    )
                    self.doc.add_text(
                        label=DocItemLabel.PARAGRAPH,
@ -434,7 +434,7 @@ class PatentUsptoIce(PatentUspto):
                claims_item = self.doc.add_heading(
                    heading_text,
                    level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                )
                for text in self.claims:
                    self.doc.add_text(
@ -452,7 +452,7 @@ class PatentUsptoIce(PatentUspto):
                    self.doc.add_text(
                        label=DocItemLabel.PARAGRAPH,
                        text=text,
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                    )
                self.text = ""

@ -460,7 +460,7 @@ class PatentUsptoIce(PatentUspto):
                self.parents[self.level + 1] = self.doc.add_heading(
                    text=text,
                    level=self.level,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                )
                self.level += 1
                self.text = ""
@ -470,7 +470,7 @@ class PatentUsptoIce(PatentUspto):
                empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
                self.doc.add_table(
                    data=empty_table,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                )

        def _apply_style(self, text: str, style_tag: str) -> str:
@ -721,7 +721,7 @@ class PatentUsptoGrantV2(PatentUspto):
                if self.Element.TITLE.value in self.property and text.strip():
                    title = text.strip()
                    self.parents[self.level + 1] = self.doc.add_title(
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                        text=title,
                    )
                    self.level += 1
@ -749,7 +749,7 @@ class PatentUsptoGrantV2(PatentUspto):
                    self.parents[self.level + 1] = self.doc.add_heading(
                        text=text.strip(),
                        level=self.level,
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                    )
                    self.level += 1

@ -769,7 +769,7 @@ class PatentUsptoGrantV2(PatentUspto):
                claims_item = self.doc.add_heading(
                    heading_text,
                    level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                )
                for text in self.claims:
                    self.doc.add_text(
@ -787,7 +787,7 @@ class PatentUsptoGrantV2(PatentUspto):
                abstract_item = self.doc.add_heading(
                    heading_text,
                    level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                )
                self.doc.add_text(
                    label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
@ -799,7 +799,7 @@ class PatentUsptoGrantV2(PatentUspto):
                    self.doc.add_text(
                        label=DocItemLabel.PARAGRAPH,
                        text=paragraph,
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                    )
                elif self.Element.CLAIM.value in self.property:
                    # we may need a space after a paragraph in claim text
@ -811,7 +811,7 @@ class PatentUsptoGrantV2(PatentUspto):
                empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
                self.doc.add_table(
                    data=empty_table,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                )

        def _apply_style(self, text: str, style_tag: str) -> str:
@ -938,7 +938,7 @@ class PatentUsptoGrantAps(PatentUspto):
        self.parents[self.level + 1] = self.doc.add_heading(
            heading.value,
            level=self.level,
-            parent=self.parents[self.level],  # type: ignore[arg-type]
+            parent=self.parents[self.level],
        )
        self.level += 1

@ -959,7 +959,7 @@ class PatentUsptoGrantAps(PatentUspto):

        if field == self.Field.TITLE.value:
            self.parents[self.level + 1] = self.doc.add_title(
-                parent=self.parents[self.level], text=value  # type: ignore[arg-type]
+                parent=self.parents[self.level], text=value
            )
            self.level += 1

@ -971,14 +971,14 @@ class PatentUsptoGrantAps(PatentUspto):
                self.doc.add_text(
                    label=DocItemLabel.PARAGRAPH,
                    text=value,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                )

        elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
            self.doc.add_text(
                label=DocItemLabel.PARAGRAPH,
                text="",
-                parent=self.parents[self.level],  # type: ignore[arg-type]
+                parent=self.parents[self.level],
            )

        elif (
@ -996,10 +996,10 @@ class PatentUsptoGrantAps(PatentUspto):
                last_claim = self.doc.add_text(
                    label=DocItemLabel.PARAGRAPH,
                    text="",
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                )

-            last_claim.text += f" {value}" if last_claim.text else value
+            last_claim.text += f" {value.strip()}" if last_claim.text else value.strip()

        elif field == self.Field.CAPTION.value and section in (
            self.Section.SUMMARY.value,
@ -1012,7 +1012,7 @@ class PatentUsptoGrantAps(PatentUspto):
            self.parents[self.level + 1] = self.doc.add_heading(
                value,
                level=self.level,
-                parent=self.parents[self.level],  # type: ignore[arg-type]
+                parent=self.parents[self.level],
            )
            self.level += 1

@ -1029,7 +1029,7 @@ class PatentUsptoGrantAps(PatentUspto):
            self.doc.add_text(
                label=DocItemLabel.PARAGRAPH,
                text=value,
-                parent=self.parents[self.level],  # type: ignore[arg-type]
+                parent=self.parents[self.level],
            )

    def parse(self, patent_content: str) -> Optional[DoclingDocument]:
@ -1283,7 +1283,7 @@ class PatentUsptoAppV1(PatentUspto):
                title = text.strip()
                if title:
                    self.parents[self.level + 1] = self.doc.add_text(
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                        label=DocItemLabel.TITLE,
                        text=title,
                    )
@ -1301,7 +1301,7 @@ class PatentUsptoAppV1(PatentUspto):
                    abstract_item = self.doc.add_heading(
                        heading_text,
                        level=heading_level,
-                        parent=self.parents[heading_level],  # type: ignore[arg-type]
+                        parent=self.parents[heading_level],
                    )
                    self.doc.add_text(
                        label=DocItemLabel.PARAGRAPH,
@ -1331,7 +1331,7 @@ class PatentUsptoAppV1(PatentUspto):
                claims_item = self.doc.add_heading(
                    heading_text,
                    level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                )
                for text in self.claims:
                    self.doc.add_text(
@ -1350,14 +1350,14 @@ class PatentUsptoAppV1(PatentUspto):
                        self.parents[self.level + 1] = self.doc.add_heading(
                            text=text,
                            level=self.level,
-                            parent=self.parents[self.level],  # type: ignore[arg-type]
+                            parent=self.parents[self.level],
                        )
                        self.level += 1
                    else:
                        self.doc.add_text(
                            label=DocItemLabel.PARAGRAPH,
                            text=text,
-                            parent=self.parents[self.level],  # type: ignore[arg-type]
+                            parent=self.parents[self.level],
                        )
                self.text = ""

@ -1366,7 +1366,7 @@ class PatentUsptoAppV1(PatentUspto):
                empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
                self.doc.add_table(
                    data=empty_table,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                )

        def _apply_style(self, text: str, style_tag: str) -> str:
@ -1406,6 +1406,10 @@ class XmlTable:
    http://oasis-open.org/specs/soextblx.dtd
    """

+    class ColInfo(TypedDict):
+        ncols: int
+        colinfo: list[dict]
+
    class MinColInfoType(TypedDict):
        offset: list[int]
        colwidth: list[int]
@ -1425,7 +1429,7 @@ class XmlTable:
        self.empty_text = ""
        self._soup = BeautifulSoup(input, features="xml")

-    def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
+    def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
        """Create a unified range along the table groups.

        Args:
@ -1532,19 +1536,26 @@ class XmlTable:
        Returns:
            A docling table object.
        """
-        tgs_align = []
-        tg_secs = table.find_all("tgroup")
+        tgs_align: list[XmlTable.ColInfo] = []
+        tg_secs = table("tgroup")
        if tg_secs:
            for tg_sec in tg_secs:
-                ncols = tg_sec.get("cols", None)
-                if ncols:
-                    ncols = int(ncols)
-                tg_align = {"ncols": ncols, "colinfo": []}
-                cs_secs = tg_sec.find_all("colspec")
+                if not isinstance(tg_sec, Tag):
+                    continue
+                col_val = tg_sec.get("cols")
+                ncols = (
+                    int(col_val)
+                    if isinstance(col_val, str) and col_val.isnumeric()
+                    else 1
+                )
+                tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
+                cs_secs = tg_sec("colspec")
                if cs_secs:
                    for cs_sec in cs_secs:
-                        colname = cs_sec.get("colname", None)
-                        colwidth = cs_sec.get("colwidth", None)
+                        if not isinstance(cs_sec, Tag):
+                            continue
+                        colname = cs_sec.get("colname")
+                        colwidth = cs_sec.get("colwidth")
                        tg_align["colinfo"].append(
                            {"colname": colname, "colwidth": colwidth}
                        )
@ -1565,16 +1576,23 @@ class XmlTable:
        table_data: list[TableCell] = []
        i_row_global = 0
        is_row_empty: bool = True
-        tg_secs = table.find_all("tgroup")
+        tg_secs = table("tgroup")
        if tg_secs:
            for itg, tg_sec in enumerate(tg_secs):
+                if not isinstance(tg_sec, Tag):
+                    continue
                tg_range = tgs_range[itg]
-                row_secs = tg_sec.find_all(["row", "tr"])
+                row_secs = tg_sec(["row", "tr"])

                if row_secs:
                    for row_sec in row_secs:
-                        entry_secs = row_sec.find_all(["entry", "td"])
-                        is_header: bool = row_sec.parent.name in ["thead"]
+                        if not isinstance(row_sec, Tag):
+                            continue
+                        entry_secs = row_sec(["entry", "td"])
+                        is_header: bool = (
+                            row_sec.parent is not None
+                            and row_sec.parent.name == "thead"
+                        )

                        ncols = 0
                        local_row: list[TableCell] = []
@ -1582,23 +1600,26 @@ class XmlTable:
                        if entry_secs:
                            wrong_nbr_cols = False
                            for ientry, entry_sec in enumerate(entry_secs):
+                                if not isinstance(entry_sec, Tag):
+                                    continue
                                text = entry_sec.get_text().strip()

                                # start-end
-                                namest = entry_sec.attrs.get("namest", None)
-                                nameend = entry_sec.attrs.get("nameend", None)
-                                if isinstance(namest, str) and namest.isnumeric():
-                                    namest = int(namest)
-                                else:
-                                    namest = ientry + 1
+                                namest = entry_sec.get("namest")
+                                nameend = entry_sec.get("nameend")
+                                start = (
+                                    int(namest)
+                                    if isinstance(namest, str) and namest.isnumeric()
+                                    else ientry + 1
+                                )
                                if isinstance(nameend, str) and nameend.isnumeric():
-                                    nameend = int(nameend)
+                                    end = int(nameend)
                                    shift = 0
                                else:
-                                    nameend = ientry + 2
+                                    end = ientry + 2
                                    shift = 1

-                                if nameend > len(tg_range["cell_offst"]):
+                                if end > len(tg_range["cell_offst"]):
                                    wrong_nbr_cols = True
                                    self.nbr_messages += 1
                                    if self.nbr_messages <= self.max_nbr_messages:
@ -1608,8 +1629,8 @@ class XmlTable:
                                    break

                                range_ = [
-                                    tg_range["cell_offst"][namest - 1],
-                                    tg_range["cell_offst"][nameend - 1] - shift,
+                                    tg_range["cell_offst"][start - 1],
+                                    tg_range["cell_offst"][end - 1] - shift,
                                ]

                                # add row and replicate cell if needed
@ -1668,7 +1689,7 @@ class XmlTable:
            A docling table data.
        """
        section = self._soup.find("table")
-        if section is not None:
+        if isinstance(section, Tag):
            table = self._parse_table(section)
            if table.num_rows == 0 or table.num_cols == 0:
                _log.warning("The parsed USPTO table is empty")
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -1,18 +1,18 @@
 import importlib
-import json
 import logging
+import platform
 import re
+import sys
 import tempfile
 import time
 import warnings
-from enum import Enum
 from pathlib import Path
 from typing import Annotated, Dict, Iterable, List, Optional, Type

 import typer
 from docling_core.types.doc import ImageRefMode
 from docling_core.utils.file import resolve_source_to_path
-from pydantic import TypeAdapter, ValidationError
+from pydantic import TypeAdapter

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@ -65,10 +65,15 @@ def version_callback(value: bool):
        docling_core_version = importlib.metadata.version("docling-core")
        docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
        docling_parse_version = importlib.metadata.version("docling-parse")
+        platform_str = platform.platform()
+        py_impl_version = sys.implementation.cache_tag
+        py_lang_version = platform.python_version()
        print(f"Docling version: {docling_version}")
        print(f"Docling Core version: {docling_core_version}")
        print(f"Docling IBM Models version: {docling_ibm_models_version}")
        print(f"Docling Parse version: {docling_parse_version}")
+        print(f"Python: {py_impl_version} ({py_lang_version})")
+        print(f"Platform: {platform_str}")
        raise typer.Exit()


@ -205,17 +210,42 @@ def convert(
    table_mode: Annotated[
        TableFormerMode,
        typer.Option(..., help="The mode to use in the table structure model."),
-    ] = TableFormerMode.FAST,
+    ] = TableFormerMode.ACCURATE,
+    enrich_code: Annotated[
+        bool,
+        typer.Option(..., help="Enable the code enrichment model in the pipeline."),
+    ] = False,
+    enrich_formula: Annotated[
+        bool,
+        typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
+    ] = False,
+    enrich_picture_classes: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            help="Enable the picture classification enrichment model in the pipeline.",
+        ),
+    ] = False,
+    enrich_picture_description: Annotated[
+        bool,
+        typer.Option(..., help="Enable the picture description model in the pipeline."),
+    ] = False,
    artifacts_path: Annotated[
        Optional[Path],
        typer.Option(..., help="If provided, the location of the model artifacts."),
    ] = None,
+    enable_remote_services: Annotated[
+        bool,
+        typer.Option(
+            ..., help="Must be enabled when using models connecting to remote services."
+        ),
+    ] = False,
    abort_on_error: Annotated[
        bool,
        typer.Option(
            ...,
            "--abort-on-error/--no-abort-on-error",
-            help="If enabled, the bitmap content will be processed using OCR.",
+            help="If enabled, the processing will be aborted when the first error is encountered.",
        ),
    ] = False,
    output: Annotated[
@ -356,10 +386,15 @@ def convert(

        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
        pipeline_options = PdfPipelineOptions(
+            enable_remote_services=enable_remote_services,
            accelerator_options=accelerator_options,
            do_ocr=ocr,
            ocr_options=ocr_options,
            do_table_structure=True,
+            do_code_enrichment=enrich_code,
+            do_formula_enrichment=enrich_formula,
+            do_picture_description=enrich_picture_description,
+            do_picture_classification=enrich_picture_classes,
            document_timeout=document_timeout,
        )
        pipeline_options.table_structure_options.do_cell_matching = (
--- a/docling/cli/models.py
+++ b/docling/cli/models.py
@ -0,0 +1,131 @@
+import logging
+import warnings
+from enum import Enum
+from pathlib import Path
+from typing import Annotated, Optional
+
+import typer
+from rich.console import Console
+from rich.logging import RichHandler
+
+from docling.datamodel.settings import settings
+from docling.utils.model_downloader import download_models
+
+warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
+warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
+
+console = Console()
+err_console = Console(stderr=True)
+
+
+app = typer.Typer(
+    name="Docling models helper",
+    no_args_is_help=True,
+    add_completion=False,
+    pretty_exceptions_enable=False,
+)
+
+
+class _AvailableModels(str, Enum):
+    LAYOUT = "layout"
+    TABLEFORMER = "tableformer"
+    CODE_FORMULA = "code_formula"
+    PICTURE_CLASSIFIER = "picture_classifier"
+    SMOLVLM = "smolvlm"
+    GRANITE_VISION = "granite_vision"
+    EASYOCR = "easyocr"
+
+
+_default_models = [
+    _AvailableModels.LAYOUT,
+    _AvailableModels.TABLEFORMER,
+    _AvailableModels.CODE_FORMULA,
+    _AvailableModels.PICTURE_CLASSIFIER,
+    _AvailableModels.EASYOCR,
+]
+
+
+@app.command("download")
+def download(
+    output_dir: Annotated[
+        Path,
+        typer.Option(
+            ...,
+            "-o",
+            "--output-dir",
+            help="The directory where to download the models.",
+        ),
+    ] = (settings.cache_dir / "models"),
+    force: Annotated[
+        bool, typer.Option(..., help="If true, the download will be forced.")
+    ] = False,
+    models: Annotated[
+        Optional[list[_AvailableModels]],
+        typer.Argument(
+            help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
+        ),
+    ] = None,
+    all: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "--all",
+            help="If true, all available models will be downloaded (mutually exclusive with passing specific models).",
+            show_default=True,
+        ),
+    ] = False,
+    quiet: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "-q",
+            "--quiet",
+            help="No extra output is generated, the CLI prints only the directory with the cached models.",
+        ),
+    ] = False,
+):
+    if models and all:
+        raise typer.BadParameter(
+            "Cannot simultaneously set 'all' parameter and specify models to download."
+        )
+    if not quiet:
+        FORMAT = "%(message)s"
+        logging.basicConfig(
+            level=logging.INFO,
+            format="[blue]%(message)s[/blue]",
+            datefmt="[%X]",
+            handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
+        )
+    to_download = models or ([m for m in _AvailableModels] if all else _default_models)
+    output_dir = download_models(
+        output_dir=output_dir,
+        force=force,
+        progress=(not quiet),
+        with_layout=_AvailableModels.LAYOUT in to_download,
+        with_tableformer=_AvailableModels.TABLEFORMER in to_download,
+        with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
+        with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
+        with_smolvlm=_AvailableModels.SMOLVLM in to_download,
+        with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
+        with_easyocr=_AvailableModels.EASYOCR in to_download,
+    )
+
+    if quiet:
+        typer.echo(output_dir)
+    else:
+        typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
+
+        console.print(
+            "\n",
+            "Docling can now be configured for running offline using the local artifacts.\n\n",
+            "Using the CLI:",
+            f"`docling --artifacts-path={output_dir} FILE`",
+            "\n",
+            "Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
+        )
+
+
+click_app = typer.main.get_command(app)
+
+if __name__ == "__main__":
+    app()
--- a/docling/cli/tools.py
+++ b/docling/cli/tools.py
@ -0,0 +1,17 @@
+import typer
+
+from docling.cli.models import app as models_app
+
+app = typer.Typer(
+    name="Docling helpers",
+    no_args_is_help=True,
+    add_completion=False,
+    pretty_exceptions_enable=False,
+)
+
+app.add_typer(models_app, name="models")
+
+click_app = typer.main.get_command(app)
+
+if __name__ == "__main__":
+    app()
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Union
 from docling_core.types.doc import (
    BoundingBox,
    DocItemLabel,
+    NodeItem,
    PictureDataType,
    Size,
    TableCell,
@ -33,13 +34,15 @@ class InputFormat(str, Enum):
    DOCX = "docx"
    PPTX = "pptx"
    HTML = "html"
-    XML_PUBMED = "xml_pubmed"
    IMAGE = "image"
    PDF = "pdf"
    ASCIIDOC = "asciidoc"
    MD = "md"
+    CSV = "csv"
    XLSX = "xlsx"
    XML_USPTO = "xml_uspto"
+    XML_JATS = "xml_jats"
+    JSON_DOCLING = "json_docling"


 class OutputFormat(str, Enum):
@ -56,11 +59,13 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.PDF: ["pdf"],
    InputFormat.MD: ["md"],
    InputFormat.HTML: ["html", "htm", "xhtml"],
-    InputFormat.XML_PUBMED: ["xml", "nxml"],
+    InputFormat.XML_JATS: ["xml", "nxml"],
    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
    InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
+    InputFormat.CSV: ["csv"],
    InputFormat.XLSX: ["xlsx"],
    InputFormat.XML_USPTO: ["xml", "txt"],
+    InputFormat.JSON_DOCLING: ["json"],
 }

 FormatToMimeType: Dict[InputFormat, List[str]] = {
@ -74,7 +79,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
    ],
    InputFormat.HTML: ["text/html", "application/xhtml+xml"],
-    InputFormat.XML_PUBMED: ["application/xml"],
+    InputFormat.XML_JATS: ["application/xml"],
    InputFormat.IMAGE: [
        "image/png",
        "image/jpeg",
@ -85,10 +90,12 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
    InputFormat.PDF: ["application/pdf"],
    InputFormat.ASCIIDOC: ["text/asciidoc"],
    InputFormat.MD: ["text/markdown", "text/x-markdown"],
+    InputFormat.CSV: ["text/csv"],
    InputFormat.XLSX: [
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    ],
    InputFormat.XML_USPTO: ["application/xml", "text/plain"],
+    InputFormat.JSON_DOCLING: ["application/json"],
 }

 MimeTypeToFormat: dict[str, list[InputFormat]] = {
@ -147,6 +154,10 @@ class LayoutPrediction(BaseModel):
    clusters: List[Cluster] = []


+class VlmPrediction(BaseModel):
+    text: str = ""
+
+
 class ContainerElement(
    BasePageElement
 ):  # Used for Form and Key-Value-Regions, only for typing.
@ -190,6 +201,7 @@ class PagePredictions(BaseModel):
    tablestructure: Optional[TableStructurePrediction] = None
    figures_classification: Optional[FigureClassificationPrediction] = None
    equations_prediction: Optional[EquationPrediction] = None
+    vlm_response: Optional[VlmPrediction] = None


 PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
@ -201,6 +213,13 @@ class AssembledUnit(BaseModel):
    headers: List[PageElement] = []


+class ItemAndImageEnrichmentElement(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    item: NodeItem
+    image: Image
+
+
 class Page(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)

@ -219,12 +238,28 @@ class Page(BaseModel):
        {}
    )  # Cache of images in different scales. By default it is cleared during assembling.

-    def get_image(self, scale: float = 1.0) -> Optional[Image]:
+    def get_image(
+        self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
+    ) -> Optional[Image]:
        if self._backend is None:
            return self._image_cache.get(scale, None)
+
        if not scale in self._image_cache:
-            self._image_cache[scale] = self._backend.get_page_image(scale=scale)
-        return self._image_cache[scale]
+            if cropbox is None:
+                self._image_cache[scale] = self._backend.get_page_image(scale=scale)
+            else:
+                return self._backend.get_page_image(scale=scale, cropbox=cropbox)
+
+        if cropbox is None:
+            return self._image_cache[scale]
+        else:
+            page_im = self._image_cache[scale]
+            assert self.size is not None
+            return page_im.crop(
+                cropbox.to_top_left_origin(page_height=self.size.height)
+                .scaled(scale=scale)
+                .as_tuple()
+            )

    @property
    def image(self) -> Optional[Image]:
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -1,3 +1,4 @@
+import csv
 import logging
 import re
 from enum import Enum
@ -157,6 +158,8 @@ class InputDocument(BaseModel):
                    self.page_count = self._backend.page_count()
                    if not self.page_count <= self.limits.max_num_pages:
                        self.valid = False
+                    elif self.page_count < self.limits.page_range[0]:
+                        self.valid = False

        except (FileNotFoundError, OSError) as e:
            self.valid = False
@ -294,6 +297,7 @@ class _DocumentConversionInput(BaseModel):
                mime = _DocumentConversionInput._mime_from_extension(ext)

        mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
+        mime = mime or _DocumentConversionInput._detect_csv(content)
        mime = mime or "text/plain"
        formats = MimeTypeToFormat.get(mime, [])
        if formats:
@ -329,11 +333,11 @@ class _DocumentConversionInput(BaseModel):
                ):
                    input_format = InputFormat.XML_USPTO

-                if (
-                    InputFormat.XML_PUBMED in formats
-                    and "/NLM//DTD JATS" in xml_doctype
+                if InputFormat.XML_JATS in formats and (
+                    "JATS-journalpublishing" in xml_doctype
+                    or "JATS-archive" in xml_doctype
                ):
-                    input_format = InputFormat.XML_PUBMED
+                    input_format = InputFormat.XML_JATS

        elif mime == "text/plain":
            if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
@ -350,6 +354,12 @@ class _DocumentConversionInput(BaseModel):
            mime = FormatToMimeType[InputFormat.HTML][0]
        elif ext in FormatToExtensions[InputFormat.MD]:
            mime = FormatToMimeType[InputFormat.MD][0]
+        elif ext in FormatToExtensions[InputFormat.CSV]:
+            mime = FormatToMimeType[InputFormat.CSV][0]
+        elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
+            mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
+        elif ext in FormatToExtensions[InputFormat.PDF]:
+            mime = FormatToMimeType[InputFormat.PDF][0]
        return mime

    @staticmethod
@ -386,3 +396,32 @@ class _DocumentConversionInput(BaseModel):
            return "application/xml"

        return None
+
+    @staticmethod
+    def _detect_csv(
+        content: bytes,
+    ) -> Optional[Literal["text/csv"]]:
+        """Guess the mime type of a CSV file from its content.
+
+        Args:
+            content: A short piece of a document from its beginning.
+
+        Returns:
+            The mime type of a CSV file, or None if the content does
+              not match any of the format.
+        """
+        content_str = content.decode("ascii", errors="ignore").strip()
+
+        # Ensure there's at least one newline (CSV is usually multi-line)
+        if "\n" not in content_str:
+            return None
+
+        # Use csv.Sniffer to detect CSV characteristics
+        try:
+            dialect = csv.Sniffer().sniff(content_str)
+            if dialect.delimiter in {",", ";", "\t", "|"}:  # Common delimiters
+                return "text/csv"
+        except csv.Error:
+            return None
+
+        return None
--- a/docling/datamodel/settings.py
+++ b/docling/datamodel/settings.py
@ -1,13 +1,28 @@
 import sys
 from pathlib import Path
+from typing import Annotated, Optional, Tuple

-from pydantic import BaseModel
+from pydantic import BaseModel, PlainValidator
 from pydantic_settings import BaseSettings, SettingsConfigDict


+def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
+    if v[0] < 1 or v[1] < v[0]:
+        raise ValueError(
+            "Invalid page range: start must be ≥ 1 and end must be ≥ start."
+        )
+    return v
+
+
+PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
+
+DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
+
+
 class DocumentLimits(BaseModel):
    max_num_pages: int = sys.maxsize
    max_file_size: int = sys.maxsize
+    page_range: PageRange = DEFAULT_PAGE_RANGE


 class BatchConcurrencySettings(BaseModel):
@ -46,5 +61,8 @@ class AppSettings(BaseSettings):
    perf: BatchConcurrencySettings
    debug: DebugSettings

+    cache_dir: Path = Path.home() / ".cache" / "docling"
+    artifacts_path: Optional[Path] = None
+

 settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -1,21 +1,24 @@
 import logging
+import math
 import sys
 import time
 from functools import partial
 from pathlib import Path
-from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
+from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union

 from pydantic import BaseModel, ConfigDict, model_validator, validate_call

 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.asciidoc_backend import AsciiDocBackend
+from docling.backend.csv_backend import CsvDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
+from docling.backend.json.docling_json_backend import DoclingJSONBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
-from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
+from docling.backend.xml.jats_backend import JatsDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
    ConversionStatus,
@ -30,7 +33,12 @@ from docling.datamodel.document import (
    _DocumentConversionInput,
 )
 from docling.datamodel.pipeline_options import PipelineOptions
-from docling.datamodel.settings import DocumentLimits, settings
+from docling.datamodel.settings import (
+    DEFAULT_PAGE_RANGE,
+    DocumentLimits,
+    PageRange,
+    settings,
+)
 from docling.exceptions import ConversionError
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.pipeline.simple_pipeline import SimplePipeline
@ -54,6 +62,11 @@ class FormatOption(BaseModel):
        return self


+class CsvFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
+
+
 class ExcelFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
@ -89,9 +102,9 @@ class PatentUsptoFormatOption(FormatOption):
    backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend


-class XMLPubMedFormatOption(FormatOption):
+class XMLJatsFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
+    backend: Type[AbstractDocumentBackend] = JatsDocumentBackend


 class ImageFormatOption(FormatOption):
@ -106,6 +119,9 @@ class PdfFormatOption(FormatOption):

 def _get_default_option(format: InputFormat) -> FormatOption:
    format_to_default_options = {
+        InputFormat.CSV: FormatOption(
+            pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
+        ),
        InputFormat.XLSX: FormatOption(
            pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
        ),
@ -127,8 +143,8 @@ def _get_default_option(format: InputFormat) -> FormatOption:
        InputFormat.XML_USPTO: FormatOption(
            pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
        ),
-        InputFormat.XML_PUBMED: FormatOption(
-            pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
+        InputFormat.XML_JATS: FormatOption(
+            pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
        ),
        InputFormat.IMAGE: FormatOption(
            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
@ -136,6 +152,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
        InputFormat.PDF: FormatOption(
            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
        ),
+        InputFormat.JSON_DOCLING: FormatOption(
+            pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
+        ),
    }
    if (options := format_to_default_options.get(format)) is not None:
        return options
@ -180,6 +199,7 @@ class DocumentConverter:
        raises_on_error: bool = True,
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
+        page_range: PageRange = DEFAULT_PAGE_RANGE,
    ) -> ConversionResult:
        all_res = self.convert_all(
            source=[source],
@ -187,6 +207,7 @@ class DocumentConverter:
            max_num_pages=max_num_pages,
            max_file_size=max_file_size,
            headers=headers,
+            page_range=page_range,
        )
        return next(all_res)

@ -198,10 +219,12 @@ class DocumentConverter:
        raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
+        page_range: PageRange = DEFAULT_PAGE_RANGE,
    ) -> Iterator[ConversionResult]:
        limits = DocumentLimits(
            max_num_pages=max_num_pages,
            max_file_size=max_file_size,
+            page_range=page_range,
        )
        conv_input = _DocumentConversionInput(
            path_or_stream_iterator=source, limits=limits, headers=headers
--- a/docling/exceptions.py
+++ b/docling/exceptions.py
@ -4,3 +4,7 @@ class BaseError(RuntimeError):

 class ConversionError(BaseError):
    pass
+
+
+class OperationNotAllowed(BaseError):
+    pass
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@ -1,10 +1,12 @@
 from abc import ABC, abstractmethod
-from typing import Any, Iterable
+from typing import Any, Generic, Iterable, Optional

-from docling_core.types.doc import DoclingDocument, NodeItem
+from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
+from typing_extensions import TypeVar

-from docling.datamodel.base_models import Page
+from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
 from docling.datamodel.document import ConversionResult
+from docling.datamodel.settings import settings


 class BasePageModel(ABC):
@ -15,14 +17,71 @@ class BasePageModel(ABC):
        pass


-class BaseEnrichmentModel(ABC):
+EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
+
+
+class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
+
+    elements_batch_size: int = settings.perf.elements_batch_size

    @abstractmethod
    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
        pass

    @abstractmethod
-    def __call__(
-        self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
-    ) -> Iterable[Any]:
+    def prepare_element(
+        self, conv_res: ConversionResult, element: NodeItem
+    ) -> Optional[EnrichElementT]:
        pass
+
+    @abstractmethod
+    def __call__(
+        self, doc: DoclingDocument, element_batch: Iterable[EnrichElementT]
+    ) -> Iterable[NodeItem]:
+        pass
+
+
+class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
+
+    def prepare_element(
+        self, conv_res: ConversionResult, element: NodeItem
+    ) -> Optional[NodeItem]:
+        if self.is_processable(doc=conv_res.document, element=element):
+            return element
+        return None
+
+
+class BaseItemAndImageEnrichmentModel(
+    GenericEnrichmentModel[ItemAndImageEnrichmentElement]
+):
+
+    images_scale: float
+    expansion_factor: float = 0.0
+
+    def prepare_element(
+        self, conv_res: ConversionResult, element: NodeItem
+    ) -> Optional[ItemAndImageEnrichmentElement]:
+        if not self.is_processable(doc=conv_res.document, element=element):
+            return None
+
+        assert isinstance(element, DocItem)
+        element_prov = element.prov[0]
+
+        bbox = element_prov.bbox
+        width = bbox.r - bbox.l
+        height = bbox.t - bbox.b
+
+        # TODO: move to a utility in the BoundingBox class
+        expanded_bbox = BoundingBox(
+            l=bbox.l - width * self.expansion_factor,
+            t=bbox.t + height * self.expansion_factor,
+            r=bbox.r + width * self.expansion_factor,
+            b=bbox.b - height * self.expansion_factor,
+            coord_origin=bbox.coord_origin,
+        )
+
+        page_ix = element_prov.page_no - 1
+        cropped_image = conv_res.pages[page_ix].get_image(
+            scale=self.images_scale, cropbox=expanded_bbox
+        )
+        return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
--- a/docling/models/code_formula_model.py
+++ b/docling/models/code_formula_model.py
@ -0,0 +1,330 @@
+import re
+from collections import Counter
+from pathlib import Path
+from typing import Iterable, List, Literal, Optional, Tuple, Union
+
+import numpy as np
+from docling_core.types.doc import (
+    CodeItem,
+    DocItemLabel,
+    DoclingDocument,
+    NodeItem,
+    TextItem,
+)
+from docling_core.types.doc.labels import CodeLanguageLabel
+from PIL import Image, ImageOps
+from pydantic import BaseModel
+
+from docling.datamodel.base_models import ItemAndImageEnrichmentElement
+from docling.datamodel.pipeline_options import AcceleratorOptions
+from docling.models.base_model import BaseItemAndImageEnrichmentModel
+from docling.utils.accelerator_utils import decide_device
+
+
+class CodeFormulaModelOptions(BaseModel):
+    """
+    Configuration options for the CodeFormulaModel.
+
+    Attributes
+    ----------
+    kind : str
+        Type of the model. Fixed value "code_formula".
+    do_code_enrichment : bool
+        True if code enrichment is enabled, False otherwise.
+    do_formula_enrichment : bool
+        True if formula enrichment is enabled, False otherwise.
+    """
+
+    kind: Literal["code_formula"] = "code_formula"
+    do_code_enrichment: bool = True
+    do_formula_enrichment: bool = True
+
+
+class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
+    """
+    Model for processing and enriching documents with code and formula predictions.
+
+    Attributes
+    ----------
+    enabled : bool
+        True if the model is enabled, False otherwise.
+    options : CodeFormulaModelOptions
+        Configuration options for the CodeFormulaModel.
+    code_formula_model : CodeFormulaPredictor
+        The predictor model for code and formula processing.
+
+    Methods
+    -------
+    __init__(self, enabled, artifacts_path, accelerator_options, code_formula_options)
+        Initializes the CodeFormulaModel with the given configuration options.
+    is_processable(self, doc, element)
+        Determines if a given element in a document can be processed by the model.
+    __call__(self, doc, element_batch)
+        Processes the given batch of elements and enriches them with predictions.
+    """
+
+    _model_repo_folder = "ds4sd--CodeFormula"
+    elements_batch_size = 5
+    images_scale = 1.66  # = 120 dpi, aligned with training data resolution
+    expansion_factor = 0.18
+
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        options: CodeFormulaModelOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        """
+        Initializes the CodeFormulaModel with the given configuration.
+
+        Parameters
+        ----------
+        enabled : bool
+            True if the model is enabled, False otherwise.
+        artifacts_path : Path
+            Path to the directory containing the model artifacts.
+        options : CodeFormulaModelOptions
+            Configuration options for the model.
+        accelerator_options : AcceleratorOptions
+            Options specifying the device and number of threads for acceleration.
+        """
+        self.enabled = enabled
+        self.options = options
+
+        if self.enabled:
+            device = decide_device(accelerator_options.device)
+
+            from docling_ibm_models.code_formula_model.code_formula_predictor import (
+                CodeFormulaPredictor,
+            )
+
+            if artifacts_path is None:
+                artifacts_path = self.download_models()
+            else:
+                artifacts_path = artifacts_path / self._model_repo_folder
+
+            self.code_formula_model = CodeFormulaPredictor(
+                artifacts_path=str(artifacts_path),
+                device=device,
+                num_threads=accelerator_options.num_threads,
+            )
+
+    @staticmethod
+    def download_models(
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id="ds4sd/CodeFormula",
+            force_download=force,
+            local_dir=local_dir,
+            revision="v1.0.2",
+        )
+
+        return Path(download_path)
+
+    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
+        """
+        Determines if a given element in a document can be processed by the model.
+
+        Parameters
+        ----------
+        doc : DoclingDocument
+            The document being processed.
+        element : NodeItem
+            The element within the document to check.
+
+        Returns
+        -------
+        bool
+            True if the element can be processed, False otherwise.
+        """
+        return self.enabled and (
+            (isinstance(element, CodeItem) and self.options.do_code_enrichment)
+            or (
+                isinstance(element, TextItem)
+                and element.label == DocItemLabel.FORMULA
+                and self.options.do_formula_enrichment
+            )
+        )
+
+    def _extract_code_language(self, input_string: str) -> Tuple[str, Optional[str]]:
+        """Extracts a programming language from the beginning of a string.
+
+        This function checks if the input string starts with a pattern of the form
+        ``<_some_language_>``. If it does, it extracts the language string and returns
+        a tuple of (remainder, language). Otherwise, it returns the original string
+        and `None`.
+
+        Args:
+            input_string (str): The input string, which may start with ``<_language_>``.
+
+        Returns:
+            Tuple[str, Optional[str]]:
+                A tuple where:
+                - The first element is either:
+                    - The remainder of the string (everything after ``<_language_>``),
+                    if a match is found; or
+                    - The original string, if no match is found.
+                - The second element is the extracted language if a match is found;
+                otherwise, `None`.
+        """
+        pattern = r"^<_([^_>]+)_>\s(.*)"
+        match = re.match(pattern, input_string, flags=re.DOTALL)
+        if match:
+            language = str(match.group(1))  # the captured programming language
+            remainder = str(match.group(2))  # everything after the <_language_>
+            return remainder, language
+        else:
+            return input_string, None
+
+    def _get_code_language_enum(self, value: Optional[str]) -> CodeLanguageLabel:
+        """
+        Converts a string to a corresponding `CodeLanguageLabel` enum member.
+
+        If the provided string does not match any value in `CodeLanguageLabel`,
+        it defaults to `CodeLanguageLabel.UNKNOWN`.
+
+        Args:
+            value (Optional[str]): The string representation of the code language or None.
+
+        Returns:
+            CodeLanguageLabel: The corresponding enum member if the value is valid,
+            otherwise `CodeLanguageLabel.UNKNOWN`.
+        """
+        if not isinstance(value, str):
+            return CodeLanguageLabel.UNKNOWN
+
+        try:
+            return CodeLanguageLabel(value)
+        except ValueError:
+            return CodeLanguageLabel.UNKNOWN
+
+    def _get_most_frequent_edge_color(self, pil_img: Image.Image):
+        """
+        Compute the most frequent color along the outer edges of a PIL image.
+
+        Parameters
+        ----------
+            pil_img : Image.Image
+                A PIL Image in any mode (L, RGB, RGBA, etc.).
+
+        Returns
+        -------
+            (int) or (tuple): The most common edge color as a scalar (for grayscale) or
+                tuple (for RGB/RGBA).
+        """
+        # Convert to NumPy array for easy pixel access
+        img_np = np.array(pil_img)
+
+        if img_np.ndim == 2:
+            # Grayscale-like image: shape (H, W)
+            # Extract edges: top row, bottom row, left col, right col
+            top = img_np[0, :]  # shape (W,)
+            bottom = img_np[-1, :]  # shape (W,)
+            left = img_np[:, 0]  # shape (H,)
+            right = img_np[:, -1]  # shape (H,)
+
+            # Concatenate all edges
+            edges = np.concatenate([top, bottom, left, right])
+
+            # Count frequencies
+            freq = Counter(edges.tolist())
+            most_common_value, _ = freq.most_common(1)[0]
+            return int(most_common_value)  # single channel color
+
+        else:
+            # Color image: shape (H, W, C)
+            top = img_np[0, :, :]  # shape (W, C)
+            bottom = img_np[-1, :, :]  # shape (W, C)
+            left = img_np[:, 0, :]  # shape (H, C)
+            right = img_np[:, -1, :]  # shape (H, C)
+
+            # Concatenate edges along first axis
+            edges = np.concatenate([top, bottom, left, right], axis=0)
+
+            # Convert each color to a tuple for counting
+            edges_as_tuples = [tuple(pixel) for pixel in edges]
+            freq = Counter(edges_as_tuples)
+            most_common_value, _ = freq.most_common(1)[0]
+            return most_common_value  # e.g. (R, G, B) or (R, G, B, A)
+
+    def _pad_with_most_frequent_edge_color(
+        self, img: Union[Image.Image, np.ndarray], padding: Tuple[int, int, int, int]
+    ):
+        """
+        Pads an image (PIL or NumPy array) using the most frequent edge color.
+
+        Parameters
+        ----------
+            img : Union[Image.Image, np.ndarray]
+                The original image.
+            padding : tuple
+                Padding (left, top, right, bottom) in pixels.
+
+        Returns
+        -------
+            Image.Image: A new PIL image with the specified padding.
+        """
+        if isinstance(img, np.ndarray):
+            pil_img = Image.fromarray(img)
+        else:
+            pil_img = img
+
+        most_freq_color = self._get_most_frequent_edge_color(pil_img)
+
+        padded_img = ImageOps.expand(pil_img, border=padding, fill=most_freq_color)
+        return padded_img
+
+    def __call__(
+        self,
+        doc: DoclingDocument,
+        element_batch: Iterable[ItemAndImageEnrichmentElement],
+    ) -> Iterable[NodeItem]:
+        """
+        Processes the given batch of elements and enriches them with predictions.
+
+        Parameters
+        ----------
+        doc : DoclingDocument
+            The document being processed.
+        element_batch : Iterable[ItemAndImageEnrichmentElement]
+            A batch of elements to be processed.
+
+        Returns
+        -------
+        Iterable[Any]
+            An iterable of enriched elements.
+        """
+        if not self.enabled:
+            for element in element_batch:
+                yield element.item
+            return
+
+        labels: List[str] = []
+        images: List[Union[Image.Image, np.ndarray]] = []
+        elements: List[TextItem] = []
+        for el in element_batch:
+            assert isinstance(el.item, TextItem)
+            elements.append(el.item)
+            labels.append(el.item.label)
+            images.append(
+                self._pad_with_most_frequent_edge_color(el.image, (20, 10, 20, 10))
+            )
+
+        outputs = self.code_formula_model.predict(images, labels)
+
+        for item, output in zip(elements, outputs):
+            if isinstance(item, CodeItem):
+                output, code_language = self._extract_code_language(output)
+                item.code_language = self._get_code_language_enum(code_language)
+            item.text = output
+
+            yield item
--- a/docling/models/document_picture_classifier.py
+++ b/docling/models/document_picture_classifier.py
@ -0,0 +1,190 @@
+from pathlib import Path
+from typing import Iterable, List, Literal, Optional, Tuple, Union
+
+import numpy as np
+from docling_core.types.doc import (
+    DoclingDocument,
+    NodeItem,
+    PictureClassificationClass,
+    PictureClassificationData,
+    PictureItem,
+)
+from PIL import Image
+from pydantic import BaseModel
+
+from docling.datamodel.pipeline_options import AcceleratorOptions
+from docling.models.base_model import BaseEnrichmentModel
+from docling.utils.accelerator_utils import decide_device
+
+
+class DocumentPictureClassifierOptions(BaseModel):
+    """
+    Options for configuring the DocumentPictureClassifier.
+
+    Attributes
+    ----------
+    kind : Literal["document_picture_classifier"]
+        Identifier for the type of classifier.
+    """
+
+    kind: Literal["document_picture_classifier"] = "document_picture_classifier"
+
+
+class DocumentPictureClassifier(BaseEnrichmentModel):
+    """
+    A model for classifying pictures in documents.
+
+    This class enriches document pictures with predicted classifications
+    based on a predefined set of classes.
+
+    Attributes
+    ----------
+    enabled : bool
+        Whether the classifier is enabled for use.
+    options : DocumentPictureClassifierOptions
+        Configuration options for the classifier.
+    document_picture_classifier : DocumentPictureClassifierPredictor
+        The underlying prediction model, loaded if the classifier is enabled.
+
+    Methods
+    -------
+    __init__(enabled, artifacts_path, options, accelerator_options)
+        Initializes the classifier with specified configurations.
+    is_processable(doc, element)
+        Checks if the given element can be processed by the classifier.
+    __call__(doc, element_batch)
+        Processes a batch of elements and adds classification annotations.
+    """
+
+    _model_repo_folder = "ds4sd--DocumentFigureClassifier"
+    images_scale = 2
+
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        options: DocumentPictureClassifierOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        """
+        Initializes the DocumentPictureClassifier.
+
+        Parameters
+        ----------
+        enabled : bool
+            Indicates whether the classifier is enabled.
+        artifacts_path : Optional[Union[Path, str]],
+            Path to the directory containing model artifacts.
+        options : DocumentPictureClassifierOptions
+            Configuration options for the classifier.
+        accelerator_options : AcceleratorOptions
+            Options for configuring the device and parallelism.
+        """
+        self.enabled = enabled
+        self.options = options
+
+        if self.enabled:
+            device = decide_device(accelerator_options.device)
+            from docling_ibm_models.document_figure_classifier_model.document_figure_classifier_predictor import (
+                DocumentFigureClassifierPredictor,
+            )
+
+            if artifacts_path is None:
+                artifacts_path = self.download_models()
+            else:
+                artifacts_path = artifacts_path / self._model_repo_folder
+
+            self.document_picture_classifier = DocumentFigureClassifierPredictor(
+                artifacts_path=str(artifacts_path),
+                device=device,
+                num_threads=accelerator_options.num_threads,
+            )
+
+    @staticmethod
+    def download_models(
+        local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id="ds4sd/DocumentFigureClassifier",
+            force_download=force,
+            local_dir=local_dir,
+            revision="v1.0.1",
+        )
+
+        return Path(download_path)
+
+    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
+        """
+        Determines if the given element can be processed by the classifier.
+
+        Parameters
+        ----------
+        doc : DoclingDocument
+            The document containing the element.
+        element : NodeItem
+            The element to be checked.
+
+        Returns
+        -------
+        bool
+            True if the element is a PictureItem and processing is enabled; False otherwise.
+        """
+        return self.enabled and isinstance(element, PictureItem)
+
+    def __call__(
+        self,
+        doc: DoclingDocument,
+        element_batch: Iterable[NodeItem],
+    ) -> Iterable[NodeItem]:
+        """
+        Processes a batch of elements and enriches them with classification predictions.
+
+        Parameters
+        ----------
+        doc : DoclingDocument
+            The document containing the elements to be processed.
+        element_batch : Iterable[NodeItem]
+            A batch of pictures to classify.
+
+        Returns
+        -------
+        Iterable[NodeItem]
+            An iterable of NodeItem objects after processing. The field
+            'data.classification' is added containing the classification for each picture.
+        """
+        if not self.enabled:
+            for element in element_batch:
+                yield element
+            return
+
+        images: List[Union[Image.Image, np.ndarray]] = []
+        elements: List[PictureItem] = []
+        for el in element_batch:
+            assert isinstance(el, PictureItem)
+            elements.append(el)
+            img = el.get_image(doc)
+            assert img is not None
+            images.append(img)
+
+        outputs = self.document_picture_classifier.predict(images)
+
+        for element, output in zip(elements, outputs):
+            element.annotations.append(
+                PictureClassificationData(
+                    provenance="DocumentPictureClassifier",
+                    predicted_classes=[
+                        PictureClassificationClass(
+                            class_name=pred[0],
+                            confidence=pred[1],
+                        )
+                        for pred in output
+                    ],
+                )
+            )
+
+            yield element
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@ -1,328 +0,0 @@
-import copy
-import random
-from pathlib import Path
-from typing import List, Union
-
-from deepsearch_glm.andromeda_nlp import nlp_model
-from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
-from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
-from docling_core.types.legacy_doc.base import (
-    Figure,
-    PageDimensions,
-    PageReference,
-    Prov,
-    Ref,
-)
-from docling_core.types.legacy_doc.base import Table as DsSchemaTable
-from docling_core.types.legacy_doc.base import TableCell
-from docling_core.types.legacy_doc.document import BaseText
-from docling_core.types.legacy_doc.document import (
-    CCSDocumentDescription as DsDocumentDescription,
-)
-from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
-from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
-from PIL import ImageDraw
-from pydantic import BaseModel, ConfigDict, TypeAdapter
-
-from docling.datamodel.base_models import (
-    Cluster,
-    ContainerElement,
-    FigureElement,
-    Table,
-    TextElement,
-)
-from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
-from docling.datamodel.settings import settings
-from docling.utils.glm_utils import to_docling_document
-from docling.utils.profiling import ProfilingScope, TimeRecorder
-from docling.utils.utils import create_hash
-
-
-class GlmOptions(BaseModel):
-    model_config = ConfigDict(protected_namespaces=())
-
-    model_names: str = ""  # e.g. "language;term;reference"
-
-
-class GlmModel:
-    def __init__(self, options: GlmOptions):
-        self.options = options
-
-        self.model = nlp_model(loglevel="error", text_ordering=True)
-
-    def _to_legacy_document(self, conv_res) -> DsDocument:
-        title = ""
-        desc: DsDocumentDescription = DsDocumentDescription(logs=[])
-
-        page_hashes = [
-            PageReference(
-                hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
-                page=p.page_no + 1,
-                model="default",
-            )
-            for p in conv_res.pages
-        ]
-
-        file_info = DsFileInfoObject(
-            filename=conv_res.input.file.name,
-            document_hash=conv_res.input.document_hash,
-            num_pages=conv_res.input.page_count,
-            page_hashes=page_hashes,
-        )
-
-        main_text: List[Union[Ref, BaseText]] = []
-        tables: List[DsSchemaTable] = []
-        figures: List[Figure] = []
-
-        page_no_to_page = {p.page_no: p for p in conv_res.pages}
-
-        for element in conv_res.assembled.elements:
-            # Convert bboxes to lower-left origin.
-            target_bbox = DsBoundingBox(
-                element.cluster.bbox.to_bottom_left_origin(
-                    page_no_to_page[element.page_no].size.height
-                ).as_tuple()
-            )
-
-            if isinstance(element, TextElement):
-                main_text.append(
-                    BaseText(
-                        text=element.text,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        name=element.label,
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, len(element.text)],
-                            )
-                        ],
-                    )
-                )
-            elif isinstance(element, Table):
-                index = len(tables)
-                ref_str = f"#/tables/{index}"
-                main_text.append(
-                    Ref(
-                        name=element.label,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        ref=ref_str,
-                    ),
-                )
-
-                # Initialise empty table data grid (only empty cells)
-                table_data = [
-                    [
-                        TableCell(
-                            text="",
-                            # bbox=[0,0,0,0],
-                            spans=[[i, j]],
-                            obj_type="body",
-                        )
-                        for j in range(element.num_cols)
-                    ]
-                    for i in range(element.num_rows)
-                ]
-
-                # Overwrite cells in table data for which there is actual cell content.
-                for cell in element.table_cells:
-                    for i in range(
-                        min(cell.start_row_offset_idx, element.num_rows),
-                        min(cell.end_row_offset_idx, element.num_rows),
-                    ):
-                        for j in range(
-                            min(cell.start_col_offset_idx, element.num_cols),
-                            min(cell.end_col_offset_idx, element.num_cols),
-                        ):
-                            celltype = "body"
-                            if cell.column_header:
-                                celltype = "col_header"
-                            elif cell.row_header:
-                                celltype = "row_header"
-                            elif cell.row_section:
-                                celltype = "row_section"
-
-                            def make_spans(cell):
-                                for rspan in range(
-                                    min(cell.start_row_offset_idx, element.num_rows),
-                                    min(cell.end_row_offset_idx, element.num_rows),
-                                ):
-                                    for cspan in range(
-                                        min(
-                                            cell.start_col_offset_idx, element.num_cols
-                                        ),
-                                        min(cell.end_col_offset_idx, element.num_cols),
-                                    ):
-                                        yield [rspan, cspan]
-
-                            spans = list(make_spans(cell))
-                            if cell.bbox is not None:
-                                bbox = cell.bbox.to_bottom_left_origin(
-                                    page_no_to_page[element.page_no].size.height
-                                ).as_tuple()
-                            else:
-                                bbox = None
-
-                            table_data[i][j] = TableCell(
-                                text=cell.text,
-                                bbox=bbox,
-                                # col=j,
-                                # row=i,
-                                spans=spans,
-                                obj_type=celltype,
-                                # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
-                                # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
-                            )
-
-                tables.append(
-                    DsSchemaTable(
-                        num_cols=element.num_cols,
-                        num_rows=element.num_rows,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        data=table_data,
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, 0],
-                            )
-                        ],
-                    )
-                )
-
-            elif isinstance(element, FigureElement):
-                index = len(figures)
-                ref_str = f"#/figures/{index}"
-                main_text.append(
-                    Ref(
-                        name=element.label,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        ref=ref_str,
-                    ),
-                )
-                figures.append(
-                    Figure(
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, 0],
-                            )
-                        ],
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        payload={
-                            "children": TypeAdapter(List[Cluster]).dump_python(
-                                element.cluster.children
-                            )
-                        },  # hack to channel child clusters through GLM
-                    )
-                )
-            elif isinstance(element, ContainerElement):
-                main_text.append(
-                    BaseText(
-                        text="",
-                        payload={
-                            "children": TypeAdapter(List[Cluster]).dump_python(
-                                element.cluster.children
-                            )
-                        },  # hack to channel child clusters through GLM
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        name=element.label,
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, 0],
-                            )
-                        ],
-                    )
-                )
-
-        page_dimensions = [
-            PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
-            for p in conv_res.pages
-            if p.size is not None
-        ]
-
-        ds_doc: DsDocument = DsDocument(
-            name=title,
-            description=desc,
-            file_info=file_info,
-            main_text=main_text,
-            tables=tables,
-            figures=figures,
-            page_dimensions=page_dimensions,
-        )
-
-        return ds_doc
-
-    def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
-        with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
-            ds_doc = self._to_legacy_document(conv_res)
-            ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
-
-            glm_doc = self.model.apply_on_doc(ds_doc_dict)
-
-            docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental
-
-        # DEBUG code:
-        def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
-            clusters_to_draw = []
-            image = copy.deepcopy(conv_res.pages[page_no].image)
-            for ix, elem in enumerate(ds_document.main_text):
-                if isinstance(elem, BaseText):
-                    prov = elem.prov[0]  # type: ignore
-                elif isinstance(elem, Ref):
-                    _, arr, index = elem.ref.split("/")
-                    index = int(index)  # type: ignore
-                    if arr == "tables":
-                        prov = ds_document.tables[index].prov[0]
-                    elif arr == "figures":
-                        prov = ds_document.pictures[index].prov[0]
-                    else:
-                        prov = None
-
-                if prov and prov.page == page_no:
-                    clusters_to_draw.append(
-                        Cluster(
-                            id=ix,
-                            label=elem.name,
-                            bbox=BoundingBox.from_tuple(
-                                coord=prov.bbox,  # type: ignore
-                                origin=CoordOrigin.BOTTOMLEFT,
-                            ).to_top_left_origin(conv_res.pages[page_no].size.height),
-                        )
-                    )
-
-            draw = ImageDraw.Draw(image)
-            for c in clusters_to_draw:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
-                draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
-
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                for tc in c.cells:  # [:1]:
-                    x0, y0, x1, y1 = tc.bbox.as_tuple()
-                    draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-
-            if show:
-                image.show()
-            else:
-                out_path: Path = (
-                    Path(settings.debug.debug_output_path)
-                    / f"debug_{conv_res.input.file.stem}"
-                )
-                out_path.mkdir(parents=True, exist_ok=True)
-
-                out_file = out_path / f"doc_page_{page_no:05}.png"
-                image.save(str(out_file), format="png")
-
-        # for item in ds_doc.page_dimensions:
-        #    page_no = item.page
-        #    draw_clusters_and_cells(ds_doc, page_no)
-
-        return docling_doc
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@ -1,9 +1,10 @@
 import logging
 import warnings
-from typing import Iterable
+import zipfile
+from pathlib import Path
+from typing import Iterable, List, Optional

 import numpy
-import torch
 from docling_core.types.doc import BoundingBox, CoordOrigin

 from docling.datamodel.base_models import Cell, OcrCell, Page
@ -17,14 +18,18 @@ from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
+from docling.utils.utils import download_url_with_progress

 _log = logging.getLogger(__name__)


 class EasyOcrModel(BaseOcrModel):
+    _model_repo_folder = "EasyOcr"
+
    def __init__(
        self,
        enabled: bool,
+        artifacts_path: Optional[Path],
        options: EasyOcrOptions,
        accelerator_options: AcceleratorOptions,
    ):
@ -62,15 +67,55 @@ class EasyOcrModel(BaseOcrModel):
                )
                use_gpu = self.options.use_gpu

+            download_enabled = self.options.download_enabled
+            model_storage_directory = self.options.model_storage_directory
+            if artifacts_path is not None and model_storage_directory is None:
+                download_enabled = False
+                model_storage_directory = str(artifacts_path / self._model_repo_folder)
+
            self.reader = easyocr.Reader(
                lang_list=self.options.lang,
                gpu=use_gpu,
-                model_storage_directory=self.options.model_storage_directory,
+                model_storage_directory=model_storage_directory,
                recog_network=self.options.recog_network,
-                download_enabled=self.options.download_enabled,
+                download_enabled=download_enabled,
                verbose=False,
            )

+    @staticmethod
+    def download_models(
+        detection_models: List[str] = ["craft"],
+        recognition_models: List[str] = ["english_g2", "latin_g2"],
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
+        from easyocr.config import detection_models as det_models_dict
+        from easyocr.config import recognition_models as rec_models_dict
+
+        if local_dir is None:
+            local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
+
+        local_dir.mkdir(parents=True, exist_ok=True)
+
+        # Collect models to download
+        download_list = []
+        for model_name in detection_models:
+            if model_name in det_models_dict:
+                download_list.append(det_models_dict[model_name])
+        for model_name in recognition_models:
+            if model_name in rec_models_dict["gen2"]:
+                download_list.append(rec_models_dict["gen2"][model_name])
+
+        # Download models
+        for model_details in download_list:
+            buf = download_url_with_progress(model_details["url"], progress=progress)
+            with zipfile.ZipFile(buf, "r") as zip_ref:
+                zip_ref.extractall(local_dir)
+
+        return local_dir
+
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
--- a/docling/models/hf_vlm_model.py
+++ b/docling/models/hf_vlm_model.py
@ -0,0 +1,180 @@
+import logging
+import time
+from pathlib import Path
+from typing import Iterable, List, Optional
+
+from docling.datamodel.base_models import Page, VlmPrediction
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    HuggingFaceVlmOptions,
+)
+from docling.datamodel.settings import settings
+from docling.models.base_model import BasePageModel
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.profiling import TimeRecorder
+
+_log = logging.getLogger(__name__)
+
+
+class HuggingFaceVlmModel(BasePageModel):
+
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        vlm_options: HuggingFaceVlmOptions,
+    ):
+        self.enabled = enabled
+
+        self.vlm_options = vlm_options
+
+        if self.enabled:
+            import torch
+            from transformers import (  # type: ignore
+                AutoModelForVision2Seq,
+                AutoProcessor,
+                BitsAndBytesConfig,
+            )
+
+            device = decide_device(accelerator_options.device)
+            self.device = device
+
+            _log.debug("Available device for HuggingFace VLM: {}".format(device))
+
+            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
+
+            # PARAMETERS:
+            if artifacts_path is None:
+                artifacts_path = self.download_models(self.vlm_options.repo_id)
+            elif (artifacts_path / repo_cache_folder).exists():
+                artifacts_path = artifacts_path / repo_cache_folder
+
+            self.param_question = vlm_options.prompt  # "Perform Layout Analysis."
+            self.param_quantization_config = BitsAndBytesConfig(
+                load_in_8bit=vlm_options.load_in_8bit,  # True,
+                llm_int8_threshold=vlm_options.llm_int8_threshold,  # 6.0
+            )
+            self.param_quantized = vlm_options.quantized  # False
+
+            self.processor = AutoProcessor.from_pretrained(artifacts_path)
+            if not self.param_quantized:
+                self.vlm_model = AutoModelForVision2Seq.from_pretrained(
+                    artifacts_path,
+                    device_map=device,
+                    torch_dtype=torch.bfloat16,
+                    _attn_implementation=(
+                        "flash_attention_2"
+                        if self.device.startswith("cuda")
+                        and accelerator_options.cuda_use_flash_attention2
+                        else "eager"
+                    ),
+                )  # .to(self.device)
+
+            else:
+                self.vlm_model = AutoModelForVision2Seq.from_pretrained(
+                    artifacts_path,
+                    device_map=device,
+                    torch_dtype="auto",
+                    quantization_config=self.param_quantization_config,
+                    _attn_implementation=(
+                        "flash_attention_2"
+                        if self.device.startswith("cuda")
+                        and accelerator_options.cuda_use_flash_attention2
+                        else "eager"
+                    ),
+                )  # .to(self.device)
+
+    @staticmethod
+    def download_models(
+        repo_id: str,
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id=repo_id,
+            force_download=force,
+            local_dir=local_dir,
+            # revision="v0.0.1",
+        )
+
+        return Path(download_path)
+
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        for page in page_batch:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "vlm"):
+                    assert page.size is not None
+
+                    hi_res_image = page.get_image(scale=2.0)  # 144dpi
+                    # hi_res_image = page.get_image(scale=1.0)  # 72dpi
+
+                    if hi_res_image is not None:
+                        im_width, im_height = hi_res_image.size
+
+                    # populate page_tags with predicted doc tags
+                    page_tags = ""
+
+                    if hi_res_image:
+                        if hi_res_image.mode != "RGB":
+                            hi_res_image = hi_res_image.convert("RGB")
+
+                    messages = [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": "This is a page from a document.",
+                                },
+                                {"type": "image"},
+                                {"type": "text", "text": self.param_question},
+                            ],
+                        }
+                    ]
+                    prompt = self.processor.apply_chat_template(
+                        messages, add_generation_prompt=False
+                    )
+                    inputs = self.processor(
+                        text=prompt, images=[hi_res_image], return_tensors="pt"
+                    )
+                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
+
+                    start_time = time.time()
+                    # Call model to generate:
+                    generated_ids = self.vlm_model.generate(
+                        **inputs, max_new_tokens=4096, use_cache=True
+                    )
+
+                    generation_time = time.time() - start_time
+                    generated_texts = self.processor.batch_decode(
+                        generated_ids[:, inputs["input_ids"].shape[1] :],
+                        skip_special_tokens=False,
+                    )[0]
+
+                    num_tokens = len(generated_ids[0])
+                    page_tags = generated_texts
+
+                    # inference_time = time.time() - start_time
+                    # tokens_per_second = num_tokens / generation_time
+                    # print("")
+                    # print(f"Page Inference Time: {inference_time:.2f} seconds")
+                    # print(f"Total tokens on page: {num_tokens:.2f}")
+                    # print(f"Tokens/sec: {tokens_per_second:.2f}")
+                    # print("")
+                    page.predictions.vlm_response = VlmPrediction(text=page_tags)
+
+                yield page
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -1,33 +1,29 @@
 import copy
 import logging
-import random
-import time
+import warnings
 from pathlib import Path
-from typing import Iterable, List
+from typing import Iterable, Optional, Union

-from docling_core.types.doc import CoordOrigin, DocItemLabel
+from docling_core.types.doc import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image

-from docling.datamodel.base_models import (
-    BoundingBox,
-    Cell,
-    Cluster,
-    LayoutPrediction,
-    Page,
-)
+from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.pipeline_options import AcceleratorOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.layout_postprocessor import LayoutPostprocessor
 from docling.utils.profiling import TimeRecorder
+from docling.utils.visualization import draw_clusters

 _log = logging.getLogger(__name__)


 class LayoutModel(BasePageModel):
+    _model_repo_folder = "ds4sd--docling-models"
+    _model_path = "model_artifacts/layout"

    TEXT_ELEM_LABELS = [
        DocItemLabel.TEXT,
@ -40,7 +36,7 @@ class LayoutModel(BasePageModel):
        DocItemLabel.PAGE_FOOTER,
        DocItemLabel.CODE,
        DocItemLabel.LIST_ITEM,
-        # "Formula",
+        DocItemLabel.FORMULA,
    ]
    PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]

@ -49,15 +45,56 @@ class LayoutModel(BasePageModel):
    FORMULA_LABEL = DocItemLabel.FORMULA
    CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]

-    def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
+    def __init__(
+        self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
+    ):
        device = decide_device(accelerator_options.device)

+        if artifacts_path is None:
+            artifacts_path = self.download_models() / self._model_path
+        else:
+            # will become the default in the future
+            if (artifacts_path / self._model_repo_folder).exists():
+                artifacts_path = (
+                    artifacts_path / self._model_repo_folder / self._model_path
+                )
+            elif (artifacts_path / self._model_path).exists():
+                warnings.warn(
+                    "The usage of artifacts_path containing directly "
+                    f"{self._model_path} is deprecated. Please point "
+                    "the artifacts_path to the parent containing "
+                    f"the {self._model_repo_folder} folder.",
+                    DeprecationWarning,
+                    stacklevel=3,
+                )
+                artifacts_path = artifacts_path / self._model_path
+
        self.layout_predictor = LayoutPredictor(
            artifact_path=str(artifacts_path),
            device=device,
            num_threads=accelerator_options.num_threads,
        )

+    @staticmethod
+    def download_models(
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id="ds4sd/docling-models",
+            force_download=force,
+            local_dir=local_dir,
+            revision="v2.1.0",
+        )
+
+        return Path(download_path)
+
    def draw_clusters_and_cells_side_by_side(
        self, conv_res, page, clusters, mode_prefix: str, show: bool = False
    ):
@ -82,78 +119,9 @@ class LayoutModel(BasePageModel):
        left_image = copy.deepcopy(page.image)
        right_image = copy.deepcopy(page.image)

-        # Function to draw clusters on an image
-        def draw_clusters(image, clusters):
-            draw = ImageDraw.Draw(image, "RGBA")
-            # Create a smaller font for the labels
-            try:
-                font = ImageFont.truetype("arial.ttf", 12)
-            except OSError:
-                # Fallback to default font if arial is not available
-                font = ImageFont.load_default()
-            for c_tl in clusters:
-                all_clusters = [c_tl, *c_tl.children]
-                for c in all_clusters:
-                    # Draw cells first (underneath)
-                    cell_color = (0, 0, 0, 40)  # Transparent black for cells
-                    for tc in c.cells:
-                        cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
-                        cx0 *= scale_x
-                        cx1 *= scale_x
-                        cy0 *= scale_x
-                        cy1 *= scale_y
-
-                        draw.rectangle(
-                            [(cx0, cy0), (cx1, cy1)],
-                            outline=None,
-                            fill=cell_color,
-                        )
-                    # Draw cluster rectangle
-                    x0, y0, x1, y1 = c.bbox.as_tuple()
-                    x0 *= scale_x
-                    x1 *= scale_x
-                    y0 *= scale_x
-                    y1 *= scale_y
-
-                    cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
-                    cluster_outline_color = (
-                        *list(DocItemLabel.get_color(c.label)),
-                        255,
-                    )
-                    draw.rectangle(
-                        [(x0, y0), (x1, y1)],
-                        outline=cluster_outline_color,
-                        fill=cluster_fill_color,
-                    )
-                    # Add label name and confidence
-                    label_text = f"{c.label.name} ({c.confidence:.2f})"
-                    # Create semi-transparent background for text
-                    text_bbox = draw.textbbox((x0, y0), label_text, font=font)
-                    text_bg_padding = 2
-                    draw.rectangle(
-                        [
-                            (
-                                text_bbox[0] - text_bg_padding,
-                                text_bbox[1] - text_bg_padding,
-                            ),
-                            (
-                                text_bbox[2] + text_bg_padding,
-                                text_bbox[3] + text_bg_padding,
-                            ),
-                        ],
-                        fill=(255, 255, 255, 180),  # Semi-transparent white
-                    )
-                    # Draw text
-                    draw.text(
-                        (x0, y0),
-                        label_text,
-                        fill=(0, 0, 0, 255),  # Solid black
-                        font=font,
-                    )
-
        # Draw clusters on both images
-        draw_clusters(left_image, left_clusters)
-        draw_clusters(right_image, right_clusters)
+        draw_clusters(left_image, left_clusters, scale_x, scale_y)
+        draw_clusters(right_image, right_clusters, scale_x, scale_y)
        # Combine the images side by side
        combined_width = left_image.width * 2
        combined_height = left_image.height
@ -182,10 +150,12 @@ class LayoutModel(BasePageModel):
            else:
                with TimeRecorder(conv_res, "layout"):
                    assert page.size is not None
+                    page_image = page.get_image(scale=1.0)
+                    assert page_image is not None

                    clusters = []
                    for ix, pred_item in enumerate(
-                        self.layout_predictor.predict(page.get_image(scale=1.0))
+                        self.layout_predictor.predict(page_image)
                    ):
                        label = DocItemLabel(
                            pred_item["label"]
--- a/docling/models/page_assemble_model.py
+++ b/docling/models/page_assemble_model.py
@ -22,7 +22,7 @@ _log = logging.getLogger(__name__)


 class PageAssembleOptions(BaseModel):
-    keep_images: bool = False
+    pass


 class PageAssembleModel(BasePageModel):
@ -52,6 +52,14 @@ class PageAssembleModel(BasePageModel):

        sanitized_text = "".join(lines)

+        # Text normalization
+        sanitized_text = sanitized_text.replace("⁄", "/")
+        sanitized_text = sanitized_text.replace("’", "'")
+        sanitized_text = sanitized_text.replace("‘", "'")
+        sanitized_text = sanitized_text.replace("“", '"')
+        sanitized_text = sanitized_text.replace("”", '"')
+        sanitized_text = sanitized_text.replace("•", "·")
+
        return sanitized_text.strip()  # Strip any leading or trailing whitespace

    def __call__(
@ -135,31 +143,6 @@ class PageAssembleModel(BasePageModel):
                                )
                            elements.append(fig)
                            body.append(fig)
-                        elif cluster.label == LayoutModel.FORMULA_LABEL:
-                            equation = None
-                            if page.predictions.equations_prediction:
-                                equation = page.predictions.equations_prediction.equation_map.get(
-                                    cluster.id, None
-                                )
-                            if (
-                                not equation
-                            ):  # fallback: add empty formula, if it isn't present
-                                text = self.sanitize_text(
-                                    [
-                                        cell.text.replace("\x02", "-").strip()
-                                        for cell in cluster.cells
-                                        if len(cell.text.strip()) > 0
-                                    ]
-                                )
-                                equation = TextElement(
-                                    label=cluster.label,
-                                    id=cluster.id,
-                                    cluster=cluster,
-                                    page_no=page.page_no,
-                                    text=text,
-                                )
-                            elements.append(equation)
-                            body.append(equation)
                        elif cluster.label in LayoutModel.CONTAINER_LABELS:
                            container_el = ContainerElement(
                                label=cluster.label,
@ -174,11 +157,4 @@ class PageAssembleModel(BasePageModel):
                        elements=elements, headers=headers, body=body
                    )

-                    # Remove page images (can be disabled)
-                    if not self.options.keep_images:
-                        page._image_cache = {}
-
-                    # Unload backend
-                    page._backend.unload()
-
                yield page
--- a/docling/models/picture_description_api_model.py
+++ b/docling/models/picture_description_api_model.py
@ -0,0 +1,108 @@
+import base64
+import io
+import logging
+from typing import Iterable, List, Optional
+
+import requests
+from PIL import Image
+from pydantic import BaseModel, ConfigDict
+
+from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
+from docling.exceptions import OperationNotAllowed
+from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+
+_log = logging.getLogger(__name__)
+
+
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+
+
+class ResponseChoice(BaseModel):
+    index: int
+    message: ChatMessage
+    finish_reason: str
+
+
+class ResponseUsage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+
+class ApiResponse(BaseModel):
+    model_config = ConfigDict(
+        protected_namespaces=(),
+    )
+
+    id: str
+    model: Optional[str] = None  # returned by openai
+    choices: List[ResponseChoice]
+    created: int
+    usage: ResponseUsage
+
+
+class PictureDescriptionApiModel(PictureDescriptionBaseModel):
+    # elements_batch_size = 4
+
+    def __init__(
+        self,
+        enabled: bool,
+        enable_remote_services: bool,
+        options: PictureDescriptionApiOptions,
+    ):
+        super().__init__(enabled=enabled, options=options)
+        self.options: PictureDescriptionApiOptions
+
+        if self.enabled:
+            if not enable_remote_services:
+                raise OperationNotAllowed(
+                    "Connections to remote services is only allowed when set explicitly. "
+                    "pipeline_options.enable_remote_services=True."
+                )
+
+    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+        # Note: technically we could make a batch request here,
+        # but not all APIs will allow for it. For example, vllm won't allow more than 1.
+        for image in images:
+            img_io = io.BytesIO()
+            image.save(img_io, "PNG")
+            image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
+
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": self.options.prompt,
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{image_base64}"
+                            },
+                        },
+                    ],
+                }
+            ]
+
+            payload = {
+                "messages": messages,
+                **self.options.params,
+            }
+
+            r = requests.post(
+                str(self.options.url),
+                headers=self.options.headers,
+                json=payload,
+                timeout=self.options.timeout,
+            )
+            if not r.ok:
+                _log.error(f"Error calling the API. Reponse was {r.text}")
+            r.raise_for_status()
+
+            api_resp = ApiResponse.model_validate_json(r.text)
+            generated_text = api_resp.choices[0].message.content.strip()
+            yield generated_text
--- a/docling/models/picture_description_base_model.py
+++ b/docling/models/picture_description_base_model.py
@ -0,0 +1,64 @@
+import logging
+from pathlib import Path
+from typing import Any, Iterable, List, Optional, Union
+
+from docling_core.types.doc import (
+    DoclingDocument,
+    NodeItem,
+    PictureClassificationClass,
+    PictureItem,
+)
+from docling_core.types.doc.document import (  # TODO: move import to docling_core.types.doc
+    PictureDescriptionData,
+)
+from PIL import Image
+
+from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
+from docling.models.base_model import (
+    BaseItemAndImageEnrichmentModel,
+    ItemAndImageEnrichmentElement,
+)
+
+
+class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
+    images_scale: float = 2.0
+
+    def __init__(
+        self,
+        enabled: bool,
+        options: PictureDescriptionBaseOptions,
+    ):
+        self.enabled = enabled
+        self.options = options
+        self.provenance = "not-implemented"
+
+    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
+        return self.enabled and isinstance(element, PictureItem)
+
+    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+        raise NotImplementedError
+
+    def __call__(
+        self,
+        doc: DoclingDocument,
+        element_batch: Iterable[ItemAndImageEnrichmentElement],
+    ) -> Iterable[NodeItem]:
+        if not self.enabled:
+            for element in element_batch:
+                yield element.item
+            return
+
+        images: List[Image.Image] = []
+        elements: List[PictureItem] = []
+        for el in element_batch:
+            assert isinstance(el.item, PictureItem)
+            elements.append(el.item)
+            images.append(el.image)
+
+        outputs = self._annotate_images(images)
+
+        for item, output in zip(elements, outputs):
+            item.annotations.append(
+                PictureDescriptionData(text=output, provenance=self.provenance)
+            )
+            yield item
--- a/docling/models/picture_description_vlm_model.py
+++ b/docling/models/picture_description_vlm_model.py
@ -0,0 +1,109 @@
+from pathlib import Path
+from typing import Iterable, Optional, Union
+
+from PIL import Image
+
+from docling.datamodel.pipeline_options import (
+    AcceleratorOptions,
+    PictureDescriptionVlmOptions,
+)
+from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+from docling.utils.accelerator_utils import decide_device
+
+
+class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
+
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Union[Path, str]],
+        options: PictureDescriptionVlmOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        super().__init__(enabled=enabled, options=options)
+        self.options: PictureDescriptionVlmOptions
+
+        if self.enabled:
+
+            if artifacts_path is None:
+                artifacts_path = self.download_models(repo_id=self.options.repo_id)
+            else:
+                artifacts_path = Path(artifacts_path) / self.options.repo_cache_folder
+
+            self.device = decide_device(accelerator_options.device)
+
+            try:
+                import torch
+                from transformers import AutoModelForVision2Seq, AutoProcessor
+            except ImportError:
+                raise ImportError(
+                    "transformers >=4.46 is not installed. Please install Docling with the required extras `pip install docling[vlm]`."
+                )
+
+            # Initialize processor and model
+            self.processor = AutoProcessor.from_pretrained(artifacts_path)
+            self.model = AutoModelForVision2Seq.from_pretrained(
+                artifacts_path,
+                torch_dtype=torch.bfloat16,
+                _attn_implementation=(
+                    "flash_attention_2" if self.device.startswith("cuda") else "eager"
+                ),
+            ).to(self.device)
+
+            self.provenance = f"{self.options.repo_id}"
+
+    @staticmethod
+    def download_models(
+        repo_id: str,
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id=repo_id,
+            force_download=force,
+            local_dir=local_dir,
+        )
+
+        return Path(download_path)
+
+    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+        from transformers import GenerationConfig
+
+        # Create input messages
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": self.options.prompt},
+                ],
+            },
+        ]
+
+        # TODO: do batch generation
+
+        for image in images:
+            # Prepare inputs
+            prompt = self.processor.apply_chat_template(
+                messages, add_generation_prompt=True
+            )
+            inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
+            inputs = inputs.to(self.device)
+
+            # Generate outputs
+            generated_ids = self.model.generate(
+                **inputs,
+                generation_config=GenerationConfig(**self.options.generation_config),
+            )
+            generated_texts = self.processor.batch_decode(
+                generated_ids[:, inputs["input_ids"].shape[1] :],
+                skip_special_tokens=True,
+            )
+
+            yield generated_texts[0].strip()
--- a/docling/models/rapid_ocr_model.py
+++ b/docling/models/rapid_ocr_model.py
@ -59,6 +59,7 @@ class RapidOcrModel(BaseOcrModel):
                det_model_path=self.options.det_model_path,
                cls_model_path=self.options.cls_model_path,
                rec_model_path=self.options.rec_model_path,
+                rec_keys_path=self.options.rec_keys_path,
            )

    def __call__(
--- a/docling/models/readingorder_model.py
+++ b/docling/models/readingorder_model.py
@ -0,0 +1,389 @@
+import copy
+import random
+from pathlib import Path
+from typing import Dict, List
+
+from docling_core.types.doc import (
+    BoundingBox,
+    CoordOrigin,
+    DocItem,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupLabel,
+    NodeItem,
+    ProvenanceItem,
+    RefItem,
+    TableData,
+)
+from docling_core.types.doc.document import ContentLayer
+from docling_core.types.legacy_doc.base import Ref
+from docling_core.types.legacy_doc.document import BaseText
+from docling_ibm_models.reading_order.reading_order_rb import (
+    PageElement as ReadingOrderPageElement,
+)
+from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
+from PIL import ImageDraw
+from pydantic import BaseModel, ConfigDict
+
+from docling.datamodel.base_models import (
+    BasePageElement,
+    Cluster,
+    ContainerElement,
+    FigureElement,
+    Table,
+    TextElement,
+)
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.settings import settings
+from docling.utils.profiling import ProfilingScope, TimeRecorder
+
+
+class ReadingOrderOptions(BaseModel):
+    model_config = ConfigDict(protected_namespaces=())
+
+    model_names: str = ""  # e.g. "language;term;reference"
+
+
+class ReadingOrderModel:
+    def __init__(self, options: ReadingOrderOptions):
+        self.options = options
+        self.ro_model = ReadingOrderPredictor()
+
+    def _assembled_to_readingorder_elements(
+        self, conv_res: ConversionResult
+    ) -> List[ReadingOrderPageElement]:
+
+        elements: List[ReadingOrderPageElement] = []
+        page_no_to_pages = {p.page_no: p for p in conv_res.pages}
+
+        for element in conv_res.assembled.elements:
+
+            page_height = page_no_to_pages[element.page_no].size.height  # type: ignore
+            bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
+            text = element.text or ""
+
+            elements.append(
+                ReadingOrderPageElement(
+                    cid=len(elements),
+                    ref=RefItem(cref=f"#/{element.page_no}/{element.cluster.id}"),
+                    text=text,
+                    page_no=element.page_no,
+                    page_size=page_no_to_pages[element.page_no].size,
+                    label=element.label,
+                    l=bbox.l,
+                    r=bbox.r,
+                    b=bbox.b,
+                    t=bbox.t,
+                    coord_origin=bbox.coord_origin,
+                )
+            )
+
+        return elements
+
+    def _add_child_elements(
+        self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
+    ):
+
+        child: Cluster
+        for child in element.cluster.children:
+            c_label = child.label
+            c_bbox = child.bbox.to_bottom_left_origin(
+                doc.pages[element.page_no + 1].size.height
+            )
+            c_text = " ".join(
+                [
+                    cell.text.replace("\x02", "-").strip()
+                    for cell in child.cells
+                    if len(cell.text.strip()) > 0
+                ]
+            )
+
+            c_prov = ProvenanceItem(
+                page_no=element.page_no + 1, charspan=(0, len(c_text)), bbox=c_bbox
+            )
+            if c_label == DocItemLabel.LIST_ITEM:
+                # TODO: Infer if this is a numbered or a bullet list item
+                doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
+            elif c_label == DocItemLabel.SECTION_HEADER:
+                doc.add_heading(parent=doc_item, text=c_text, prov=c_prov)
+            else:
+                doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
+
+    def _readingorder_elements_to_docling_doc(
+        self,
+        conv_res: ConversionResult,
+        ro_elements: List[ReadingOrderPageElement],
+        el_to_captions_mapping: Dict[int, List[int]],
+        el_to_footnotes_mapping: Dict[int, List[int]],
+        el_merges_mapping: Dict[int, List[int]],
+    ) -> DoclingDocument:
+
+        id_to_elem = {
+            RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
+            for elem in conv_res.assembled.elements
+        }
+        cid_to_rels = {rel.cid: rel for rel in ro_elements}
+
+        origin = DocumentOrigin(
+            mimetype="application/pdf",
+            filename=conv_res.input.file.name,
+            binary_hash=conv_res.input.document_hash,
+        )
+        doc_name = Path(origin.filename).stem
+        out_doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
+
+        for page in conv_res.pages:
+            page_no = page.page_no + 1
+            size = page.size
+
+            assert size is not None
+
+            out_doc.add_page(page_no=page_no, size=size)
+
+        current_list = None
+        skippable_cids = {
+            cid
+            for mapping in (
+                el_to_captions_mapping,
+                el_to_footnotes_mapping,
+                el_merges_mapping,
+            )
+            for lst in mapping.values()
+            for cid in lst
+        }
+
+        page_no_to_pages = {p.page_no: p for p in conv_res.pages}
+
+        for rel in ro_elements:
+            if rel.cid in skippable_cids:
+                continue
+            element = id_to_elem[rel.ref.cref]
+
+            page_height = page_no_to_pages[element.page_no].size.height  # type: ignore
+
+            if isinstance(element, TextElement):
+                if element.label == DocItemLabel.CODE:
+                    cap_text = element.text
+                    prov = ProvenanceItem(
+                        page_no=element.page_no + 1,
+                        charspan=(0, len(cap_text)),
+                        bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
+                    )
+                    code_item = out_doc.add_code(text=cap_text, prov=prov)
+
+                    if rel.cid in el_to_captions_mapping.keys():
+                        for caption_cid in el_to_captions_mapping[rel.cid]:
+                            caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
+                            new_cap_item = self._add_caption_or_footnote(
+                                caption_elem, out_doc, code_item, page_height
+                            )
+
+                            code_item.captions.append(new_cap_item.get_ref())
+
+                    if rel.cid in el_to_footnotes_mapping.keys():
+                        for footnote_cid in el_to_footnotes_mapping[rel.cid]:
+                            footnote_elem = id_to_elem[
+                                cid_to_rels[footnote_cid].ref.cref
+                            ]
+                            new_footnote_item = self._add_caption_or_footnote(
+                                footnote_elem, out_doc, code_item, page_height
+                            )
+
+                            code_item.footnotes.append(new_footnote_item.get_ref())
+                else:
+
+                    new_item, current_list = self._handle_text_element(
+                        element, out_doc, current_list, page_height
+                    )
+
+                    if rel.cid in el_merges_mapping.keys():
+                        for merged_cid in el_merges_mapping[rel.cid]:
+                            merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref]
+
+                            self._merge_elements(
+                                element, merged_elem, new_item, page_height
+                            )
+
+            elif isinstance(element, Table):
+
+                tbl_data = TableData(
+                    num_rows=element.num_rows,
+                    num_cols=element.num_cols,
+                    table_cells=element.table_cells,
+                )
+
+                prov = ProvenanceItem(
+                    page_no=element.page_no + 1,
+                    charspan=(0, 0),
+                    bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
+                )
+
+                tbl = out_doc.add_table(
+                    data=tbl_data, prov=prov, label=element.cluster.label
+                )
+
+                if rel.cid in el_to_captions_mapping.keys():
+                    for caption_cid in el_to_captions_mapping[rel.cid]:
+                        caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
+                        new_cap_item = self._add_caption_or_footnote(
+                            caption_elem, out_doc, tbl, page_height
+                        )
+
+                        tbl.captions.append(new_cap_item.get_ref())
+
+                if rel.cid in el_to_footnotes_mapping.keys():
+                    for footnote_cid in el_to_footnotes_mapping[rel.cid]:
+                        footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
+                        new_footnote_item = self._add_caption_or_footnote(
+                            footnote_elem, out_doc, tbl, page_height
+                        )
+
+                        tbl.footnotes.append(new_footnote_item.get_ref())
+
+                # TODO: Consider adding children of Table.
+
+            elif isinstance(element, FigureElement):
+                cap_text = ""
+                prov = ProvenanceItem(
+                    page_no=element.page_no + 1,
+                    charspan=(0, len(cap_text)),
+                    bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
+                )
+                pic = out_doc.add_picture(prov=prov)
+
+                if rel.cid in el_to_captions_mapping.keys():
+                    for caption_cid in el_to_captions_mapping[rel.cid]:
+                        caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
+                        new_cap_item = self._add_caption_or_footnote(
+                            caption_elem, out_doc, pic, page_height
+                        )
+
+                        pic.captions.append(new_cap_item.get_ref())
+
+                if rel.cid in el_to_footnotes_mapping.keys():
+                    for footnote_cid in el_to_footnotes_mapping[rel.cid]:
+                        footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
+                        new_footnote_item = self._add_caption_or_footnote(
+                            footnote_elem, out_doc, pic, page_height
+                        )
+
+                        pic.footnotes.append(new_footnote_item.get_ref())
+
+                self._add_child_elements(element, pic, out_doc)
+
+            elif isinstance(element, ContainerElement):  # Form, KV region
+                label = element.label
+                group_label = GroupLabel.UNSPECIFIED
+                if label == DocItemLabel.FORM:
+                    group_label = GroupLabel.FORM_AREA
+                elif label == DocItemLabel.KEY_VALUE_REGION:
+                    group_label = GroupLabel.KEY_VALUE_AREA
+
+                container_el = out_doc.add_group(label=group_label)
+
+                self._add_child_elements(element, container_el, out_doc)
+
+        return out_doc
+
+    def _add_caption_or_footnote(self, elem, out_doc, parent, page_height):
+        assert isinstance(elem, TextElement)
+        text = elem.text
+        prov = ProvenanceItem(
+            page_no=elem.page_no + 1,
+            charspan=(0, len(text)),
+            bbox=elem.cluster.bbox.to_bottom_left_origin(page_height),
+        )
+        new_item = out_doc.add_text(
+            label=elem.label, text=text, prov=prov, parent=parent
+        )
+        return new_item
+
+    def _handle_text_element(self, element, out_doc, current_list, page_height):
+        cap_text = element.text
+
+        prov = ProvenanceItem(
+            page_no=element.page_no + 1,
+            charspan=(0, len(cap_text)),
+            bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
+        )
+        label = element.label
+        if label == DocItemLabel.LIST_ITEM:
+            if current_list is None:
+                current_list = out_doc.add_group(label=GroupLabel.LIST, name="list")
+
+            # TODO: Infer if this is a numbered or a bullet list item
+            new_item = out_doc.add_list_item(
+                text=cap_text, enumerated=False, prov=prov, parent=current_list
+            )
+        elif label == DocItemLabel.SECTION_HEADER:
+            current_list = None
+
+            new_item = out_doc.add_heading(text=cap_text, prov=prov)
+        elif label == DocItemLabel.FORMULA:
+            current_list = None
+
+            new_item = out_doc.add_text(
+                label=DocItemLabel.FORMULA, text="", orig=cap_text, prov=prov
+            )
+        else:
+            current_list = None
+
+            content_layer = ContentLayer.BODY
+            if element.label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
+                content_layer = ContentLayer.FURNITURE
+
+            new_item = out_doc.add_text(
+                label=element.label,
+                text=cap_text,
+                prov=prov,
+                content_layer=content_layer,
+            )
+        return new_item, current_list
+
+    def _merge_elements(self, element, merged_elem, new_item, page_height):
+        assert isinstance(
+            merged_elem, type(element)
+        ), "Merged element must be of same type as element."
+        assert (
+            merged_elem.label == new_item.label
+        ), "Labels of merged elements must match."
+        prov = ProvenanceItem(
+            page_no=element.page_no + 1,
+            charspan=(
+                len(new_item.text) + 1,
+                len(new_item.text) + 1 + len(merged_elem.text),
+            ),
+            bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
+        )
+        new_item.text += f" {merged_elem.text}"
+        new_item.orig += f" {merged_elem.text}"  # TODO: This is incomplete, we don't have the `orig` field of the merged element.
+        new_item.prov.append(prov)
+
+    def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
+        with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
+            page_elements = self._assembled_to_readingorder_elements(conv_res)
+
+            # Apply reading order
+            sorted_elements = self.ro_model.predict_reading_order(
+                page_elements=page_elements
+            )
+            el_to_captions_mapping = self.ro_model.predict_to_captions(
+                sorted_elements=sorted_elements
+            )
+            el_to_footnotes_mapping = self.ro_model.predict_to_footnotes(
+                sorted_elements=sorted_elements
+            )
+            el_merges_mapping = self.ro_model.predict_merges(
+                sorted_elements=sorted_elements
+            )
+
+            docling_doc: DoclingDocument = self._readingorder_elements_to_docling_doc(
+                conv_res,
+                sorted_elements,
+                el_to_captions_mapping,
+                el_to_footnotes_mapping,
+                el_merges_mapping,
+            )
+
+        return docling_doc
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -1,6 +1,7 @@
 import copy
+import warnings
 from pathlib import Path
-from typing import Iterable
+from typing import Iterable, Optional, Union

 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@ -22,10 +23,13 @@ from docling.utils.profiling import TimeRecorder


 class TableStructureModel(BasePageModel):
+    _model_repo_folder = "ds4sd--docling-models"
+    _model_path = "model_artifacts/tableformer"
+
    def __init__(
        self,
        enabled: bool,
-        artifacts_path: Path,
+        artifacts_path: Optional[Path],
        options: TableStructureOptions,
        accelerator_options: AcceleratorOptions,
    ):
@ -35,6 +39,26 @@ class TableStructureModel(BasePageModel):

        self.enabled = enabled
        if self.enabled:
+
+            if artifacts_path is None:
+                artifacts_path = self.download_models() / self._model_path
+            else:
+                # will become the default in the future
+                if (artifacts_path / self._model_repo_folder).exists():
+                    artifacts_path = (
+                        artifacts_path / self._model_repo_folder / self._model_path
+                    )
+                elif (artifacts_path / self._model_path).exists():
+                    warnings.warn(
+                        "The usage of artifacts_path containing directly "
+                        f"{self._model_path} is deprecated. Please point "
+                        "the artifacts_path to the parent containing "
+                        f"the {self._model_repo_folder} folder.",
+                        DeprecationWarning,
+                        stacklevel=3,
+                    )
+                    artifacts_path = artifacts_path / self._model_path
+
            if self.mode == TableFormerMode.ACCURATE:
                artifacts_path = artifacts_path / "accurate"
            else:
@ -58,6 +82,24 @@ class TableStructureModel(BasePageModel):
            )
            self.scale = 2.0  # Scale up table input images to 144 dpi

+    @staticmethod
+    def download_models(
+        local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id="ds4sd/docling-models",
+            force_download=force,
+            local_dir=local_dir,
+            revision="v2.2.0",
+        )
+
+        return Path(download_path)
+
    def draw_table_and_cells(
        self,
        conv_res: ConversionResult,
@ -209,12 +251,16 @@ class TableStructureModel(BasePageModel):
                                    tc.bbox = tc.bbox.scaled(1 / self.scale)
                                table_cells.append(tc)

+                            assert "predict_details" in table_out
+
                            # Retrieving cols/rows, after post processing:
-                            num_rows = table_out["predict_details"]["num_rows"]
-                            num_cols = table_out["predict_details"]["num_cols"]
-                            otsl_seq = table_out["predict_details"]["prediction"][
-                                "rs_seq"
-                            ]
+                            num_rows = table_out["predict_details"].get("num_rows", 0)
+                            num_cols = table_out["predict_details"].get("num_cols", 0)
+                            otsl_seq = (
+                                table_out["predict_details"]
+                                .get("prediction", {})
+                                .get("rs_seq", [])
+                            )

                            tbl = Table(
                                otsl_seq=otsl_seq,
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@ -4,7 +4,7 @@ import logging
 import os
 import tempfile
 from subprocess import DEVNULL, PIPE, Popen
-from typing import Iterable, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple

 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
@ -14,13 +14,13 @@ from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractCliOcrOptions
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.ocr_utils import map_tesseract_script
 from docling.utils.profiling import TimeRecorder

 _log = logging.getLogger(__name__)


 class TesseractOcrCliModel(BaseOcrModel):
-
    def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
        super().__init__(enabled=enabled, options=options)
        self.options: TesseractCliOcrOptions
@ -29,10 +29,13 @@ class TesseractOcrCliModel(BaseOcrModel):

        self._name: Optional[str] = None
        self._version: Optional[str] = None
+        self._tesseract_languages: Optional[List[str]] = None
+        self._script_prefix: Optional[str] = None

        if self.enabled:
            try:
                self._get_name_and_version()
+                self._set_languages_and_prefix()

            except Exception as exc:
                raise RuntimeError(
@ -74,12 +77,20 @@ class TesseractOcrCliModel(BaseOcrModel):
        return name, version

    def _run_tesseract(self, ifilename: str):
-
+        r"""
+        Run tesseract CLI
+        """
        cmd = [self.options.tesseract_cmd]

-        if self.options.lang is not None and len(self.options.lang) > 0:
+        if "auto" in self.options.lang:
+            lang = self._detect_language(ifilename)
+            if lang is not None:
+                cmd.append("-l")
+                cmd.append(lang)
+        elif self.options.lang is not None and len(self.options.lang) > 0:
            cmd.append("-l")
            cmd.append("+".join(self.options.lang))
+
        if self.options.path is not None:
            cmd.append("--tessdata-dir")
            cmd.append(self.options.path)
@ -103,10 +114,69 @@ class TesseractOcrCliModel(BaseOcrModel):
        # _log.info("df: ", df.head())

        # Filter rows that contain actual text (ignore header or empty rows)
-        df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
+        df_filtered = df[
+            df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
+        ]

        return df_filtered

+    def _detect_language(self, ifilename: str):
+        r"""
+        Run tesseract in PSM 0 mode to detect the language
+        """
+        assert self._tesseract_languages is not None
+
+        cmd = [self.options.tesseract_cmd]
+        cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
+        _log.info("command: {}".format(" ".join(cmd)))
+        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
+        output, _ = proc.communicate()
+        decoded_data = output.decode("utf-8")
+        df = pd.read_csv(
+            io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
+        )
+        scripts = df.loc[df["key"] == "Script"].value.tolist()
+        if len(scripts) == 0:
+            _log.warning("Tesseract cannot detect the script of the page")
+            return None
+
+        script = map_tesseract_script(scripts[0].strip())
+        lang = f"{self._script_prefix}{script}"
+
+        # Check if the detected language has been installed
+        if lang not in self._tesseract_languages:
+            msg = f"Tesseract detected the script '{script}' and language '{lang}'."
+            msg += " However this language is not installed in your system and will be ignored."
+            _log.warning(msg)
+            return None
+
+        _log.debug(
+            f"Using tesseract model for the detected script '{script}' and language '{lang}'"
+        )
+        return lang
+
+    def _set_languages_and_prefix(self):
+        r"""
+        Read and set the languages installed in tesseract and decide the script prefix
+        """
+        # Get all languages
+        cmd = [self.options.tesseract_cmd]
+        cmd.append("--list-langs")
+        _log.info("command: {}".format(" ".join(cmd)))
+        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
+        output, _ = proc.communicate()
+        decoded_data = output.decode("utf-8")
+        df = pd.read_csv(io.StringIO(decoded_data), header=None)
+        self._tesseract_languages = df[0].tolist()[1:]
+
+        # Decide the script prefix
+        if any([l.startswith("script/") for l in self._tesseract_languages]):
+            script_prefix = "script/"
+        else:
+            script_prefix = ""
+
+        self._script_prefix = script_prefix
+
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
@ -121,7 +191,6 @@ class TesseractOcrCliModel(BaseOcrModel):
                yield page
            else:
                with TimeRecorder(conv_res, "ocr"):
-
                    ocr_rects = self.get_ocr_rects(page)

                    all_ocr_cells = []
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -8,6 +8,7 @@ from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractOcrOptions
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.ocr_utils import map_tesseract_script
 from docling.utils.profiling import TimeRecorder

 _log = logging.getLogger(__name__)
@ -20,6 +21,8 @@ class TesseractOcrModel(BaseOcrModel):

        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
        self.reader = None
+        self.osd_reader = None
+        self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}

        if self.enabled:
            install_errmsg = (
@ -47,27 +50,36 @@ class TesseractOcrModel(BaseOcrModel):
            except:
                raise ImportError(install_errmsg)

-            _, tesserocr_languages = tesserocr.get_languages()
-            if not tesserocr_languages:
+            _, self._tesserocr_languages = tesserocr.get_languages()
+            if not self._tesserocr_languages:
                raise ImportError(missing_langs_errmsg)

            # Initialize the tesseractAPI
            _log.debug("Initializing TesserOCR: %s", tesseract_version)
            lang = "+".join(self.options.lang)
+
+            if any([l.startswith("script/") for l in self._tesserocr_languages]):
+                self.script_prefix = "script/"
+            else:
+                self.script_prefix = ""
+
+            tesserocr_kwargs = {
+                "psm": tesserocr.PSM.AUTO,
+                "init": True,
+                "oem": tesserocr.OEM.DEFAULT,
+            }
+
            if self.options.path is not None:
-                self.reader = tesserocr.PyTessBaseAPI(
-                    path=self.options.path,
-                    lang=lang,
-                    psm=tesserocr.PSM.AUTO,
-                    init=True,
-                    oem=tesserocr.OEM.DEFAULT,
+                tesserocr_kwargs["path"] = self.options.path
+
+            if lang == "auto":
+                self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
+                self.osd_reader = tesserocr.PyTessBaseAPI(
+                    **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
                )
            else:
                self.reader = tesserocr.PyTessBaseAPI(
-                    lang=lang,
-                    psm=tesserocr.PSM.AUTO,
-                    init=True,
-                    oem=tesserocr.OEM.DEFAULT,
+                    **{"lang": lang} | tesserocr_kwargs,
                )
            self.reader_RIL = tesserocr.RIL

@ -75,11 +87,12 @@ class TesseractOcrModel(BaseOcrModel):
        if self.reader is not None:
            # Finalize the tesseractAPI
            self.reader.End()
+        for script in self.script_readers:
+            self.script_readers[script].End()

    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
-
        if not self.enabled:
            yield from page_batch
            return
@ -90,8 +103,8 @@ class TesseractOcrModel(BaseOcrModel):
                yield page
            else:
                with TimeRecorder(conv_res, "ocr"):
-
                    assert self.reader is not None
+                    assert self._tesserocr_languages is not None

                    ocr_rects = self.get_ocr_rects(page)

@ -104,22 +117,56 @@ class TesseractOcrModel(BaseOcrModel):
                            scale=self.scale, cropbox=ocr_rect
                        )

-                        # Retrieve text snippets with their bounding boxes
-                        self.reader.SetImage(high_res_image)
-                        boxes = self.reader.GetComponentImages(
+                        local_reader = self.reader
+                        if "auto" in self.options.lang:
+                            assert self.osd_reader is not None
+
+                            self.osd_reader.SetImage(high_res_image)
+                            osd = self.osd_reader.DetectOrientationScript()
+
+                            # No text, probably
+                            if osd is None:
+                                continue
+
+                            script = osd["script_name"]
+                            script = map_tesseract_script(script)
+                            lang = f"{self.script_prefix}{script}"
+
+                            # Check if the detected languge is present in the system
+                            if lang not in self._tesserocr_languages:
+                                msg = f"Tesseract detected the script '{script}' and language '{lang}'."
+                                msg += " However this language is not installed in your system and will be ignored."
+                                _log.warning(msg)
+                            else:
+                                if script not in self.script_readers:
+                                    import tesserocr
+
+                                    self.script_readers[script] = (
+                                        tesserocr.PyTessBaseAPI(
+                                            path=self.reader.GetDatapath(),
+                                            lang=lang,
+                                            psm=tesserocr.PSM.AUTO,
+                                            init=True,
+                                            oem=tesserocr.OEM.DEFAULT,
+                                        )
+                                    )
+                                local_reader = self.script_readers[script]
+
+                        local_reader.SetImage(high_res_image)
+                        boxes = local_reader.GetComponentImages(
                            self.reader_RIL.TEXTLINE, True
                        )

                        cells = []
                        for ix, (im, box, _, _) in enumerate(boxes):
                            # Set the area of interest. Tesseract uses Bottom-Left for the origin
-                            self.reader.SetRectangle(
+                            local_reader.SetRectangle(
                                box["x"], box["y"], box["w"], box["h"]
                            )

                            # Extract text within the bounding box
-                            text = self.reader.GetUTF8Text().strip()
-                            confidence = self.reader.MeanTextConf()
+                            text = local_reader.GetUTF8Text().strip()
+                            confidence = local_reader.MeanTextConf()
                            left = box["x"] / self.scale
                            bottom = box["y"] / self.scale
                            right = (box["x"] + box["w"]) / self.scale
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@ -3,7 +3,7 @@ import logging
 import time
 import traceback
 from abc import ABC, abstractmethod
-from typing import Callable, Iterable, List
+from typing import Any, Callable, Iterable, List

 from docling_core.types.doc import DoclingDocument, NodeItem

@ -18,7 +18,7 @@ from docling.datamodel.base_models import (
 from docling.datamodel.document import ConversionResult, InputDocument
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import settings
-from docling.models.base_model import BaseEnrichmentModel
+from docling.models.base_model import GenericEnrichmentModel
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 from docling.utils.utils import chunkify

@ -28,8 +28,9 @@ _log = logging.getLogger(__name__)
 class BasePipeline(ABC):
    def __init__(self, pipeline_options: PipelineOptions):
        self.pipeline_options = pipeline_options
+        self.keep_images = False
        self.build_pipe: List[Callable] = []
-        self.enrichment_pipe: List[BaseEnrichmentModel] = []
+        self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []

    def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
        conv_res = ConversionResult(input=in_doc)
@ -40,7 +41,7 @@ class BasePipeline(ABC):
                conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
            ):
                # These steps are building and assembling the structure of the
-                # output DoclingDocument
+                # output DoclingDocument.
                conv_res = self._build_document(conv_res)
                conv_res = self._assemble_document(conv_res)
                # From this stage, all operations should rely only on conv_res.output
@ -50,6 +51,8 @@ class BasePipeline(ABC):
            conv_res.status = ConversionStatus.FAILURE
            if raises_on_error:
                raise e
+        finally:
+            self._unload(conv_res)

        return conv_res

@ -62,21 +65,22 @@ class BasePipeline(ABC):

    def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:

-        def _filter_elements(
-            doc: DoclingDocument, model: BaseEnrichmentModel
+        def _prepare_elements(
+            conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
        ) -> Iterable[NodeItem]:
-            for element, _level in doc.iterate_items():
-                if model.is_processable(doc=doc, element=element):
-                    yield element
+            for doc_element, _level in conv_res.document.iterate_items():
+                prepared_element = model.prepare_element(
+                    conv_res=conv_res, element=doc_element
+                )
+                if prepared_element is not None:
+                    yield prepared_element

        with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
            for model in self.enrichment_pipe:
                for element_batch in chunkify(
-                    _filter_elements(conv_res.document, model),
-                    settings.perf.elements_batch_size,
+                    _prepare_elements(conv_res, model),
+                    model.elements_batch_size,
                ):
-                    # TODO: currently we assume the element itself is modified, because
-                    # we don't have an interface to save the element back to the document
                    for element in model(
                        doc=conv_res.document, element_batch=element_batch
                    ):  # Must exhaust!
@ -88,6 +92,9 @@ class BasePipeline(ABC):
    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
        pass

+    def _unload(self, conv_res: ConversionResult):
+        pass
+
    @classmethod
    @abstractmethod
    def get_default_options(cls) -> PipelineOptions:
@ -107,6 +114,10 @@ class BasePipeline(ABC):

 class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.

+    def __init__(self, pipeline_options: PipelineOptions):
+        super().__init__(pipeline_options)
+        self.keep_backend = False
+
    def _apply_on_pages(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
@ -130,7 +141,9 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):

            for i in range(0, conv_res.input.page_count):
-                conv_res.pages.append(Page(page_no=i))
+                start_page, end_page = conv_res.input.limits.page_range
+                if (start_page - 1) <= i <= (end_page - 1):
+                    conv_res.pages.append(Page(page_no=i))

            try:
                # Iterate batches of pages (page_batch_size) in the doc
@ -148,7 +161,14 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                    pipeline_pages = self._apply_on_pages(conv_res, init_pages)

                    for p in pipeline_pages:  # Must exhaust!
-                        pass
+
+                        # Cleanup cached images
+                        if not self.keep_images:
+                            p._image_cache = {}
+
+                        # Cleanup page backends
+                        if not self.keep_backend and p._backend is not None:
+                            p._backend.unload()

                    end_batch_time = time.monotonic()
                    total_elapsed_time += end_batch_time - start_batch_time
@ -177,10 +197,15 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                )
                raise e

-            finally:
-                # Always unload the PDF backend, even in case of failure
-                if conv_res.input._backend:
-                    conv_res.input._backend.unload()
+        return conv_res
+
+    def _unload(self, conv_res: ConversionResult) -> ConversionResult:
+        for page in conv_res.pages:
+            if page._backend is not None:
+                page._backend.unload()
+
+        if conv_res.input._backend:
+            conv_res.input._backend.unload()

        return conv_res

--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -1,5 +1,6 @@
 import logging
 import sys
+import warnings
 from pathlib import Path
 from typing import Optional

@ -13,12 +14,19 @@ from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrMacOptions,
    PdfPipelineOptions,
+    PictureDescriptionApiOptions,
+    PictureDescriptionVlmOptions,
    RapidOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )
+from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
-from docling.models.ds_glm_model import GlmModel, GlmOptions
+from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
+from docling.models.document_picture_classifier import (
+    DocumentPictureClassifier,
+    DocumentPictureClassifierOptions,
+)
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.ocr_mac_model import OcrMacModel
@ -27,38 +35,50 @@ from docling.models.page_preprocessing_model import (
    PagePreprocessingModel,
    PagePreprocessingOptions,
 )
+from docling.models.picture_description_api_model import PictureDescriptionApiModel
+from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
 from docling.models.rapid_ocr_model import RapidOcrModel
+from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
 from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
 from docling.models.tesseract_ocr_model import TesseractOcrModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
+from docling.utils.model_downloader import download_models
 from docling.utils.profiling import ProfilingScope, TimeRecorder

 _log = logging.getLogger(__name__)


 class StandardPdfPipeline(PaginatedPipeline):
-    _layout_model_path = "model_artifacts/layout"
-    _table_model_path = "model_artifacts/tableformer"
+    _layout_model_path = LayoutModel._model_path
+    _table_model_path = TableStructureModel._model_path

    def __init__(self, pipeline_options: PdfPipelineOptions):
        super().__init__(pipeline_options)
        self.pipeline_options: PdfPipelineOptions

-        if pipeline_options.artifacts_path is None:
-            self.artifacts_path = self.download_models_hf()
-        else:
-            self.artifacts_path = Path(pipeline_options.artifacts_path)
+        artifacts_path: Optional[Path] = None
+        if pipeline_options.artifacts_path is not None:
+            artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
+        elif settings.artifacts_path is not None:
+            artifacts_path = Path(settings.artifacts_path).expanduser()

-        keep_images = (
+        if artifacts_path is not None and not artifacts_path.is_dir():
+            raise RuntimeError(
+                f"The value of {artifacts_path=} is not valid. "
+                "When defined, it must point to a folder containing all models required by the pipeline."
+            )
+
+        self.keep_images = (
            self.pipeline_options.generate_page_images
            or self.pipeline_options.generate_picture_images
            or self.pipeline_options.generate_table_images
        )

-        self.glm_model = GlmModel(options=GlmOptions())
+        self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())

-        if (ocr_model := self.get_ocr_model()) is None:
+        if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
            raise RuntimeError(
                f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
            )
@ -74,47 +94,82 @@ class StandardPdfPipeline(PaginatedPipeline):
            ocr_model,
            # Layout model
            LayoutModel(
-                artifacts_path=self.artifacts_path
-                / StandardPdfPipeline._layout_model_path,
+                artifacts_path=artifacts_path,
                accelerator_options=pipeline_options.accelerator_options,
            ),
            # Table structure model
            TableStructureModel(
                enabled=pipeline_options.do_table_structure,
-                artifacts_path=self.artifacts_path
-                / StandardPdfPipeline._table_model_path,
+                artifacts_path=artifacts_path,
                options=pipeline_options.table_structure_options,
                accelerator_options=pipeline_options.accelerator_options,
            ),
            # Page assemble
-            PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
+            PageAssembleModel(options=PageAssembleOptions()),
        ]

+        # Picture description model
+        if (
+            picture_description_model := self.get_picture_description_model(
+                artifacts_path=artifacts_path
+            )
+        ) is None:
+            raise RuntimeError(
+                f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
+            )
+
        self.enrichment_pipe = [
-            # Other models working on `NodeItem` elements in the DoclingDocument
+            # Code Formula Enrichment Model
+            CodeFormulaModel(
+                enabled=pipeline_options.do_code_enrichment
+                or pipeline_options.do_formula_enrichment,
+                artifacts_path=artifacts_path,
+                options=CodeFormulaModelOptions(
+                    do_code_enrichment=pipeline_options.do_code_enrichment,
+                    do_formula_enrichment=pipeline_options.do_formula_enrichment,
+                ),
+                accelerator_options=pipeline_options.accelerator_options,
+            ),
+            # Document Picture Classifier
+            DocumentPictureClassifier(
+                enabled=pipeline_options.do_picture_classification,
+                artifacts_path=artifacts_path,
+                options=DocumentPictureClassifierOptions(),
+                accelerator_options=pipeline_options.accelerator_options,
+            ),
+            # Document Picture description
+            picture_description_model,
        ]

+        if (
+            self.pipeline_options.do_formula_enrichment
+            or self.pipeline_options.do_code_enrichment
+            or self.pipeline_options.do_picture_description
+        ):
+            self.keep_backend = True
+
    @staticmethod
    def download_models_hf(
        local_dir: Optional[Path] = None, force: bool = False
    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-
-        disable_progress_bars()
-        download_path = snapshot_download(
-            repo_id="ds4sd/docling-models",
-            force_download=force,
-            local_dir=local_dir,
-            revision="v2.1.0",
+        warnings.warn(
+            "The usage of StandardPdfPipeline.download_models_hf() is deprecated "
+            "use instead the utility `docling-tools models download`, or "
+            "the upstream method docling.utils.models_downloader.download_all()",
+            DeprecationWarning,
+            stacklevel=3,
        )

-        return Path(download_path)
+        output_dir = download_models(output_dir=local_dir, force=force, progress=False)
+        return output_dir

-    def get_ocr_model(self) -> Optional[BaseOcrModel]:
+    def get_ocr_model(
+        self, artifacts_path: Optional[Path] = None
+    ) -> Optional[BaseOcrModel]:
        if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
            return EasyOcrModel(
                enabled=self.pipeline_options.do_ocr,
+                artifacts_path=artifacts_path,
                options=self.pipeline_options.ocr_options,
                accelerator_options=self.pipeline_options.accelerator_options,
            )
@ -145,6 +200,30 @@ class StandardPdfPipeline(PaginatedPipeline):
            )
        return None

+    def get_picture_description_model(
+        self, artifacts_path: Optional[Path] = None
+    ) -> Optional[PictureDescriptionBaseModel]:
+        if isinstance(
+            self.pipeline_options.picture_description_options,
+            PictureDescriptionApiOptions,
+        ):
+            return PictureDescriptionApiModel(
+                enabled=self.pipeline_options.do_picture_description,
+                enable_remote_services=self.pipeline_options.enable_remote_services,
+                options=self.pipeline_options.picture_description_options,
+            )
+        elif isinstance(
+            self.pipeline_options.picture_description_options,
+            PictureDescriptionVlmOptions,
+        ):
+            return PictureDescriptionVlmModel(
+                enabled=self.pipeline_options.do_picture_description,
+                artifacts_path=artifacts_path,
+                options=self.pipeline_options.picture_description_options,
+                accelerator_options=self.pipeline_options.accelerator_options,
+            )
+        return None
+
    def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
        with TimeRecorder(conv_res, "page_init"):
            page._backend = conv_res.input._backend.load_page(page.page_no)  # type: ignore
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@ -0,0 +1,534 @@
+import itertools
+import logging
+import re
+import warnings
+from io import BytesIO
+
+# from io import BytesIO
+from pathlib import Path
+from typing import Optional
+
+from docling_core.types import DoclingDocument
+from docling_core.types.doc import (
+    BoundingBox,
+    DocItem,
+    DocItemLabel,
+    DoclingDocument,
+    GroupLabel,
+    ImageRef,
+    ImageRefMode,
+    PictureItem,
+    ProvenanceItem,
+    Size,
+    TableCell,
+    TableData,
+    TableItem,
+)
+from docling_core.types.doc.tokens import DocumentToken, TableToken
+
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.md_backend import MarkdownDocumentBackend
+from docling.backend.pdf_backend import PdfDocumentBackend
+from docling.datamodel.base_models import InputFormat, Page
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.pipeline_options import (
+    PdfPipelineOptions,
+    ResponseFormat,
+    VlmPipelineOptions,
+)
+from docling.datamodel.settings import settings
+from docling.models.hf_vlm_model import HuggingFaceVlmModel
+from docling.pipeline.base_pipeline import PaginatedPipeline
+from docling.utils.profiling import ProfilingScope, TimeRecorder
+
+_log = logging.getLogger(__name__)
+
+
+class VlmPipeline(PaginatedPipeline):
+
+    def __init__(self, pipeline_options: VlmPipelineOptions):
+        super().__init__(pipeline_options)
+        self.keep_backend = True
+
+        warnings.warn(
+            "The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
+            category=UserWarning,
+            stacklevel=2,
+        )
+
+        self.pipeline_options: VlmPipelineOptions
+
+        artifacts_path: Optional[Path] = None
+        if pipeline_options.artifacts_path is not None:
+            artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
+        elif settings.artifacts_path is not None:
+            artifacts_path = Path(settings.artifacts_path).expanduser()
+
+        if artifacts_path is not None and not artifacts_path.is_dir():
+            raise RuntimeError(
+                f"The value of {artifacts_path=} is not valid. "
+                "When defined, it must point to a folder containing all models required by the pipeline."
+            )
+
+        # force_backend_text = False - use text that is coming from VLM response
+        # force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags
+        self.force_backend_text = (
+            pipeline_options.force_backend_text
+            and pipeline_options.vlm_options.response_format == ResponseFormat.DOCTAGS
+        )
+
+        self.keep_images = self.pipeline_options.generate_page_images
+
+        self.build_pipe = [
+            HuggingFaceVlmModel(
+                enabled=True,  # must be always enabled for this pipeline to make sense.
+                artifacts_path=artifacts_path,
+                accelerator_options=pipeline_options.accelerator_options,
+                vlm_options=self.pipeline_options.vlm_options,
+            ),
+        ]
+
+        self.enrichment_pipe = [
+            # Other models working on `NodeItem` elements in the DoclingDocument
+        ]
+
+    def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
+        with TimeRecorder(conv_res, "page_init"):
+            page._backend = conv_res.input._backend.load_page(page.page_no)  # type: ignore
+            if page._backend is not None and page._backend.is_valid():
+                page.size = page._backend.get_size()
+
+        return page
+
+    def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
+        with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
+
+            if (
+                self.pipeline_options.vlm_options.response_format
+                == ResponseFormat.DOCTAGS
+            ):
+                conv_res.document = self._turn_tags_into_doc(conv_res.pages)
+            elif (
+                self.pipeline_options.vlm_options.response_format
+                == ResponseFormat.MARKDOWN
+            ):
+                conv_res.document = self._turn_md_into_doc(conv_res)
+
+            else:
+                raise RuntimeError(
+                    f"Unsupported VLM response format {self.pipeline_options.vlm_options.response_format}"
+                )
+
+            # Generate images of the requested element types
+            if self.pipeline_options.generate_picture_images:
+                scale = self.pipeline_options.images_scale
+                for element, _level in conv_res.document.iterate_items():
+                    if not isinstance(element, DocItem) or len(element.prov) == 0:
+                        continue
+                    if (
+                        isinstance(element, PictureItem)
+                        and self.pipeline_options.generate_picture_images
+                    ):
+                        page_ix = element.prov[0].page_no - 1
+                        page = conv_res.pages[page_ix]
+                        assert page.size is not None
+                        assert page.image is not None
+
+                        crop_bbox = (
+                            element.prov[0]
+                            .bbox.scaled(scale=scale)
+                            .to_top_left_origin(page_height=page.size.height * scale)
+                        )
+
+                        cropped_im = page.image.crop(crop_bbox.as_tuple())
+                        element.image = ImageRef.from_pil(
+                            cropped_im, dpi=int(72 * scale)
+                        )
+
+        return conv_res
+
+    def _turn_md_into_doc(self, conv_res):
+        predicted_text = ""
+        for pg_idx, page in enumerate(conv_res.pages):
+            if page.predictions.vlm_response:
+                predicted_text += page.predictions.vlm_response.text + "\n\n"
+        response_bytes = BytesIO(predicted_text.encode("utf8"))
+        out_doc = InputDocument(
+            path_or_stream=response_bytes,
+            filename=conv_res.input.file.name,
+            format=InputFormat.MD,
+            backend=MarkdownDocumentBackend,
+        )
+        backend = MarkdownDocumentBackend(
+            in_doc=out_doc,
+            path_or_stream=response_bytes,
+        )
+        return backend.convert()
+
+    def _turn_tags_into_doc(self, pages: list[Page]) -> DoclingDocument:
+        ###############################################
+        # Tag definitions and color mappings
+        ###############################################
+
+        # Maps the recognized tag to a Docling label.
+        # Code items will be given DocItemLabel.CODE
+        tag_to_doclabel = {
+            "title": DocItemLabel.TITLE,
+            "document_index": DocItemLabel.DOCUMENT_INDEX,
+            "otsl": DocItemLabel.TABLE,
+            "section_header_level_1": DocItemLabel.SECTION_HEADER,
+            "checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
+            "checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
+            "text": DocItemLabel.TEXT,
+            "page_header": DocItemLabel.PAGE_HEADER,
+            "page_footer": DocItemLabel.PAGE_FOOTER,
+            "formula": DocItemLabel.FORMULA,
+            "caption": DocItemLabel.CAPTION,
+            "picture": DocItemLabel.PICTURE,
+            "list_item": DocItemLabel.LIST_ITEM,
+            "footnote": DocItemLabel.FOOTNOTE,
+            "code": DocItemLabel.CODE,
+        }
+
+        # Maps each tag to an associated bounding box color.
+        tag_to_color = {
+            "title": "blue",
+            "document_index": "darkblue",
+            "otsl": "green",
+            "section_header_level_1": "purple",
+            "checkbox_selected": "black",
+            "checkbox_unselected": "gray",
+            "text": "red",
+            "page_header": "orange",
+            "page_footer": "cyan",
+            "formula": "pink",
+            "caption": "magenta",
+            "picture": "yellow",
+            "list_item": "brown",
+            "footnote": "darkred",
+            "code": "lightblue",
+        }
+
+        def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
+            """Extracts <loc_...> bounding box coords from the chunk, normalized by / 500."""
+            coords = re.findall(r"<loc_(\d+)>", text_chunk)
+            if len(coords) == 4:
+                l, t, r, b = map(float, coords)
+                return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
+            return None
+
+        def extract_inner_text(text_chunk: str) -> str:
+            """Strips all <...> tags inside the chunk to get the raw text content."""
+            return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
+
+        def extract_text_from_backend(page: Page, bbox: BoundingBox | None) -> str:
+            # Convert bounding box normalized to 0-100 into page coordinates for cropping
+            text = ""
+            if bbox:
+                if page.size:
+                    bbox.l = bbox.l * page.size.width
+                    bbox.t = bbox.t * page.size.height
+                    bbox.r = bbox.r * page.size.width
+                    bbox.b = bbox.b * page.size.height
+                    if page._backend:
+                        text = page._backend.get_text_in_rect(bbox)
+            return text
+
+        def otsl_parse_texts(texts, tokens):
+            split_word = TableToken.OTSL_NL.value
+            split_row_tokens = [
+                list(y)
+                for x, y in itertools.groupby(tokens, lambda z: z == split_word)
+                if not x
+            ]
+            table_cells = []
+            r_idx = 0
+            c_idx = 0
+
+            def count_right(tokens, c_idx, r_idx, which_tokens):
+                span = 0
+                c_idx_iter = c_idx
+                while tokens[r_idx][c_idx_iter] in which_tokens:
+                    c_idx_iter += 1
+                    span += 1
+                    if c_idx_iter >= len(tokens[r_idx]):
+                        return span
+                return span
+
+            def count_down(tokens, c_idx, r_idx, which_tokens):
+                span = 0
+                r_idx_iter = r_idx
+                while tokens[r_idx_iter][c_idx] in which_tokens:
+                    r_idx_iter += 1
+                    span += 1
+                    if r_idx_iter >= len(tokens):
+                        return span
+                return span
+
+            for i, text in enumerate(texts):
+                cell_text = ""
+                if text in [
+                    TableToken.OTSL_FCEL.value,
+                    TableToken.OTSL_ECEL.value,
+                    TableToken.OTSL_CHED.value,
+                    TableToken.OTSL_RHED.value,
+                    TableToken.OTSL_SROW.value,
+                ]:
+                    row_span = 1
+                    col_span = 1
+                    right_offset = 1
+                    if text != TableToken.OTSL_ECEL.value:
+                        cell_text = texts[i + 1]
+                        right_offset = 2
+
+                    # Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
+                    next_right_cell = ""
+                    if i + right_offset < len(texts):
+                        next_right_cell = texts[i + right_offset]
+
+                    next_bottom_cell = ""
+                    if r_idx + 1 < len(split_row_tokens):
+                        if c_idx < len(split_row_tokens[r_idx + 1]):
+                            next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
+
+                    if next_right_cell in [
+                        TableToken.OTSL_LCEL.value,
+                        TableToken.OTSL_XCEL.value,
+                    ]:
+                        # we have horisontal spanning cell or 2d spanning cell
+                        col_span += count_right(
+                            split_row_tokens,
+                            c_idx + 1,
+                            r_idx,
+                            [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
+                        )
+                    if next_bottom_cell in [
+                        TableToken.OTSL_UCEL.value,
+                        TableToken.OTSL_XCEL.value,
+                    ]:
+                        # we have a vertical spanning cell or 2d spanning cell
+                        row_span += count_down(
+                            split_row_tokens,
+                            c_idx,
+                            r_idx + 1,
+                            [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
+                        )
+
+                    table_cells.append(
+                        TableCell(
+                            text=cell_text.strip(),
+                            row_span=row_span,
+                            col_span=col_span,
+                            start_row_offset_idx=r_idx,
+                            end_row_offset_idx=r_idx + row_span,
+                            start_col_offset_idx=c_idx,
+                            end_col_offset_idx=c_idx + col_span,
+                        )
+                    )
+                if text in [
+                    TableToken.OTSL_FCEL.value,
+                    TableToken.OTSL_ECEL.value,
+                    TableToken.OTSL_CHED.value,
+                    TableToken.OTSL_RHED.value,
+                    TableToken.OTSL_SROW.value,
+                    TableToken.OTSL_LCEL.value,
+                    TableToken.OTSL_UCEL.value,
+                    TableToken.OTSL_XCEL.value,
+                ]:
+                    c_idx += 1
+                if text == TableToken.OTSL_NL.value:
+                    r_idx += 1
+                    c_idx = 0
+            return table_cells, split_row_tokens
+
+        def otsl_extract_tokens_and_text(s: str):
+            # Pattern to match anything enclosed by < > (including the angle brackets themselves)
+            pattern = r"(<[^>]+>)"
+            # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
+            tokens = re.findall(pattern, s)
+            # Remove any tokens that start with "<loc_"
+            tokens = [
+                token
+                for token in tokens
+                if not (
+                    token.startswith(rf"<{DocumentToken.LOC.value}")
+                    or token
+                    in [
+                        rf"<{DocumentToken.OTSL.value}>",
+                        rf"</{DocumentToken.OTSL.value}>",
+                    ]
+                )
+            ]
+            # Split the string by those tokens to get the in-between text
+            text_parts = re.split(pattern, s)
+            text_parts = [
+                token
+                for token in text_parts
+                if not (
+                    token.startswith(rf"<{DocumentToken.LOC.value}")
+                    or token
+                    in [
+                        rf"<{DocumentToken.OTSL.value}>",
+                        rf"</{DocumentToken.OTSL.value}>",
+                    ]
+                )
+            ]
+            # Remove any empty or purely whitespace strings from text_parts
+            text_parts = [part for part in text_parts if part.strip()]
+
+            return tokens, text_parts
+
+        def parse_table_content(otsl_content: str) -> TableData:
+            tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
+            table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
+
+            return TableData(
+                num_rows=len(split_row_tokens),
+                num_cols=(
+                    max(len(row) for row in split_row_tokens) if split_row_tokens else 0
+                ),
+                table_cells=table_cells,
+            )
+
+        doc = DoclingDocument(name="Document")
+        for pg_idx, page in enumerate(pages):
+            xml_content = ""
+            predicted_text = ""
+            if page.predictions.vlm_response:
+                predicted_text = page.predictions.vlm_response.text
+            image = page.image
+
+            page_no = pg_idx + 1
+            bounding_boxes = []
+
+            if page.size:
+                pg_width = page.size.width
+                pg_height = page.size.height
+                size = Size(width=pg_width, height=pg_height)
+                parent_page = doc.add_page(page_no=page_no, size=size)
+
+            """
+            1. Finds all <tag>...</tag> blocks in the entire string (multi-line friendly) in the order they appear.
+            2. For each chunk, extracts bounding box (if any) and inner text.
+            3. Adds the item to a DoclingDocument structure with the right label.
+            4. Tracks bounding boxes + color in a separate list for later visualization.
+            """
+
+            # Regex for all recognized tags
+            tag_pattern = (
+                rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
+                rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
+                rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
+                rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
+                rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
+                rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
+                rf"{DocItemLabel.SECTION_HEADER}_level_1|{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
+            )
+
+            # DocumentToken.OTSL
+            pattern = re.compile(tag_pattern, re.DOTALL)
+
+            # Go through each match in order
+            for match in pattern.finditer(predicted_text):
+                full_chunk = match.group(0)
+                tag_name = match.group("tag")
+
+                bbox = extract_bounding_box(full_chunk)
+                doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
+                color = tag_to_color.get(tag_name, "white")
+
+                # Store bounding box + color
+                if bbox:
+                    bounding_boxes.append((bbox, color))
+
+                if tag_name == DocumentToken.OTSL.value:
+                    table_data = parse_table_content(full_chunk)
+                    bbox = extract_bounding_box(full_chunk)
+
+                    if bbox:
+                        prov = ProvenanceItem(
+                            bbox=bbox.resize_by_scale(pg_width, pg_height),
+                            charspan=(0, 0),
+                            page_no=page_no,
+                        )
+                        doc.add_table(data=table_data, prov=prov)
+                    else:
+                        doc.add_table(data=table_data)
+
+                elif tag_name == DocItemLabel.PICTURE:
+                    text_caption_content = extract_inner_text(full_chunk)
+                    if image:
+                        if bbox:
+                            im_width, im_height = image.size
+
+                            crop_box = (
+                                int(bbox.l * im_width),
+                                int(bbox.t * im_height),
+                                int(bbox.r * im_width),
+                                int(bbox.b * im_height),
+                            )
+                            cropped_image = image.crop(crop_box)
+                            pic = doc.add_picture(
+                                parent=None,
+                                image=ImageRef.from_pil(image=cropped_image, dpi=72),
+                                prov=(
+                                    ProvenanceItem(
+                                        bbox=bbox.resize_by_scale(pg_width, pg_height),
+                                        charspan=(0, 0),
+                                        page_no=page_no,
+                                    )
+                                ),
+                            )
+                            # If there is a caption to an image, add it as well
+                            if len(text_caption_content) > 0:
+                                caption_item = doc.add_text(
+                                    label=DocItemLabel.CAPTION,
+                                    text=text_caption_content,
+                                    parent=None,
+                                )
+                                pic.captions.append(caption_item.get_ref())
+                    else:
+                        if bbox:
+                            # In case we don't have access to an binary of an image
+                            doc.add_picture(
+                                parent=None,
+                                prov=ProvenanceItem(
+                                    bbox=bbox, charspan=(0, 0), page_no=page_no
+                                ),
+                            )
+                            # If there is a caption to an image, add it as well
+                            if len(text_caption_content) > 0:
+                                caption_item = doc.add_text(
+                                    label=DocItemLabel.CAPTION,
+                                    text=text_caption_content,
+                                    parent=None,
+                                )
+                                pic.captions.append(caption_item.get_ref())
+                else:
+                    # For everything else, treat as text
+                    if self.force_backend_text:
+                        text_content = extract_text_from_backend(page, bbox)
+                    else:
+                        text_content = extract_inner_text(full_chunk)
+                    doc.add_text(
+                        label=doc_label,
+                        text=text_content,
+                        prov=(
+                            ProvenanceItem(
+                                bbox=bbox.resize_by_scale(pg_width, pg_height),
+                                charspan=(0, len(text_content)),
+                                page_no=page_no,
+                            )
+                            if bbox
+                            else None
+                        ),
+                    )
+        return doc
+
+    @classmethod
+    def get_default_options(cls) -> VlmPipelineOptions:
+        return VlmPipelineOptions()
+
+    @classmethod
+    def is_backend_supported(cls, backend: AbstractDocumentBackend):
+        return isinstance(backend, PdfDocumentBackend)
--- a/docling/utils/accelerator_utils.py
+++ b/docling/utils/accelerator_utils.py
@ -7,36 +7,62 @@ from docling.datamodel.pipeline_options import AcceleratorDevice
 _log = logging.getLogger(__name__)


-def decide_device(accelerator_device: AcceleratorDevice) -> str:
+def decide_device(accelerator_device: str) -> str:
    r"""
-    Resolve the device based on the acceleration options and the available devices in the system
+    Resolve the device based on the acceleration options and the available devices in the system.
+
    Rules:
    1. AUTO: Check for the best available device on the system.
    2. User-defined: Check if the device actually exists, otherwise fall-back to CPU
    """
-    cuda_index = 0
    device = "cpu"

    has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
    has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()

-    if accelerator_device == AcceleratorDevice.AUTO:
+    if accelerator_device == AcceleratorDevice.AUTO.value:  # Handle 'auto'
        if has_cuda:
-            device = f"cuda:{cuda_index}"
+            device = "cuda:0"
        elif has_mps:
            device = "mps"

+    elif accelerator_device.startswith("cuda"):
+        if has_cuda:
+            # if cuda device index specified extract device id
+            parts = accelerator_device.split(":")
+            if len(parts) == 2 and parts[1].isdigit():
+                # select cuda device's id
+                cuda_index = int(parts[1])
+                if cuda_index < torch.cuda.device_count():
+                    device = f"cuda:{cuda_index}"
+                else:
+                    _log.warning(
+                        "CUDA device 'cuda:%d' is not available. Fall back to 'CPU'.",
+                        cuda_index,
+                    )
+            elif len(parts) == 1:  # just "cuda"
+                device = "cuda:0"
+            else:
+                _log.warning(
+                    "Invalid CUDA device format '%s'. Fall back to 'CPU'",
+                    accelerator_device,
+                )
+        else:
+            _log.warning("CUDA is not available in the system. Fall back to 'CPU'")
+
+    elif accelerator_device == AcceleratorDevice.MPS.value:
+        if has_mps:
+            device = "mps"
+        else:
+            _log.warning("MPS is not available in the system. Fall back to 'CPU'")
+
+    elif accelerator_device == AcceleratorDevice.CPU.value:
+        device = "cpu"
+
    else:
-        if accelerator_device == AcceleratorDevice.CUDA:
-            if has_cuda:
-                device = f"cuda:{cuda_index}"
-            else:
-                _log.warning("CUDA is not available in the system. Fall back to 'CPU'")
-        elif accelerator_device == AcceleratorDevice.MPS:
-            if has_mps:
-                device = "mps"
-            else:
-                _log.warning("MPS is not available in the system. Fall back to 'CPU'")
+        _log.warning(
+            "Unknown device option '%s'. Fall back to 'CPU'", accelerator_device
+        )

    _log.info("Accelerator device: '%s'", device)
    return device
--- a/docling/utils/glm_utils.py
+++ b/docling/utils/glm_utils.py
@ -15,6 +15,7 @@ from docling_core.types.doc import (
    TableCell,
    TableData,
 )
+from docling_core.types.doc.document import ContentLayer


 def resolve_item(paths, obj):
@ -270,7 +271,6 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
            container_el = doc.add_group(label=group_label)

            _add_child_elements(container_el, doc, obj, pelem)
-
        elif "text" in obj:
            text = obj["text"][span_i:span_j]

@ -304,6 +304,23 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
                current_list = None

                doc.add_heading(text=text, prov=prov)
+            elif label == DocItemLabel.CODE:
+                current_list = None
+
+                doc.add_code(text=text, prov=prov)
+            elif label == DocItemLabel.FORMULA:
+                current_list = None
+
+                doc.add_text(label=DocItemLabel.FORMULA, text="", orig=text, prov=prov)
+            elif label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
+                current_list = None
+
+                doc.add_text(
+                    label=DocItemLabel(name_label),
+                    text=text,
+                    prov=prov,
+                    content_layer=ContentLayer.FURNITURE,
+                )
            else:
                current_list = None

--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@ -203,6 +203,7 @@ class LayoutPostprocessor:
        """Initialize processor with cells and spatial indices."""
        self.cells = cells
        self.page_size = page_size
+        self.all_clusters = clusters
        self.regular_clusters = [
            c for c in clusters if c.label not in self.SPECIAL_TYPES
        ]
@ -267,7 +268,7 @@ class LayoutPostprocessor:
        # Handle orphaned cells
        unassigned = self._find_unassigned_cells(clusters)
        if unassigned:
-            next_id = max((c.id for c in clusters), default=0) + 1
+            next_id = max((c.id for c in self.all_clusters), default=0) + 1
            orphan_clusters = []
            for i, cell in enumerate(unassigned):
                conf = 1.0
--- a/docling/utils/locks.py
+++ b/docling/utils/locks.py
@ -0,0 +1,3 @@
+import threading
+
+pypdfium2_lock = threading.Lock()
--- a/docling/utils/model_downloader.py
+++ b/docling/utils/model_downloader.py
@ -0,0 +1,97 @@
+import logging
+from pathlib import Path
+from typing import Optional
+
+from docling.datamodel.pipeline_options import (
+    granite_picture_description,
+    smolvlm_picture_description,
+)
+from docling.datamodel.settings import settings
+from docling.models.code_formula_model import CodeFormulaModel
+from docling.models.document_picture_classifier import DocumentPictureClassifier
+from docling.models.easyocr_model import EasyOcrModel
+from docling.models.layout_model import LayoutModel
+from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
+from docling.models.table_structure_model import TableStructureModel
+
+_log = logging.getLogger(__name__)
+
+
+def download_models(
+    output_dir: Optional[Path] = None,
+    *,
+    force: bool = False,
+    progress: bool = False,
+    with_layout: bool = True,
+    with_tableformer: bool = True,
+    with_code_formula: bool = True,
+    with_picture_classifier: bool = True,
+    with_smolvlm: bool = False,
+    with_granite_vision: bool = False,
+    with_easyocr: bool = True,
+):
+    if output_dir is None:
+        output_dir = settings.cache_dir / "models"
+
+    # Make sure the folder exists
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    if with_layout:
+        _log.info(f"Downloading layout model...")
+        LayoutModel.download_models(
+            local_dir=output_dir / LayoutModel._model_repo_folder,
+            force=force,
+            progress=progress,
+        )
+
+    if with_tableformer:
+        _log.info(f"Downloading tableformer model...")
+        TableStructureModel.download_models(
+            local_dir=output_dir / TableStructureModel._model_repo_folder,
+            force=force,
+            progress=progress,
+        )
+
+    if with_picture_classifier:
+        _log.info(f"Downloading picture classifier model...")
+        DocumentPictureClassifier.download_models(
+            local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
+            force=force,
+            progress=progress,
+        )
+
+    if with_code_formula:
+        _log.info(f"Downloading code formula model...")
+        CodeFormulaModel.download_models(
+            local_dir=output_dir / CodeFormulaModel._model_repo_folder,
+            force=force,
+            progress=progress,
+        )
+
+    if with_smolvlm:
+        _log.info(f"Downloading SmolVlm model...")
+        PictureDescriptionVlmModel.download_models(
+            repo_id=smolvlm_picture_description.repo_id,
+            local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
+            force=force,
+            progress=progress,
+        )
+
+    if with_granite_vision:
+        _log.info(f"Downloading Granite Vision model...")
+        PictureDescriptionVlmModel.download_models(
+            repo_id=granite_picture_description.repo_id,
+            local_dir=output_dir / granite_picture_description.repo_cache_folder,
+            force=force,
+            progress=progress,
+        )
+
+    if with_easyocr:
+        _log.info(f"Downloading easyocr models...")
+        EasyOcrModel.download_models(
+            local_dir=output_dir / EasyOcrModel._model_repo_folder,
+            force=force,
+            progress=progress,
+        )
+
+    return output_dir
--- a/docling/utils/ocr_utils.py
+++ b/docling/utils/ocr_utils.py
@ -0,0 +1,9 @@
+def map_tesseract_script(script: str) -> str:
+    r""" """
+    if script == "Katakana" or script == "Hiragana":
+        script = "Japanese"
+    elif script == "Han":
+        script = "HanS"
+    elif script == "Korean":
+        script = "Hangul"
+    return script
--- a/docling/utils/utils.py
+++ b/docling/utils/utils.py
@ -4,6 +4,9 @@ from itertools import islice
 from pathlib import Path
 from typing import List, Union

+import requests
+from tqdm import tqdm
+

 def chunkify(iterator, chunk_size):
    """Yield successive chunks of chunk_size from the iterable."""
@ -39,3 +42,24 @@ def create_hash(string: str):
    hasher.update(string.encode("utf-8"))

    return hasher.hexdigest()
+
+
+def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
+    buf = BytesIO()
+    with requests.get(url, stream=True, allow_redirects=True) as response:
+        total_size = int(response.headers.get("content-length", 0))
+        progress_bar = tqdm(
+            total=total_size,
+            unit="B",
+            unit_scale=True,
+            unit_divisor=1024,
+            disable=(not progress),
+        )
+
+        for chunk in response.iter_content(10 * 1024):
+            buf.write(chunk)
+            progress_bar.update(len(chunk))
+        progress_bar.close()
+
+    buf.seek(0)
+    return buf
--- a/docling/utils/visualization.py
+++ b/docling/utils/visualization.py
@ -0,0 +1,85 @@
+from docling_core.types.doc import DocItemLabel
+from PIL import Image, ImageDraw, ImageFont
+from PIL.ImageFont import FreeTypeFont
+
+from docling.datamodel.base_models import Cluster
+
+
+def draw_clusters(
+    image: Image.Image, clusters: list[Cluster], scale_x: float, scale_y: float
+) -> None:
+    """
+    Draw clusters on an image
+    """
+    draw = ImageDraw.Draw(image, "RGBA")
+    # Create a smaller font for the labels
+    font: ImageFont.ImageFont | FreeTypeFont
+    try:
+        font = ImageFont.truetype("arial.ttf", 12)
+    except OSError:
+        # Fallback to default font if arial is not available
+        font = ImageFont.load_default()
+    for c_tl in clusters:
+        all_clusters = [c_tl, *c_tl.children]
+        for c in all_clusters:
+            # Draw cells first (underneath)
+            cell_color = (0, 0, 0, 40)  # Transparent black for cells
+            for tc in c.cells:
+                cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
+                cx0 *= scale_x
+                cx1 *= scale_x
+                cy0 *= scale_x
+                cy1 *= scale_y
+
+                draw.rectangle(
+                    [(cx0, cy0), (cx1, cy1)],
+                    outline=None,
+                    fill=cell_color,
+                )
+            # Draw cluster rectangle
+            x0, y0, x1, y1 = c.bbox.as_tuple()
+            x0 *= scale_x
+            x1 *= scale_x
+            y0 *= scale_x
+            y1 *= scale_y
+
+            if y1 <= y0:
+                y1, y0 = y0, y1
+            if x1 <= x0:
+                x1, x0 = x0, x1
+
+            cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
+            cluster_outline_color = (
+                *list(DocItemLabel.get_color(c.label)),
+                255,
+            )
+            draw.rectangle(
+                [(x0, y0), (x1, y1)],
+                outline=cluster_outline_color,
+                fill=cluster_fill_color,
+            )
+            # Add label name and confidence
+            label_text = f"{c.label.name} ({c.confidence:.2f})"
+            # Create semi-transparent background for text
+            text_bbox = draw.textbbox((x0, y0), label_text, font=font)
+            text_bg_padding = 2
+            draw.rectangle(
+                [
+                    (
+                        text_bbox[0] - text_bg_padding,
+                        text_bbox[1] - text_bg_padding,
+                    ),
+                    (
+                        text_bbox[2] + text_bg_padding,
+                        text_bbox[3] + text_bg_padding,
+                    ),
+                ],
+                fill=(255, 255, 255, 180),  # Semi-transparent white
+            )
+            # Draw text
+            draw.text(
+                (x0, y0),
+                label_text,
+                fill=(0, 0, 0, 255),  # Solid black
+                font=font,
+            )
--- a/docs/concepts/chunking.md
+++ b/docs/concepts/chunking.md
@ -1,5 +1,18 @@
 ## Introduction

+!!! note "Chunking approaches"
+
+    Starting from a `DoclingDocument`, there are in principle two possible chunking
+    approaches:
+
+    1. exporting the `DoclingDocument` to Markdown (or similar format) and then
+      performing user-defined chunking as a post-processing step, or
+    2. using native Docling chunkers, i.e. operating directly on the `DoclingDocument`
+
+    This page is about the latter, i.e. using native Docling chunkers.
+    For an example of using approach (1) check out e.g.
+    [this recipe](../examples/rag_langchain.ipynb) looking at the Markdown export mode.
+
 A *chunker* is a Docling abstraction that, given a
 [`DoclingDocument`](./docling_document.md), returns a stream of chunks, each of which
 captures some part of the document as a string accompanied by respective metadata.
@ -54,12 +67,12 @@ tokens), &
 chunks with same headings & captions) — users can opt out of this step via param
 `merge_peers` (by default `True`)

-👉 Example: see  [here](../../examples/hybrid_chunking).
+👉 Example: see  [here](../examples/hybrid_chunking.ipynb).

 ## Hierarchical Chunker

 The `HierarchicalChunker` implementation uses the document structure information from
-the [`DoclingDocument`](../docling_document) to create one chunk for each individual
+the [`DoclingDocument`](./docling_document.md) to create one chunk for each individual
 detected document element, by default only merging together list items (can be opted out
 via param `merge_list_items`). It also takes care of attaching all relevant document
 metadata, including headers and captions.
--- a/docs/examples/backend_csv.ipynb
+++ b/docs/examples/backend_csv.ipynb
@ -0,0 +1,80 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Conversion of CSV files\n",
+    "\n",
+    "This example shows how to convert CSV files to a structured Docling Document.\n",
+    "\n",
+    "* Multiple delimiters are supported: `,` `;` `|` `[tab]`\n",
+    "* Additional CSV dialect settings are detected automatically (e.g. quotes, line separator, escape character)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Example Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "from docling.document_converter import DocumentConverter\n",
+    "\n",
+    "# Convert CSV to Docling document\n",
+    "converter = DocumentConverter()\n",
+    "result = converter.convert(Path(\"../../tests/data/csv/csv-comma.csv\"))\n",
+    "output = result.document.export_to_markdown()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This code generates the following output:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "|   Index | Customer Id     | First Name   | Last Name   | Company                         | City              | Country                    | Phone 1                | Phone 2               | Email                       | Subscription Date   | Website                     |\n",
+    "|---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|\n",
+    "|       1 | DD37Cf93aecA6Dc | Sheryl       | Baxter      | Rasmussen Group                 | East Leonard      | Chile                      | 229.077.5154           | 397.884.0519x718      | zunigavanessa@smith.info    | 2020-08-24          | http://www.stephenson.com/  |\n",
+    "|       2 | 1Ef7b82A4CAAD10 | Preston      | Lozano, Dr  | Vega-Gentry                     | East Jimmychester | Djibouti                   | 5153435776             | 686-620-1820x944      | vmata@colon.com             | 2021-04-23          | http://www.hobbs.com/       |\n",
+    "|       3 | 6F94879bDAfE5a6 | Roy          | Berry       | Murillo-Perry                   | Isabelborough     | Antigua and Barbuda        | +1-539-402-0259        | (496)978-3969x58947   | beckycarr@hogan.com         | 2020-03-25          | http://www.lawrence.com/    |\n",
+    "|       4 | 5Cef8BFA16c5e3c | Linda        | Olsen       | Dominguez, Mcmillan and Donovan | Bensonview        | Dominican Republic         | 001-808-617-6467x12895 | +1-813-324-8756       | stanleyblackwell@benson.org | 2020-06-02          | http://www.good-lyons.com/  |\n",
+    "|       5 | 053d585Ab6b3159 | Joanna       | Bender      | Martin, Lang and Andrade        | West Priscilla    | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net     | 2021-04-17          | https://goodwin-ingram.com/ |"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "docling-TtEIaPrw-py3.12",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/docs/examples/backend_xml_rag.ipynb
+++ b/docs/examples/backend_xml_rag.ipynb
@ -0,0 +1,931 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/DS4SD/docling/blob/main/docs/examples/backend_xml_rag.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Conversion of custom XML"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "| Step | Tech | Execution | \n",
+    "| --- | --- | --- |\n",
+    "| Embedding | Hugging Face / Sentence Transformers | 💻 Local |\n",
+    "| Vector store | Milvus | 💻 Local |\n",
+    "| Gen AI | Hugging Face Inference API | 🌐 Remote | "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is an example of using [Docling](https://ds4sd.github.io/docling/) for converting structured data (XML) into a unified document\n",
+    "representation format, `DoclingDocument`, and leverage its riched structured content for RAG applications.\n",
+    "\n",
+    "Data used in this example consist of patents from the [United States Patent and Trademark Office (USPTO)](https://www.uspto.gov/) and medical\n",
+    "articles from [PubMed Central® (PMC)](https://pmc.ncbi.nlm.nih.gov/).\n",
+    "\n",
+    "In this notebook, we accomplish the following:\n",
+    "- [Simple conversion](#simple-conversion) of supported XML files in a nutshell\n",
+    "- An [end-to-end application](#end-to-end-application) using public collections of XML files supported by Docling\n",
+    "  - [Setup](#setup) the API access for generative AI\n",
+    "  - [Fetch the data](#fetch-the-data) from USPTO and PubMed Central® sites, using Docling custom backends\n",
+    "  - [Parse, chunk, and index](#parse-chunk-and-index) the documents in a vector database\n",
+    "  - [Perform RAG](#question-answering-with-rag) using [LlamaIndex Docling extension](../../integrations/llamaindex/)\n",
+    "\n",
+    "For more details on document chunking with Docling, refer to the [Chunking](../../concepts/chunking/) documentation. For RAG with Docling and LlamaIndex, also check the example [RAG with LlamaIndex](../rag_llamaindex/)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Simple conversion\n",
+    "\n",
+    "The XML file format defines and stores data in a format that is both human-readable and machine-readable.\n",
+    "Because of this flexibility, Docling requires custom backend processors to interpret XML definitions and convert them into `DoclingDocument` objects.\n",
+    "\n",
+    "Some public data collections in XML format are already supported by Docling (USTPO patents and PMC articles). In these cases, the document conversion is straightforward and the same as with any other supported format, such as PDF or HTML. The execution example in [Simple Conversion](../minimal/) is the recommended usage of Docling for a single file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ConversionStatus.SUCCESS\n"
+     ]
+    }
+   ],
+   "source": [
+    "from docling.document_converter import DocumentConverter\n",
+    "\n",
+    "# a sample PMC article:\n",
+    "source = \"../../tests/data/jats/elife-56337.nxml\"\n",
+    "converter = DocumentConverter()\n",
+    "result = converter.convert(source)\n",
+    "print(result.status)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once the document is converted, it can be exported to any format supported by Docling. For instance, to markdown (showing here the first lines only):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# KRAB-zinc finger protein gene expansion in response to active retrotransposons in the murine lineage\n",
+      "\n",
+      "Gernot Wolf, Alberto de Iaco, Ming-An Sun, Melania Bruno, Matthew Tinkham, Don Hoang, Apratim Mitra, Sherry Ralls, Didier Trono, Todd S Macfarlan\n",
+      "\n",
+      "The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health, Bethesda, United States; School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL), Lausanne, Switzerland\n",
+      "\n",
+      "## Abstract\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "md_doc = result.document.export_to_markdown()\n",
+    "\n",
+    "delim = \"\\n\"\n",
+    "print(delim.join(md_doc.split(delim)[:8]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If the XML file is not supported, a `ConversionError` message will be raised."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Input document docling_test.xml does not match any allowed format.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File format not allowed: docling_test.xml\n"
+     ]
+    }
+   ],
+   "source": [
+    "from io import BytesIO\n",
+    "\n",
+    "from docling.datamodel.base_models import DocumentStream\n",
+    "from docling.exceptions import ConversionError\n",
+    "\n",
+    "xml_content = (\n",
+    "    b'<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE docling_test SYSTEM '\n",
+    "    b'\"test.dtd\"><docling>Random content</docling>'\n",
+    ")\n",
+    "stream = DocumentStream(name=\"docling_test.xml\", stream=BytesIO(xml_content))\n",
+    "try:\n",
+    "    result = converter.convert(stream)\n",
+    "except ConversionError as ce:\n",
+    "    print(ce)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can always refer to the [Usage](../../usage/#supported-formats) documentation page for a list of supported formats."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## End-to-end application\n",
+    "\n",
+    "This section describes a step-by-step application for processing XML files from supported public collections and use them for question-answering."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Requirements can be installed as shown below. The `--no-warn-conflicts` argument is meant for Colab's pre-populated Python environment, feel free to remove for stricter usage."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install -q --progress-bar off --no-warn-conflicts llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-vector-stores-milvus llama-index-readers-file python-dotenv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook uses HuggingFace's Inference API. For an increased LLM quota, a token can be provided via the environment variable `HF_TOKEN`.\n",
+    "\n",
+    "If you're running this notebook in Google Colab, make sure you [add](https://medium.com/@parthdasawant/how-to-use-secrets-in-google-colab-450c38e3ec75) your API key as a secret."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from warnings import filterwarnings\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "\n",
+    "def _get_env_from_colab_or_os(key):\n",
+    "    try:\n",
+    "        from google.colab import userdata\n",
+    "\n",
+    "        try:\n",
+    "            return userdata.get(key)\n",
+    "        except userdata.SecretNotFoundError:\n",
+    "            pass\n",
+    "    except ImportError:\n",
+    "        pass\n",
+    "    return os.getenv(key)\n",
+    "\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can now define the main parameters:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "from tempfile import mkdtemp\n",
+    "\n",
+    "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
+    "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
+    "\n",
+    "EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n",
+    "EMBED_MODEL = HuggingFaceEmbedding(model_name=EMBED_MODEL_ID)\n",
+    "TEMP_DIR = Path(mkdtemp())\n",
+    "MILVUS_URI = str(TEMP_DIR / \"docling.db\")\n",
+    "GEN_MODEL = HuggingFaceInferenceAPI(\n",
+    "    token=_get_env_from_colab_or_os(\"HF_TOKEN\"),\n",
+    "    model_name=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n",
+    ")\n",
+    "embed_dim = len(EMBED_MODEL.get_text_embedding(\"hi\"))\n",
+    "# https://github.com/huggingface/transformers/issues/5486:\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Fetch the data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this notebook we will use XML data from collections supported by Docling:\n",
+    "- Medical articles from the [PubMed Central® (PMC)](https://pmc.ncbi.nlm.nih.gov/). They are available in an [FTP server](https://ftp.ncbi.nlm.nih.gov/pub/pmc/) as `.tar.gz` files. Each file contains the full article data in XML format, among other supplementary files like images or spreadsheets.\n",
+    "- Patents from the [United States Patent and Trademark Office](https://www.uspto.gov/). They are available in the [Bulk Data Storage System (BDSS)](https://bulkdata.uspto.gov/) as zip files. Each zip file may contain several patents in XML format.\n",
+    "\n",
+    "The raw files will be downloaded form the source and saved in a temporary directory."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### PMC articles\n",
+    "\n",
+    "The [OA file](https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.csv) is a manifest file of all the PMC articles, including the URL path to download the source files. In this notebook we will use as example the article [Pathogens spread by high-altitude windborne mosquitoes](https://pmc.ncbi.nlm.nih.gov/articles/PMC11703268/), which is available in the archive file [PMC11703268.tar.gz](https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/e3/6b/PMC11703268.tar.gz)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/e3/6b/PMC11703268.tar.gz...\n",
+      "Extracting and storing the XML file containing the article text...\n",
+      "Stored XML file nihpp-2024.12.26.630351v1.nxml\n"
+     ]
+    }
+   ],
+   "source": [
+    "import tarfile\n",
+    "from io import BytesIO\n",
+    "\n",
+    "import requests\n",
+    "\n",
+    "# PMC article PMC11703268\n",
+    "url: str = \"https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/e3/6b/PMC11703268.tar.gz\"\n",
+    "\n",
+    "print(f\"Downloading {url}...\")\n",
+    "buf = BytesIO(requests.get(url).content)\n",
+    "print(\"Extracting and storing the XML file containing the article text...\")\n",
+    "with tarfile.open(fileobj=buf, mode=\"r:gz\") as tar_file:\n",
+    "    for tarinfo in tar_file:\n",
+    "        if tarinfo.isreg():\n",
+    "            file_path = Path(tarinfo.name)\n",
+    "            if file_path.suffix == \".nxml\":\n",
+    "                with open(TEMP_DIR / file_path.name, \"wb\") as file_obj:\n",
+    "                    file_obj.write(tar_file.extractfile(tarinfo).read())\n",
+    "                print(f\"Stored XML file {file_path.name}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### USPTO patents\n",
+    "\n",
+    "Since each USPTO file is a concatenation of several patents, we need to split its content into valid XML pieces. The following code downloads a sample zip file, split its content in sections, and dumps each section as an XML file. For simplicity, this pipeline is shown here in a sequential manner, but it could be parallelized."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2024/ipg241217.zip...\n",
+      "Parsing zip file, splitting into XML sections, and exporting to files...\n"
+     ]
+    }
+   ],
+   "source": [
+    "import zipfile\n",
+    "\n",
+    "# Patent grants from December 17-23, 2024\n",
+    "url: str = (\n",
+    "    \"https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2024/ipg241217.zip\"\n",
+    ")\n",
+    "XML_SPLITTER: str = '<?xml version=\"1.0\"'\n",
+    "doc_num: int = 0\n",
+    "\n",
+    "print(f\"Downloading {url}...\")\n",
+    "buf = BytesIO(requests.get(url).content)\n",
+    "print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
+    "with zipfile.ZipFile(buf) as zf:\n",
+    "    res = zf.testzip()\n",
+    "    if res:\n",
+    "        print(\"Error validating zip file\")\n",
+    "    else:\n",
+    "        with zf.open(zf.namelist()[0]) as xf:\n",
+    "            is_patent = False\n",
+    "            patent_buffer = BytesIO()\n",
+    "            for xf_line in xf:\n",
+    "                decoded_line = xf_line.decode(errors=\"ignore\").rstrip()\n",
+    "                xml_index = decoded_line.find(XML_SPLITTER)\n",
+    "                if xml_index != -1:\n",
+    "                    if (\n",
+    "                        xml_index > 0\n",
+    "                    ):  # cases like </sequence-cwu><?xml version=\"1.0\"...\n",
+    "                        patent_buffer.write(xf_line[:xml_index])\n",
+    "                        patent_buffer.write(b\"\\r\\n\")\n",
+    "                        xf_line = xf_line[xml_index:]\n",
+    "                    if patent_buffer.getbuffer().nbytes > 0 and is_patent:\n",
+    "                        doc_num += 1\n",
+    "                        patent_id = f\"ipg241217-{doc_num}\"\n",
+    "                        with open(TEMP_DIR / f\"{patent_id}.xml\", \"wb\") as file_obj:\n",
+    "                            file_obj.write(patent_buffer.getbuffer())\n",
+    "                    is_patent = False\n",
+    "                    patent_buffer = BytesIO()\n",
+    "                elif decoded_line.startswith(\"<!DOCTYPE\"):\n",
+    "                    is_patent = True\n",
+    "                patent_buffer.write(xf_line)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fetched and exported 4014 documents.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Fetched and exported {doc_num} documents.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Using the backend converter (optional)\n",
+    "\n",
+    "- The custom backend converters `PubMedDocumentBackend` and `PatentUsptoDocumentBackend` aim at handling the parsing of PMC articles and USPTO patents, respectively.\n",
+    "- As any other backends, you can leverage the function `is_valid()` to check if the input document is supported by the this backend.\n",
+    "- Note that some XML sections in the original USPTO zip file may not represent patents, like sequence listings, and therefore they will show as invalid by the backend."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Document nihpp-2024.12.26.630351v1.nxml is a valid PMC article? True\n",
+      "Document ipg241217-1.xml is a valid patent? True\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "316241ca89a843bda3170f2a5c76c639",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/4014 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 3928 patents out of 4014 XML files.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "from docling.backend.xml.jats_backend import JatsDocumentBackend\n",
+    "from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend\n",
+    "from docling.datamodel.base_models import InputFormat\n",
+    "from docling.datamodel.document import InputDocument\n",
+    "\n",
+    "# check PMC\n",
+    "in_doc = InputDocument(\n",
+    "    path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\",\n",
+    "    format=InputFormat.XML_JATS,\n",
+    "    backend=JatsDocumentBackend,\n",
+    ")\n",
+    "backend = JatsDocumentBackend(\n",
+    "    in_doc=in_doc, path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\"\n",
+    ")\n",
+    "print(f\"Document {in_doc.file.name} is a valid PMC article? {backend.is_valid()}\")\n",
+    "\n",
+    "# check USPTO\n",
+    "in_doc = InputDocument(\n",
+    "    path_or_stream=TEMP_DIR / \"ipg241217-1.xml\",\n",
+    "    format=InputFormat.XML_USPTO,\n",
+    "    backend=PatentUsptoDocumentBackend,\n",
+    ")\n",
+    "backend = PatentUsptoDocumentBackend(\n",
+    "    in_doc=in_doc, path_or_stream=TEMP_DIR / \"ipg241217-1.xml\"\n",
+    ")\n",
+    "print(f\"Document {in_doc.file.name} is a valid patent? {backend.is_valid()}\")\n",
+    "\n",
+    "patent_valid = 0\n",
+    "pbar = tqdm(TEMP_DIR.glob(\"*.xml\"), total=doc_num)\n",
+    "for in_path in pbar:\n",
+    "    in_doc = InputDocument(\n",
+    "        path_or_stream=in_path,\n",
+    "        format=InputFormat.XML_USPTO,\n",
+    "        backend=PatentUsptoDocumentBackend,\n",
+    "    )\n",
+    "    backend = PatentUsptoDocumentBackend(in_doc=in_doc, path_or_stream=in_path)\n",
+    "    patent_valid += int(backend.is_valid())\n",
+    "\n",
+    "print(f\"Found {patent_valid} patents out of {doc_num} XML files.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Calling the function `convert()` will convert the input document into a `DoclingDocument`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Patent \"Semiconductor package\" has 19 claims\n"
+     ]
+    }
+   ],
+   "source": [
+    "doc = backend.convert()\n",
+    "\n",
+    "claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n",
+    "print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or JATS (PubMed) XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Parse, chunk, and index"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `DoclingDocument` format of the converted patents has a rich hierarchical structure, inherited from the original XML document and preserved by the Docling custom backend.\n",
+    "In this notebook, we will leverage:\n",
+    "- The `SimpleDirectoryReader` pattern to iterate over the exported XML files created in section [Fetch the data](#fetch-the-data).\n",
+    "- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vectore store.\n",
+    "- The `HierarchicalChunker` implementation, which applies a document-based hierarchical chunking, to leverage the patent structures like sections and paragraphs within sections.\n",
+    "\n",
+    "Refer to other possible implementations and usage patterns in the [Chunking](../../concepts/chunking/) documentation and the [RAG with LlamaIndex](../rag_llamaindex/) notebook."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Set the Docling reader and the directory reader\n",
+    "\n",
+    "Note that `DoclingReader` uses Docling's `DocumentConverter` by default and therefore it will recognize the format of the XML files and leverage the `PatentUsptoDocumentBackend` automatically.\n",
+    "\n",
+    "For demonstration purposes, we limit the scope of the analysis to the first 100 patents."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import SimpleDirectoryReader\n",
+    "from llama_index.readers.docling import DoclingReader\n",
+    "\n",
+    "reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)\n",
+    "dir_reader = SimpleDirectoryReader(\n",
+    "    input_dir=TEMP_DIR,\n",
+    "    exclude=[\"docling.db\", \"*.nxml\"],\n",
+    "    file_extractor={\".xml\": reader},\n",
+    "    filename_as_id=True,\n",
+    "    num_files_limit=100,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Set the node parser\n",
+    "\n",
+    "Note that the `HierarchicalChunker` is the default chunking implementation of the `DoclingNodeParser`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.node_parser.docling import DoclingNodeParser\n",
+    "\n",
+    "node_parser = DoclingNodeParser()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Set a local Milvus database and run the ingestion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import StorageContext, VectorStoreIndex\n",
+    "from llama_index.vector_stores.milvus import MilvusVectorStore\n",
+    "\n",
+    "vector_store = MilvusVectorStore(\n",
+    "    uri=MILVUS_URI,\n",
+    "    dim=embed_dim,\n",
+    "    overwrite=True,\n",
+    ")\n",
+    "\n",
+    "index = VectorStoreIndex.from_documents(\n",
+    "    documents=dir_reader.load_data(show_progress=True),\n",
+    "    transformations=[node_parser],\n",
+    "    storage_context=StorageContext.from_defaults(vector_store=vector_store),\n",
+    "    embed_model=EMBED_MODEL,\n",
+    "    show_progress=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, add the PMC article to the vector store directly from the reader."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x373a7f7d0>"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "index.from_documents(\n",
+    "    documents=reader.load_data(TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\"),\n",
+    "    transformations=[node_parser],\n",
+    "    storage_context=StorageContext.from_defaults(vector_store=vector_store),\n",
+    "    embed_model=EMBED_MODEL,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Question-answering with RAG"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The retriever can be used to identify highly relevant documents:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Node ID: 5afd36c0-a739-4a88-a51c-6d0f75358db5\n",
+      "Text: The portable fitness monitoring device 102 may be a device such\n",
+      "as, for example, a mobile phone, a personal digital assistant, a music\n",
+      "file player (e.g. and MP3 player), an intelligent article for wearing\n",
+      "(e.g. a fitness monitoring garment, wrist band, or watch), a dongle\n",
+      "(e.g. a small hardware device that protects software) that includes a\n",
+      "fitn...\n",
+      "Score:  0.772\n",
+      "\n",
+      "Node ID: f294b5fd-9089-43cb-8c4e-d1095a634ff1\n",
+      "Text: US Patent Application US 20120071306 entitled “Portable\n",
+      "Multipurpose Whole Body Exercise Device” discloses a portable\n",
+      "multipurpose whole body exercise device which can be used for general\n",
+      "fitness, Pilates-type, core strengthening, therapeutic, and\n",
+      "rehabilitative exercises as well as stretching and physical therapy\n",
+      "and which includes storable acc...\n",
+      "Score:  0.749\n",
+      "\n",
+      "Node ID: 8251c7ef-1165-42e1-8c91-c99c8a711bf7\n",
+      "Text: Program products, methods, and systems for providing fitness\n",
+      "monitoring services of the present invention can include any software\n",
+      "application executed by one or more computing devices. A computing\n",
+      "device can be any type of computing device having one or more\n",
+      "processors. For example, a computing device can be a workstation,\n",
+      "mobile device (e.g., ...\n",
+      "Score:  0.744\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "retriever = index.as_retriever(similarity_top_k=3)\n",
+    "results = retriever.retrieve(\"What patents are related to fitness devices?\")\n",
+    "\n",
+    "for item in results:\n",
+    "    print(item)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With the query engine, we can run the question-answering with the RAG pattern on the set of indexed documents.\n",
+    "\n",
+    "First, we can prompt the LLM directly:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">╭──────────────────────────────────────────────────── Prompt ─────────────────────────────────────────────────────╮</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│</span> Do mosquitoes in high altitude expand viruses over large distances?                                             <span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;31m╭─\u001b[0m\u001b[1;31m───────────────────────────────────────────────────\u001b[0m\u001b[1;31m Prompt \u001b[0m\u001b[1;31m────────────────────────────────────────────────────\u001b[0m\u001b[1;31m─╮\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m Do mosquitoes in high altitude expand viruses over large distances?                                             \u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╭─────────────────────────────────────────────── Generated Content ───────────────────────────────────────────────╮</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> Mosquitoes can be found at high altitudes, but their ability to transmit viruses over long distances is not     <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> primarily dependent on altitude. Mosquitoes are vectors for various diseases, such as malaria, dengue fever,    <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> and Zika virus, and their transmission range is more closely related to their movement, the presence of a host, <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> and environmental conditions that support their survival and reproduction.                                      <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>                                                                                                                 <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> At high altitudes, the environment can be less suitable for mosquitoes due to factors such as colder            <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> temperatures, lower humidity, and stronger winds, which can limit their population size and distribution.       <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> However, some species of mosquitoes have adapted to high-altitude environments and can still transmit diseases  <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> in these areas.                                                                                                 <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>                                                                                                                 <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> It is possible for mosquitoes to be transported by wind or human activities to higher altitudes, but this is    <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> not a significant factor in their ability to transmit viruses over long distances. Instead, long-distance       <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> transmission of viruses is more often associated with human travel and transportation, which can rapidly spread <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> infected mosquitoes or humans to new areas, leading to the spread of disease.                                   <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;32m╭─\u001b[0m\u001b[1;32m──────────────────────────────────────────────\u001b[0m\u001b[1;32m Generated Content \u001b[0m\u001b[1;32m──────────────────────────────────────────────\u001b[0m\u001b[1;32m─╮\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m Mosquitoes can be found at high altitudes, but their ability to transmit viruses over long distances is not     \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m primarily dependent on altitude. Mosquitoes are vectors for various diseases, such as malaria, dengue fever,    \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m and Zika virus, and their transmission range is more closely related to their movement, the presence of a host, \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m and environmental conditions that support their survival and reproduction.                                      \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m                                                                                                                 \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m At high altitudes, the environment can be less suitable for mosquitoes due to factors such as colder            \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m temperatures, lower humidity, and stronger winds, which can limit their population size and distribution.       \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m However, some species of mosquitoes have adapted to high-altitude environments and can still transmit diseases  \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m in these areas.                                                                                                 \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m                                                                                                                 \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m It is possible for mosquitoes to be transported by wind or human activities to higher altitudes, but this is    \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m not a significant factor in their ability to transmit viruses over long distances. Instead, long-distance       \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m transmission of viruses is more often associated with human travel and transportation, which can rapidly spread \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m infected mosquitoes or humans to new areas, leading to the spread of disease.                                   \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from llama_index.core.base.llms.types import ChatMessage, MessageRole\n",
+    "from rich.console import Console\n",
+    "from rich.panel import Panel\n",
+    "\n",
+    "console = Console()\n",
+    "query = \"Do mosquitoes in high altitude expand viruses over large distances?\"\n",
+    "\n",
+    "usr_msg = ChatMessage(role=MessageRole.USER, content=query)\n",
+    "response = GEN_MODEL.chat(messages=[usr_msg])\n",
+    "\n",
+    "console.print(Panel(query, title=\"Prompt\", border_style=\"bold red\"))\n",
+    "console.print(\n",
+    "    Panel(\n",
+    "        response.message.content.strip(),\n",
+    "        title=\"Generated Content\",\n",
+    "        border_style=\"bold green\",\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we can compare the response when the model is prompted with the indexed PMC article as supporting context:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╭────────────────────────────────────────── Generated Content with RAG ───────────────────────────────────────────╮</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> Yes, mosquitoes in high altitude can expand viruses over large distances. A study intercepted 1,017 female      <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> mosquitoes at altitudes of 120-290 m above ground over Mali and Ghana and screened them for infection with      <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> arboviruses, plasmodia, and filariae. The study found that 3.5% of the mosquitoes were infected with            <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> flaviviruses, and 1.1% were infectious. Additionally, the study identified 19 mosquito-borne pathogens,         <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> including three arboviruses that affect humans (dengue, West Nile, and M’Poko viruses). The study provides      <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> compelling evidence that mosquito-borne pathogens are often spread by windborne mosquitoes at altitude.         <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;32m╭─\u001b[0m\u001b[1;32m─────────────────────────────────────────\u001b[0m\u001b[1;32m Generated Content with RAG \u001b[0m\u001b[1;32m──────────────────────────────────────────\u001b[0m\u001b[1;32m─╮\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m Yes, mosquitoes in high altitude can expand viruses over large distances. A study intercepted 1,017 female      \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m mosquitoes at altitudes of 120-290 m above ground over Mali and Ghana and screened them for infection with      \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m arboviruses, plasmodia, and filariae. The study found that 3.5% of the mosquitoes were infected with            \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m flaviviruses, and 1.1% were infectious. Additionally, the study identified 19 mosquito-borne pathogens,         \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m including three arboviruses that affect humans (dengue, West Nile, and M’Poko viruses). The study provides      \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m compelling evidence that mosquito-borne pathogens are often spread by windborne mosquitoes at altitude.         \u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters\n",
+    "\n",
+    "filters = MetadataFilters(\n",
+    "    filters=[\n",
+    "        ExactMatchFilter(key=\"filename\", value=\"nihpp-2024.12.26.630351v1.nxml\"),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "query_engine = index.as_query_engine(llm=GEN_MODEL, filter=filters, similarity_top_k=3)\n",
+    "result = query_engine.query(query)\n",
+    "\n",
+    "console.print(\n",
+    "    Panel(\n",
+    "        result.response.strip(),\n",
+    "        title=\"Generated Content with RAG\",\n",
+    "        border_style=\"bold green\",\n",
+    "    )\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/docs/examples/batch_convert.py
+++ b/docs/examples/batch_convert.py
@ -5,16 +5,18 @@ from pathlib import Path
 from typing import Iterable

 import yaml
+from docling_core.types.doc import ImageRefMode

-from docling.datamodel.base_models import ConversionStatus
+from docling.datamodel.base_models import ConversionStatus, InputFormat
 from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.datamodel.settings import settings
-from docling.document_converter import DocumentConverter
+from docling.document_converter import DocumentConverter, PdfFormatOption

 _log = logging.getLogger(__name__)

 USE_V2 = True
-USE_LEGACY = True
+USE_LEGACY = False


 def export_documents(
@ -33,26 +35,31 @@ def export_documents(
            doc_filename = conv_res.input.file.stem

            if USE_V2:
-                # Export Docling document format to JSON:
-                with (output_dir / f"{doc_filename}.json").open("w") as fp:
-                    fp.write(json.dumps(conv_res.document.export_to_dict()))
+                conv_res.document.save_as_json(
+                    output_dir / f"{doc_filename}.json",
+                    image_mode=ImageRefMode.PLACEHOLDER,
+                )
+                conv_res.document.save_as_html(
+                    output_dir / f"{doc_filename}.html",
+                    image_mode=ImageRefMode.EMBEDDED,
+                )
+                conv_res.document.save_as_document_tokens(
+                    output_dir / f"{doc_filename}.doctags.txt"
+                )
+                conv_res.document.save_as_markdown(
+                    output_dir / f"{doc_filename}.md",
+                    image_mode=ImageRefMode.PLACEHOLDER,
+                )
+                conv_res.document.save_as_markdown(
+                    output_dir / f"{doc_filename}.txt",
+                    image_mode=ImageRefMode.PLACEHOLDER,
+                    strict_text=True,
+                )

                # Export Docling document format to YAML:
                with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
                    fp.write(yaml.safe_dump(conv_res.document.export_to_dict()))

-                # Export Docling document format to doctags:
-                with (output_dir / f"{doc_filename}.doctags.txt").open("w") as fp:
-                    fp.write(conv_res.document.export_to_document_tokens())
-
-                # Export Docling document format to markdown:
-                with (output_dir / f"{doc_filename}.md").open("w") as fp:
-                    fp.write(conv_res.document.export_to_markdown())
-
-                # Export Docling document format to text:
-                with (output_dir / f"{doc_filename}.txt").open("w") as fp:
-                    fp.write(conv_res.document.export_to_markdown(strict_text=True))
-
            if USE_LEGACY:
                # Export Deep Search document JSON format:
                with (output_dir / f"{doc_filename}.legacy.json").open(
@ -103,10 +110,10 @@ def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_paths = [
-        Path("./tests/data/2206.01062.pdf"),
-        Path("./tests/data/2203.01017v2.pdf"),
-        Path("./tests/data/2305.03393v1.pdf"),
-        Path("./tests/data/redp5110_sampled.pdf"),
+        Path("./tests/data/pdf/2206.01062.pdf"),
+        Path("./tests/data/pdf/2203.01017v2.pdf"),
+        Path("./tests/data/pdf/2305.03393v1.pdf"),
+        Path("./tests/data/pdf/redp5110_sampled.pdf"),
    ]

    # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
@ -119,13 +126,20 @@ def main():
    # settings.debug.visualize_tables = True
    # settings.debug.visualize_cells = True

-    doc_converter = DocumentConverter()
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.generate_page_images = True
+
+    doc_converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+        }
+    )

    start_time = time.time()

    conv_results = doc_converter.convert_all(
        input_doc_paths,
-        raises_on_error=False,  # to let conversion run through all and examine results at the end
+        raises_on_error=True,  # to let conversion run through all and examine results at the end
    )
    success_count, partial_success_count, failure_count = export_documents(
        conv_results, output_dir=Path("scratch")
--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@ -5,7 +5,11 @@ from pathlib import Path

 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    PdfPipelineOptions,
+)
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.models.ocr_mac_model import OcrMacOptions
 from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
@ -17,7 +21,7 @@ _log = logging.getLogger(__name__)
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")

    ###########################################################################

@ -76,7 +80,7 @@ def main():
    pipeline_options.table_structure_options.do_cell_matching = True
    pipeline_options.ocr_options.lang = ["es"]
    pipeline_options.accelerator_options = AcceleratorOptions(
-        num_threads=4, device=Device.AUTO
+        num_threads=4, device=AcceleratorDevice.AUTO
    )

    doc_converter = DocumentConverter(
--- a/docs/examples/develop_formula_understanding.py
+++ b/docs/examples/develop_formula_understanding.py
@ -0,0 +1,92 @@
+# WARNING
+# This example demonstrates only how to develop a new enrichment model.
+# It does not run the actual formula understanding model.
+
+import logging
+from pathlib import Path
+from typing import Iterable
+
+from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem
+
+from docling.datamodel.base_models import InputFormat, ItemAndImageEnrichmentElement
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.models.base_model import BaseItemAndImageEnrichmentModel
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+
+
+class ExampleFormulaUnderstandingPipelineOptions(PdfPipelineOptions):
+    do_formula_understanding: bool = True
+
+
+# A new enrichment model using both the document element and its image as input
+class ExampleFormulaUnderstandingEnrichmentModel(BaseItemAndImageEnrichmentModel):
+    images_scale = 2.6
+
+    def __init__(self, enabled: bool):
+        self.enabled = enabled
+
+    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
+        return (
+            self.enabled
+            and isinstance(element, TextItem)
+            and element.label == DocItemLabel.FORMULA
+        )
+
+    def __call__(
+        self,
+        doc: DoclingDocument,
+        element_batch: Iterable[ItemAndImageEnrichmentElement],
+    ) -> Iterable[NodeItem]:
+        if not self.enabled:
+            return
+
+        for enrich_element in element_batch:
+            enrich_element.image.show()
+
+            yield enrich_element.item
+
+
+# How the pipeline can be extended.
+class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
+
+    def __init__(self, pipeline_options: ExampleFormulaUnderstandingPipelineOptions):
+        super().__init__(pipeline_options)
+        self.pipeline_options: ExampleFormulaUnderstandingPipelineOptions
+
+        self.enrichment_pipe = [
+            ExampleFormulaUnderstandingEnrichmentModel(
+                enabled=self.pipeline_options.do_formula_understanding
+            )
+        ]
+
+        if self.pipeline_options.do_formula_understanding:
+            self.keep_backend = True
+
+    @classmethod
+    def get_default_options(cls) -> ExampleFormulaUnderstandingPipelineOptions:
+        return ExampleFormulaUnderstandingPipelineOptions()
+
+
+# Example main. In the final version, we simply have to set do_formula_understanding to true.
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf")
+
+    pipeline_options = ExampleFormulaUnderstandingPipelineOptions()
+    pipeline_options.do_formula_understanding = True
+
+    doc_converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_cls=ExampleFormulaUnderstandingPipeline,
+                pipeline_options=pipeline_options,
+            )
+        }
+    )
+    result = doc_converter.convert(input_doc_path)
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/examples/develop_picture_enrichment.py
+++ b/docs/examples/develop_picture_enrichment.py
@ -1,3 +1,7 @@
+# WARNING
+# This example demonstrates only how to develop a new enrichment model.
+# It does not run the actual picture classifier model.
+
 import logging
 from pathlib import Path
 from typing import Any, Iterable
@ -22,7 +26,6 @@ class ExamplePictureClassifierPipelineOptions(PdfPipelineOptions):


 class ExamplePictureClassifierEnrichmentModel(BaseEnrichmentModel):
-
    def __init__(self, enabled: bool):
        self.enabled = enabled

@ -54,7 +57,6 @@ class ExamplePictureClassifierEnrichmentModel(BaseEnrichmentModel):


 class ExamplePictureClassifierPipeline(StandardPdfPipeline):
-
    def __init__(self, pipeline_options: ExamplePictureClassifierPipelineOptions):
        super().__init__(pipeline_options)
        self.pipeline_options: ExamplePictureClassifierPipeline
@ -73,7 +75,7 @@ class ExamplePictureClassifierPipeline(StandardPdfPipeline):
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")

    pipeline_options = ExamplePictureClassifierPipelineOptions()
    pipeline_options.images_scale = 2.0
--- a/docs/examples/export_figures.py
+++ b/docs/examples/export_figures.py
@ -16,7 +16,7 @@ IMAGE_RESOLUTION_SCALE = 2.0
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
--- a/docs/examples/export_multimodal.py
+++ b/docs/examples/export_multimodal.py
@ -19,7 +19,7 @@ IMAGE_RESOLUTION_SCALE = 2.0
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
--- a/docs/examples/export_tables.py
+++ b/docs/examples/export_tables.py
@ -12,7 +12,7 @@ _log = logging.getLogger(__name__)
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
    output_dir = Path("scratch")

    doc_converter = DocumentConverter()
--- a/docs/examples/full_page_ocr.py
+++ b/docs/examples/full_page_ocr.py
@ -14,7 +14,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption


 def main():
-    input_doc = Path("./tests/data/2206.01062.pdf")
+    input_doc = Path("./tests/data/pdf/2206.01062.pdf")

    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
--- a/docs/examples/hybrid_chunking.ipynb
+++ b/docs/examples/hybrid_chunking.ipynb
@ -83,7 +83,15 @@
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors\n"
+     ]
+    }
+   ],
   "source": [
    "from docling.chunking import HybridChunker\n",
    "\n",
@ -91,6 +99,13 @@
    "chunk_iter = chunker.chunk(dl_doc=doc)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 👉 **NOTE**: As you see above, using the `HybridChunker` can sometimes lead to a warning from the transformers library, however this is a \"false alarm\" — for details check [here](https://ds4sd.github.io/docling/faq/#hybridchunker-triggers-warning-token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-length-for-this-model)."
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -337,11 +352,11 @@
   "source": [
    "for i, chunk in enumerate(chunks):\n",
    "    print(f\"=== {i} ===\")\n",
-    "    txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))\n",
+    "    txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
    "    print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
    "\n",
    "    ser_txt = chunker.serialize(chunk=chunk)\n",
-    "    ser_tokens = len(tokenizer.tokenize(ser_txt, max_length=None))\n",
+    "    ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
    "    print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
    "\n",
    "    print()"
--- a/docs/examples/inspect_picture_content.py
+++ b/docs/examples/inspect_picture_content.py
@ -0,0 +1,29 @@
+from docling_core.types.doc import TextItem
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+source = "tests/data/pdf/amt_handbook_sample.pdf"
+
+pipeline_options = PdfPipelineOptions()
+pipeline_options.images_scale = 2
+pipeline_options.generate_page_images = True
+
+doc_converter = DocumentConverter(
+    format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
+)
+
+result = doc_converter.convert(source)
+
+doc = result.document
+
+for picture in doc.pictures:
+    # picture.get_image(doc).show() # display the picture
+    print(picture.caption_text(doc), " contains these elements:")
+
+    for item, level in doc.iterate_items(root=picture, traverse_pictures=True):
+        if isinstance(item, TextItem):
+            print(item.text)
+
+    print("\n")
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@ -0,0 +1,96 @@
+import json
+import time
+from pathlib import Path
+
+import yaml
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    VlmPipelineOptions,
+    granite_vision_vlm_conversion_options,
+    smoldocling_vlm_conversion_options,
+)
+from docling.datamodel.settings import settings
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.pipeline.vlm_pipeline import VlmPipeline
+
+sources = [
+    "tests/data/2305.03393v1-pg9-img.png",
+]
+
+## Use experimental VlmPipeline
+pipeline_options = VlmPipelineOptions()
+# If force_backend_text = True, text from backend will be used instead of generated text
+pipeline_options.force_backend_text = False
+
+## On GPU systems, enable flash_attention_2 with CUDA:
+# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
+# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
+
+## Pick a VLM model. We choose SmolDocling-256M by default
+pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+
+## Alternative VLM models:
+# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
+
+from docling_core.types.doc import DocItemLabel, ImageRefMode
+from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
+
+## Set up pipeline for PDF or image inputs
+converter = DocumentConverter(
+    format_options={
+        InputFormat.PDF: PdfFormatOption(
+            pipeline_cls=VlmPipeline,
+            pipeline_options=pipeline_options,
+        ),
+        InputFormat.IMAGE: PdfFormatOption(
+            pipeline_cls=VlmPipeline,
+            pipeline_options=pipeline_options,
+        ),
+    }
+)
+
+out_path = Path("scratch")
+out_path.mkdir(parents=True, exist_ok=True)
+
+for source in sources:
+    start_time = time.time()
+    print("================================================")
+    print("Processing... {}".format(source))
+    print("================================================")
+    print("")
+
+    res = converter.convert(source)
+
+    print("------------------------------------------------")
+    print("MD:")
+    print("------------------------------------------------")
+    print("")
+    print(res.document.export_to_markdown())
+
+    for page in res.pages:
+        print("")
+        print("Predicted page in DOCTAGS:")
+        print(page.predictions.vlm_response.text)
+
+    res.document.save_as_html(
+        filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
+        image_mode=ImageRefMode.REFERENCED,
+        labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
+    )
+
+    with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
+        fp.write(json.dumps(res.document.export_to_dict()))
+
+    pg_num = res.document.num_pages()
+
+    print("")
+    inference_time = time.time() - start_time
+    print(
+        f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
+    )
+
+print("================================================")
+print("done!")
+print("================================================")
--- a/docs/examples/pictures_description.ipynb
+++ b/docs/examples/pictures_description.ipynb
--- a/docs/examples/pictures_description_api.py
+++ b/docs/examples/pictures_description_api.py
@ -0,0 +1,118 @@
+import logging
+import os
+from pathlib import Path
+
+import requests
+from docling_core.types.doc import PictureItem
+from dotenv import load_dotenv
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    PdfPipelineOptions,
+    PictureDescriptionApiOptions,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+
+def vllm_local_options(model: str):
+    options = PictureDescriptionApiOptions(
+        url="http://localhost:8000/v1/chat/completions",
+        params=dict(
+            model=model,
+            seed=42,
+            max_completion_tokens=200,
+        ),
+        prompt="Describe the image in three sentences. Be consise and accurate.",
+        timeout=90,
+    )
+    return options
+
+
+def watsonx_vlm_options():
+    load_dotenv()
+    api_key = os.environ.get("WX_API_KEY")
+    project_id = os.environ.get("WX_PROJECT_ID")
+
+    def _get_iam_access_token(api_key: str) -> str:
+        res = requests.post(
+            url="https://iam.cloud.ibm.com/identity/token",
+            headers={
+                "Content-Type": "application/x-www-form-urlencoded",
+            },
+            data=f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={api_key}",
+        )
+        res.raise_for_status()
+        api_out = res.json()
+        print(f"{api_out=}")
+        return api_out["access_token"]
+
+    options = PictureDescriptionApiOptions(
+        url="https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29",
+        params=dict(
+            model_id="meta-llama/llama-3-2-11b-vision-instruct",
+            project_id=project_id,
+            parameters=dict(
+                max_new_tokens=400,
+            ),
+        ),
+        headers={
+            "Authorization": "Bearer " + _get_iam_access_token(api_key=api_key),
+        },
+        prompt="Describe the image in three sentences. Be consise and accurate.",
+        timeout=60,
+    )
+    return options
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+
+    pipeline_options = PdfPipelineOptions(
+        enable_remote_services=True  # <-- this is required!
+    )
+    pipeline_options.do_picture_description = True
+
+    # The PictureDescriptionApiOptions() allows to interface with APIs supporting
+    # the multi-modal chat interface. Here follow a few example on how to configure those.
+    #
+    # One possibility is self-hosting model, e.g. via VLLM.
+    # $ vllm serve MODEL_NAME
+    # Then PictureDescriptionApiOptions can point to the localhost endpoint.
+    #
+    # Example for the Granite Vision model: (uncomment the following lines)
+    # pipeline_options.picture_description_options = vllm_local_options(
+    #     model="ibm-granite/granite-vision-3.1-2b-preview"
+    # )
+    #
+    # Example for the SmolVLM model: (uncomment the following lines)
+    pipeline_options.picture_description_options = vllm_local_options(
+        model="HuggingFaceTB/SmolVLM-256M-Instruct"
+    )
+    #
+    # Another possibility is using online services, e.g. watsonx.ai.
+    # Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID.
+    # Uncomment the following line for this option:
+    # pipeline_options.picture_description_options = watsonx_vlm_options()
+
+    doc_converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+            )
+        }
+    )
+    result = doc_converter.convert(input_doc_path)
+
+    for element, _level in result.document.iterate_items():
+        if isinstance(element, PictureItem):
+            print(
+                f"Picture {element.self_ref}\n"
+                f"Caption: {element.caption_text(doc=result.document)}\n"
+                f"Annotations: {element.annotations}"
+            )
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/examples/rag_azuresearch.ipynb
+++ b/docs/examples/rag_azuresearch.ipynb
@ -0,0 +1,894 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Ag9kcX2B_atc"
+   },
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/DS4SD/docling/blob/main/docs/examples/rag_azuresearch.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# RAG with Azure AI Search"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "| Step               | Tech               | Execution |\n",
+    "| ------------------ | ------------------ | --------- |\n",
+    "| Embedding          | Azure OpenAI       | 🌐 Remote |\n",
+    "| Vector Store       | Azure AI Search    | 🌐 Remote |\n",
+    "| Gen AI  | Azure OpenAI | 🌐 Remote |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## A recipe 🧑‍🍳 🐥 💚\n",
+    "\n",
+    "This notebook demonstrates how to build a Retrieval-Augmented Generation (RAG) system using:\n",
+    "- [Docling](https://ds4sd.github.io/docling/) for document parsing and chunking\n",
+    "- [Azure AI Search](https://azure.microsoft.com/products/ai-services/ai-search/?msockid=0109678bea39665431e37323ebff6723) for vector indexing and retrieval\n",
+    "- [Azure OpenAI](https://azure.microsoft.com/products/ai-services/openai-service?msockid=0109678bea39665431e37323ebff6723) for embeddings and chat completion\n",
+    "\n",
+    "This sample demonstrates how to:\n",
+    "1. Parse a PDF with Docling.\n",
+    "2. Chunk the parsed text.\n",
+    "3. Use Azure OpenAI for embeddings.\n",
+    "4. Index and search in Azure AI Search.\n",
+    "5. Run a retrieval-augmented generation (RAG) query with Azure OpenAI GPT-4o.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# If running in a fresh environment (like Google Colab), uncomment and run this single command:\n",
+    "%pip install \"docling~=2.12\" azure-search-documents==11.5.2 azure-identity openai rich torch python-dotenv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Part 0: Prerequisites\n",
+    " - **Azure AI Search** resource\n",
+    " - **Azure OpenAI** resource with a deployed embedding and chat completion model (e.g. `text-embedding-3-small` and `gpt-4o`) \n",
+    " - **Docling 2.12+** (installs `docling_core` automatically)  Docling installed (Python 3.8+ environment)\n",
+    "\n",
+    "- A **GPU-enabled environment** is preferred for faster parsing. Docling 2.12 automatically detects GPU if present.\n",
+    "  - If you only have CPU, parsing large PDFs can be slower.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "\n",
+    "def _get_env(key, default=None):\n",
+    "    try:\n",
+    "        from google.colab import userdata\n",
+    "\n",
+    "        try:\n",
+    "            return userdata.get(key)\n",
+    "        except userdata.SecretNotFoundError:\n",
+    "            pass\n",
+    "    except ImportError:\n",
+    "        pass\n",
+    "    return os.getenv(key, default)\n",
+    "\n",
+    "\n",
+    "AZURE_SEARCH_ENDPOINT = _get_env(\"AZURE_SEARCH_ENDPOINT\")\n",
+    "AZURE_SEARCH_KEY = _get_env(\"AZURE_SEARCH_KEY\")  # Ensure this is your Admin Key\n",
+    "AZURE_SEARCH_INDEX_NAME = _get_env(\"AZURE_SEARCH_INDEX_NAME\", \"docling-rag-sample\")\n",
+    "AZURE_OPENAI_ENDPOINT = _get_env(\"AZURE_OPENAI_ENDPOINT\")\n",
+    "AZURE_OPENAI_API_KEY = _get_env(\"AZURE_OPENAI_API_KEY\")\n",
+    "AZURE_OPENAI_API_VERSION = _get_env(\"AZURE_OPENAI_API_VERSION\", \"2024-10-21\")\n",
+    "AZURE_OPENAI_CHAT_MODEL = _get_env(\n",
+    "    \"AZURE_OPENAI_CHAT_MODEL\"\n",
+    ")  # Using a deployed model named \"gpt-4o\"\n",
+    "AZURE_OPENAI_EMBEDDINGS = _get_env(\n",
+    "    \"AZURE_OPENAI_EMBEDDINGS\", \"text-embedding-3-small\"\n",
+    ")  # Using a deployed model named \"text-embeddings-3-small\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Part 1: Parse the PDF with Docling\n",
+    "\n",
+    "We’ll parse the **Microsoft GraphRAG Research Paper** (~15 pages). Parsing should be relatively quick, even on CPU, but it will be faster on a GPU or MPS device if available.\n",
+    "\n",
+    "*(If you prefer a different document, simply provide a different URL or local file path.)*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Parsing a ~</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">15</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">-page PDF. The process should be relatively quick, even on CPU...</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mParsing a ~\u001b[0m\u001b[1;33m15\u001b[0m\u001b[1;33m-page PDF. The process should be relatively quick, even on CPU\u001b[0m\u001b[1;33m...\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭─────────────────────────────────────────── Docling Markdown Preview ────────────────────────────────────────────╮\n",
+       "│ ## From Local to Global: A Graph RAG Approach to Query-Focused Summarization                                    │\n",
+       "│                                                                                                                 │\n",
+       "│ Darren Edge 1†                                                                                                  │\n",
+       "│                                                                                                                 │\n",
+       "│ Ha Trinh 1†                                                                                                     │\n",
+       "│                                                                                                                 │\n",
+       "│ Newman Cheng 2                                                                                                  │\n",
+       "│                                                                                                                 │\n",
+       "│ Joshua Bradley 2                                                                                                │\n",
+       "│                                                                                                                 │\n",
+       "│ Alex Chao 3                                                                                                     │\n",
+       "│                                                                                                                 │\n",
+       "│ Apurva Mody 3                                                                                                   │\n",
+       "│                                                                                                                 │\n",
+       "│ Steven Truitt 2                                                                                                 │\n",
+       "│                                                                                                                 │\n",
+       "│ ## Jonathan Larson 1                                                                                            │\n",
+       "│                                                                                                                 │\n",
+       "│ 1 Microsoft Research 2 Microsoft Strategic Missions and Technologies 3 Microsoft Office of the CTO              │\n",
+       "│                                                                                                                 │\n",
+       "│ { daedge,trinhha,newmancheng,joshbradley,achao,moapurva,steventruitt,jolarso } @microsoft.com                   │\n",
+       "│                                                                                                                 │\n",
+       "│ † These authors contributed equally to this work                                                                │\n",
+       "│                                                                                                                 │\n",
+       "│ ## Abstract                                                                                                     │\n",
+       "│                                                                                                                 │\n",
+       "│ The use of retrieval-augmented gen...                                                                           │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "╭─────────────────────────────────────────── Docling Markdown Preview ────────────────────────────────────────────╮\n",
+       "│ ## From Local to Global: A Graph RAG Approach to Query-Focused Summarization                                    │\n",
+       "│                                                                                                                 │\n",
+       "│ Darren Edge 1†                                                                                                  │\n",
+       "│                                                                                                                 │\n",
+       "│ Ha Trinh 1†                                                                                                     │\n",
+       "│                                                                                                                 │\n",
+       "│ Newman Cheng 2                                                                                                  │\n",
+       "│                                                                                                                 │\n",
+       "│ Joshua Bradley 2                                                                                                │\n",
+       "│                                                                                                                 │\n",
+       "│ Alex Chao 3                                                                                                     │\n",
+       "│                                                                                                                 │\n",
+       "│ Apurva Mody 3                                                                                                   │\n",
+       "│                                                                                                                 │\n",
+       "│ Steven Truitt 2                                                                                                 │\n",
+       "│                                                                                                                 │\n",
+       "│ ## Jonathan Larson 1                                                                                            │\n",
+       "│                                                                                                                 │\n",
+       "│ 1 Microsoft Research 2 Microsoft Strategic Missions and Technologies 3 Microsoft Office of the CTO              │\n",
+       "│                                                                                                                 │\n",
+       "│ { daedge,trinhha,newmancheng,joshbradley,achao,moapurva,steventruitt,jolarso } @microsoft.com                   │\n",
+       "│                                                                                                                 │\n",
+       "│ † These authors contributed equally to this work                                                                │\n",
+       "│                                                                                                                 │\n",
+       "│ ## Abstract                                                                                                     │\n",
+       "│                                                                                                                 │\n",
+       "│ The use of retrieval-augmented gen...                                                                           │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from rich.console import Console\n",
+    "from rich.panel import Panel\n",
+    "\n",
+    "from docling.document_converter import DocumentConverter\n",
+    "\n",
+    "console = Console()\n",
+    "\n",
+    "# This URL points to the Microsoft GraphRAG Research Paper (arXiv: 2404.16130), ~15 pages\n",
+    "source_url = \"https://arxiv.org/pdf/2404.16130\"\n",
+    "\n",
+    "console.print(\n",
+    "    \"[bold yellow]Parsing a ~15-page PDF. The process should be relatively quick, even on CPU...[/bold yellow]\"\n",
+    ")\n",
+    "converter = DocumentConverter()\n",
+    "result = converter.convert(source_url)\n",
+    "\n",
+    "# Optional: preview the parsed Markdown\n",
+    "md_preview = result.document.export_to_markdown()\n",
+    "console.print(Panel(md_preview[:500] + \"...\", title=\"Docling Markdown Preview\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Part 2: Hierarchical Chunking\n",
+    "We convert the `Document` into smaller chunks for embedding and indexing. The built-in `HierarchicalChunker` preserves structure. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Total chunks from PDF: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">106</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "Total chunks from PDF: \u001b[1;36m106\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from docling.chunking import HierarchicalChunker\n",
+    "\n",
+    "chunker = HierarchicalChunker()\n",
+    "doc_chunks = list(chunker.chunk(result.document))\n",
+    "\n",
+    "all_chunks = []\n",
+    "for idx, c in enumerate(doc_chunks):\n",
+    "    chunk_text = c.text\n",
+    "    all_chunks.append((f\"chunk_{idx}\", chunk_text))\n",
+    "\n",
+    "console.print(f\"Total chunks from PDF: {len(all_chunks)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Part 3: Create Azure AI Search Index and Push Chunk Embeddings\n",
+    "We’ll define a vector index in Azure AI Search, then embed each chunk using Azure OpenAI and upload in batches."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Index <span style=\"color: #008000; text-decoration-color: #008000\">'docling-rag-sample-2'</span> created.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "Index \u001b[32m'docling-rag-sample-2'\u001b[0m created.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from azure.core.credentials import AzureKeyCredential\n",
+    "from azure.search.documents.indexes import SearchIndexClient\n",
+    "from azure.search.documents.indexes.models import (\n",
+    "    AzureOpenAIVectorizer,\n",
+    "    AzureOpenAIVectorizerParameters,\n",
+    "    HnswAlgorithmConfiguration,\n",
+    "    SearchableField,\n",
+    "    SearchField,\n",
+    "    SearchFieldDataType,\n",
+    "    SearchIndex,\n",
+    "    SimpleField,\n",
+    "    VectorSearch,\n",
+    "    VectorSearchProfile,\n",
+    ")\n",
+    "from rich.console import Console\n",
+    "\n",
+    "console = Console()\n",
+    "\n",
+    "VECTOR_DIM = 1536  # Adjust based on your chosen embeddings model\n",
+    "\n",
+    "index_client = SearchIndexClient(\n",
+    "    AZURE_SEARCH_ENDPOINT, AzureKeyCredential(AZURE_SEARCH_KEY)\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def create_search_index(index_name: str):\n",
+    "    # Define fields\n",
+    "    fields = [\n",
+    "        SimpleField(name=\"chunk_id\", type=SearchFieldDataType.String, key=True),\n",
+    "        SearchableField(name=\"content\", type=SearchFieldDataType.String),\n",
+    "        SearchField(\n",
+    "            name=\"content_vector\",\n",
+    "            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n",
+    "            searchable=True,\n",
+    "            filterable=False,\n",
+    "            sortable=False,\n",
+    "            facetable=False,\n",
+    "            vector_search_dimensions=VECTOR_DIM,\n",
+    "            vector_search_profile_name=\"default\",\n",
+    "        ),\n",
+    "    ]\n",
+    "    # Vector search config with an AzureOpenAIVectorizer\n",
+    "    vector_search = VectorSearch(\n",
+    "        algorithms=[HnswAlgorithmConfiguration(name=\"default\")],\n",
+    "        profiles=[\n",
+    "            VectorSearchProfile(\n",
+    "                name=\"default\",\n",
+    "                algorithm_configuration_name=\"default\",\n",
+    "                vectorizer_name=\"default\",\n",
+    "            )\n",
+    "        ],\n",
+    "        vectorizers=[\n",
+    "            AzureOpenAIVectorizer(\n",
+    "                vectorizer_name=\"default\",\n",
+    "                parameters=AzureOpenAIVectorizerParameters(\n",
+    "                    resource_url=AZURE_OPENAI_ENDPOINT,\n",
+    "                    deployment_name=AZURE_OPENAI_EMBEDDINGS,\n",
+    "                    model_name=\"text-embedding-3-small\",\n",
+    "                    api_key=AZURE_OPENAI_API_KEY,\n",
+    "                ),\n",
+    "            )\n",
+    "        ],\n",
+    "    )\n",
+    "\n",
+    "    # Create or update the index\n",
+    "    new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
+    "    try:\n",
+    "        index_client.delete_index(index_name)\n",
+    "    except:\n",
+    "        pass\n",
+    "\n",
+    "    index_client.create_or_update_index(new_index)\n",
+    "    console.print(f\"Index '{index_name}' created.\")\n",
+    "\n",
+    "\n",
+    "create_search_index(AZURE_SEARCH_INDEX_NAME)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Generate Embeddings and Upload to Azure AI Search\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Uploaded batch <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> -&gt; <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">50</span>; all_succeeded: <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>, first_doc_status_code: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">201</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "Uploaded batch \u001b[1;36m0\u001b[0m -> \u001b[1;36m50\u001b[0m; all_succeeded: \u001b[3;92mTrue\u001b[0m, first_doc_status_code: \u001b[1;36m201\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Uploaded batch <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">50</span> -&gt; <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">100</span>; all_succeeded: <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>, first_doc_status_code: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">201</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "Uploaded batch \u001b[1;36m50\u001b[0m -> \u001b[1;36m100\u001b[0m; all_succeeded: \u001b[3;92mTrue\u001b[0m, first_doc_status_code: \u001b[1;36m201\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Uploaded batch <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">100</span> -&gt; <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">106</span>; all_succeeded: <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>, first_doc_status_code: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">201</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "Uploaded batch \u001b[1;36m100\u001b[0m -> \u001b[1;36m106\u001b[0m; all_succeeded: \u001b[3;92mTrue\u001b[0m, first_doc_status_code: \u001b[1;36m201\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">All chunks uploaded to Azure Search.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "All chunks uploaded to Azure Search.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from azure.search.documents import SearchClient\n",
+    "from openai import AzureOpenAI\n",
+    "\n",
+    "search_client = SearchClient(\n",
+    "    AZURE_SEARCH_ENDPOINT, AZURE_SEARCH_INDEX_NAME, AzureKeyCredential(AZURE_SEARCH_KEY)\n",
+    ")\n",
+    "openai_client = AzureOpenAI(\n",
+    "    api_key=AZURE_OPENAI_API_KEY,\n",
+    "    api_version=AZURE_OPENAI_API_VERSION,\n",
+    "    azure_endpoint=AZURE_OPENAI_ENDPOINT,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def embed_text(text: str):\n",
+    "    \"\"\"\n",
+    "    Helper to generate embeddings with Azure OpenAI.\n",
+    "    \"\"\"\n",
+    "    response = openai_client.embeddings.create(\n",
+    "        input=text, model=AZURE_OPENAI_EMBEDDINGS\n",
+    "    )\n",
+    "    return response.data[0].embedding\n",
+    "\n",
+    "\n",
+    "upload_docs = []\n",
+    "for chunk_id, chunk_text in all_chunks:\n",
+    "    embedding_vector = embed_text(chunk_text)\n",
+    "    upload_docs.append(\n",
+    "        {\n",
+    "            \"chunk_id\": chunk_id,\n",
+    "            \"content\": chunk_text,\n",
+    "            \"content_vector\": embedding_vector,\n",
+    "        }\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "BATCH_SIZE = 50\n",
+    "for i in range(0, len(upload_docs), BATCH_SIZE):\n",
+    "    subset = upload_docs[i : i + BATCH_SIZE]\n",
+    "    resp = search_client.upload_documents(documents=subset)\n",
+    "\n",
+    "    all_succeeded = all(r.succeeded for r in resp)\n",
+    "    console.print(\n",
+    "        f\"Uploaded batch {i} -> {i+len(subset)}; all_succeeded: {all_succeeded}, \"\n",
+    "        f\"first_doc_status_code: {resp[0].status_code}\"\n",
+    "    )\n",
+    "\n",
+    "console.print(\"All chunks uploaded to Azure Search.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Part 4: Perform RAG over PDF\n",
+    "Combine retrieval from Azure AI Search with Azure OpenAI Chat Completions (aka. grounding your LLM)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">╭──────────────────────────────────────────────────</span> RAG Prompt <span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">───────────────────────────────────────────────────╮</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│                                                                                                                 │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ You are an AI assistant helping answering questions about Microsoft GraphRAG.                                   │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Use ONLY the text below to answer the user's question.                                                          │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ If the answer isn't in the text, say you don't know.                                                            │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│                                                                                                                 │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Context:                                                                                                        │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Community summaries vs. source texts. When comparing community summaries to source texts using Graph RAG,       │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ community summaries generally provided a small but consistent improvement in answer comprehensiveness and       │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ diversity, except for root-level summaries. Intermediate-level summaries in the Podcast dataset and low-level   │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ community summaries in the News dataset achieved comprehensiveness win rates of 57% and 64%, respectively.      │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Diversity win rates were 57% for Podcast intermediate-level summaries and 60% for News low-level community      │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ summaries. Table 3 also illustrates the scalability advantages of Graph RAG compared to source text             │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ summarization: for low-level community summaries ( C3 ), Graph RAG required 26-33% fewer context tokens, while  │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ for root-level community summaries ( C0 ), it required over 97% fewer tokens. For a modest drop in performance  │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ compared with other global methods, root-level Graph RAG offers a highly efficient method for the iterative     │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ question answering that characterizes sensemaking activity, while retaining advantages in comprehensiveness     │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ (72% win rate) and diversity (62% win rate) over na¨ıve RAG.                                                    │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ ---                                                                                                             │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ We have presented a global approach to Graph RAG, combining knowledge graph generation, retrieval-augmented     │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ generation (RAG), and query-focused summarization (QFS) to support human sensemaking over entire text corpora.  │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Initial evaluations show substantial improvements over a na¨ıve RAG baseline for both the comprehensiveness and │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ diversity of answers, as well as favorable comparisons to a global but graph-free approach using map-reduce     │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ source text summarization. For situations requiring many global queries over the same dataset, summaries of     │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ root-level communities in the entity-based graph index provide a data index that is both superior to na¨ıve RAG │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ and achieves competitive performance to other global methods at a fraction of the token cost.                   │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ ---                                                                                                             │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Trade-offs of building a graph index . We consistently observed Graph RAG achieve the best headto-head results  │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ against other methods, but in many cases the graph-free approach to global summarization of source texts        │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ performed competitively. The real-world decision about whether to invest in building a graph index depends on   │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ multiple factors, including the compute budget, expected number of lifetime queries per dataset, and value      │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ obtained from other aspects of the graph index (including the generic community summaries and the use of other  │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ graph-related RAG approaches).                                                                                  │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ ---                                                                                                             │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Future work . The graph index, rich text annotations, and hierarchical community structure supporting the       │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ current Graph RAG approach offer many possibilities for refinement and adaptation. This includes RAG approaches │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ that operate in a more local manner, via embedding-based matching of user queries and graph annotations, as     │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ well as the possibility of hybrid RAG schemes that combine embedding-based matching against community reports   │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ before employing our map-reduce summarization mechanisms. This 'roll-up' operation could also be extended       │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ across more levels of the community hierarchy, as well as implemented as a more exploratory 'drill down'        │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ mechanism that follows the information scent contained in higher-level community summaries.                     │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ ---                                                                                                             │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Advanced RAG systems include pre-retrieval, retrieval, post-retrieval strategies designed to overcome the       │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ drawbacks of Na¨ıve RAG, while Modular RAG systems include patterns for iterative and dynamic cycles of         │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ interleaved retrieval and generation (Gao et al., 2023). Our implementation of Graph RAG incorporates multiple  │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ concepts related to other systems. For example, our community summaries are a kind of self-memory (Selfmem,     │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Cheng et al., 2024) for generation-augmented retrieval (GAR, Mao et al., 2020) that facilitates future          │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ generation cycles, while our parallel generation of community answers from these summaries is a kind of         │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ iterative (Iter-RetGen, Shao et al., 2023) or federated (FeB4RAG, Wang et al., 2024) retrieval-generation       │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ strategy. Other systems have also combined these concepts for multi-document summarization (CAiRE-COVID, Su et  │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ al., 2020) and multi-hop question answering (ITRG, Feng et al., 2023; IR-CoT, Trivedi et al., 2022; DSP,        │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Khattab et al., 2022). Our use of a hierarchical index and summarization also bears resemblance to further      │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ approaches, such as generating a hierarchical index of text chunks by clustering the vectors of text embeddings │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ (RAPTOR, Sarthi et al., 2024) or generating a 'tree of clarifications' to answer multiple interpretations of    │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ ambiguous questions (Kim et al., 2023). However, none of these iterative or hierarchical approaches use the     │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ kind of self-generated graph index that enables Graph RAG.                                                      │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ ---                                                                                                             │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ The use of retrieval-augmented generation (RAG) to retrieve relevant information from an external knowledge     │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ source enables large language models (LLMs) to answer questions over private and/or previously unseen document  │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ collections. However, RAG fails on global questions directed at an entire text corpus, such as 'What are the    │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ main themes in the dataset?', since this is inherently a queryfocused summarization (QFS) task, rather than an  │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ explicit retrieval task. Prior QFS methods, meanwhile, fail to scale to the quantities of text indexed by       │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ typical RAGsystems. To combine the strengths of these contrasting methods, we propose a Graph RAG approach to   │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ question answering over private text corpora that scales with both the generality of user questions and the     │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ quantity of source text to be indexed. Our approach uses an LLM to build a graph-based text index in two        │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ stages: first to derive an entity knowledge graph from the source documents, then to pregenerate community      │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ summaries for all groups of closely-related entities. Given a question, each community summary is used to       │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ generate a partial response, before all partial responses are again summarized in a final response to the user. │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ For a class of global sensemaking questions over datasets in the 1 million token range, we show that Graph RAG  │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ leads to substantial improvements over a na¨ıve RAG baseline for both the comprehensiveness and diversity of    │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ generated answers. An open-source, Python-based implementation of both global and local Graph RAG approaches is │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ forthcoming at https://aka . ms/graphrag .                                                                      │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ ---                                                                                                             │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Given the multi-stage nature of our Graph RAG mechanism, the multiple conditions we wanted to compare, and the  │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ lack of gold standard answers to our activity-based sensemaking questions, we decided to adopt a head-to-head   │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ comparison approach using an LLM evaluator. We selected three target metrics capturing qualities that are       │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ desirable for sensemaking activities, as well as a control metric (directness) used as a indicator of validity. │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Since directness is effectively in opposition to comprehensiveness and diversity, we would not expect any       │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ method to win across all four metrics.                                                                          │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ ---                                                                                                             │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Figure 1: Graph RAG pipeline using an LLM-derived graph index of source document text. This index spans nodes   │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ (e.g., entities), edges (e.g., relationships), and covariates (e.g., claims) that have been detected,           │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ extracted, and summarized by LLM prompts tailored to the domain of the dataset. Community detection (e.g.,      │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Leiden, Traag et al., 2019) is used to partition the graph index into groups of elements (nodes, edges,         │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ covariates) that the LLM can summarize in parallel at both indexing time and query time. The 'global answer' to │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ a given query is produced using a final round of query-focused summarization over all community summaries       │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ reporting relevance to that query.                                                                              │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ ---                                                                                                             │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Retrieval-augmented generation (RAG, Lewis et al., 2020) is an established approach to answering user questions │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ over entire datasets, but it is designed for situations where these answers are contained locally within        │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ regions of text whose retrieval provides sufficient grounding for the generation task. Instead, a more          │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ appropriate task framing is query-focused summarization (QFS, Dang, 2006), and in particular, query-focused     │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ abstractive summarization that generates natural language summaries and not just concatenated excerpts (Baumel  │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ et al., 2018; Laskar et al., 2020; Yao et al., 2017) . In recent years, however, such distinctions between      │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ summarization tasks that are abstractive versus extractive, generic versus query-focused, and single-document   │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ versus multi-document, have become less relevant. While early applications of the transformer architecture      │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ showed substantial improvements on the state-of-the-art for all such summarization tasks (Goodwin et al., 2020; │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Laskar et al., 2022; Liu and Lapata, 2019), these tasks are now trivialized by modern LLMs, including the GPT   │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ (Achiam et al., 2023; Brown et al., 2020), Llama (Touvron et al., 2023), and Gemini (Anil et al., 2023) series, │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ all of which can use in-context learning to summarize any content provided in their context window.             │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ ---                                                                                                             │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ community descriptions provide complete coverage of the underlying graph index and the input documents it       │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ represents. Query-focused summarization of an entire corpus is then made possible using a map-reduce approach:  │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ first using each community summary to answer the query independently and in parallel, then summarizing all      │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ relevant partial answers into a final global answer.                                                            │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│                                                                                                                 │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Question: What are the main advantages of using the Graph RAG approach for query-focused summarization compared │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ to traditional RAG methods?                                                                                     │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Answer:                                                                                                         │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│                                                                                                                 │</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;31m╭─\u001b[0m\u001b[1;31m─────────────────────────────────────────────────\u001b[0m RAG Prompt \u001b[1;31m──────────────────────────────────────────────────\u001b[0m\u001b[1;31m─╮\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m                                                                                                               \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mYou are an AI assistant helping answering questions about Microsoft GraphRAG.\u001b[0m\u001b[1;31m                                  \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mUse ONLY the text below to answer the user's question.\u001b[0m\u001b[1;31m                                                         \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mIf the answer isn't in the text, say you don't know.\u001b[0m\u001b[1;31m                                                           \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m                                                                                                               \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mContext:\u001b[0m\u001b[1;31m                                                                                                       \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mCommunity summaries vs. source texts. When comparing community summaries to source texts using Graph RAG, \u001b[0m\u001b[1;31m     \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcommunity summaries generally provided a small but consistent improvement in answer comprehensiveness and \u001b[0m\u001b[1;31m     \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mdiversity, except for root-level summaries. Intermediate-level summaries in the Podcast dataset and low-level \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcommunity summaries in the News dataset achieved comprehensiveness win rates of 57% and 64%, respectively. \u001b[0m\u001b[1;31m    \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mDiversity win rates were 57% for Podcast intermediate-level summaries and 60% for News low-level community \u001b[0m\u001b[1;31m    \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31msummaries. Table 3 also illustrates the scalability advantages of Graph RAG compared to source text \u001b[0m\u001b[1;31m           \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31msummarization: for low-level community summaries ( C3 ), Graph RAG required 26-33% fewer context tokens, while \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mfor root-level community summaries ( C0 ), it required over 97% fewer tokens. For a modest drop in performance \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcompared with other global methods, root-level Graph RAG offers a highly efficient method for the iterative \u001b[0m\u001b[1;31m   \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mquestion answering that characterizes sensemaking activity, while retaining advantages in comprehensiveness \u001b[0m\u001b[1;31m   \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m(72% win rate) and diversity (62% win rate) over na¨ıve RAG.\u001b[0m\u001b[1;31m                                                   \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m                                                                                                            \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mWe have presented a global approach to Graph RAG, combining knowledge graph generation, retrieval-augmented \u001b[0m\u001b[1;31m   \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mgeneration (RAG), and query-focused summarization (QFS) to support human sensemaking over entire text corpora. \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mInitial evaluations show substantial improvements over a na¨ıve RAG baseline for both the comprehensiveness and\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mdiversity of answers, as well as favorable comparisons to a global but graph-free approach using map-reduce \u001b[0m\u001b[1;31m   \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31msource text summarization. For situations requiring many global queries over the same dataset, summaries of \u001b[0m\u001b[1;31m   \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mroot-level communities in the entity-based graph index provide a data index that is both superior to na¨ıve RAG\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mand achieves competitive performance to other global methods at a fraction of the token cost.\u001b[0m\u001b[1;31m                  \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m                                                                                                            \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mTrade-offs of building a graph index . We consistently observed Graph RAG achieve the best headto-head results \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31magainst other methods, but in many cases the graph-free approach to global summarization of source texts \u001b[0m\u001b[1;31m      \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mperformed competitively. The real-world decision about whether to invest in building a graph index depends on \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mmultiple factors, including the compute budget, expected number of lifetime queries per dataset, and value \u001b[0m\u001b[1;31m    \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mobtained from other aspects of the graph index (including the generic community summaries and the use of other \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mgraph-related RAG approaches).\u001b[0m\u001b[1;31m                                                                                 \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m                                                                                                            \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mFuture work . The graph index, rich text annotations, and hierarchical community structure supporting the \u001b[0m\u001b[1;31m     \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcurrent Graph RAG approach offer many possibilities for refinement and adaptation. This includes RAG approaches\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mthat operate in a more local manner, via embedding-based matching of user queries and graph annotations, as \u001b[0m\u001b[1;31m   \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mwell as the possibility of hybrid RAG schemes that combine embedding-based matching against community reports \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mbefore employing our map-reduce summarization mechanisms. This 'roll-up' operation could also be extended \u001b[0m\u001b[1;31m     \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31macross more levels of the community hierarchy, as well as implemented as a more exploratory 'drill down' \u001b[0m\u001b[1;31m      \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mmechanism that follows the information scent contained in higher-level community summaries.\u001b[0m\u001b[1;31m                    \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m                                                                                                            \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mAdvanced RAG systems include pre-retrieval, retrieval, post-retrieval strategies designed to overcome the \u001b[0m\u001b[1;31m     \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mdrawbacks of Na¨ıve RAG, while Modular RAG systems include patterns for iterative and dynamic cycles of \u001b[0m\u001b[1;31m       \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31minterleaved retrieval and generation (Gao et al., 2023). Our implementation of Graph RAG incorporates multiple \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mconcepts related to other systems. For example, our community summaries are a kind of self-memory (Selfmem, \u001b[0m\u001b[1;31m   \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mCheng et al., 2024) for generation-augmented retrieval (GAR, Mao et al., 2020) that facilitates future \u001b[0m\u001b[1;31m        \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mgeneration cycles, while our parallel generation of community answers from these summaries is a kind of \u001b[0m\u001b[1;31m       \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31miterative (Iter-RetGen, Shao et al., 2023) or federated (FeB4RAG, Wang et al., 2024) retrieval-generation \u001b[0m\u001b[1;31m     \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mstrategy. Other systems have also combined these concepts for multi-document summarization (CAiRE-COVID, Su et \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mal., 2020) and multi-hop question answering (ITRG, Feng et al., 2023; IR-CoT, Trivedi et al., 2022; DSP, \u001b[0m\u001b[1;31m      \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mKhattab et al., 2022). Our use of a hierarchical index and summarization also bears resemblance to further \u001b[0m\u001b[1;31m    \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mapproaches, such as generating a hierarchical index of text chunks by clustering the vectors of text embeddings\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m(RAPTOR, Sarthi et al., 2024) or generating a 'tree of clarifications' to answer multiple interpretations of \u001b[0m\u001b[1;31m  \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mambiguous questions (Kim et al., 2023). However, none of these iterative or hierarchical approaches use the \u001b[0m\u001b[1;31m   \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mkind of self-generated graph index that enables Graph RAG.\u001b[0m\u001b[1;31m                                                     \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m                                                                                                            \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mThe use of retrieval-augmented generation (RAG) to retrieve relevant information from an external knowledge \u001b[0m\u001b[1;31m   \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31msource enables large language models (LLMs) to answer questions over private and/or previously unseen document \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcollections. However, RAG fails on global questions directed at an entire text corpus, such as 'What are the \u001b[0m\u001b[1;31m  \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mmain themes in the dataset?', since this is inherently a queryfocused summarization (QFS) task, rather than an \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mexplicit retrieval task. Prior QFS methods, meanwhile, fail to scale to the quantities of text indexed by \u001b[0m\u001b[1;31m     \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mtypical RAGsystems. To combine the strengths of these contrasting methods, we propose a Graph RAG approach to \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mquestion answering over private text corpora that scales with both the generality of user questions and the \u001b[0m\u001b[1;31m   \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mquantity of source text to be indexed. Our approach uses an LLM to build a graph-based text index in two \u001b[0m\u001b[1;31m      \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mstages: first to derive an entity knowledge graph from the source documents, then to pregenerate community \u001b[0m\u001b[1;31m    \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31msummaries for all groups of closely-related entities. Given a question, each community summary is used to \u001b[0m\u001b[1;31m     \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mgenerate a partial response, before all partial responses are again summarized in a final response to the user.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mFor a class of global sensemaking questions over datasets in the 1 million token range, we show that Graph RAG \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mleads to substantial improvements over a na¨ıve RAG baseline for both the comprehensiveness and diversity of \u001b[0m\u001b[1;31m  \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mgenerated answers. An open-source, Python-based implementation of both global and local Graph RAG approaches is\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mforthcoming at https://aka . ms/graphrag .\u001b[0m\u001b[1;31m                                                                     \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m                                                                                                            \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mGiven the multi-stage nature of our Graph RAG mechanism, the multiple conditions we wanted to compare, and the \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mlack of gold standard answers to our activity-based sensemaking questions, we decided to adopt a head-to-head \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcomparison approach using an LLM evaluator. We selected three target metrics capturing qualities that are \u001b[0m\u001b[1;31m     \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mdesirable for sensemaking activities, as well as a control metric (directness) used as a indicator of validity.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mSince directness is effectively in opposition to comprehensiveness and diversity, we would not expect any \u001b[0m\u001b[1;31m     \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mmethod to win across all four metrics.\u001b[0m\u001b[1;31m                                                                         \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m                                                                                                            \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mFigure 1: Graph RAG pipeline using an LLM-derived graph index of source document text. This index spans nodes \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m(e.g., entities), edges (e.g., relationships), and covariates (e.g., claims) that have been detected, \u001b[0m\u001b[1;31m         \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mextracted, and summarized by LLM prompts tailored to the domain of the dataset. Community detection (e.g., \u001b[0m\u001b[1;31m    \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mLeiden, Traag et al., 2019) is used to partition the graph index into groups of elements (nodes, edges, \u001b[0m\u001b[1;31m       \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcovariates) that the LLM can summarize in parallel at both indexing time and query time. The 'global answer' to\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31ma given query is produced using a final round of query-focused summarization over all community summaries \u001b[0m\u001b[1;31m     \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mreporting relevance to that query.\u001b[0m\u001b[1;31m                                                                             \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m                                                                                                            \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mRetrieval-augmented generation (RAG, Lewis et al., 2020) is an established approach to answering user questions\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mover entire datasets, but it is designed for situations where these answers are contained locally within \u001b[0m\u001b[1;31m      \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mregions of text whose retrieval provides sufficient grounding for the generation task. Instead, a more \u001b[0m\u001b[1;31m        \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mappropriate task framing is query-focused summarization (QFS, Dang, 2006), and in particular, query-focused \u001b[0m\u001b[1;31m   \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mabstractive summarization that generates natural language summaries and not just concatenated excerpts (Baumel \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31met al., 2018; Laskar et al., 2020; Yao et al., 2017) . In recent years, however, such distinctions between \u001b[0m\u001b[1;31m    \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31msummarization tasks that are abstractive versus extractive, generic versus query-focused, and single-document \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mversus multi-document, have become less relevant. While early applications of the transformer architecture \u001b[0m\u001b[1;31m    \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mshowed substantial improvements on the state-of-the-art for all such summarization tasks (Goodwin et al., 2020;\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mLaskar et al., 2022; Liu and Lapata, 2019), these tasks are now trivialized by modern LLMs, including the GPT \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m(Achiam et al., 2023; Brown et al., 2020), Llama (Touvron et al., 2023), and Gemini (Anil et al., 2023) series,\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mall of which can use in-context learning to summarize any content provided in their context window.\u001b[0m\u001b[1;31m            \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m                                                                                                            \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcommunity descriptions provide complete coverage of the underlying graph index and the input documents it \u001b[0m\u001b[1;31m     \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mrepresents. Query-focused summarization of an entire corpus is then made possible using a map-reduce approach: \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mfirst using each community summary to answer the query independently and in parallel, then summarizing all \u001b[0m\u001b[1;31m    \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mrelevant partial answers into a final global answer.\u001b[0m\u001b[1;31m                                                           \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m                                                                                                               \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mQuestion: What are the main advantages of using the Graph RAG approach for query-focused summarization compared\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mto traditional RAG methods?\u001b[0m\u001b[1;31m                                                                                    \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mAnswer:\u001b[0m\u001b[1;31m                                                                                                        \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m                                                                                                               \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
+       "\u001b[1;31m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╭─────────────────────────────────────────────────</span> RAG Response <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">──────────────────────────────────────────────────╮</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ The main advantages of using the Graph RAG approach for query-focused summarization compared to traditional RAG │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ methods include:                                                                                                │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│                                                                                                                 │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ 1. **Improved Comprehensiveness and Diversity**: Graph RAG shows substantial improvements over a naïve RAG      │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ baseline in terms of the comprehensiveness and diversity of answers. This is particularly beneficial for global │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ sensemaking questions over large datasets.                                                                      │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│                                                                                                                 │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ 2. **Scalability**: Graph RAG provides scalability advantages, achieving efficient summarization with           │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ significantly fewer context tokens required. For instance, it requires 26-33% fewer tokens for low-level        │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ community summaries and over 97% fewer tokens for root-level summaries compared to source text summarization.   │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│                                                                                                                 │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ 3. **Efficiency in Iterative Question Answering**: Root-level Graph RAG offers a highly efficient method for    │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ iterative question answering, which is crucial for sensemaking activities, with only a modest drop in           │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ performance compared to other global methods.                                                                   │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│                                                                                                                 │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ 4. **Global Query Handling**: It supports handling global queries effectively, as it combines knowledge graph   │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ generation, retrieval-augmented generation, and query-focused summarization, making it suitable for sensemaking │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ over entire text corpora.                                                                                       │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│                                                                                                                 │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ 5. **Hierarchical Indexing and Summarization**: The use of a hierarchical index and summarization allows for    │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ efficient processing and summarizing of community summaries into a final global answer, facilitating a          │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ comprehensive coverage of the underlying graph index and input documents.                                       │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│                                                                                                                 │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ 6. **Reduced Token Cost**: For situations requiring many global queries over the same dataset, Graph RAG        │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ achieves competitive performance to other global methods at a fraction of the token cost.                       │</span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;32m╭─\u001b[0m\u001b[1;32m────────────────────────────────────────────────\u001b[0m RAG Response \u001b[1;32m─────────────────────────────────────────────────\u001b[0m\u001b[1;32m─╮\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mThe main advantages of using the Graph RAG approach for query-focused summarization compared to traditional RAG\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mmethods include:\u001b[0m\u001b[1;32m                                                                                               \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m                                                                                                               \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m1. **Improved Comprehensiveness and Diversity**: Graph RAG shows substantial improvements over a naïve RAG \u001b[0m\u001b[1;32m    \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mbaseline in terms of the comprehensiveness and diversity of answers. This is particularly beneficial for global\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32msensemaking questions over large datasets.\u001b[0m\u001b[1;32m                                                                     \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m                                                                                                               \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m2. **Scalability**: Graph RAG provides scalability advantages, achieving efficient summarization with \u001b[0m\u001b[1;32m         \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32msignificantly fewer context tokens required. For instance, it requires 26-33% fewer tokens for low-level \u001b[0m\u001b[1;32m      \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mcommunity summaries and over 97% fewer tokens for root-level summaries compared to source text summarization.\u001b[0m\u001b[1;32m  \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m                                                                                                               \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m3. **Efficiency in Iterative Question Answering**: Root-level Graph RAG offers a highly efficient method for \u001b[0m\u001b[1;32m  \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32miterative question answering, which is crucial for sensemaking activities, with only a modest drop in \u001b[0m\u001b[1;32m         \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mperformance compared to other global methods.\u001b[0m\u001b[1;32m                                                                  \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m                                                                                                               \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m4. **Global Query Handling**: It supports handling global queries effectively, as it combines knowledge graph \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mgeneration, retrieval-augmented generation, and query-focused summarization, making it suitable for sensemaking\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mover entire text corpora.\u001b[0m\u001b[1;32m                                                                                      \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m                                                                                                               \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m5. **Hierarchical Indexing and Summarization**: The use of a hierarchical index and summarization allows for \u001b[0m\u001b[1;32m  \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mefficient processing and summarizing of community summaries into a final global answer, facilitating a \u001b[0m\u001b[1;32m        \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mcomprehensive coverage of the underlying graph index and input documents.\u001b[0m\u001b[1;32m                                      \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m                                                                                                               \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m6. **Reduced Token Cost**: For situations requiring many global queries over the same dataset, Graph RAG \u001b[0m\u001b[1;32m      \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32machieves competitive performance to other global methods at a fraction of the token cost.\u001b[0m\u001b[1;32m                      \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
+       "\u001b[1;32m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from azure.search.documents.models import VectorizableTextQuery\n",
+    "\n",
+    "\n",
+    "def generate_chat_response(prompt: str, system_message: str = None):\n",
+    "    \"\"\"\n",
+    "    Generates a single-turn chat response using Azure OpenAI Chat.\n",
+    "    If you need multi-turn conversation or follow-up queries, you'll have to\n",
+    "    maintain the messages list externally.\n",
+    "    \"\"\"\n",
+    "    messages = []\n",
+    "    if system_message:\n",
+    "        messages.append({\"role\": \"system\", \"content\": system_message})\n",
+    "    messages.append({\"role\": \"user\", \"content\": prompt})\n",
+    "\n",
+    "    completion = openai_client.chat.completions.create(\n",
+    "        model=AZURE_OPENAI_CHAT_MODEL, messages=messages, temperature=0.7\n",
+    "    )\n",
+    "    return completion.choices[0].message.content\n",
+    "\n",
+    "\n",
+    "user_query = \"What are the main advantages of using the Graph RAG approach for query-focused summarization compared to traditional RAG methods?\"\n",
+    "user_embed = embed_text(user_query)\n",
+    "\n",
+    "vector_query = VectorizableTextQuery(\n",
+    "    text=user_query,  # passing in text for a hybrid search\n",
+    "    k_nearest_neighbors=5,\n",
+    "    fields=\"content_vector\",\n",
+    ")\n",
+    "\n",
+    "search_results = search_client.search(\n",
+    "    search_text=user_query, vector_queries=[vector_query], select=[\"content\"], top=10\n",
+    ")\n",
+    "\n",
+    "retrieved_chunks = []\n",
+    "for result in search_results:\n",
+    "    snippet = result[\"content\"]\n",
+    "    retrieved_chunks.append(snippet)\n",
+    "\n",
+    "context_str = \"\\n---\\n\".join(retrieved_chunks)\n",
+    "rag_prompt = f\"\"\"\n",
+    "You are an AI assistant helping answering questions about Microsoft GraphRAG.\n",
+    "Use ONLY the text below to answer the user's question.\n",
+    "If the answer isn't in the text, say you don't know.\n",
+    "\n",
+    "Context:\n",
+    "{context_str}\n",
+    "\n",
+    "Question: {user_query}\n",
+    "Answer:\n",
+    "\"\"\"\n",
+    "\n",
+    "final_answer = generate_chat_response(rag_prompt)\n",
+    "\n",
+    "console.print(Panel(rag_prompt, title=\"RAG Prompt\", style=\"bold red\"))\n",
+    "console.print(Panel(final_answer, title=\"RAG Response\", style=\"bold green\"))"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/docs/examples/rapidocr_with_custom_models.py
+++ b/docs/examples/rapidocr_with_custom_models.py
@ -0,0 +1,58 @@
+import os
+
+from huggingface_hub import snapshot_download
+
+from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
+from docling.document_converter import (
+    ConversionResult,
+    DocumentConverter,
+    InputFormat,
+    PdfFormatOption,
+)
+
+
+def main():
+    # Source document to convert
+    source = "https://arxiv.org/pdf/2408.09869v4"
+
+    # Download RappidOCR models from HuggingFace
+    print("Downloading RapidOCR models")
+    download_path = snapshot_download(repo_id="SWHL/RapidOCR")
+
+    # Setup RapidOcrOptions for english detection
+    det_model_path = os.path.join(
+        download_path, "PP-OCRv4", "en_PP-OCRv3_det_infer.onnx"
+    )
+    rec_model_path = os.path.join(
+        download_path, "PP-OCRv4", "ch_PP-OCRv4_rec_server_infer.onnx"
+    )
+    cls_model_path = os.path.join(
+        download_path, "PP-OCRv3", "ch_ppocr_mobile_v2.0_cls_train.onnx"
+    )
+    ocr_options = RapidOcrOptions(
+        det_model_path=det_model_path,
+        rec_model_path=rec_model_path,
+        cls_model_path=cls_model_path,
+    )
+
+    pipeline_options = PdfPipelineOptions(
+        ocr_options=ocr_options,
+    )
+
+    # Convert the document
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+            ),
+        },
+    )
+
+    conversion_result: ConversionResult = converter.convert(source=source)
+    doc = conversion_result.document
+    md = doc.export_to_markdown()
+    print(md)
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/examples/run_with_accelerator.py
+++ b/docs/examples/run_with_accelerator.py
@ -14,7 +14,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption


 def main():
-    input_doc = Path("./tests/data/2206.01062.pdf")
+    input_doc = Path("./tests/data/pdf/2206.01062.pdf")

    # Explicitly set the accelerator
    # accelerator_options = AcceleratorOptions(
@ -30,6 +30,9 @@ def main():
    #     num_threads=8, device=AcceleratorDevice.CUDA
    # )

+    # easyocr doesnt support cuda:N allocation, defaults to cuda:0
+    # accelerator_options = AcceleratorOptions(num_threads=8, device="cuda:1")
+
    pipeline_options = PdfPipelineOptions()
    pipeline_options.accelerator_options = accelerator_options
    pipeline_options.do_ocr = True
--- a/docs/examples/run_with_formats.py
+++ b/docs/examples/run_with_formats.py
@ -25,9 +25,8 @@ def main():
        Path("tests/data/docx/lorem_ipsum.docx"),
        Path("tests/data/pptx/powerpoint_sample.pptx"),
        Path("tests/data/2305.03393v1-pg9-img.png"),
-        Path("tests/data/2206.01062.pdf"),
-        Path("tests/data/test_01.asciidoc"),
-        Path("tests/data/test_01.asciidoc"),
+        Path("tests/data/pdf/2206.01062.pdf"),
+        Path("tests/data/asciidoc/test_01.asciidoc"),
    ]

    ## for defaults use:
@ -44,6 +43,7 @@ def main():
                InputFormat.HTML,
                InputFormat.PPTX,
                InputFormat.ASCIIDOC,
+                InputFormat.CSV,
                InputFormat.MD,
            ],  # whitelist formats, non-matching files are ignored.
            format_options={
--- a/docs/examples/tesseract_lang_detection.py
+++ b/docs/examples/tesseract_lang_detection.py
@ -0,0 +1,37 @@
+from pathlib import Path
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    PdfPipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+
+def main():
+    input_doc = Path("./tests/data/pdf/2206.01062.pdf")
+
+    # Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
+    # ocr_options = TesseractOcrOptions(lang=["auto"])
+    ocr_options = TesseractCliOcrOptions(lang=["auto"])
+
+    pipeline_options = PdfPipelineOptions(
+        do_ocr=True, force_full_page_ocr=True, ocr_options=ocr_options
+    )
+
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+            )
+        }
+    )
+
+    doc = converter.convert(input_doc).document
+    md = doc.export_to_markdown()
+    print(md)
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/examples/translate.py
+++ b/docs/examples/translate.py
@ -0,0 +1,75 @@
+import logging
+import time
+from pathlib import Path
+
+from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem
+
+from docling.datamodel.base_models import FigureElement, InputFormat, Table
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+_log = logging.getLogger(__name__)
+
+IMAGE_RESOLUTION_SCALE = 2.0
+
+
+# FIXME: put in your favorite translation code ....
+def translate(text: str, src: str = "en", dest: str = "de"):
+
+    _log.warning("!!! IMPLEMENT HERE YOUR FAVORITE TRANSLATION CODE!!!")
+    # from googletrans import Translator
+
+    # Initialize the translator
+    # translator = Translator()
+
+    # Translate text from English to German
+    # text = "Hello, how are you?"
+    # translated = translator.translate(text, src="en", dest="de")
+
+    return text
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    output_dir = Path("scratch")
+
+    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
+    # will destroy them for cleaning up memory.
+    # This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
+    # scale=1 correspond of a standard 72 DPI image
+    # The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
+    # with the image field
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
+    pipeline_options.generate_page_images = True
+    pipeline_options.generate_picture_images = True
+
+    doc_converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+        }
+    )
+
+    start_time = time.time()
+
+    conv_res = doc_converter.convert(input_doc_path)
+    conv_doc = conv_res.document
+
+    # Save markdown with embedded pictures in original text
+    md_filename = output_dir / f"{doc_filename}-with-images-orig.md"
+    conv_doc.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)
+
+    for element, _level in conv_res.document.iterate_items():
+        if isinstance(element, TextItem):
+            element.orig = element.text
+            element.text = translate(text=element.text)
+
+        elif isinstance(element, TableItem):
+            for cell in element.data.table_cells:
+                cell.text = translate(text=element.text)
+
+    # Save markdown with embedded pictures in translated text
+    md_filename = output_dir / f"{doc_filename}-with-images-translated.md"
+    conv_doc.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)
--- a/docs/faq/index.md
+++ b/docs/faq/index.md
@ -7,28 +7,7 @@ This is a collection of FAQ collected from the user questions on <https://github

    ### Is Python 3.13 supported?

-    Full support for Python 3.13 is currently waiting for [pytorch](https://github.com/pytorch/pytorch).
-
-    At the moment, no release has full support, but nightly builds are available. Docling was tested on Python 3.13 with the following steps:
-
-    ```sh
-    # Create a python 3.13 virtualenv
-    python3.13 -m venv venv
-    source ./venv/bin/activate
-
-    # Install torch nightly builds, see https://pytorch.org/
-    pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
-
-    # Install docling
-    pip3 install docling
-
-    # Run docling
-    docling --no-ocr https://arxiv.org/pdf/2408.09869
-    ```
-
-    _Note: we are disabling OCR since easyocr and the nightly torch builds have some conflicts._
-
-    Source: Issue [#136](https://github.com/DS4SD/docling/issues/136)
+    Python 3.13 is supported from Docling 2.18.0.


 ??? question "Install conflicts with numpy (python 3.13)"
@ -123,6 +102,12 @@ This is a collection of FAQ collected from the user questions on <https://github

    - Update to the latest version of [certifi](https://pypi.org/project/certifi/), i.e. `pip install --upgrade certifi`
    - Use [pip-system-certs](https://pypi.org/project/pip-system-certs/) to use the latest trusted certificates on your system.
+    - Set environment variables `SSL_CERT_FILE` and `REQUESTS_CA_BUNDLE` to the value of `python -m certifi`:
+        ```
+        CERT_PATH=$(python -m certifi)
+        export SSL_CERT_FILE=${CERT_PATH}
+        export REQUESTS_CA_BUNDLE=${CERT_PATH}
+        ```


 ??? question "Which OCR languages are supported?"
@ -145,3 +130,50 @@ This is a collection of FAQ collected from the user questions on <https://github
    pipeline_options = PdfPipelineOptions()
    pipeline_options.ocr_options.lang = ["fr", "de", "es", "en"]  # example of languages for EasyOCR
    ```
+
+
+??? question "Some images are missing from MS Word and Powerpoint"
+
+    ### Some images are missing from MS Word and Powerpoint
+
+    The image processing library used by Docling is able to handle embedded WMF images only on Windows platform.
+    If you are on other operaring systems, these images will be ignored.
+
+
+??? question "`HybridChunker` triggers warning: 'Token indices sequence length is longer than the specified maximum sequence length for this model'"
+
+    ### `HybridChunker` triggers warning: 'Token indices sequence length is longer than the specified maximum sequence length for this model'
+
+    **TLDR**:
+    In the context of the `HybridChunker`, this is a known & ancitipated "false alarm".
+
+    **Details**:
+
+    Using the [`HybridChunker`](../concepts/chunking.md#hybrid-chunker) often triggers a warning like this:
+    > Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors
+
+    This is a warning that is emitted by transformers, saying that actually *running this sequence through the model* will result in indexing errors, i.e. the problematic case is only if one indeed passes the particular sequence through the (embedding) model.
+
+    In our case though, this occurs as a "false alarm", since what happens is the following:
+
+    - the chunker invokes the tokenizer on a potentially long sequence (e.g. 530 tokens as mentioned in the warning) in order to count its tokens, i.e. to assess if it is short enough. At this point transformers already emits the warning above!
+    - whenever the sequence at hand is oversized, the chunker proceeds to split it (but the transformers warning has already been shown nonetheless)
+
+    What is important is the actual token length of the produced chunks.
+    The snippet below can be used for getting the actual maximum chunk size (for users wanting to confirm that this does not exceed the model limit):
+
+    ```python
+    chunk_max_len = 0
+    for i, chunk in enumerate(chunks):
+        ser_txt = chunker.serialize(chunk=chunk)
+        ser_tokens = len(tokenizer.tokenize(ser_txt))
+        if ser_tokens > chunk_max_len:
+            chunk_max_len = ser_tokens
+        print(f"{i}\t{ser_tokens}\t{repr(ser_txt[:100])}...")
+    print(f"Longest chunk yielded: {chunk_max_len} tokens")
+    print(f"Model max length: {tokenizer.model_max_length}")
+    ```
+
+    Also see [docling#725](https://github.com/DS4SD/docling/issues/725).
+
+    Source: Issue [docling-core#119](https://github.com/DS4SD/docling-core/issues/119)
--- a/docs/index.md
+++ b/docs/index.md
@ -14,21 +14,25 @@
 [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
 [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)

-Docling parses documents and exports them to the desired format with ease and speed.
+Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.

 ## Features

-* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
-* 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
-* 🧩 Unified, expressive [DoclingDocument](./concepts/docling_document.md) representation format
-* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
-* 🔍 OCR support for scanned PDFs
+* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
+* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
+* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
+* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
+* 🔒 Local execution capabilities for sensitive data and air-gapped environments
+* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
+* 🔍 Extensive OCR support for scanned PDFs and images
 * 💻 Simple and convenient CLI

 ### Coming soon

-* ♾️ Equation & code extraction
 * 📝 Metadata extraction, including title, authors, references & language
+* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
+* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
+* 📝 Complex chemistry understanding (Molecular structures)

 ## Get started

@ -42,3 +46,7 @@ Docling parses documents and exports them to the desired format with ease and sp
 ## IBM ❤️ Open Source AI

 Docling has been brought to you by IBM.
+
+[supported_formats]: ./usage/supported_formats.md
+[docling_document]: ./concepts/docling_document.md
+[integrations]: ./integrations/index.md
--- a/docs/installation/index.md
+++ b/docs/installation/index.md
--- a/docs/usage/enrichments.md
+++ b/docs/usage/enrichments.md
@ -0,0 +1,216 @@
+Docling allows to enrich the conversion pipeline with additional steps which process specific document components,
+e.g. code blocks, pictures, etc. The extra steps usually require extra models executions which may increase
+the processing time consistently. For this reason most enrichment models are disabled by default.
+
+The following table provides an overview of the default enrichment models available in Docling.
+
+| Feature | Parameter | Processed item | Description |
+| ------- | --------- | ---------------| ----------- |
+| Code understanding | `do_code_enrichment` | `CodeItem` | See [docs below](#code-understanding). |
+| Formula understanding | `do_formula_enrichment` | `TextItem` with label `FORMULA` | See [docs below](#formula-understanding). |
+| Picrure classification | `do_picture_classification` | `PictureItem` | See [docs below](#picture-classification). |
+| Picture description | `do_picture_description` | `PictureItem` | See [docs below](#picture-description). |
+
+
+## Enrichments details
+
+### Code understanding
+
+The code understanding step allows to use advance parsing for code blocks found in the document.
+This enrichment model also set the `code_language` property of the `CodeItem`.
+
+Model specs: see the [`CodeFormula` model card](https://huggingface.co/ds4sd/CodeFormula).
+
+Example command line:
+
+```sh
+docling --enrich-code FILE
+```
+
+Example code:
+
+```py
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.base_models import InputFormat
+
+pipeline_options = PdfPipelineOptions()
+pipeline_options.do_code_enrichment = True
+
+converter = DocumentConverter(format_options={
+    InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+})
+
+result = converter.convert("https://arxiv.org/pdf/2501.17887")
+doc = result.document
+```
+
+### Formula understanding
+
+The formula understanding step will analize the equation formulas in documents and extract their LaTeX representation.
+The HTML export functions in the DoclingDocument will leverage the formula and visualize the result using the mathml html syntax.
+
+Model specs: see the [`CodeFormula` model card](https://huggingface.co/ds4sd/CodeFormula).
+
+Example command line:
+
+```sh
+docling --enrich-formula FILE
+```
+
+Example code:
+
+```py
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.base_models import InputFormat
+
+pipeline_options = PdfPipelineOptions()
+pipeline_options.do_formula_enrichment = True
+
+converter = DocumentConverter(format_options={
+    InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+})
+
+result = converter.convert("https://arxiv.org/pdf/2501.17887")
+doc = result.document
+```
+
+### Picture classification
+
+The picture classification step classifies the `PictureItem` elements in the document with the `DocumentFigureClassifier` model.
+This model is specialized to understand the classes of pictures found in documents, e.g. different chart types, flow diagrams,
+logos, signatures, etc.
+
+Model specs: see the [`DocumentFigureClassifier` model card](https://huggingface.co/ds4sd/DocumentFigureClassifier).
+
+Example command line:
+
+```sh
+docling --enrich-picture-classes FILE
+```
+
+Example code:
+
+```py
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.base_models import InputFormat
+
+pipeline_options = PdfPipelineOptions()
+pipeline_options.generate_picture_images = True
+pipeline_options.images_scale = 2
+pipeline_options.do_picture_classification = True
+
+converter = DocumentConverter(format_options={
+    InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+})
+
+result = converter.convert("https://arxiv.org/pdf/2501.17887")
+doc = result.document
+```
+
+
+### Picture description
+
+The picture description step allows to annotate a picture with a vision model. This is also known as a "captioning" task.
+The Docling pipeline allows to load and run models completely locally as well as connecting to remote API which support the chat template.
+Below follow a few examples on how to use some common vision model and remote services.
+
+
+```py
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.base_models import InputFormat
+
+pipeline_options = PdfPipelineOptions()
+pipeline_options.do_picture_description = True
+
+converter = DocumentConverter(format_options={
+    InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+})
+
+result = converter.convert("https://arxiv.org/pdf/2501.17887")
+doc = result.document
+
+```
+
+#### Granite Vision model
+
+Model specs: see the [`ibm-granite/granite-vision-3.1-2b-preview` model card](https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview).
+
+Usage in Docling:
+
+```py
+from docling.datamodel.pipeline_options import granite_picture_description
+
+pipeline_options.picture_description_options = granite_picture_description
+```
+
+#### SmolVLM model
+
+Model specs: see the [`HuggingFaceTB/SmolVLM-256M-Instruct` model card](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct).
+
+Usage in Docling:
+
+```py
+from docling.datamodel.pipeline_options import smolvlm_picture_description
+
+pipeline_options.picture_description_options = smolvlm_picture_description
+```
+
+#### Other vision models
+
+The option class `PictureDescriptionVlmOptions` allows to use any another model from the Hugging Face Hub.
+
+```py
+from docling.datamodel.pipeline_options import PictureDescriptionVlmOptions
+
+pipeline_options.picture_description_options = PictureDescriptionVlmOptions(
+    repo_id="",  # <-- add here the Hugging Face repo_id of your favorite VLM
+    prompt="Describe the image in three sentences. Be consise and accurate.",
+)
+```
+
+#### Remote vision model
+
+The option class `PictureDescriptionApiOptions` allows to use models hosted on remote platforms, e.g.
+on local endpoints served by [VLLM](https://docs.vllm.ai), [Ollama](https://ollama.com/) and others,
+or cloud providers like [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai), etc.
+
+_Note: in most cases this option will send your data to the remote service provider._
+
+Usage in Docling:
+
+```py
+from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
+
+# Enable connections to remote services
+pipeline_options.enable_remote_services=True  # <-- this is required!
+
+# Example using a model running locally, e.g. via VLLM
+# $ vllm serve MODEL_NAME
+pipeline_options.picture_description_options = PictureDescriptionApiOptions(
+    url="http://localhost:8000/v1/chat/completions",
+    params=dict(
+        model="MODEL NAME",
+        seed=42,
+        max_completion_tokens=200,
+    ),
+    prompt="Describe the image in three sentences. Be consise and accurate.",
+    timeout=90,
+)
+```
+
+End-to-end code snippets for cloud providers are available in the examples section:
+
+- [IBM watsonx.ai](../examples/pictures_description_api.py)
+
+
+## Develop new enrichment models
+
+Beside looking at the implementation of all the models listed above, the Docling documentation has a few examples
+dedicated to the implementation of enrichment models.
+
+- [Develop picture enrichment](../examples/develop_picture_enrichment.py)
+- [Develop formula enrichment](../examples/develop_formula_understanding.py)
--- a/docs/usage/index.md
+++ b/docs/usage/index.md
@ -22,16 +22,98 @@ A simple example would look like this:
 docling https://arxiv.org/pdf/2206.01062
 ```

-To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).
+To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](../reference/cli.md).

 ### Advanced options

+#### Model prefetching and offline usage
+
+By default, models are downloaded automatically upon first usage. If you would prefer
+to explicitly prefetch them for offline use (e.g. in air-gapped environments) you can do
+that as follows:
+
+**Step 1: Prefetch the models**
+
+Use the `docling-tools models download` utility:
+
+```sh
+$ docling-tools models download
+Downloading layout model...
+Downloading tableformer model...
+Downloading picture classifier model...
+Downloading code formula model...
+Downloading easyocr models...
+Models downloaded into $HOME/.cache/docling/models.
+```
+
+Alternatively, models can be programmatically downloaded using `docling.utils.model_downloader.download_models()`.
+
+**Step 2: Use the prefetched models**
+
+```python
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+artifacts_path = "/local/path/to/models"
+
+pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
+doc_converter = DocumentConverter(
+    format_options={
+        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+    }
+)
+```
+
+Or using the CLI:
+
+```sh
+docling --artifacts-path="/local/path/to/models" FILE
+```
+
+Or using the `DOCLING_ARTIFACTS_PATH` environment variable:
+
+```sh
+export DOCLING_ARTIFACTS_PATH="/local/path/to/models"
+python my_docling_script.py
+```
+
+#### Using remote services
+
+The main purpose of Docling is to run local models which are not sharing any user data with remote services.
+Anyhow, there are valid use cases for processing part of the pipeline using remote services, for example invoking OCR engines from cloud vendors or the usage of hosted LLMs.
+
+In Docling we decided to allow such models, but we require the user to explicitly opt-in in communicating with external services.
+
+```py
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+pipeline_options = PdfPipelineOptions(enable_remote_services=True)
+doc_converter = DocumentConverter(
+    format_options={
+        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+    }
+)
+```
+
+When the value `enable_remote_services=True` is not set, the system will raise an exception `OperationNotAllowed()`.
+
+_Note: This option is only related to the system sending user data to remote services. Control of pulling data (e.g. model weights) follows the logic described in [Model prefetching and offline usage](#model-prefetching-and-offline-usage)._
+
+##### List of remote model services
+
+The options in this list require the explicit `enable_remote_services=True` when processing the documents.
+
+- `PictureDescriptionApiOptions`: Using vision models via API calls.
+
+
 #### Adjust pipeline features

-The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways
+The example file [custom_convert.py](../examples/custom_convert.py) contains multiple ways
 one can adjust the conversion pipeline and features.

-
 ##### Control PDF table extraction options

 You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
@ -53,7 +135,7 @@ doc_converter = DocumentConverter(
 )
 ```

-Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
+Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (faster but less accurate) and `TableFormerMode.ACCURATE` (default) to receive better quality with difficult table structures.

 ```python
 from docling.datamodel.base_models import InputFormat
@ -70,28 +152,6 @@ doc_converter = DocumentConverter(
 )
 ```

-##### Provide specific artifacts path
-
-By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
-
-```python
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import PdfPipelineOptions
-from docling.document_converter import DocumentConverter, PdfFormatOption
-from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
-
-# # to explicitly prefetch:
-# artifacts_path = StandardPdfPipeline.download_models_hf()
-
-artifacts_path = "/local/path/to/artifacts"
-
-pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
-doc_converter = DocumentConverter(
-    format_options={
-        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
-    }
-)
-```

 #### Impose limits on the document size

@ -126,11 +186,44 @@ result = converter.convert(source)
 You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.


+#### Use specific backend converters
+
+!!! note
+
+    This section discusses directly invoking a [backend](../concepts/architecture.md),
+    i.e. using a low-level API. This should only be done when necessary. For most cases,
+    using a `DocumentConverter` (high-level API) as discussed in the sections above
+    should suffice — and is the recommended way.
+
+By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](../supported_formats.md)).
+You can restrict the `DocumentConverter` to a set of allowed document formats, as shown in the [Multi-format conversion](../examples/run_with_formats.py) example.
+Alternatively, you can also use the specific backend that matches your document content. For instance, you can use `HTMLDocumentBackend` for HTML pages:
+
+```python
+import urllib.request
+from io import BytesIO
+from docling.backend.html_backend import HTMLDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+url = "https://en.wikipedia.org/wiki/Duck"
+text = urllib.request.urlopen(url).read()
+in_doc = InputDocument(
+    path_or_stream=BytesIO(text),
+    format=InputFormat.HTML,
+    backend=HTMLDocumentBackend,
+    filename="duck.html",
+)
+backend = HTMLDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(text))
+dl_doc = backend.convert()
+print(dl_doc.export_to_markdown())
+```
+
 ## Chunking

-You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a
+You can chunk a Docling document using a [chunker](../concepts/chunking.md), such as a
 `HybridChunker`, as shown below (for more details check out
-[this example](examples/hybrid_chunking.ipynb)):
+[this example](../examples/hybrid_chunking.ipynb)):

 ```python
 from docling.document_converter import DocumentConverter
--- a/docs/usage/supported_formats.md
+++ b/docs/usage/supported_formats.md
@ -0,0 +1,35 @@
+Docling can parse various documents formats into a unified representation (Docling
+Document), which it can export to different formats too — check out
+[Architecture](../concepts/architecture.md) for more details.
+
+Below you can find a listing of all supported input and output formats.
+
+## Supported input formats
+
+| Format | Description |
+|--------|-------------|
+| PDF | |
+| DOCX, XLSX, PPTX | Default formats in MS Office 2007+, based on Office Open XML |
+| Markdown | |
+| AsciiDoc | |
+| HTML, XHTML | |
+| CSV | |
+| PNG, JPEG, TIFF, BMP | Image formats |
+
+Schema-specific support:
+
+| Format | Description |
+|--------|-------------|
+| USPTO XML | XML format followed by [USPTO](https://www.uspto.gov/patents) patents |
+| JATS XML | XML format followed by [JATS](https://jats.nlm.nih.gov/) articles |
+| Docling JSON | JSON-serialized [Docling Document](../concepts/docling_document.md) |
+
+## Supported output formats
+
+| Format | Description |
+|--------|-------------|
+| HTML | Both image embedding and referencing are supported |
+| Markdown | |
+| JSON | Lossless serialization of Docling Document |
+| Text | Plain text, i.e. without Markdown markers |
+| Doctags | |
--- a/docs/v2.md
+++ b/docs/v2.md
@ -95,8 +95,8 @@ doc_converter = (

 More options are shown in the following example units:

- [run_with_formats.py](../examples/run_with_formats/)
- [custom_convert.py](../examples/custom_convert/)
+- [run_with_formats.py](examples/run_with_formats.py)
+- [custom_convert.py](examples/custom_convert.py)

 ### Converting documents

@ -117,12 +117,12 @@ conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/240
 ## Convert several files at once:

 input_files = [
-    "tests/data/wiki_duck.html",
-    "tests/data/word_sample.docx",
-    "tests/data/lorem_ipsum.docx",
-    "tests/data/powerpoint_sample.pptx",
+    "tests/data/html/wiki_duck.html",
+    "tests/data/docx/word_sample.docx",
+    "tests/data/docx/lorem_ipsum.docx",
+    "tests/data/pptx/powerpoint_sample.pptx",
    "tests/data/2305.03393v1-pg9-img.png",
-    "tests/data/2206.01062.pdf",
+    "tests/data/pdf/2206.01062.pdf",
 ]

 # Directly pass list of files or streams to `convert_all`
@ -226,4 +226,4 @@ leverages the new `DoclingDocument` and provides a new, richer chunk output form
 - any applicable headings for context
 - any applicable captions for context

-For an example, check out [Chunking usage](../usage/#chunking).
+For an example, check out [Chunking usage](usage.md#chunking).
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -54,10 +54,14 @@ theme:
 nav:
  - Home:
    - "Docling": index.md
-    - Installation: installation.md
-    - Usage: usage.md
-    - FAQ: faq.md
-    - Docling v2: v2.md
+    - Installation:
+      - Installation: installation/index.md
+    - Usage:
+      - Usage: usage/index.md
+      - Supported formats: usage/supported_formats.md
+      - Enrichment features: usage/enrichments.md
+    - FAQ:
+      - FAQ: faq/index.md
  - Concepts:
    - Concepts: concepts/index.md
    - Architecture: concepts/architecture.md
@ -71,19 +75,31 @@ nav:
      - "Batch conversion": examples/batch_convert.py
      - "Multi-format conversion": examples/run_with_formats.py
      - "Figure export": examples/export_figures.py
-      - "Figure enrichment": examples/develop_picture_enrichment.py
      - "Table export": examples/export_tables.py
      - "Multimodal export": examples/export_multimodal.py
      - "Force full page OCR": examples/full_page_ocr.py
+      - "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
+      - "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py
      - "Accelerator options": examples/run_with_accelerator.py
+      - "Simple translation": examples/translate.py
+      - examples/backend_csv.ipynb
+      - examples/backend_xml_rag.ipynb
    - ✂️ Chunking:
-      - "Hybrid chunking": examples/hybrid_chunking.ipynb
-    - 💬 RAG / QA:
+      - examples/hybrid_chunking.ipynb
+    - 🤖 RAG with AI dev frameworks:
      - examples/rag_haystack.ipynb
-      - examples/rag_llamaindex.ipynb
      - examples/rag_langchain.ipynb
+      - examples/rag_llamaindex.ipynb
+    - 🖼️ Picture annotation:
+      - "Annotate picture with local VLM": examples/pictures_description.ipynb
+      - "Annotate picture with remote VLM": examples/pictures_description_api.py
+    - ✨ Enrichment development:
+      - "Figure enrichment": examples/develop_picture_enrichment.py
+      - "Formula enrichment": examples/develop_formula_understanding.py
+    - 🗂️ More examples:
      - examples/rag_weaviate.ipynb
      - RAG with Granite [↗]: https://github.com/ibm-granite-community/granite-snack-cookbook/blob/main/recipes/RAG/Granite_Docling_RAG.ipynb
+      - examples/rag_azuresearch.ipynb
      - examples/retrieval_qdrant.ipynb
  - Integrations:
    - Integrations: integrations/index.md
--- a/poetry.lock
+++ b/poetry.lock
--- a/tests/data/asciidoc/test_01.asciidoc
+++ b/tests/data/asciidoc/test_01.asciidoc
--- a/tests/data/asciidoc/test_02.asciidoc
+++ b/tests/data/asciidoc/test_02.asciidoc
--- a/tests/data/csv/csv-comma-in-cell.csv
+++ b/tests/data/csv/csv-comma-in-cell.csv
@ -0,0 +1,5 @@
+1,2,3,4
+a,b,c,d
+a,",",c,d
+a,b,c,d
+a,b,c,d
--- a/tests/data/csv/csv-comma.csv
+++ b/tests/data/csv/csv-comma.csv
@ -0,0 +1,6 @@
+Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
+1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
+2,1Ef7b82A4CAAD10,Preston,"Lozano, Dr",Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
+3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/
+4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/
+5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/
--- a/tests/data/csv/csv-inconsistent-header.csv
+++ b/tests/data/csv/csv-inconsistent-header.csv
@ -0,0 +1,5 @@
+1,2,3
+a,b,c,d
+a,b,c,d
+a,b,c,d
+a,b,c,d
--- a/tests/data/csv/csv-pipe.csv
+++ b/tests/data/csv/csv-pipe.csv
@ -0,0 +1,6 @@
+Index|Customer Id|First Name|Last Name|Company|City|Country|Phone 1|Phone 2|Email|Subscription Date|Website
+1|DD37Cf93aecA6Dc|Sheryl|Baxter|Rasmussen Group|East Leonard|Chile|229.077.5154|397.884.0519x718|zunigavanessa@smith.info|2020-08-24|http://www.stephenson.com/
+2|1Ef7b82A4CAAD10|Preston|Lozano|Vega-Gentry|East Jimmychester|Djibouti|5153435776|686-620-1820x944|vmata@colon.com|2021-04-23|http://www.hobbs.com/
+3|6F94879bDAfE5a6|Roy|Berry|Murillo-Perry|Isabelborough|Antigua and Barbuda|+1-539-402-0259|(496)978-3969x58947|beckycarr@hogan.com|2020-03-25|http://www.lawrence.com/
+4|5Cef8BFA16c5e3c|Linda|Olsen|"Dominguez|Mcmillan and Donovan"|Bensonview|Dominican Republic|001-808-617-6467x12895|+1-813-324-8756|stanleyblackwell@benson.org|2020-06-02|http://www.good-lyons.com/
+5|053d585Ab6b3159|Joanna|Bender|"Martin|Lang and Andrade"|West Priscilla|Slovakia (Slovak Republic)|001-234-203-0635x76146|001-199-446-3860x3486|colinalvarado@miles.net|2021-04-17|https://goodwin-ingram.com/
--- a/tests/data/csv/csv-semicolon.csv
+++ b/tests/data/csv/csv-semicolon.csv
@ -0,0 +1,6 @@
+Index;Customer Id;First Name;Last Name;Company;City;Country;Phone 1;Phone 2;Email;Subscription Date;Website
+1;DD37Cf93aecA6Dc;Sheryl;Baxter;Rasmussen Group;East Leonard;Chile;229.077.5154;397.884.0519x718;zunigavanessa@smith.info;2020-08-24;http://www.stephenson.com/
+2;1Ef7b82A4CAAD10;Preston;Lozano;Vega-Gentry;East Jimmychester;Djibouti;5153435776;686-620-1820x944;vmata@colon.com;2021-04-23;http://www.hobbs.com/
+3;6F94879bDAfE5a6;Roy;Berry;Murillo-Perry;Isabelborough;Antigua and Barbuda;+1-539-402-0259;(496)978-3969x58947;beckycarr@hogan.com;2020-03-25;http://www.lawrence.com/
+4;5Cef8BFA16c5e3c;Linda;Olsen;"Dominguez;Mcmillan and Donovan";Bensonview;Dominican Republic;001-808-617-6467x12895;+1-813-324-8756;stanleyblackwell@benson.org;2020-06-02;http://www.good-lyons.com/
+5;053d585Ab6b3159;Joanna;Bender;"Martin;Lang and Andrade";West Priscilla;Slovakia (Slovak Republic);001-234-203-0635x76146;001-199-446-3860x3486;colinalvarado@miles.net;2021-04-17;https://goodwin-ingram.com/
--- a/tests/data/csv/csv-tab.csv
+++ b/tests/data/csv/csv-tab.csv
@ -0,0 +1,6 @@
+Index	Customer Id	First Name	Last Name	Company	City	Country	Phone 1	Phone 2	Email	Subscription Date	Website
+1	DD37Cf93aecA6Dc	Sheryl	Baxter	Rasmussen Group	East Leonard	Chile	229.077.5154	397.884.0519x718	zunigavanessa@smith.info	2020-08-24	http://www.stephenson.com/
+2	1Ef7b82A4CAAD10	Preston	Lozano	Vega-Gentry	East Jimmychester	Djibouti	5153435776	686-620-1820x944	vmata@colon.com	2021-04-23	http://www.hobbs.com/
+3	6F94879bDAfE5a6	Roy	Berry	Murillo-Perry	Isabelborough	Antigua and Barbuda	+1-539-402-0259	(496)978-3969x58947	beckycarr@hogan.com	2020-03-25	http://www.lawrence.com/
+4	5Cef8BFA16c5e3c	Linda	Olsen	"Dominguez	Mcmillan and Donovan"	Bensonview	Dominican Republic	001-808-617-6467x12895	+1-813-324-8756	stanleyblackwell@benson.org	2020-06-02	http://www.good-lyons.com/
+5	053d585Ab6b3159	Joanna	Bender	"Martin	Lang and Andrade"	West Priscilla	Slovakia (Slovak Republic)	001-234-203-0635x76146	001-199-446-3860x3486	colinalvarado@miles.net	2021-04-17	https://goodwin-ingram.com/
--- a/tests/data/csv/csv-too-few-columns.csv
+++ b/tests/data/csv/csv-too-few-columns.csv
@ -0,0 +1,5 @@
+1,2,3,4
+a,'b',c,d
+a,b,c
+a,b,c,d
+a,b,c,d
--- a/tests/data/csv/csv-too-many-columns.csv
+++ b/tests/data/csv/csv-too-many-columns.csv
@ -0,0 +1,5 @@
+1,2,3,4
+a,b,c,d
+a,b,c,d,e
+a,b,c,d
+a,b,c,d
--- a/tests/data/docx/unit_test_headers_numbered.docx
+++ b/tests/data/docx/unit_test_headers_numbered.docx
--- a/tests/data/docx/word_tables.docx
+++ b/tests/data/docx/word_tables.docx
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt
@ -4,34 +4,32 @@
 <paragraph><location><page_1><loc_34><loc_77><loc_62><loc_78></location>{ ahn,nli,mly,taa } @zurich.ibm.com</paragraph>
 <subtitle-level-1><location><page_1><loc_24><loc_71><loc_31><loc_73></location>Abstract</subtitle-level-1>
 <subtitle-level-1><location><page_1><loc_52><loc_71><loc_67><loc_72></location>a. Picture of a table:</subtitle-level-1>
+<paragraph><location><page_1><loc_8><loc_35><loc_47><loc_70></location>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</paragraph>
 <subtitle-level-1><location><page_1><loc_8><loc_30><loc_21><loc_32></location>1. Introduction</subtitle-level-1>
 <paragraph><location><page_1><loc_8><loc_10><loc_47><loc_29></location>The occurrence of tables in documents is ubiquitous. They often summarise quantitative or factual data, which is cumbersome to describe in verbose text but nevertheless extremely valuable. Unfortunately, this compact representation is often not easy to parse by machines. There are many implicit conventions used to obtain a compact table representation. For example, tables often have complex columnand row-headers in order to reduce duplicated cell content. Lines of different shapes and sizes are leveraged to separate content or indicate a tree structure. Additionally, tables can also have empty/missing table-entries or multi-row textual table-entries. Fig. 1 shows a table which presents all these issues.</paragraph>
 <figure>
 <location><page_1><loc_52><loc_62><loc_88><loc_71></location>
 </figure>
-<caption><location><page_1><loc_8><loc_35><loc_47><loc_70></location>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</caption>
 <table>
 <location><page_1><loc_52><loc_62><loc_88><loc_71></location>
-<caption>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</caption>
-<row_0><col_0><col_header>3</col_0><col_1><col_header>1</col_1></row_0>
+<row_0><col_0><col_header>1</col_0></row_0>
 </table>
 <paragraph><location><page_1><loc_52><loc_58><loc_79><loc_60></location>- b. Red-annotation of bounding boxes, Blue-predictions by TableFormer</paragraph>
+<paragraph><location><page_1><loc_52><loc_46><loc_80><loc_47></location>- c. Structure predicted by TableFormer:</paragraph>
 <figure>
 <location><page_1><loc_51><loc_48><loc_88><loc_57></location>
 </figure>
-<paragraph><location><page_1><loc_52><loc_46><loc_80><loc_47></location>- c. Structure predicted by TableFormer:</paragraph>
 <figure>
 <location><page_1><loc_52><loc_37><loc_88><loc_45></location>
+<caption>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
 </figure>
-<caption><location><page_1><loc_50><loc_29><loc_89><loc_35></location>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
 <table>
 <location><page_1><loc_52><loc_37><loc_88><loc_45></location>
-<caption>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
-<row_0><col_0><col_header>0</col_0><col_1><col_header>1</col_1><col_2><col_header>1</col_2><col_3><col_header>2 1</col_3><col_4><col_header>2 1</col_4><col_5><body></col_5></row_0>
-<row_1><col_0><body>3</col_0><col_1><body>4</col_1><col_2><body>5 3</col_2><col_3><body>6</col_3><col_4><body>7</col_4><col_5><body></col_5></row_1>
-<row_2><col_0><body>8</col_0><col_1><body>9</col_1><col_2><body>10</col_2><col_3><body>11</col_3><col_4><body>12</col_4><col_5><body>2</col_5></row_2>
-<row_3><col_0><body></col_0><col_1><body>13</col_1><col_2><body>14</col_2><col_3><body>15</col_3><col_4><body>16</col_4><col_5><body>2</col_5></row_3>
-<row_4><col_0><body></col_0><col_1><body>17</col_1><col_2><body>18</col_2><col_3><body>19</col_3><col_4><body>20</col_4><col_5><body>2</col_5></row_4>
+<row_0><col_0><body>0</col_0><col_1><body>1 2 1</col_1><col_2><body>1 2 1</col_2><col_3><body>1 2 1</col_3><col_4><body>1 2 1</col_4></row_0>
+<row_1><col_0><body>3</col_0><col_1><body>4 3</col_1><col_2><body>5</col_2><col_3><body>6</col_3><col_4><body>7</col_4></row_1>
+<row_2><col_0><body>8 2</col_0><col_1><body>9</col_1><col_2><body>10</col_2><col_3><body>11</col_3><col_4><body>12</col_4></row_2>
+<row_3><col_0><body>13</col_0><col_1><body></col_1><col_2><body>14</col_2><col_3><body>15</col_3><col_4><body>16</col_4></row_3>
+<row_4><col_0><body>17</col_0><col_1><body>18</col_1><col_2><body></col_2><col_3><body>19</col_3><col_4><body>20</col_4></row_4>
 </table>
 <paragraph><location><page_1><loc_50><loc_16><loc_89><loc_26></location>Recently, significant progress has been made with vision based approaches to extract tables in documents. For the sake of completeness, the issue of table extraction from documents is typically decomposed into two separate challenges, i.e. (1) finding the location of the table(s) on a document-page and (2) finding the structure of a given table in the document.</paragraph>
 <paragraph><location><page_1><loc_50><loc_10><loc_89><loc_16></location>The first problem is called table-location and has been previously addressed [30, 38, 19, 21, 23, 26, 8] with stateof-the-art object-detection networks (e.g. YOLO and later on Mask-RCNN [9]). For all practical purposes, it can be</paragraph>
@ -55,7 +53,6 @@
 <paragraph><location><page_3><loc_8><loc_21><loc_47><loc_38></location>Hybrid Deep Learning-Rule-Based approach : A popular current model for table-structure identification is the use of a hybrid Deep Learning-Rule-Based approach similar to [27, 29]. In this approach, one first detects the position of the table-cells with object detection (e.g. YoloVx or MaskRCNN), then classifies the table into different types (from its images) and finally uses different rule-sets to obtain its table-structure. Currently, this approach achieves stateof-the-art results, but is not an end-to-end deep-learning method. As such, new rules need to be written if different types of tables are encountered.</paragraph>
 <subtitle-level-1><location><page_3><loc_8><loc_18><loc_17><loc_20></location>3. Datasets</subtitle-level-1>
 <paragraph><location><page_3><loc_8><loc_10><loc_47><loc_17></location>We rely on large-scale datasets such as PubTabNet [37], FinTabNet [36], and TableBank [17] datasets to train and evaluate our models. These datasets span over various appearance styles and content. We also introduce our own synthetically generated SynthTabNet dataset to fix an im-</paragraph>
-<caption><location><page_3><loc_50><loc_64><loc_89><loc_66></location>Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets</caption>
 <figure>
 <location><page_3><loc_51><loc_68><loc_90><loc_90></location>
 <caption>Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets</caption>
@ -68,7 +65,6 @@
 <paragraph><location><page_4><loc_8><loc_45><loc_47><loc_60></location>As it is illustrated in Fig. 2, the table distributions from all datasets are skewed towards simpler structures with fewer number of rows/columns. Additionally, there is very limited variance in the table styles, which in case of PubTabNet and FinTabNet means one styling format for the majority of the tables. Similar limitations appear also in the type of table content, which in some cases (e.g. FinTabNet) is restricted to a certain domain. Ultimately, the lack of diversity in the training dataset damages the ability of the models to generalize well on unseen data.</paragraph>
 <paragraph><location><page_4><loc_8><loc_21><loc_47><loc_45></location>Motivated by those observations we aimed at generating a synthetic table dataset named SynthTabNet . This approach offers control over: 1) the size of the dataset, 2) the table structure, 3) the table style and 4) the type of content. The complexity of the table structure is described by the size of the table header and the table body, as well as the percentage of the table cells covered by row spans and column spans. A set of carefully designed styling templates provides the basis to build a wide range of table appearances. Lastly, the table content is generated out of a curated collection of text corpora. By controlling the size and scope of the synthetic datasets we are able to train and evaluate our models in a variety of different conditions. For example, we can first generate a highly diverse dataset to train our models and then evaluate their performance on other synthetic datasets which are focused on a specific domain.</paragraph>
 <paragraph><location><page_4><loc_8><loc_10><loc_47><loc_20></location>In this regard, we have prepared four synthetic datasets, each one containing 150k examples. The corpora to generate the table text consists of the most frequent terms appearing in PubTabNet and FinTabNet together with randomly generated text. The first two synthetic datasets have been fine-tuned to mimic the appearance of the original datasets but encompass more complicated table structures. The third</paragraph>
-<caption><location><page_4><loc_50><loc_72><loc_89><loc_79></location>Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.</caption>
 <table>
 <location><page_4><loc_51><loc_80><loc_89><loc_91></location>
 <caption>Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.</caption>
@ -80,6 +76,7 @@
 <row_5><col_0><row_header>Combined(**)</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>500k</col_3><col_4><body>PNG</col_4></row_5>
 <row_6><col_0><row_header>SynthTabNet</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>600k</col_3><col_4><body>PNG</col_4></row_6>
 </table>
+<caption><location><page_4><loc_50><loc_72><loc_89><loc_79></location>Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.</caption>
 <paragraph><location><page_4><loc_50><loc_63><loc_89><loc_68></location>one adopts a colorful appearance with high contrast and the last one contains tables with sparse content. Lastly, we have combined all synthetic datasets into one big unified synthetic dataset of 600k examples.</paragraph>
 <paragraph><location><page_4><loc_52><loc_61><loc_89><loc_62></location>Tab. 1 summarizes the various attributes of the datasets.</paragraph>
 <subtitle-level-1><location><page_4><loc_50><loc_58><loc_73><loc_59></location>4. The TableFormer model</subtitle-level-1>
@ -87,12 +84,10 @@
 <subtitle-level-1><location><page_4><loc_50><loc_41><loc_69><loc_42></location>4.1. Model architecture.</subtitle-level-1>
 <paragraph><location><page_4><loc_50><loc_16><loc_89><loc_40></location>We now describe in detail the proposed method, which is composed of three main components, see Fig. 4. Our CNN Backbone Network encodes the input as a feature vector of predefined length. The input feature vector of the encoded image is passed to the Structure Decoder to produce a sequence of HTML tags that represent the structure of the table. With each prediction of an HTML standard data cell (' < td > ') the hidden state of that cell is passed to the Cell BBox Decoder. As for spanning cells, such as row or column span, the tag is broken down to ' < ', 'rowspan=' or 'colspan=', with the number of spanning cells (attribute), and ' > '. The hidden state attached to ' < ' is passed to the Cell BBox Decoder. A shared feed forward network (FFN) receives the hidden states from the Structure Decoder, to provide the final detection predictions of the bounding box coordinates and their classification.</paragraph>
 <paragraph><location><page_4><loc_50><loc_10><loc_89><loc_16></location>CNN Backbone Network. A ResNet-18 CNN is the backbone that receives the table image and encodes it as a vector of predefined length. The network has been modified by removing the linear and pooling layer, as we are not per-</paragraph>
-<caption><location><page_5><loc_8><loc_72><loc_89><loc_74></location>Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.</caption>
 <figure>
 <location><page_5><loc_12><loc_77><loc_85><loc_90></location>
 <caption>Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.</caption>
 </figure>
-<caption><location><page_5><loc_8><loc_14><loc_47><loc_33></location>Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.</caption>
 <figure>
 <location><page_5><loc_9><loc_36><loc_47><loc_67></location>
 <caption>Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.</caption>
@ -110,8 +105,7 @@
 <subtitle-level-1><location><page_6><loc_8><loc_28><loc_28><loc_30></location>5. Experimental Results</subtitle-level-1>
 <subtitle-level-1><location><page_6><loc_8><loc_26><loc_29><loc_27></location>5.1. Implementation Details</subtitle-level-1>
 <paragraph><location><page_6><loc_8><loc_19><loc_47><loc_25></location>TableFormer uses ResNet-18 as the CNN Backbone Network . The input images are resized to 448*448 pixels and the feature map has a dimension of 28*28. Additionally, we enforce the following input constraints:</paragraph>
-<paragraph><location><page_6><loc_8><loc_10><loc_47><loc_13></location>Although input constraints are used also by other methods, such as EDD, ours are less restrictive due to the improved</paragraph>
-<paragraph><location><page_6><loc_50><loc_86><loc_89><loc_91></location>runtime performance and lower memory footprint of TableFormer. This allows to utilize input samples with longer sequences and images with larger dimensions.</paragraph>
+<paragraph><location><page_6><loc_8><loc_10><loc_47><loc_13></location><location><page_6><loc_8><loc_10><loc_47><loc_13></location>Although input constraints are used also by other methods, such as EDD, ours are less restrictive due to the improved runtime performance and lower memory footprint of TableFormer. This allows to utilize input samples with longer sequences and images with larger dimensions.</paragraph>
 <paragraph><location><page_6><loc_50><loc_59><loc_89><loc_85></location>The Transformer Encoder consists of two "Transformer Encoder Layers", with an input feature size of 512, feed forward network of 1024, and 4 attention heads. As for the Transformer Decoder it is composed of four "Transformer Decoder Layers" with similar input and output dimensions as the "Transformer Encoder Layers". Even though our model uses fewer layers and heads than the default implementation parameters, our extensive experimentation has proved this setup to be more suitable for table images. We attribute this finding to the inherent design of table images, which contain mostly lines and text, unlike the more elaborate content present in other scopes (e.g. the COCO dataset). Moreover, we have added ResNet blocks to the inputs of the Structure Decoder and Cell BBox Decoder. This prevents a decoder having a stronger influence over the learned weights which would damage the other prediction task (structure vs bounding boxes), but learn task specific weights instead. Lastly our dropout layers are set to 0.5.</paragraph>
 <paragraph><location><page_6><loc_50><loc_46><loc_89><loc_58></location>For training, TableFormer is trained with 3 Adam optimizers, each one for the CNN Backbone Network , Structure Decoder , and Cell BBox Decoder . Taking the PubTabNet as an example for our parameter set up, the initializing learning rate is 0.001 for 12 epochs with a batch size of 24, and λ set to 0.5. Afterwards, we reduce the learning rate to 0.0001, the batch size to 18 and train for 12 more epochs or convergence.</paragraph>
 <paragraph><location><page_6><loc_50><loc_30><loc_89><loc_45></location>TableFormer is implemented with PyTorch and Torchvision libraries [22]. To speed up the inference, the image undergoes a single forward pass through the CNN Backbone Network and transformer encoder. This eliminates the overhead of generating the same features for each decoding step. Similarly, we employ a 'caching' technique to preform faster autoregressive decoding. This is achieved by storing the features of decoded tokens so we can reuse them for each time step. Therefore, we only compute the attention for each new tag.</paragraph>
@ -123,10 +117,8 @@
 <paragraph><location><page_7><loc_8><loc_73><loc_47><loc_77></location>where T$_{a}$ and T$_{b}$ represent tables in tree structure HTML format. EditDist denotes the tree-edit distance, and | T | represents the number of nodes in T .</paragraph>
 <subtitle-level-1><location><page_7><loc_8><loc_70><loc_28><loc_72></location>5.4. Quantitative Analysis</subtitle-level-1>
 <paragraph><location><page_7><loc_8><loc_50><loc_47><loc_69></location>Structure. As shown in Tab. 2, TableFormer outperforms all SOTA methods across different datasets by a large margin for predicting the table structure from an image. All the more, our model outperforms pre-trained methods. During the evaluation we do not apply any table filtering. We also provide our baseline results on the SynthTabNet dataset. It has been observed that large tables (e.g. tables that occupy half of the page or more) yield poor predictions. We attribute this issue to the image resizing during the preprocessing step, that produces downsampled images with indistinguishable features. This problem can be addressed by treating such big tables with a separate model which accepts a large input image size.</paragraph>
-<caption><location><page_7><loc_8><loc_23><loc_47><loc_25></location>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</caption>
 <table>
 <location><page_7><loc_9><loc_26><loc_46><loc_48></location>
-<caption>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</caption>
 <row_0><col_0><col_header>Model</col_0><col_1><col_header>Dataset</col_1><col_2><col_header>Simple</col_2><col_3><col_header>TEDS Complex</col_3><col_4><col_header>All</col_4></row_0>
 <row_1><col_0><row_header>EDD</col_0><col_1><body>PTN</col_1><col_2><body>91.1</col_2><col_3><body>88.7</col_3><col_4><body>89.9</col_4></row_1>
 <row_2><col_0><row_header>GTE</col_0><col_1><body>PTN</col_1><col_2><body>-</col_2><col_3><body>-</col_3><col_4><body>93.01</col_4></row_2>
@ -139,20 +131,19 @@
 <row_9><col_0><row_header>TableFormer</col_0><col_1><body>TB</col_1><col_2><body>89.6</col_2><col_3><body>-</col_3><col_4><body>89.6</col_4></row_9>
 <row_10><col_0><row_header>TableFormer</col_0><col_1><body>STN</col_1><col_2><body>96.9</col_2><col_3><body>95.7</col_3><col_4><body>96.7</col_4></row_10>
 </table>
+<paragraph><location><page_7><loc_8><loc_23><loc_47><loc_25></location>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</paragraph>
 <paragraph><location><page_7><loc_8><loc_21><loc_43><loc_22></location>FT: Model was trained on PubTabNet then finetuned.</paragraph>
-<paragraph><location><page_7><loc_8><loc_10><loc_47><loc_19></location>Cell Detection. Like any object detector, our Cell BBox Detector provides bounding boxes that can be improved with post-processing during inference. We make use of the grid-like structure of tables to refine the predictions. A detailed explanation on the post-processing is available in the supplementary material. As shown in Tab. 3, we evaluate</paragraph>
-<paragraph><location><page_7><loc_50><loc_71><loc_89><loc_91></location>our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.</paragraph>
-<caption><location><page_7><loc_50><loc_57><loc_89><loc_60></location>Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.</caption>
+<paragraph><location><page_7><loc_8><loc_10><loc_47><loc_19></location><location><page_7><loc_8><loc_10><loc_47><loc_19></location>Cell Detection. Like any object detector, our Cell BBox Detector provides bounding boxes that can be improved with post-processing during inference. We make use of the grid-like structure of tables to refine the predictions. A detailed explanation on the post-processing is available in the supplementary material. As shown in Tab. 3, we evaluate our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.</paragraph>
 <table>
 <location><page_7><loc_50><loc_62><loc_87><loc_69></location>
 <caption>Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.</caption>
 <row_0><col_0><col_header>Model</col_0><col_1><col_header>Dataset</col_1><col_2><col_header>mAP</col_2><col_3><col_header>mAP (PP)</col_3></row_0>
-<row_1><col_0><body>EDD+BBox</col_0><col_1><body>PubTabNet</col_1><col_2><body>79.2</col_2><col_3><body>82.7</col_3></row_1>
-<row_2><col_0><body>TableFormer</col_0><col_1><body>PubTabNet</col_1><col_2><body>82.1</col_2><col_3><body>86.8</col_3></row_2>
-<row_3><col_0><body>TableFormer</col_0><col_1><body>SynthTabNet</col_1><col_2><body>87.7</col_2><col_3><body>-</col_3></row_3>
+<row_1><col_0><row_header>EDD+BBox</col_0><col_1><body>PubTabNet</col_1><col_2><body>79.2</col_2><col_3><body>82.7</col_3></row_1>
+<row_2><col_0><row_header>TableFormer</col_0><col_1><body>PubTabNet</col_1><col_2><body>82.1</col_2><col_3><body>86.8</col_3></row_2>
+<row_3><col_0><row_header>TableFormer</col_0><col_1><body>SynthTabNet</col_1><col_2><body>87.7</col_2><col_3><body>-</col_3></row_3>
 </table>
+<caption><location><page_7><loc_50><loc_57><loc_89><loc_60></location>Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.</caption>
 <paragraph><location><page_7><loc_50><loc_34><loc_89><loc_54></location>Cell Content. In this section, we evaluate the entire pipeline of recovering a table with content. Here we put our approach to test by capitalizing on extracting content from the PDF cells rather than decoding from images. Tab. 4 shows the TEDs score of HTML code representing the structure of the table along with the content inserted in the data cell and compared with the ground-truth. Our method achieved a 5.3% increase over the state-of-the-art, and commercial solutions. We believe our scores would be higher if the HTML ground-truth matched the extracted PDF cell content. Unfortunately, there are small discrepancies such as spacings around words or special characters with various unicode representations.</paragraph>
-<caption><location><page_7><loc_50><loc_13><loc_89><loc_17></location>Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.</caption>
 <table>
 <location><page_7><loc_54><loc_19><loc_85><loc_32></location>
 <caption>Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.</caption>
@ -164,6 +155,7 @@
 <row_5><col_0><row_header>EDD</col_0><col_1><body>91.2</col_1><col_2><body>85.4</col_2><col_3><body>88.3</col_3></row_5>
 <row_6><col_0><row_header>TableFormer</col_0><col_1><body>95.4</col_1><col_2><body>90.1</col_2><col_3><body>93.6</col_3></row_6>
 </table>
+<caption><location><page_7><loc_50><loc_13><loc_89><loc_17></location>Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.</caption>
 <paragraph><location><page_8><loc_9><loc_89><loc_10><loc_90></location>- a.</paragraph>
 <paragraph><location><page_8><loc_11><loc_89><loc_82><loc_90></location>- Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells</paragraph>
 <subtitle-level-1><location><page_8><loc_9><loc_87><loc_46><loc_88></location>Japanese language (previously unseen by TableFormer):</subtitle-level-1>
@ -171,13 +163,13 @@
 <figure>
 <location><page_8><loc_8><loc_76><loc_49><loc_87></location>
 </figure>
-<caption><location><page_8><loc_9><loc_73><loc_63><loc_74></location>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
 <figure>
 <location><page_8><loc_50><loc_77><loc_91><loc_88></location>
 <caption>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
 </figure>
 <table>
 <location><page_8><loc_9><loc_63><loc_49><loc_72></location>
+<caption>Text is aligned to match original for ease of viewing</caption>
 <row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>論文ファイル</col_2><col_3><col_header>論文ファイル</col_3><col_4><col_header>参考文献</col_4><col_5><col_header>参考文献</col_5></row_0>
 <row_1><col_0><col_header>出典</col_0><col_1><col_header>ファイル 数</col_1><col_2><col_header>英語</col_2><col_3><col_header>日本語</col_3><col_4><col_header>英語</col_4><col_5><col_header>日本語</col_5></row_1>
 <row_2><col_0><row_header>Association for Computational Linguistics(ACL2003)</col_0><col_1><body>65</col_1><col_2><body>65</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_2>
@ -187,12 +179,11 @@
 <row_6><col_0><row_header>第 17 回人工知能学会全国大会 (2003)</col_0><col_1><body>208</col_1><col_2><body>5</col_2><col_3><body>203</col_3><col_4><body>152</col_4><col_5><body>244</col_5></row_6>
 <row_7><col_0><row_header>自然言語処理研究会第 146 〜 155 回</col_0><col_1><body>98</col_1><col_2><body>2</col_2><col_3><body>96</col_3><col_4><body>150</col_4><col_5><body>232</col_5></row_7>
 <row_8><col_0><row_header>WWW から収集した論文</col_0><col_1><body>107</col_1><col_2><body>73</col_2><col_3><body>34</col_3><col_4><body>147</col_4><col_5><body>96</col_5></row_8>
-<row_9><col_0><body></col_0><col_1><body>945</col_1><col_2><body>294</col_2><col_3><body>651</col_3><col_4><body>1122</col_4><col_5><body>955</col_5></row_9>
+<row_9><col_0><row_header>計</col_0><col_1><body>945</col_1><col_2><body>294</col_2><col_3><body>651</col_3><col_4><body>1122</col_4><col_5><body>955</col_5></row_9>
 </table>
 <caption><location><page_8><loc_62><loc_62><loc_90><loc_63></location>Text is aligned to match original for ease of viewing</caption>
 <table>
 <location><page_8><loc_50><loc_64><loc_90><loc_72></location>
-<caption>Text is aligned to match original for ease of viewing</caption>
 <row_0><col_0><body></col_0><col_1><col_header>Shares (in millions)</col_1><col_2><col_header>Shares (in millions)</col_2><col_3><col_header>Weighted Average Grant Date Fair Value</col_3><col_4><col_header>Weighted Average Grant Date Fair Value</col_4></row_0>
 <row_1><col_0><body></col_0><col_1><col_header>RS U s</col_1><col_2><col_header>PSUs</col_2><col_3><col_header>RSUs</col_3><col_4><col_header>PSUs</col_4></row_1>
 <row_2><col_0><row_header>Nonvested on Janua ry 1</col_0><col_1><body>1. 1</col_1><col_2><body>0.3</col_2><col_3><body>90.10 $</col_3><col_4><body>$ 91.19</col_4></row_2>
@ -201,38 +192,36 @@
 <row_5><col_0><row_header>Canceled or forfeited</col_0><col_1><body>(0. 1 )</col_1><col_2><body>-</col_2><col_3><body>102.01</col_3><col_4><body>92.18</col_4></row_5>
 <row_6><col_0><row_header>Nonvested on December 31</col_0><col_1><body>1.0</col_1><col_2><body>0.3</col_2><col_3><body>104.85 $</col_3><col_4><body>$ 104.51</col_4></row_6>
 </table>
-<caption><location><page_8><loc_8><loc_54><loc_89><loc_59></location>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
 <figure>
 <location><page_8><loc_8><loc_44><loc_35><loc_52></location>
+<caption>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption>
+</figure>
+<figure>
+<location><page_8><loc_35><loc_44><loc_61><loc_52></location>
 <caption>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
 </figure>
 <figure>
 <location><page_8><loc_63><loc_44><loc_89><loc_52></location>
 </figure>
-<caption><location><page_8><loc_10><loc_41><loc_87><loc_42></location>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption>
-<figure>
-<location><page_8><loc_35><loc_44><loc_61><loc_52></location>
-<caption>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption>
-</figure>
 <subtitle-level-1><location><page_8><loc_8><loc_37><loc_27><loc_38></location>5.5. Qualitative Analysis</subtitle-level-1>
-<paragraph><location><page_8><loc_8><loc_10><loc_47><loc_32></location>We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.</paragraph>
 <subtitle-level-1><location><page_8><loc_50><loc_37><loc_75><loc_38></location>6. Future Work & Conclusion</subtitle-level-1>
+<paragraph><location><page_8><loc_8><loc_10><loc_47><loc_32></location>We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.</paragraph>
 <paragraph><location><page_8><loc_50><loc_18><loc_89><loc_35></location>In this paper, we presented TableFormer an end-to-end transformer based approach to predict table structures and bounding boxes of cells from an image. This approach enables us to recreate the table structure, and extract the cell content from PDF or OCR by using bounding boxes. Additionally, it provides the versatility required in real-world scenarios when dealing with various types of PDF documents, and languages. Furthermore, our method outperforms all state-of-the-arts with a wide margin. Finally, we introduce "SynthTabNet" a challenging synthetically generated dataset that reinforces missing characteristics from other datasets.</paragraph>
 <subtitle-level-1><location><page_8><loc_50><loc_14><loc_60><loc_15></location>References</subtitle-level-1>
 <paragraph><location><page_8><loc_51><loc_10><loc_89><loc_12></location>- [1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-</paragraph>
 <paragraph><location><page_9><loc_11><loc_85><loc_47><loc_90></location>- end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5</paragraph>
 <paragraph><location><page_9><loc_9><loc_81><loc_47><loc_85></location>- [2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3</paragraph>
 <paragraph><location><page_9><loc_9><loc_77><loc_47><loc_81></location>- [3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2</paragraph>
-<paragraph><location><page_9><loc_9><loc_71><loc_47><loc_76></location>- [4] Herv'e D'ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2</paragraph>
+<paragraph><location><page_9><loc_9><loc_71><loc_47><loc_76></location>- [4] Herv´e D´ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2</paragraph>
 <paragraph><location><page_9><loc_9><loc_66><loc_47><loc_71></location>- [5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2</paragraph>
-<paragraph><location><page_9><loc_9><loc_60><loc_47><loc_65></location>- [6] Max Gobel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2</paragraph>
+<paragraph><location><page_9><loc_9><loc_60><loc_47><loc_65></location>- [6] Max G¨obel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2</paragraph>
 <paragraph><location><page_9><loc_9><loc_56><loc_47><loc_60></location>- [7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2</paragraph>
 <paragraph><location><page_9><loc_9><loc_49><loc_47><loc_56></location>- [8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1</paragraph>
 <paragraph><location><page_9><loc_9><loc_45><loc_47><loc_49></location>- [9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1</paragraph>
 <paragraph><location><page_9><loc_8><loc_39><loc_47><loc_44></location>- [10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2</paragraph>
 <paragraph><location><page_9><loc_8><loc_32><loc_47><loc_39></location>- [11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2</paragraph>
 <paragraph><location><page_9><loc_8><loc_25><loc_47><loc_32></location>- [12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2</paragraph>
-<paragraph><location><page_9><loc_8><loc_18><loc_47><loc_25></location>- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl'ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2</paragraph>
+<paragraph><location><page_9><loc_8><loc_18><loc_47><loc_25></location>- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl´ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2</paragraph>
 <paragraph><location><page_9><loc_8><loc_14><loc_47><loc_18></location>- [14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2</paragraph>
 <paragraph><location><page_9><loc_8><loc_10><loc_47><loc_14></location>- [15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6</paragraph>
 <paragraph><location><page_9><loc_50><loc_82><loc_89><loc_90></location>- [16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4</paragraph>
@ -241,7 +230,7 @@
 <paragraph><location><page_9><loc_50><loc_59><loc_89><loc_67></location>- [19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1</paragraph>
 <paragraph><location><page_9><loc_50><loc_53><loc_89><loc_58></location>- [20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2</paragraph>
 <paragraph><location><page_9><loc_50><loc_45><loc_89><loc_53></location>- [21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1</paragraph>
-<paragraph><location><page_9><loc_50><loc_30><loc_89><loc_44></location>- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch'e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6</paragraph>
+<paragraph><location><page_9><loc_50><loc_30><loc_89><loc_44></location>- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch´e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6</paragraph>
 <paragraph><location><page_9><loc_50><loc_21><loc_89><loc_29></location>- [23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1</paragraph>
 <paragraph><location><page_9><loc_50><loc_16><loc_89><loc_21></location>- [24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3</paragraph>
 <paragraph><location><page_9><loc_50><loc_10><loc_89><loc_15></location>- [25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on</paragraph>
@ -267,8 +256,7 @@
 <paragraph><location><page_11><loc_8><loc_21><loc_47><loc_51></location>We have developed a technique that tries to derive a missing bounding box out of its neighbors. As a first step, we use the annotation data to generate the most fine-grained grid that covers the table structure. In case of strict HTML tables, all grid squares are associated with some table cell and in the presence of table spans a cell extends across multiple grid squares. When enough bounding boxes are known for a rectangular table, it is possible to compute the geometrical border lines between the grid rows and columns. Eventually this information is used to generate the missing bounding boxes. Additionally, the existence of unused grid squares indicates that the table rows have unequal number of columns and the overall structure is non-strict. The generation of missing bounding boxes for non-strict HTML tables is ambiguous and therefore quite challenging. Thus, we have decided to simply discard those tables. In case of PubTabNet we have computed missing bounding boxes for 48% of the simple and 69% of the complex tables. Regarding FinTabNet, 68% of the simple and 98% of the complex tables require the generation of bounding boxes.</paragraph>
 <paragraph><location><page_11><loc_8><loc_18><loc_47><loc_20></location>Figure 7 illustrates the distribution of the tables across different dimensions per dataset.</paragraph>
 <subtitle-level-1><location><page_11><loc_8><loc_15><loc_25><loc_16></location>1.2. Synthetic datasets</subtitle-level-1>
-<paragraph><location><page_11><loc_8><loc_10><loc_47><loc_14></location>Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear-</paragraph>
-<paragraph><location><page_11><loc_50><loc_74><loc_89><loc_79></location>ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).</paragraph>
+<paragraph><location><page_11><loc_8><loc_10><loc_47><loc_14></location><location><page_11><loc_8><loc_10><loc_47><loc_14></location>Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear- ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).</paragraph>
 <paragraph><location><page_11><loc_50><loc_71><loc_89><loc_73></location>The process of generating a synthetic dataset can be decomposed into the following steps:</paragraph>
 <paragraph><location><page_11><loc_50><loc_60><loc_89><loc_70></location>- 1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).</paragraph>
 <paragraph><location><page_11><loc_50><loc_43><loc_89><loc_60></location>- 2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans.</paragraph>
@ -277,13 +265,13 @@
 <paragraph><location><page_11><loc_50><loc_23><loc_89><loc_31></location>- 5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process.</paragraph>
 <subtitle-level-1><location><page_11><loc_50><loc_18><loc_89><loc_21></location>2. Prediction post-processing for PDF documents</subtitle-level-1>
 <paragraph><location><page_11><loc_50><loc_10><loc_89><loc_17></location>Although TableFormer can predict the table structure and the bounding boxes for tables recognized inside PDF documents, this is not enough when a full reconstruction of the original table is required. This happens mainly due the following reasons:</paragraph>
-<caption><location><page_12><loc_8><loc_76><loc_89><loc_79></location>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption>
 <figure>
 <location><page_12><loc_9><loc_81><loc_89><loc_91></location>
 <caption>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption>
 </figure>
 <paragraph><location><page_12><loc_10><loc_71><loc_47><loc_73></location>- · TableFormer output does not include the table cell content.</paragraph>
 <paragraph><location><page_12><loc_10><loc_67><loc_47><loc_69></location>- · There are occasional inaccuracies in the predictions of the bounding boxes.</paragraph>
+<paragraph><location><page_12><loc_50><loc_68><loc_89><loc_73></location>dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.</paragraph>
 <paragraph><location><page_12><loc_8><loc_50><loc_47><loc_65></location>However, it is possible to mitigate those limitations by combining the TableFormer predictions with the information already present inside a programmatic PDF document. More specifically, PDF documents can be seen as a sequence of PDF cells where each cell is described by its content and bounding box. If we are able to associate the PDF cells with the predicted table cells, we can directly link the PDF cell content to the table cell structure and use the PDF bounding boxes to correct misalignments in the predicted table cell bounding boxes.</paragraph>
 <paragraph><location><page_12><loc_8><loc_47><loc_47><loc_50></location>Here is a step-by-step description of the prediction postprocessing:</paragraph>
 <paragraph><location><page_12><loc_8><loc_42><loc_47><loc_47></location>- 1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.</paragraph>
@ -293,7 +281,6 @@
 <paragraph><location><page_12><loc_8><loc_24><loc_47><loc_28></location>- 4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:</paragraph>
 <paragraph><location><page_12><loc_8><loc_13><loc_47><loc_16></location>where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for the corresponding point.</paragraph>
 <paragraph><location><page_12><loc_8><loc_10><loc_47><loc_13></location>- 5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-</paragraph>
-<paragraph><location><page_12><loc_50><loc_68><loc_89><loc_73></location>dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.</paragraph>
 <paragraph><location><page_12><loc_50><loc_65><loc_89><loc_67></location>- 6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.</paragraph>
 <paragraph><location><page_12><loc_50><loc_51><loc_89><loc_64></location>- 7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.</paragraph>
 <paragraph><location><page_12><loc_50><loc_42><loc_89><loc_51></location>- 8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.</paragraph>
@ -315,14 +302,15 @@
 <table>
 <location><page_13><loc_14><loc_54><loc_39><loc_61></location>
 </table>
-<caption><location><page_13><loc_10><loc_35><loc_45><loc_37></location>Figure 8: Example of a table with multi-line header.</caption>
 <table>
 <location><page_13><loc_14><loc_38><loc_41><loc_50></location>
-<caption>Figure 8: Example of a table with multi-line header.</caption>
 </table>
+<caption><location><page_13><loc_10><loc_35><loc_45><loc_37></location>Figure 8: Example of a table with multi-line header.</caption>
 <table>
 <location><page_13><loc_51><loc_83><loc_91><loc_87></location>
+<caption>Figure 9: Example of a table with big empty distance between cells.</caption>
 </table>
+<caption><location><page_13><loc_50><loc_59><loc_89><loc_61></location>Figure 9: Example of a table with big empty distance between cells.</caption>
 <table>
 <location><page_13><loc_51><loc_77><loc_91><loc_80></location>
 </table>
@ -332,14 +320,14 @@
 <figure>
 <location><page_13><loc_51><loc_63><loc_70><loc_68></location>
 </figure>
-<caption><location><page_13><loc_50><loc_59><loc_89><loc_61></location>Figure 9: Example of a table with big empty distance between cells.</caption>
 <table>
 <location><page_13><loc_51><loc_63><loc_70><loc_68></location>
-<caption>Figure 9: Example of a table with big empty distance between cells.</caption>
 </table>
 <table>
 <location><page_13><loc_55><loc_45><loc_80><loc_51></location>
+<caption>Figure 10: Example of a complex table with empty cells.</caption>
 </table>
+<caption><location><page_13><loc_51><loc_13><loc_89><loc_14></location>Figure 10: Example of a complex table with empty cells.</caption>
 <table>
 <location><page_13><loc_55><loc_37><loc_80><loc_43></location>
 </table>
@ -349,19 +337,16 @@
 <figure>
 <location><page_13><loc_55><loc_16><loc_85><loc_25></location>
 </figure>
-<caption><location><page_13><loc_51><loc_13><loc_89><loc_14></location>Figure 10: Example of a complex table with empty cells.</caption>
 <table>
 <location><page_13><loc_55><loc_16><loc_85><loc_25></location>
-<caption>Figure 10: Example of a complex table with empty cells.</caption>
 </table>
 <table>
 <location><page_14><loc_8><loc_57><loc_46><loc_65></location>
 </table>
-<caption><location><page_14><loc_8><loc_52><loc_47><loc_55></location>Figure 11: Simple table with different style and empty cells.</caption>
 <figure>
 <location><page_14><loc_8><loc_56><loc_46><loc_87></location>
-<caption>Figure 11: Simple table with different style and empty cells.</caption>
 </figure>
+<caption><location><page_14><loc_8><loc_52><loc_47><loc_55></location>Figure 11: Simple table with different style and empty cells.</caption>
 <table>
 <location><page_14><loc_8><loc_38><loc_51><loc_43></location>
 </table>
@ -371,11 +356,10 @@
 <table>
 <location><page_14><loc_8><loc_25><loc_51><loc_30></location>
 </table>
-<caption><location><page_14><loc_9><loc_14><loc_46><loc_15></location>Figure 12: Simple table predictions and post processing.</caption>
 <figure>
 <location><page_14><loc_8><loc_17><loc_29><loc_23></location>
-<caption>Figure 12: Simple table predictions and post processing.</caption>
 </figure>
+<caption><location><page_14><loc_9><loc_14><loc_46><loc_15></location>Figure 12: Simple table predictions and post processing.</caption>
 <table>
 <location><page_14><loc_52><loc_73><loc_87><loc_80></location>
 </table>
@ -385,24 +369,23 @@
 <table>
 <location><page_14><loc_54><loc_55><loc_86><loc_64></location>
 </table>
-<caption><location><page_14><loc_52><loc_52><loc_88><loc_53></location>Figure 13: Table predictions example on colorful table.</caption>
 <figure>
 <location><page_14><loc_52><loc_55><loc_87><loc_89></location>
 <caption>Figure 13: Table predictions example on colorful table.</caption>
 </figure>
 <table>
 <location><page_14><loc_52><loc_40><loc_85><loc_46></location>
+<caption>Figure 14: Example with multi-line text.</caption>
 </table>
+<caption><location><page_14><loc_56><loc_13><loc_83><loc_14></location>Figure 14: Example with multi-line text.</caption>
 <table>
 <location><page_14><loc_52><loc_32><loc_85><loc_38></location>
 </table>
 <table>
 <location><page_14><loc_52><loc_25><loc_85><loc_31></location>
 </table>
-<caption><location><page_14><loc_56><loc_13><loc_83><loc_14></location>Figure 14: Example with multi-line text.</caption>
 <table>
 <location><page_14><loc_52><loc_16><loc_87><loc_23></location>
-<caption>Figure 14: Example with multi-line text.</caption>
 </table>
 <figure>
 <location><page_15><loc_9><loc_69><loc_46><loc_83></location>
@ -422,14 +405,11 @@
 <figure>
 <location><page_15><loc_8><loc_20><loc_52><loc_36></location>
 </figure>
-<caption><location><page_15><loc_14><loc_18><loc_41><loc_19></location>Figure 15: Example with triangular table.</caption>
-<table>
-<location><page_15><loc_8><loc_20><loc_52><loc_36></location>
-<caption>Figure 15: Example with triangular table.</caption>
-</table>
 <table>
 <location><page_15><loc_53><loc_72><loc_86><loc_85></location>
+<caption>Figure 15: Example with triangular table.</caption>
 </table>
+<caption><location><page_15><loc_14><loc_18><loc_41><loc_19></location>Figure 15: Example with triangular table.</caption>
 <table>
 <location><page_15><loc_53><loc_57><loc_86><loc_69></location>
 </table>
@ -442,12 +422,13 @@
 <figure>
 <location><page_15><loc_58><loc_20><loc_81><loc_38></location>
 </figure>
-<caption><location><page_15><loc_50><loc_15><loc_89><loc_18></location>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption>
 <table>
 <location><page_15><loc_58><loc_20><loc_81><loc_38></location>
-<caption>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption>
 </table>
-<caption><location><page_16><loc_8><loc_33><loc_89><loc_36></location>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption>
+<table>
+<location><page_15><loc_8><loc_20><loc_52><loc_36></location>
+</table>
+<caption><location><page_15><loc_50><loc_15><loc_89><loc_18></location>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption>
 <figure>
 <location><page_16><loc_11><loc_37><loc_86><loc_68></location>
 <caption>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption>
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.json
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.json
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.md
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.md
@ -8,32 +8,29 @@

 ## a. Picture of a table:

+Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.
+
 ## 1. Introduction

 The occurrence of tables in documents is ubiquitous. They often summarise quantitative or factual data, which is cumbersome to describe in verbose text but nevertheless extremely valuable. Unfortunately, this compact representation is often not easy to parse by machines. There are many implicit conventions used to obtain a compact table representation. For example, tables often have complex columnand row-headers in order to reduce duplicated cell content. Lines of different shapes and sizes are leveraged to separate content or indicate a tree structure. Additionally, tables can also have empty/missing table-entries or multi-row textual table-entries. Fig. 1 shows a table which presents all these issues.

 <!-- image -->

-Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.
-
-
-
 - b. Red-annotation of bounding boxes, Blue-predictions by TableFormer

-<!-- image -->
-
 - c. Structure predicted by TableFormer:

 <!-- image -->

 Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.
+<!-- image -->

-| 0   |   1 | 1   |   2 1 |   2 1 |    |
-|-----|-----|-----|-------|-------|----|
-| 3   |   4 | 5 3 |     6 |     7 |    |
-| 8   |   9 | 10  |    11 |    12 | 2  |
-|     |  13 | 14  |    15 |    16 | 2  |
-|     |  17 | 18  |    19 |    20 | 2  |
+| 0   | 1 2 1   | 1 2 1   |   1 2 1 |   1 2 1 |
+|-----|---------|---------|---------|---------|
+| 3   | 4 3     | 5       |       6 |       7 |
+| 8 2 | 9       | 10      |      11 |      12 |
+| 13  |         | 14      |      15 |      16 |
+| 17  | 18      |         |      19 |      20 |

 Recently, significant progress has been made with vision based approaches to extract tables in documents. For the sake of completeness, the issue of table extraction from documents is typically decomposed into two separate challenges, i.e. (1) finding the location of the table(s) on a document-page and (2) finding the structure of a given table in the document.

@ -155,9 +152,7 @@ where λ ∈ [0, 1], and λ$_{iou}$, λ$_{l}$$_{1}$ ∈$_{R}$ are hyper-paramete

 TableFormer uses ResNet-18 as the CNN Backbone Network . The input images are resized to 448*448 pixels and the feature map has a dimension of 28*28. Additionally, we enforce the following input constraints:

-Although input constraints are used also by other methods, such as EDD, ours are less restrictive due to the improved
-
-runtime performance and lower memory footprint of TableFormer. This allows to utilize input samples with longer sequences and images with larger dimensions.
+Although input constraints are used also by other methods, such as EDD, ours are less restrictive due to the improved runtime performance and lower memory footprint of TableFormer. This allows to utilize input samples with longer sequences and images with larger dimensions.

 The Transformer Encoder consists of two "Transformer Encoder Layers", with an input feature size of 512, feed forward network of 1024, and 4 attention heads. As for the Transformer Decoder it is composed of four "Transformer Decoder Layers" with similar input and output dimensions as the "Transformer Encoder Layers". Even though our model uses fewer layers and heads than the default implementation parameters, our extensive experimentation has proved this setup to be more suitable for table images. We attribute this finding to the inherent design of table images, which contain mostly lines and text, unlike the more elaborate content present in other scopes (e.g. the COCO dataset). Moreover, we have added ResNet blocks to the inputs of the Structure Decoder and Cell BBox Decoder. This prevents a decoder having a stronger influence over the learned weights which would damage the other prediction task (structure vs bounding boxes), but learn task specific weights instead. Lastly our dropout layers are set to 0.5.

@ -181,8 +176,6 @@ where T$_{a}$ and T$_{b}$ represent tables in tree structure HTML format. EditDi

 Structure. As shown in Tab. 2, TableFormer outperforms all SOTA methods across different datasets by a large margin for predicting the table structure from an image. All the more, our model outperforms pre-trained methods. During the evaluation we do not apply any table filtering. We also provide our baseline results on the SynthTabNet dataset. It has been observed that large tables (e.g. tables that occupy half of the page or more) yield poor predictions. We attribute this issue to the image resizing during the preprocessing step, that produces downsampled images with indistinguishable features. This problem can be addressed by treating such big tables with a separate model which accepts a large input image size.

-Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).
-
 | Model       | Dataset   | Simple   | TEDS Complex   |   All |
 |-------------|-----------|----------|----------------|-------|
 | EDD         | PTN       | 91.1     | 88.7           | 89.9  |
@ -196,11 +189,11 @@ Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) a
 | TableFormer | TB        | 89.6     | -              | 89.6  |
 | TableFormer | STN       | 96.9     | 95.7           | 96.7  |

+Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).
+
 FT: Model was trained on PubTabNet then finetuned.

-Cell Detection. Like any object detector, our Cell BBox Detector provides bounding boxes that can be improved with post-processing during inference. We make use of the grid-like structure of tables to refine the predictions. A detailed explanation on the post-processing is available in the supplementary material. As shown in Tab. 3, we evaluate
-
-our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.
+Cell Detection. Like any object detector, our Cell BBox Detector provides bounding boxes that can be improved with post-processing during inference. We make use of the grid-like structure of tables to refine the predictions. A detailed explanation on the post-processing is available in the supplementary material. As shown in Tab. 3, we evaluate our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.

 Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.

@ -236,6 +229,8 @@ Table 4: Results of structure with content retrieved using cell detection on Pub
 b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
 <!-- image -->

+Text is aligned to match original for ease of viewing
+
 |                                                    |             | 論文ファイル   | 論文ファイル   | 参考文献   | 参考文献   |
 |----------------------------------------------------|-------------|----------------|----------------|------------|------------|
 | 出典                                               | ファイル 数 | 英語           | 日本語         | 英語       | 日本語     |
@ -246,9 +241,7 @@ b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
 | 第 17 回人工知能学会全国大会 (2003)                | 208         | 5              | 203            | 152        | 244        |
 | 自然言語処理研究会第 146 〜 155 回                 | 98          | 2              | 96             | 150        | 232        |
 | WWW から収集した論文                               | 107         | 73             | 34             | 147        | 96         |
-|                                                    | 945         | 294            | 651            | 1122       | 955        |
-
-Text is aligned to match original for ease of viewing
+| 計                                                 | 945         | 294            | 651            | 1122       | 955        |

 |                          | Shares (in millions)   | Shares (in millions)   | Weighted Average Grant Date Fair Value   | Weighted Average Grant Date Fair Value   |
 |--------------------------|------------------------|------------------------|------------------------------------------|------------------------------------------|
@ -259,20 +252,20 @@ Text is aligned to match original for ease of viewing
 | Canceled or forfeited    | (0. 1 )                | -                      | 102.01                                   | 92.18                                    |
 | Nonvested on December 31 | 1.0                    | 0.3                    | 104.85 $                                 | $ 104.51                                 |

+Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.
+<!-- image -->
+
 Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.
 <!-- image -->

 <!-- image -->

-Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.
-<!-- image -->
-
 ## 5.5. Qualitative Analysis

-We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.
-
 ## 6. Future Work & Conclusion

+We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.
+
 In this paper, we presented TableFormer an end-to-end transformer based approach to predict table structures and bounding boxes of cells from an image. This approach enables us to recreate the table structure, and extract the cell content from PDF or OCR by using bounding boxes. Additionally, it provides the versatility required in real-world scenarios when dealing with various types of PDF documents, and languages. Furthermore, our method outperforms all state-of-the-arts with a wide margin. Finally, we introduce "SynthTabNet" a challenging synthetically generated dataset that reinforces missing characteristics from other datasets.

 ## References
@ -285,11 +278,11 @@ In this paper, we presented TableFormer an end-to-end transformer based approach

 - [3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2

- [4] Herv'e D'ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2
+- [4] Herv´e D´ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2

 - [5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2

- [6] Max Gobel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2
+- [6] Max G¨obel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2

 - [7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2

@ -303,7 +296,7 @@ In this paper, we presented TableFormer an end-to-end transformer based approach

 - [12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2

- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl'ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2
+- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl´ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2

 - [14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2

@ -321,7 +314,7 @@ In this paper, we presented TableFormer an end-to-end transformer based approach

 - [21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1

- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch'e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6
+- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch´e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6

 - [23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1

@ -373,9 +366,7 @@ Figure 7 illustrates the distribution of the tables across different dimensions

 ## 1.2. Synthetic datasets

-Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear-
-
-ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).
+Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear- ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).

 The process of generating a synthetic dataset can be decomposed into the following steps:

@ -400,6 +391,8 @@ Figure 7: Distribution of the tables across different dimensions per dataset. Si

 - · There are occasional inaccuracies in the predictions of the bounding boxes.

+dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.
+
 However, it is possible to mitigate those limitations by combining the TableFormer predictions with the information already present inside a programmatic PDF document. More specifically, PDF documents can be seen as a sequence of PDF cells where each cell is described by its content and bounding box. If we are able to associate the PDF cells with the predicted table cells, we can directly link the PDF cell content to the table cell structure and use the PDF bounding boxes to correct misalignments in the predicted table cell bounding boxes.

 Here is a step-by-step description of the prediction postprocessing:
@ -418,8 +411,6 @@ where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for t

 - 5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-

-dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.
-
 - 6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.

 - 7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.
@ -446,10 +437,6 @@ Aditional images with examples of TableFormer predictions and post-processing ca

 Figure 8: Example of a table with multi-line header.

-
-
-<!-- image -->
-
 Figure 9: Example of a table with big empty distance between cells.


@ -460,11 +447,15 @@ Figure 10: Example of a complex table with empty cells.



+<!-- image -->
+
+<!-- image -->
+
 Figure 11: Simple table with different style and empty cells.
+
 <!-- image -->

 Figure 12: Simple table predictions and post processing.
-<!-- image -->

 Figure 13: Table predictions example on colorful table.
 <!-- image -->
@ -491,7 +482,5 @@ Figure 15: Example with triangular table.

 Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.

-
-
 Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.
 <!-- image -->
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
--- a/Show More
+++ b/Show More