fix(html): preserve code blocks in list items (#2131)

* chore(html): refactor parser to leverage context managers Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(html): parse inline code snippets, also from list items Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(html): remove hidden tags Remove tags that are not meant to be displayed. Add regression tests for code blocks, inline code, and hidden tags. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-08-26 06:43:48 +02:00
parent c0268416cf
commit fa3327e1a6
5 changed files with 950 additions and 76 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -1,5 +1,6 @@
 import logging
 import re
 import traceback
 from contextlib import contextmanager
 from copy import deepcopy
 from io import BytesIO
@@ -45,20 +46,22 @@ _BLOCK_TAGS: Final = {
    "h4",
    "h5",
    "h6",
    "ol",
    "p",
    "pre",
    "code",
    "ul",
    "ol",
    "summary",
    "table",
    "ul",
 }
 _CODE_TAG_SET: Final = {"code", "kbd", "samp"}
 _FORMAT_TAG_MAP: Final = {
    "b": {"bold": True},
    "strong": {"bold": True},
    "i": {"italic": True},
    "em": {"italic": True},
    "var": {"italic": True},
    # "mark",
    # "small",
    "s": {"strikethrough": True},
@@ -67,6 +70,7 @@ _FORMAT_TAG_MAP: Final = {
    "ins": {"underline": True},
    "sub": {"script": Script.SUB},
    "sup": {"script": Script.SUPER},
    **{k: {} for k in _CODE_TAG_SET},
 }
@@ -79,6 +83,7 @@ class AnnotatedText(BaseModel):
    text: str
    hyperlink: Union[AnyUrl, Path, None] = None
    formatting: Union[Formatting, None] = None
    code: bool = False
 class AnnotatedTextList(list):
@@ -86,10 +91,12 @@ class AnnotatedTextList(list):
        current_h = None
        current_text = ""
        current_f = None
        current_code = False
        for at in self:
            t = at.text
            h = at.hyperlink
            f = at.formatting
            c = at.code
            current_text += t.strip() + " "
            if f is not None and current_f is None:
                current_f = f
@@ -103,8 +110,13 @@ class AnnotatedTextList(list):
                _log.warning(
                    f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
                )
            current_code = c if c else current_code
        return AnnotatedText(
-            text=current_text.strip(), hyperlink=current_h, formatting=current_f
+            text=current_text.strip(),
            hyperlink=current_h,
            formatting=current_f,
            code=current_code,
        )
    def simplify_text_elements(self) -> "AnnotatedTextList":
@@ -114,9 +126,14 @@ class AnnotatedTextList(list):
        text = self[0].text
        hyperlink = self[0].hyperlink
        formatting = self[0].formatting
        code = self[0].code
        last_elm = text
        for i in range(1, len(self)):
-            if hyperlink == self[i].hyperlink and formatting == self[i].formatting:
+            if (
                hyperlink == self[i].hyperlink
                and formatting == self[i].formatting
                and code == self[i].code
            ):
                sep = " "
                if not self[i].text.strip() or not last_elm.strip():
                    sep = ""
@@ -124,15 +141,20 @@ class AnnotatedTextList(list):
                last_elm = self[i].text
            else:
                simplified.append(
-                    AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
+                    AnnotatedText(
                        text=text, hyperlink=hyperlink, formatting=formatting, code=code
                    )
                )
                text = self[i].text
                last_elm = text
                hyperlink = self[i].hyperlink
                formatting = self[i].formatting
                code = self[i].code
        if text:
            simplified.append(
-                AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
+                AnnotatedText(
                    text=text, hyperlink=hyperlink, formatting=formatting, code=code
                )
            )
        return simplified
@@ -174,7 +196,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        self.ctx = _Context()
        for i in range(self.max_levels):
            self.parents[i] = None
-        self.hyperlink = None
+        self.hyperlink: Union[AnyUrl, Path, None] = None
        self.original_url = original_url
        self.format_tags: list[str] = []
@@ -235,9 +257,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                orig=title_text,
                content_layer=ContentLayer.FURNITURE,
            )
-        # remove scripts/styles
+        # remove script and style tags
        for tag in self.soup(["script", "style"]):
            tag.decompose()
        # remove any hidden tag
        for tag in self.soup(hidden=True):
            tag.decompose()
        content = self.soup.body or self.soup
        # normalize <br> tags
        for br in content("br"):
@@ -268,7 +294,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        def flush_buffer():
            if not buffer:
                return
-            annotated_text_list = buffer.simplify_text_elements()
+            annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
            parts = annotated_text_list.split_by_newline()
            buffer.clear()
@@ -276,20 +302,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                return
            for annotated_text_list in parts:
-                with self.use_inline_group(annotated_text_list, doc):
+                with self._use_inline_group(annotated_text_list, doc):
                    for annotated_text in annotated_text_list:
                        if annotated_text.text.strip():
                            seg_clean = HTMLDocumentBackend._clean_unicode(
                                annotated_text.text.strip()
                            )
-                            doc.add_text(
+                            if annotated_text.code:
-                                parent=self.parents[self.level],
+                                doc.add_code(
-                                label=DocItemLabel.TEXT,
+                                    parent=self.parents[self.level],
-                                text=seg_clean,
+                                    text=seg_clean,
-                                content_layer=self.content_layer,
+                                    content_layer=self.content_layer,
-                                formatting=annotated_text.formatting,
+                                    formatting=annotated_text.formatting,
-                                hyperlink=annotated_text.hyperlink,
+                                    hyperlink=annotated_text.hyperlink,
-                            )
+                                )
                            else:
                                doc.add_text(
                                    parent=self.parents[self.level],
                                    label=DocItemLabel.TEXT,
                                    text=seg_clean,
                                    content_layer=self.content_layer,
                                    formatting=annotated_text.formatting,
                                    hyperlink=annotated_text.hyperlink,
                                )
        for node in element.contents:
            if isinstance(node, Tag):
@@ -298,10 +333,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    flush_buffer()
                    self._emit_image(node, doc)
                elif name in _FORMAT_TAG_MAP:
-                    with self.use_format([name]):
+                    with self._use_format([name]):
                        self._walk(node, doc)
                elif name == "a":
-                    with self.use_hyperlink(node):
+                    with self._use_hyperlink(node):
                        self._walk(node, doc)
                elif name in _BLOCK_TAGS:
                    flush_buffer()
@@ -367,8 +402,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            this_parent = item.parent
            while this_parent is not None:
                if this_parent.name == "a" and this_parent.get("href"):
-                    with self.use_format(format_tags):
+                    with self._use_format(format_tags):
-                        with self.use_hyperlink(this_parent):
+                        with self._use_hyperlink(this_parent):
                            return self._extract_text_and_hyperlink_recursively(
                                item, ignore_list
                            )
@@ -379,6 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        if isinstance(item, NavigableString):
            text = item.strip()
            code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET)
            if text:
                return AnnotatedTextList(
                    [
@@ -386,6 +422,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                            text=text,
                            hyperlink=self.hyperlink,
                            formatting=self._formatting,
                            code=code,
                        )
                    ]
                )
@@ -396,6 +433,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                            text="\n",
                            hyperlink=self.hyperlink,
                            formatting=self._formatting,
                            code=code,
                        )
                    ]
                )
@@ -405,14 +443,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        if not ignore_list or (tag.name not in ["ul", "ol"]):
            for child in tag:
                if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
-                    with self.use_format([child.name]):
+                    with self._use_format([child.name]):
                        result.extend(
                            self._extract_text_and_hyperlink_recursively(
                                child, ignore_list, keep_newlines=keep_newlines
                            )
                        )
                elif isinstance(child, Tag) and child.name == "a":
-                    with self.use_hyperlink(child):
+                    with self._use_hyperlink(child):
                        result.extend(
                            self._extract_text_and_hyperlink_recursively(
                                child, ignore_list, keep_newlines=keep_newlines
@@ -428,29 +466,30 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        return result
    @contextmanager
-    def use_hyperlink(self, tag):
+    def _use_hyperlink(self, tag: Tag):
        this_href = tag.get("href")
        if this_href is None:
            yield None
        else:
-            if this_href:
+            if isinstance(this_href, str) and this_href:
-                old_hyperlink = self.hyperlink
+                old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink
                new_hyperlink: Union[AnyUrl, Path, None] = None
                if self.original_url is not None:
-                    this_href = urljoin(self.original_url, this_href)
+                    this_href = urljoin(str(self.original_url), str(this_href))
                # ugly fix for relative links since pydantic does not support them.
                try:
-                    AnyUrl(this_href)
+                    new_hyperlink = AnyUrl(this_href)
                except ValidationError:
-                    this_href = Path(this_href)
+                    new_hyperlink = Path(this_href)
-                self.hyperlink = this_href
+                self.hyperlink = new_hyperlink
            try:
                yield None
            finally:
-                if this_href:
+                if new_hyperlink:
                    self.hyperlink = old_hyperlink
    @contextmanager
-    def use_format(self, tags: list[str]):
+    def _use_format(self, tags: list[str]):
        if not tags:
            yield None
        else:
@@ -461,7 +500,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                self.format_tags = self.format_tags[: -len(tags)]
    @contextmanager
-    def use_inline_group(
+    def _use_inline_group(
        self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
    ):
        """Create an inline group for annotated texts.
@@ -473,9 +512,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        Args:
            annotated_text_list (AnnotatedTextList): Annotated text
            doc (DoclingDocument): Currently used document
        Yields:
            None: _description_
        """
        if len(annotated_text_list) > 1:
            inline_fmt = doc.add_group(
@@ -493,6 +529,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        else:
            yield None
    @contextmanager
    def _use_details(self, tag: Tag, doc: DoclingDocument):
        """Create a group with the content of a details tag.
        While the context manager is active, the hierarchy level is set one
        level higher as the cuurent parent.
        Args:
            tag: The details tag.
            doc: Currently used document.
        """
        self.parents[self.level + 1] = doc.add_group(
            name=tag.name,
            label=GroupLabel.SECTION,
            parent=self.parents[self.level],
            content_layer=self.content_layer,
        )
        self.level += 1
        try:
            yield None
        finally:
            self.parents[self.level + 1] = None
            self.level -= 1
    @contextmanager
    def _use_footer(self, tag: Tag, doc: DoclingDocument):
        """Create a group with a footer.
        Create a group with the content of a footer tag. While the context manager
        is active, the hierarchy level is set one level higher as the cuurent parent.
        Args:
            tag: The footer tag.
            doc: Currently used document.
        """
        current_layer = self.content_layer
        self.content_layer = ContentLayer.FURNITURE
        self.parents[self.level + 1] = doc.add_group(
            name=tag.name,
            label=GroupLabel.SECTION,
            parent=self.parents[self.level],
            content_layer=self.content_layer,
        )
        self.level += 1
        try:
            yield None
        finally:
            self.parents[self.level + 1] = None
            self.level -= 1
            self.content_layer = current_layer
    def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
        tag_name = tag.name.lower()
        # set default content layer to BODY as soon as we encounter a heading
@@ -611,20 +698,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                            content_layer=self.content_layer,
                        )
                        self.level += 1
-                        with self.use_inline_group(min_parts, doc):
+                        with self._use_inline_group(min_parts, doc):
                            for annotated_text in min_parts:
                                li_text = re.sub(
                                    r"\s+|\n+", " ", annotated_text.text
                                ).strip()
                                li_clean = HTMLDocumentBackend._clean_unicode(li_text)
-                                doc.add_text(
+                                if annotated_text.code:
-                                    parent=self.parents[self.level],
+                                    doc.add_code(
-                                    label=DocItemLabel.TEXT,
+                                        parent=self.parents[self.level],
-                                    text=li_clean,
+                                        text=li_clean,
-                                    content_layer=self.content_layer,
+                                        content_layer=self.content_layer,
-                                    formatting=annotated_text.formatting,
+                                        formatting=annotated_text.formatting,
-                                    hyperlink=annotated_text.hyperlink,
+                                        hyperlink=annotated_text.hyperlink,
-                                )
+                                    )
                                else:
                                    doc.add_text(
                                        parent=self.parents[self.level],
                                        label=DocItemLabel.TEXT,
                                        text=li_clean,
                                        content_layer=self.content_layer,
                                        formatting=annotated_text.formatting,
                                        hyperlink=annotated_text.hyperlink,
                                    )
                        # 4) recurse into any nested lists, attaching them to this <li> item
                        for sublist in li({"ul", "ol"}, recursive=False):
@@ -687,20 +783,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            text_list = self._extract_text_and_hyperlink_recursively(
                tag, find_parent_annotation=True
            )
-            annotated_texts = text_list.simplify_text_elements()
+            annotated_texts: AnnotatedTextList = text_list.simplify_text_elements()
            for part in annotated_texts.split_by_newline():
-                with self.use_inline_group(part, doc):
+                with self._use_inline_group(part, doc):
                    for annotated_text in part:
                        if seg := annotated_text.text.strip():
                            seg_clean = HTMLDocumentBackend._clean_unicode(seg)
-                            doc.add_text(
+                            if annotated_text.code:
-                                parent=self.parents[self.level],
+                                doc.add_code(
-                                label=DocItemLabel.TEXT,
+                                    parent=self.parents[self.level],
-                                text=seg_clean,
+                                    text=seg_clean,
-                                content_layer=self.content_layer,
+                                    content_layer=self.content_layer,
-                                formatting=annotated_text.formatting,
+                                    formatting=annotated_text.formatting,
-                                hyperlink=annotated_text.hyperlink,
+                                    hyperlink=annotated_text.hyperlink,
-                            )
+                                )
                            else:
                                doc.add_text(
                                    parent=self.parents[self.level],
                                    label=DocItemLabel.TEXT,
                                    text=seg_clean,
                                    content_layer=self.content_layer,
                                    formatting=annotated_text.formatting,
                                    hyperlink=annotated_text.hyperlink,
                                )
            for img_tag in tag("img"):
                if isinstance(img_tag, Tag):
@@ -718,13 +823,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    content_layer=self.content_layer,
                )
-        elif tag_name in {"pre", "code"}:
+        elif tag_name in {"pre"}:
            # handle monospace code snippets (pre).
            text_list = self._extract_text_and_hyperlink_recursively(
-                tag, find_parent_annotation=True
+                tag, find_parent_annotation=True, keep_newlines=True
            )
            annotated_texts = text_list.simplify_text_elements()
-            with self.use_inline_group(annotated_texts, doc):
+            with self._use_inline_group(annotated_texts, doc):
                for annotated_text in annotated_texts:
                    text_clean = HTMLDocumentBackend._clean_unicode(
                        annotated_text.text.strip()
@@ -737,22 +842,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                        hyperlink=annotated_text.hyperlink,
                    )
-        elif tag_name in {"details", "footer"}:
+        elif tag_name == "footer":
-            if tag_name == "footer":
+            with self._use_footer(tag, doc):
-                current_layer = self.content_layer
+                self._walk(tag, doc)
-                self.content_layer = ContentLayer.FURNITURE
+
-            self.parents[self.level + 1] = doc.add_group(
+        elif tag_name == "details":
-                name=tag_name,
+            with self._use_details(tag, doc):
-                label=GroupLabel.SECTION,
+                self._walk(tag, doc)
                parent=self.parents[self.level],
                content_layer=self.content_layer,
            )
            self.level += 1
            self._walk(tag, doc)
            self.parents[self.level + 1] = None
            self.level -= 1
            if tag_name == "footer":
                self.content_layer = current_layer
    def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
        figure = img_tag.find_parent("figure")
--- a/tests/data/groundtruth/docling_v2/html_code_snippets.html.itxt
+++ b/tests/data/groundtruth/docling_v2/html_code_snippets.html.itxt
@@ -0,0 +1,39 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: title: Code snippets
    item-2 at level 2: inline: group group
      item-3 at level 3: text: The Pythagorean theorem can be w ... tion relating the lengths of the sides
      item-4 at level 3: text: a
      item-5 at level 3: text: ,
      item-6 at level 3: text: b
      item-7 at level 3: text: and the hypotenuse
      item-8 at level 3: text: c
      item-9 at level 3: text: .
    item-10 at level 2: inline: group group
      item-11 at level 3: text: To use Docling, simply install
      item-12 at level 3: code: docling
      item-13 at level 3: text: from your package manager, e.g. pip:
      item-14 at level 3: code: pip install docling
    item-15 at level 2: inline: group group
      item-16 at level 3: text: To convert individual documents with python, use
      item-17 at level 3: code: convert()
      item-18 at level 3: text: , for example:
    item-19 at level 2: code: from docling.document_converter  ... (result.document.export_to_markdown())
    item-20 at level 2: inline: group group
      item-21 at level 3: text: The program will output:
      item-22 at level 3: code: ## Docling Technical Report[...]
    item-23 at level 2: text: Prefetch the models:
    item-24 at level 2: list: group list
      item-25 at level 3: list_item: 
        item-26 at level 4: inline: group group
          item-27 at level 5: text: Use the
          item-28 at level 5: code: docling-tools models download
          item-29 at level 5: text: utility:
      item-30 at level 3: list_item: 
        item-31 at level 4: inline: group group
          item-32 at level 5: text: Alternatively, models can be programmatically downloaded using
          item-33 at level 5: code: docling.utils.model_downloader.download_models()
          item-34 at level 5: text: .
      item-35 at level 3: list_item: 
        item-36 at level 4: inline: group group
          item-37 at level 5: text: Also, you can use download-hf-re ... rom HuggingFace by specifying repo id:
          item-38 at level 5: code: $ docling-tools models download- ... 256M-preview model from HuggingFace...
--- a/tests/data/groundtruth/docling_v2/html_code_snippets.html.json
+++ b/tests/data/groundtruth/docling_v2/html_code_snippets.html.json
@@ -0,0 +1,674 @@
 {
  "schema_name": "DoclingDocument",
  "version": "1.5.0",
  "name": "html_code_snippets",
  "origin": {
    "mimetype": "text/html",
    "binary_hash": 8468578485215893920,
    "filename": "html_code_snippets.html"
  },
  "furniture": {
    "self_ref": "#/furniture",
    "children": [],
    "content_layer": "furniture",
    "name": "_root_",
    "label": "unspecified"
  },
  "body": {
    "self_ref": "#/body",
    "children": [
      {
        "$ref": "#/texts/0"
      },
      {
        "$ref": "#/texts/1"
      }
    ],
    "content_layer": "body",
    "name": "_root_",
    "label": "unspecified"
  },
  "groups": [
    {
      "self_ref": "#/groups/0",
      "parent": {
        "$ref": "#/texts/1"
      },
      "children": [
        {
          "$ref": "#/texts/2"
        },
        {
          "$ref": "#/texts/3"
        },
        {
          "$ref": "#/texts/4"
        },
        {
          "$ref": "#/texts/5"
        },
        {
          "$ref": "#/texts/6"
        },
        {
          "$ref": "#/texts/7"
        },
        {
          "$ref": "#/texts/8"
        }
      ],
      "content_layer": "body",
      "name": "group",
      "label": "inline"
    },
    {
      "self_ref": "#/groups/1",
      "parent": {
        "$ref": "#/texts/1"
      },
      "children": [
        {
          "$ref": "#/texts/9"
        },
        {
          "$ref": "#/texts/10"
        },
        {
          "$ref": "#/texts/11"
        },
        {
          "$ref": "#/texts/12"
        }
      ],
      "content_layer": "body",
      "name": "group",
      "label": "inline"
    },
    {
      "self_ref": "#/groups/2",
      "parent": {
        "$ref": "#/texts/1"
      },
      "children": [
        {
          "$ref": "#/texts/13"
        },
        {
          "$ref": "#/texts/14"
        },
        {
          "$ref": "#/texts/15"
        }
      ],
      "content_layer": "body",
      "name": "group",
      "label": "inline"
    },
    {
      "self_ref": "#/groups/3",
      "parent": {
        "$ref": "#/texts/1"
      },
      "children": [
        {
          "$ref": "#/texts/17"
        },
        {
          "$ref": "#/texts/18"
        }
      ],
      "content_layer": "body",
      "name": "group",
      "label": "inline"
    },
    {
      "self_ref": "#/groups/4",
      "parent": {
        "$ref": "#/texts/1"
      },
      "children": [
        {
          "$ref": "#/texts/20"
        },
        {
          "$ref": "#/texts/24"
        },
        {
          "$ref": "#/texts/28"
        }
      ],
      "content_layer": "body",
      "name": "list",
      "label": "list"
    },
    {
      "self_ref": "#/groups/5",
      "parent": {
        "$ref": "#/texts/20"
      },
      "children": [
        {
          "$ref": "#/texts/21"
        },
        {
          "$ref": "#/texts/22"
        },
        {
          "$ref": "#/texts/23"
        }
      ],
      "content_layer": "body",
      "name": "group",
      "label": "inline"
    },
    {
      "self_ref": "#/groups/6",
      "parent": {
        "$ref": "#/texts/24"
      },
      "children": [
        {
          "$ref": "#/texts/25"
        },
        {
          "$ref": "#/texts/26"
        },
        {
          "$ref": "#/texts/27"
        }
      ],
      "content_layer": "body",
      "name": "group",
      "label": "inline"
    },
    {
      "self_ref": "#/groups/7",
      "parent": {
        "$ref": "#/texts/28"
      },
      "children": [
        {
          "$ref": "#/texts/29"
        },
        {
          "$ref": "#/texts/30"
        }
      ],
      "content_layer": "body",
      "name": "group",
      "label": "inline"
    }
  ],
  "texts": [
    {
      "self_ref": "#/texts/0",
      "parent": {
        "$ref": "#/body"
      },
      "children": [],
      "content_layer": "furniture",
      "label": "title",
      "prov": [],
      "orig": "Code snippets in HTML",
      "text": "Code snippets in HTML"
    },
    {
      "self_ref": "#/texts/1",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/groups/0"
        },
        {
          "$ref": "#/groups/1"
        },
        {
          "$ref": "#/groups/2"
        },
        {
          "$ref": "#/texts/16"
        },
        {
          "$ref": "#/groups/3"
        },
        {
          "$ref": "#/texts/19"
        },
        {
          "$ref": "#/groups/4"
        }
      ],
      "content_layer": "body",
      "label": "title",
      "prov": [],
      "orig": "Code snippets",
      "text": "Code snippets"
    },
    {
      "self_ref": "#/texts/2",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "The Pythagorean theorem can be written as an equation relating the lengths of the sides",
      "text": "The Pythagorean theorem can be written as an equation relating the lengths of the sides"
    },
    {
      "self_ref": "#/texts/3",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "a",
      "text": "a",
      "formatting": {
        "bold": false,
        "italic": true,
        "underline": false,
        "strikethrough": false,
        "script": "baseline"
      }
    },
    {
      "self_ref": "#/texts/4",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": ",",
      "text": ","
    },
    {
      "self_ref": "#/texts/5",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "b",
      "text": "b",
      "formatting": {
        "bold": false,
        "italic": true,
        "underline": false,
        "strikethrough": false,
        "script": "baseline"
      }
    },
    {
      "self_ref": "#/texts/6",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "and the hypotenuse",
      "text": "and the hypotenuse"
    },
    {
      "self_ref": "#/texts/7",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "c",
      "text": "c",
      "formatting": {
        "bold": false,
        "italic": true,
        "underline": false,
        "strikethrough": false,
        "script": "baseline"
      }
    },
    {
      "self_ref": "#/texts/8",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": ".",
      "text": "."
    },
    {
      "self_ref": "#/texts/9",
      "parent": {
        "$ref": "#/groups/1"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "To use Docling, simply install",
      "text": "To use Docling, simply install"
    },
    {
      "self_ref": "#/texts/10",
      "parent": {
        "$ref": "#/groups/1"
      },
      "children": [],
      "content_layer": "body",
      "label": "code",
      "prov": [],
      "orig": "docling",
      "text": "docling",
      "captions": [],
      "references": [],
      "footnotes": [],
      "code_language": "unknown"
    },
    {
      "self_ref": "#/texts/11",
      "parent": {
        "$ref": "#/groups/1"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "from your package manager, e.g. pip:",
      "text": "from your package manager, e.g. pip:"
    },
    {
      "self_ref": "#/texts/12",
      "parent": {
        "$ref": "#/groups/1"
      },
      "children": [],
      "content_layer": "body",
      "label": "code",
      "prov": [],
      "orig": "pip install docling",
      "text": "pip install docling",
      "captions": [],
      "references": [],
      "footnotes": [],
      "code_language": "unknown"
    },
    {
      "self_ref": "#/texts/13",
      "parent": {
        "$ref": "#/groups/2"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "To convert individual documents with python, use",
      "text": "To convert individual documents with python, use"
    },
    {
      "self_ref": "#/texts/14",
      "parent": {
        "$ref": "#/groups/2"
      },
      "children": [],
      "content_layer": "body",
      "label": "code",
      "prov": [],
      "orig": "convert()",
      "text": "convert()",
      "captions": [],
      "references": [],
      "footnotes": [],
      "code_language": "unknown"
    },
    {
      "self_ref": "#/texts/15",
      "parent": {
        "$ref": "#/groups/2"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": ", for example:",
      "text": ", for example:"
    },
    {
      "self_ref": "#/texts/16",
      "parent": {
        "$ref": "#/texts/1"
      },
      "children": [],
      "content_layer": "body",
      "label": "code",
      "prov": [],
      "orig": "from docling.document_converter import DocumentConverter\n\nsource = \"https://arxiv.org/pdf/2408.09869\"\nconverter = DocumentConverter()\nresult = converter.convert(source)\nprint(result.document.export_to_markdown())",
      "text": "from docling.document_converter import DocumentConverter\n\nsource = \"https://arxiv.org/pdf/2408.09869\"\nconverter = DocumentConverter()\nresult = converter.convert(source)\nprint(result.document.export_to_markdown())",
      "captions": [],
      "references": [],
      "footnotes": [],
      "code_language": "unknown"
    },
    {
      "self_ref": "#/texts/17",
      "parent": {
        "$ref": "#/groups/3"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "The program will output:",
      "text": "The program will output:"
    },
    {
      "self_ref": "#/texts/18",
      "parent": {
        "$ref": "#/groups/3"
      },
      "children": [],
      "content_layer": "body",
      "label": "code",
      "prov": [],
      "orig": "## Docling Technical Report[...]",
      "text": "## Docling Technical Report[...]",
      "captions": [],
      "references": [],
      "footnotes": [],
      "code_language": "unknown"
    },
    {
      "self_ref": "#/texts/19",
      "parent": {
        "$ref": "#/texts/1"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Prefetch the models:",
      "text": "Prefetch the models:"
    },
    {
      "self_ref": "#/texts/20",
      "parent": {
        "$ref": "#/groups/4"
      },
      "children": [
        {
          "$ref": "#/groups/5"
        }
      ],
      "content_layer": "body",
      "label": "list_item",
      "prov": [],
      "orig": "",
      "text": "",
      "enumerated": false,
      "marker": ""
    },
    {
      "self_ref": "#/texts/21",
      "parent": {
        "$ref": "#/groups/5"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Use the",
      "text": "Use the"
    },
    {
      "self_ref": "#/texts/22",
      "parent": {
        "$ref": "#/groups/5"
      },
      "children": [],
      "content_layer": "body",
      "label": "code",
      "prov": [],
      "orig": "docling-tools models download",
      "text": "docling-tools models download",
      "captions": [],
      "references": [],
      "footnotes": [],
      "code_language": "unknown"
    },
    {
      "self_ref": "#/texts/23",
      "parent": {
        "$ref": "#/groups/5"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "utility:",
      "text": "utility:"
    },
    {
      "self_ref": "#/texts/24",
      "parent": {
        "$ref": "#/groups/4"
      },
      "children": [
        {
          "$ref": "#/groups/6"
        }
      ],
      "content_layer": "body",
      "label": "list_item",
      "prov": [],
      "orig": "",
      "text": "",
      "enumerated": false,
      "marker": ""
    },
    {
      "self_ref": "#/texts/25",
      "parent": {
        "$ref": "#/groups/6"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Alternatively, models can be programmatically downloaded using",
      "text": "Alternatively, models can be programmatically downloaded using"
    },
    {
      "self_ref": "#/texts/26",
      "parent": {
        "$ref": "#/groups/6"
      },
      "children": [],
      "content_layer": "body",
      "label": "code",
      "prov": [],
      "orig": "docling.utils.model_downloader.download_models()",
      "text": "docling.utils.model_downloader.download_models()",
      "captions": [],
      "references": [],
      "footnotes": [],
      "code_language": "unknown"
    },
    {
      "self_ref": "#/texts/27",
      "parent": {
        "$ref": "#/groups/6"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": ".",
      "text": "."
    },
    {
      "self_ref": "#/texts/28",
      "parent": {
        "$ref": "#/groups/4"
      },
      "children": [
        {
          "$ref": "#/groups/7"
        }
      ],
      "content_layer": "body",
      "label": "list_item",
      "prov": [],
      "orig": "",
      "text": "",
      "enumerated": false,
      "marker": ""
    },
    {
      "self_ref": "#/texts/29",
      "parent": {
        "$ref": "#/groups/7"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:",
      "text": "Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:"
    },
    {
      "self_ref": "#/texts/30",
      "parent": {
        "$ref": "#/groups/7"
      },
      "children": [],
      "content_layer": "body",
      "label": "code",
      "prov": [],
      "orig": "$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...",
      "text": "$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...",
      "captions": [],
      "references": [],
      "footnotes": [],
      "code_language": "unknown"
    }
  ],
  "pictures": [],
  "tables": [],
  "key_value_items": [],
  "form_items": [],
  "pages": {}
 }
--- a/tests/data/groundtruth/docling_v2/html_code_snippets.html.md
+++ b/tests/data/groundtruth/docling_v2/html_code_snippets.html.md
@@ -0,0 +1,24 @@
 # Code snippets
 The Pythagorean theorem can be written as an equation relating the lengths of the sides *a* , *b* and the hypotenuse *c* .
 To use Docling, simply install `docling` from your package manager, e.g. pip: `pip install docling`
 To convert individual documents with python, use `convert()` , for example:
 ```
 from docling.document_converter import DocumentConverter
 source = "https://arxiv.org/pdf/2408.09869"
 converter = DocumentConverter()
 result = converter.convert(source)
 print(result.document.export_to_markdown())
 ```
 The program will output: `## Docling Technical Report[...]`
 Prefetch the models:
 - Use the `docling-tools models download` utility:
 - Alternatively, models can be programmatically downloaded using `docling.utils.model_downloader.download_models()` .
 - Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id: `$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...`
--- a/tests/data/html/html_code_snippets.html
+++ b/tests/data/html/html_code_snippets.html
@@ -0,0 +1,41 @@
 <!DOCTYPE html>
 <html>
 <head>
  <meta charset="UTF-8">
  <title>Code snippets in HTML</title>
 </head>
 <body>
 <h1>Code snippets</h1>
 <p>The Pythagorean theorem can be written as an equation relating the lengths of the sides <var>a</var>, <var>b</var> and the hypotenuse <var>c</var>.</p>
 <p>To use Docling, simply install <code>docling</code>from your package manager, e.g. pip:
    <kbd>pip install docling</kbd>
 </p>
 <p>To convert individual documents with python, use <code>convert()</code>, for example:</p>
 <pre><code>
 from docling.document_converter import DocumentConverter
 source = "https://arxiv.org/pdf/2408.09869"
 converter = DocumentConverter()
 result = converter.convert(source)
 print(result.document.export_to_markdown())
 </code></pre>
 <p>The program will output:
    <samp>## Docling Technical Report[...]</samp>
 </p>
 <p>Prefetch the models:</p>
 <ul>
    <li>Use the <code>docling-tools models download</code> utility:</li>
    <li>Alternatively, models can be programmatically downloaded using <samp>docling.utils.model_downloader.download_models()</samp>.</li>
    <li>Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:
        <pre><code>
            $ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview
            Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...
        </code></pre>
        <pre hidden><code>$ docling-tools</code></pre>
    </li>
 </ul>
 </body>
 </html>