From fa3327e1a6f7f4de6d5fd1f83588d6b5cac324ca Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Date: Tue, 26 Aug 2025 06:43:48 +0200
Subject: [PATCH] fix(html): preserve code blocks in list items (#2131)

* chore(html): refactor parser to leverage context managers

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* fix(html): parse inline code snippets, also from list items

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(html): remove hidden tags

Remove tags that are not meant to be displayed.
Add regression tests for code blocks, inline code, and hidden tags.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling/backend/html_backend.py               | 248 +++++--
 .../docling_v2/html_code_snippets.html.itxt   |  39 +
 .../docling_v2/html_code_snippets.html.json   | 674 ++++++++++++++++++
 .../docling_v2/html_code_snippets.html.md     |  24 +
 tests/data/html/html_code_snippets.html       |  41 ++
 5 files changed, 950 insertions(+), 76 deletions(-)
 create mode 100644 tests/data/groundtruth/docling_v2/html_code_snippets.html.itxt
 create mode 100644 tests/data/groundtruth/docling_v2/html_code_snippets.html.json
 create mode 100644 tests/data/groundtruth/docling_v2/html_code_snippets.html.md
 create mode 100644 tests/data/html/html_code_snippets.html

diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index b24df93e..2334c645 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -1,5 +1,6 @@
 import logging
 import re
+import traceback
 from contextlib import contextmanager
 from copy import deepcopy
 from io import BytesIO
@@ -45,20 +46,22 @@ _BLOCK_TAGS: Final = {
     "h4",
     "h5",
     "h6",
+    "ol",
     "p",
     "pre",
-    "code",
-    "ul",
-    "ol",
     "summary",
     "table",
+    "ul",
 }
 
+_CODE_TAG_SET: Final = {"code", "kbd", "samp"}
+
 _FORMAT_TAG_MAP: Final = {
     "b": {"bold": True},
     "strong": {"bold": True},
     "i": {"italic": True},
     "em": {"italic": True},
+    "var": {"italic": True},
     # "mark",
     # "small",
     "s": {"strikethrough": True},
@@ -67,6 +70,7 @@ _FORMAT_TAG_MAP: Final = {
     "ins": {"underline": True},
     "sub": {"script": Script.SUB},
     "sup": {"script": Script.SUPER},
+    **{k: {} for k in _CODE_TAG_SET},
 }
 
 
@@ -79,6 +83,7 @@ class AnnotatedText(BaseModel):
     text: str
     hyperlink: Union[AnyUrl, Path, None] = None
     formatting: Union[Formatting, None] = None
+    code: bool = False
 
 
 class AnnotatedTextList(list):
@@ -86,10 +91,12 @@ class AnnotatedTextList(list):
         current_h = None
         current_text = ""
         current_f = None
+        current_code = False
         for at in self:
             t = at.text
             h = at.hyperlink
             f = at.formatting
+            c = at.code
             current_text += t.strip() + " "
             if f is not None and current_f is None:
                 current_f = f
@@ -103,8 +110,13 @@ class AnnotatedTextList(list):
                 _log.warning(
                     f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
                 )
+            current_code = c if c else current_code
+
         return AnnotatedText(
-            text=current_text.strip(), hyperlink=current_h, formatting=current_f
+            text=current_text.strip(),
+            hyperlink=current_h,
+            formatting=current_f,
+            code=current_code,
         )
 
     def simplify_text_elements(self) -> "AnnotatedTextList":
@@ -114,9 +126,14 @@ class AnnotatedTextList(list):
         text = self[0].text
         hyperlink = self[0].hyperlink
         formatting = self[0].formatting
+        code = self[0].code
         last_elm = text
         for i in range(1, len(self)):
-            if hyperlink == self[i].hyperlink and formatting == self[i].formatting:
+            if (
+                hyperlink == self[i].hyperlink
+                and formatting == self[i].formatting
+                and code == self[i].code
+            ):
                 sep = " "
                 if not self[i].text.strip() or not last_elm.strip():
                     sep = ""
@@ -124,15 +141,20 @@ class AnnotatedTextList(list):
                 last_elm = self[i].text
             else:
                 simplified.append(
-                    AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
+                    AnnotatedText(
+                        text=text, hyperlink=hyperlink, formatting=formatting, code=code
+                    )
                 )
                 text = self[i].text
                 last_elm = text
                 hyperlink = self[i].hyperlink
                 formatting = self[i].formatting
+                code = self[i].code
         if text:
             simplified.append(
-                AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
+                AnnotatedText(
+                    text=text, hyperlink=hyperlink, formatting=formatting, code=code
+                )
             )
         return simplified
 
@@ -174,7 +196,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         self.ctx = _Context()
         for i in range(self.max_levels):
             self.parents[i] = None
-        self.hyperlink = None
+        self.hyperlink: Union[AnyUrl, Path, None] = None
         self.original_url = original_url
         self.format_tags: list[str] = []
 
@@ -235,9 +257,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 orig=title_text,
                 content_layer=ContentLayer.FURNITURE,
             )
-        # remove scripts/styles
+        # remove script and style tags
         for tag in self.soup(["script", "style"]):
             tag.decompose()
+        # remove any hidden tag
+        for tag in self.soup(hidden=True):
+            tag.decompose()
+
         content = self.soup.body or self.soup
         # normalize <br> tags
         for br in content("br"):
@@ -268,7 +294,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         def flush_buffer():
             if not buffer:
                 return
-            annotated_text_list = buffer.simplify_text_elements()
+            annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
             parts = annotated_text_list.split_by_newline()
             buffer.clear()
 
@@ -276,20 +302,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 return
 
             for annotated_text_list in parts:
-                with self.use_inline_group(annotated_text_list, doc):
+                with self._use_inline_group(annotated_text_list, doc):
                     for annotated_text in annotated_text_list:
                         if annotated_text.text.strip():
                             seg_clean = HTMLDocumentBackend._clean_unicode(
                                 annotated_text.text.strip()
                             )
-                            doc.add_text(
-                                parent=self.parents[self.level],
-                                label=DocItemLabel.TEXT,
-                                text=seg_clean,
-                                content_layer=self.content_layer,
-                                formatting=annotated_text.formatting,
-                                hyperlink=annotated_text.hyperlink,
-                            )
+                            if annotated_text.code:
+                                doc.add_code(
+                                    parent=self.parents[self.level],
+                                    text=seg_clean,
+                                    content_layer=self.content_layer,
+                                    formatting=annotated_text.formatting,
+                                    hyperlink=annotated_text.hyperlink,
+                                )
+                            else:
+                                doc.add_text(
+                                    parent=self.parents[self.level],
+                                    label=DocItemLabel.TEXT,
+                                    text=seg_clean,
+                                    content_layer=self.content_layer,
+                                    formatting=annotated_text.formatting,
+                                    hyperlink=annotated_text.hyperlink,
+                                )
 
         for node in element.contents:
             if isinstance(node, Tag):
@@ -298,10 +333,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     flush_buffer()
                     self._emit_image(node, doc)
                 elif name in _FORMAT_TAG_MAP:
-                    with self.use_format([name]):
+                    with self._use_format([name]):
                         self._walk(node, doc)
                 elif name == "a":
-                    with self.use_hyperlink(node):
+                    with self._use_hyperlink(node):
                         self._walk(node, doc)
                 elif name in _BLOCK_TAGS:
                     flush_buffer()
@@ -367,8 +402,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             this_parent = item.parent
             while this_parent is not None:
                 if this_parent.name == "a" and this_parent.get("href"):
-                    with self.use_format(format_tags):
-                        with self.use_hyperlink(this_parent):
+                    with self._use_format(format_tags):
+                        with self._use_hyperlink(this_parent):
                             return self._extract_text_and_hyperlink_recursively(
                                 item, ignore_list
                             )
@@ -379,6 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
 
         if isinstance(item, NavigableString):
             text = item.strip()
+            code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET)
             if text:
                 return AnnotatedTextList(
                     [
@@ -386,6 +422,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                             text=text,
                             hyperlink=self.hyperlink,
                             formatting=self._formatting,
+                            code=code,
                         )
                     ]
                 )
@@ -396,6 +433,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                             text="\n",
                             hyperlink=self.hyperlink,
                             formatting=self._formatting,
+                            code=code,
                         )
                     ]
                 )
@@ -405,14 +443,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         if not ignore_list or (tag.name not in ["ul", "ol"]):
             for child in tag:
                 if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
-                    with self.use_format([child.name]):
+                    with self._use_format([child.name]):
                         result.extend(
                             self._extract_text_and_hyperlink_recursively(
                                 child, ignore_list, keep_newlines=keep_newlines
                             )
                         )
                 elif isinstance(child, Tag) and child.name == "a":
-                    with self.use_hyperlink(child):
+                    with self._use_hyperlink(child):
                         result.extend(
                             self._extract_text_and_hyperlink_recursively(
                                 child, ignore_list, keep_newlines=keep_newlines
@@ -428,29 +466,30 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return result
 
     @contextmanager
-    def use_hyperlink(self, tag):
+    def _use_hyperlink(self, tag: Tag):
         this_href = tag.get("href")
         if this_href is None:
             yield None
         else:
-            if this_href:
-                old_hyperlink = self.hyperlink
+            if isinstance(this_href, str) and this_href:
+                old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink
+                new_hyperlink: Union[AnyUrl, Path, None] = None
                 if self.original_url is not None:
-                    this_href = urljoin(self.original_url, this_href)
+                    this_href = urljoin(str(self.original_url), str(this_href))
                 # ugly fix for relative links since pydantic does not support them.
                 try:
-                    AnyUrl(this_href)
+                    new_hyperlink = AnyUrl(this_href)
                 except ValidationError:
-                    this_href = Path(this_href)
-                self.hyperlink = this_href
+                    new_hyperlink = Path(this_href)
+                self.hyperlink = new_hyperlink
             try:
                 yield None
             finally:
-                if this_href:
+                if new_hyperlink:
                     self.hyperlink = old_hyperlink
 
     @contextmanager
-    def use_format(self, tags: list[str]):
+    def _use_format(self, tags: list[str]):
         if not tags:
             yield None
         else:
@@ -461,7 +500,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 self.format_tags = self.format_tags[: -len(tags)]
 
     @contextmanager
-    def use_inline_group(
+    def _use_inline_group(
         self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
     ):
         """Create an inline group for annotated texts.
@@ -473,9 +512,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         Args:
             annotated_text_list (AnnotatedTextList): Annotated text
             doc (DoclingDocument): Currently used document
-
-        Yields:
-            None: _description_
         """
         if len(annotated_text_list) > 1:
             inline_fmt = doc.add_group(
@@ -493,6 +529,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         else:
             yield None
 
+    @contextmanager
+    def _use_details(self, tag: Tag, doc: DoclingDocument):
+        """Create a group with the content of a details tag.
+
+        While the context manager is active, the hierarchy level is set one
+        level higher as the cuurent parent.
+
+        Args:
+            tag: The details tag.
+            doc: Currently used document.
+        """
+        self.parents[self.level + 1] = doc.add_group(
+            name=tag.name,
+            label=GroupLabel.SECTION,
+            parent=self.parents[self.level],
+            content_layer=self.content_layer,
+        )
+        self.level += 1
+        try:
+            yield None
+        finally:
+            self.parents[self.level + 1] = None
+            self.level -= 1
+
+    @contextmanager
+    def _use_footer(self, tag: Tag, doc: DoclingDocument):
+        """Create a group with a footer.
+
+        Create a group with the content of a footer tag. While the context manager
+        is active, the hierarchy level is set one level higher as the cuurent parent.
+
+        Args:
+            tag: The footer tag.
+            doc: Currently used document.
+        """
+        current_layer = self.content_layer
+        self.content_layer = ContentLayer.FURNITURE
+        self.parents[self.level + 1] = doc.add_group(
+            name=tag.name,
+            label=GroupLabel.SECTION,
+            parent=self.parents[self.level],
+            content_layer=self.content_layer,
+        )
+        self.level += 1
+        try:
+            yield None
+        finally:
+            self.parents[self.level + 1] = None
+            self.level -= 1
+            self.content_layer = current_layer
+
     def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
         tag_name = tag.name.lower()
         # set default content layer to BODY as soon as we encounter a heading
@@ -611,20 +698,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                             content_layer=self.content_layer,
                         )
                         self.level += 1
-                        with self.use_inline_group(min_parts, doc):
+                        with self._use_inline_group(min_parts, doc):
                             for annotated_text in min_parts:
                                 li_text = re.sub(
                                     r"\s+|\n+", " ", annotated_text.text
                                 ).strip()
                                 li_clean = HTMLDocumentBackend._clean_unicode(li_text)
-                                doc.add_text(
-                                    parent=self.parents[self.level],
-                                    label=DocItemLabel.TEXT,
-                                    text=li_clean,
-                                    content_layer=self.content_layer,
-                                    formatting=annotated_text.formatting,
-                                    hyperlink=annotated_text.hyperlink,
-                                )
+                                if annotated_text.code:
+                                    doc.add_code(
+                                        parent=self.parents[self.level],
+                                        text=li_clean,
+                                        content_layer=self.content_layer,
+                                        formatting=annotated_text.formatting,
+                                        hyperlink=annotated_text.hyperlink,
+                                    )
+                                else:
+                                    doc.add_text(
+                                        parent=self.parents[self.level],
+                                        label=DocItemLabel.TEXT,
+                                        text=li_clean,
+                                        content_layer=self.content_layer,
+                                        formatting=annotated_text.formatting,
+                                        hyperlink=annotated_text.hyperlink,
+                                    )
 
                         # 4) recurse into any nested lists, attaching them to this <li> item
                         for sublist in li({"ul", "ol"}, recursive=False):
@@ -687,20 +783,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             text_list = self._extract_text_and_hyperlink_recursively(
                 tag, find_parent_annotation=True
             )
-            annotated_texts = text_list.simplify_text_elements()
+            annotated_texts: AnnotatedTextList = text_list.simplify_text_elements()
             for part in annotated_texts.split_by_newline():
-                with self.use_inline_group(part, doc):
+                with self._use_inline_group(part, doc):
                     for annotated_text in part:
                         if seg := annotated_text.text.strip():
                             seg_clean = HTMLDocumentBackend._clean_unicode(seg)
-                            doc.add_text(
-                                parent=self.parents[self.level],
-                                label=DocItemLabel.TEXT,
-                                text=seg_clean,
-                                content_layer=self.content_layer,
-                                formatting=annotated_text.formatting,
-                                hyperlink=annotated_text.hyperlink,
-                            )
+                            if annotated_text.code:
+                                doc.add_code(
+                                    parent=self.parents[self.level],
+                                    text=seg_clean,
+                                    content_layer=self.content_layer,
+                                    formatting=annotated_text.formatting,
+                                    hyperlink=annotated_text.hyperlink,
+                                )
+                            else:
+                                doc.add_text(
+                                    parent=self.parents[self.level],
+                                    label=DocItemLabel.TEXT,
+                                    text=seg_clean,
+                                    content_layer=self.content_layer,
+                                    formatting=annotated_text.formatting,
+                                    hyperlink=annotated_text.hyperlink,
+                                )
 
             for img_tag in tag("img"):
                 if isinstance(img_tag, Tag):
@@ -718,13 +823,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     content_layer=self.content_layer,
                 )
 
-        elif tag_name in {"pre", "code"}:
+        elif tag_name in {"pre"}:
             # handle monospace code snippets (pre).
             text_list = self._extract_text_and_hyperlink_recursively(
-                tag, find_parent_annotation=True
+                tag, find_parent_annotation=True, keep_newlines=True
             )
             annotated_texts = text_list.simplify_text_elements()
-            with self.use_inline_group(annotated_texts, doc):
+            with self._use_inline_group(annotated_texts, doc):
                 for annotated_text in annotated_texts:
                     text_clean = HTMLDocumentBackend._clean_unicode(
                         annotated_text.text.strip()
@@ -737,22 +842,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                         hyperlink=annotated_text.hyperlink,
                     )
 
-        elif tag_name in {"details", "footer"}:
-            if tag_name == "footer":
-                current_layer = self.content_layer
-                self.content_layer = ContentLayer.FURNITURE
-            self.parents[self.level + 1] = doc.add_group(
-                name=tag_name,
-                label=GroupLabel.SECTION,
-                parent=self.parents[self.level],
-                content_layer=self.content_layer,
-            )
-            self.level += 1
-            self._walk(tag, doc)
-            self.parents[self.level + 1] = None
-            self.level -= 1
-            if tag_name == "footer":
-                self.content_layer = current_layer
+        elif tag_name == "footer":
+            with self._use_footer(tag, doc):
+                self._walk(tag, doc)
+
+        elif tag_name == "details":
+            with self._use_details(tag, doc):
+                self._walk(tag, doc)
 
     def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
         figure = img_tag.find_parent("figure")
diff --git a/tests/data/groundtruth/docling_v2/html_code_snippets.html.itxt b/tests/data/groundtruth/docling_v2/html_code_snippets.html.itxt
new file mode 100644
index 00000000..77f11c65
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/html_code_snippets.html.itxt
@@ -0,0 +1,39 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: title: Code snippets
+    item-2 at level 2: inline: group group
+      item-3 at level 3: text: The Pythagorean theorem can be w ... tion relating the lengths of the sides
+      item-4 at level 3: text: a
+      item-5 at level 3: text: ,
+      item-6 at level 3: text: b
+      item-7 at level 3: text: and the hypotenuse
+      item-8 at level 3: text: c
+      item-9 at level 3: text: .
+    item-10 at level 2: inline: group group
+      item-11 at level 3: text: To use Docling, simply install
+      item-12 at level 3: code: docling
+      item-13 at level 3: text: from your package manager, e.g. pip:
+      item-14 at level 3: code: pip install docling
+    item-15 at level 2: inline: group group
+      item-16 at level 3: text: To convert individual documents with python, use
+      item-17 at level 3: code: convert()
+      item-18 at level 3: text: , for example:
+    item-19 at level 2: code: from docling.document_converter  ... (result.document.export_to_markdown())
+    item-20 at level 2: inline: group group
+      item-21 at level 3: text: The program will output:
+      item-22 at level 3: code: ## Docling Technical Report[...]
+    item-23 at level 2: text: Prefetch the models:
+    item-24 at level 2: list: group list
+      item-25 at level 3: list_item: 
+        item-26 at level 4: inline: group group
+          item-27 at level 5: text: Use the
+          item-28 at level 5: code: docling-tools models download
+          item-29 at level 5: text: utility:
+      item-30 at level 3: list_item: 
+        item-31 at level 4: inline: group group
+          item-32 at level 5: text: Alternatively, models can be programmatically downloaded using
+          item-33 at level 5: code: docling.utils.model_downloader.download_models()
+          item-34 at level 5: text: .
+      item-35 at level 3: list_item: 
+        item-36 at level 4: inline: group group
+          item-37 at level 5: text: Also, you can use download-hf-re ... rom HuggingFace by specifying repo id:
+          item-38 at level 5: code: $ docling-tools models download- ... 256M-preview model from HuggingFace...
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/html_code_snippets.html.json b/tests/data/groundtruth/docling_v2/html_code_snippets.html.json
new file mode 100644
index 00000000..aaa239b0
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/html_code_snippets.html.json
@@ -0,0 +1,674 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.5.0",
+  "name": "html_code_snippets",
+  "origin": {
+    "mimetype": "text/html",
+    "binary_hash": 8468578485215893920,
+    "filename": "html_code_snippets.html"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/texts/1"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/2"
+        },
+        {
+          "$ref": "#/texts/3"
+        },
+        {
+          "$ref": "#/texts/4"
+        },
+        {
+          "$ref": "#/texts/5"
+        },
+        {
+          "$ref": "#/texts/6"
+        },
+        {
+          "$ref": "#/texts/7"
+        },
+        {
+          "$ref": "#/texts/8"
+        }
+      ],
+      "content_layer": "body",
+      "name": "group",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/1",
+      "parent": {
+        "$ref": "#/texts/1"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/9"
+        },
+        {
+          "$ref": "#/texts/10"
+        },
+        {
+          "$ref": "#/texts/11"
+        },
+        {
+          "$ref": "#/texts/12"
+        }
+      ],
+      "content_layer": "body",
+      "name": "group",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/2",
+      "parent": {
+        "$ref": "#/texts/1"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/13"
+        },
+        {
+          "$ref": "#/texts/14"
+        },
+        {
+          "$ref": "#/texts/15"
+        }
+      ],
+      "content_layer": "body",
+      "name": "group",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/3",
+      "parent": {
+        "$ref": "#/texts/1"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/17"
+        },
+        {
+          "$ref": "#/texts/18"
+        }
+      ],
+      "content_layer": "body",
+      "name": "group",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/4",
+      "parent": {
+        "$ref": "#/texts/1"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/20"
+        },
+        {
+          "$ref": "#/texts/24"
+        },
+        {
+          "$ref": "#/texts/28"
+        }
+      ],
+      "content_layer": "body",
+      "name": "list",
+      "label": "list"
+    },
+    {
+      "self_ref": "#/groups/5",
+      "parent": {
+        "$ref": "#/texts/20"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/21"
+        },
+        {
+          "$ref": "#/texts/22"
+        },
+        {
+          "$ref": "#/texts/23"
+        }
+      ],
+      "content_layer": "body",
+      "name": "group",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/6",
+      "parent": {
+        "$ref": "#/texts/24"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/25"
+        },
+        {
+          "$ref": "#/texts/26"
+        },
+        {
+          "$ref": "#/texts/27"
+        }
+      ],
+      "content_layer": "body",
+      "name": "group",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/7",
+      "parent": {
+        "$ref": "#/texts/28"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/29"
+        },
+        {
+          "$ref": "#/texts/30"
+        }
+      ],
+      "content_layer": "body",
+      "name": "group",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "furniture",
+      "label": "title",
+      "prov": [],
+      "orig": "Code snippets in HTML",
+      "text": "Code snippets in HTML"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/groups/0"
+        },
+        {
+          "$ref": "#/groups/1"
+        },
+        {
+          "$ref": "#/groups/2"
+        },
+        {
+          "$ref": "#/texts/16"
+        },
+        {
+          "$ref": "#/groups/3"
+        },
+        {
+          "$ref": "#/texts/19"
+        },
+        {
+          "$ref": "#/groups/4"
+        }
+      ],
+      "content_layer": "body",
+      "label": "title",
+      "prov": [],
+      "orig": "Code snippets",
+      "text": "Code snippets"
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "The Pythagorean theorem can be written as an equation relating the lengths of the sides",
+      "text": "The Pythagorean theorem can be written as an equation relating the lengths of the sides"
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "a",
+      "text": "a",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": ",",
+      "text": ","
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "b",
+      "text": "b",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "and the hypotenuse",
+      "text": "and the hypotenuse"
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "c",
+      "text": "c",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": ".",
+      "text": "."
+    },
+    {
+      "self_ref": "#/texts/9",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "To use Docling, simply install",
+      "text": "To use Docling, simply install"
+    },
+    {
+      "self_ref": "#/texts/10",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "code",
+      "prov": [],
+      "orig": "docling",
+      "text": "docling",
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "code_language": "unknown"
+    },
+    {
+      "self_ref": "#/texts/11",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "from your package manager, e.g. pip:",
+      "text": "from your package manager, e.g. pip:"
+    },
+    {
+      "self_ref": "#/texts/12",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "code",
+      "prov": [],
+      "orig": "pip install docling",
+      "text": "pip install docling",
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "code_language": "unknown"
+    },
+    {
+      "self_ref": "#/texts/13",
+      "parent": {
+        "$ref": "#/groups/2"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "To convert individual documents with python, use",
+      "text": "To convert individual documents with python, use"
+    },
+    {
+      "self_ref": "#/texts/14",
+      "parent": {
+        "$ref": "#/groups/2"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "code",
+      "prov": [],
+      "orig": "convert()",
+      "text": "convert()",
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "code_language": "unknown"
+    },
+    {
+      "self_ref": "#/texts/15",
+      "parent": {
+        "$ref": "#/groups/2"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": ", for example:",
+      "text": ", for example:"
+    },
+    {
+      "self_ref": "#/texts/16",
+      "parent": {
+        "$ref": "#/texts/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "code",
+      "prov": [],
+      "orig": "from docling.document_converter import DocumentConverter\n\nsource = \"https://arxiv.org/pdf/2408.09869\"\nconverter = DocumentConverter()\nresult = converter.convert(source)\nprint(result.document.export_to_markdown())",
+      "text": "from docling.document_converter import DocumentConverter\n\nsource = \"https://arxiv.org/pdf/2408.09869\"\nconverter = DocumentConverter()\nresult = converter.convert(source)\nprint(result.document.export_to_markdown())",
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "code_language": "unknown"
+    },
+    {
+      "self_ref": "#/texts/17",
+      "parent": {
+        "$ref": "#/groups/3"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "The program will output:",
+      "text": "The program will output:"
+    },
+    {
+      "self_ref": "#/texts/18",
+      "parent": {
+        "$ref": "#/groups/3"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "code",
+      "prov": [],
+      "orig": "## Docling Technical Report[...]",
+      "text": "## Docling Technical Report[...]",
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "code_language": "unknown"
+    },
+    {
+      "self_ref": "#/texts/19",
+      "parent": {
+        "$ref": "#/texts/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "Prefetch the models:",
+      "text": "Prefetch the models:"
+    },
+    {
+      "self_ref": "#/texts/20",
+      "parent": {
+        "$ref": "#/groups/4"
+      },
+      "children": [
+        {
+          "$ref": "#/groups/5"
+        }
+      ],
+      "content_layer": "body",
+      "label": "list_item",
+      "prov": [],
+      "orig": "",
+      "text": "",
+      "enumerated": false,
+      "marker": ""
+    },
+    {
+      "self_ref": "#/texts/21",
+      "parent": {
+        "$ref": "#/groups/5"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "Use the",
+      "text": "Use the"
+    },
+    {
+      "self_ref": "#/texts/22",
+      "parent": {
+        "$ref": "#/groups/5"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "code",
+      "prov": [],
+      "orig": "docling-tools models download",
+      "text": "docling-tools models download",
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "code_language": "unknown"
+    },
+    {
+      "self_ref": "#/texts/23",
+      "parent": {
+        "$ref": "#/groups/5"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "utility:",
+      "text": "utility:"
+    },
+    {
+      "self_ref": "#/texts/24",
+      "parent": {
+        "$ref": "#/groups/4"
+      },
+      "children": [
+        {
+          "$ref": "#/groups/6"
+        }
+      ],
+      "content_layer": "body",
+      "label": "list_item",
+      "prov": [],
+      "orig": "",
+      "text": "",
+      "enumerated": false,
+      "marker": ""
+    },
+    {
+      "self_ref": "#/texts/25",
+      "parent": {
+        "$ref": "#/groups/6"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "Alternatively, models can be programmatically downloaded using",
+      "text": "Alternatively, models can be programmatically downloaded using"
+    },
+    {
+      "self_ref": "#/texts/26",
+      "parent": {
+        "$ref": "#/groups/6"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "code",
+      "prov": [],
+      "orig": "docling.utils.model_downloader.download_models()",
+      "text": "docling.utils.model_downloader.download_models()",
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "code_language": "unknown"
+    },
+    {
+      "self_ref": "#/texts/27",
+      "parent": {
+        "$ref": "#/groups/6"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": ".",
+      "text": "."
+    },
+    {
+      "self_ref": "#/texts/28",
+      "parent": {
+        "$ref": "#/groups/4"
+      },
+      "children": [
+        {
+          "$ref": "#/groups/7"
+        }
+      ],
+      "content_layer": "body",
+      "label": "list_item",
+      "prov": [],
+      "orig": "",
+      "text": "",
+      "enumerated": false,
+      "marker": ""
+    },
+    {
+      "self_ref": "#/texts/29",
+      "parent": {
+        "$ref": "#/groups/7"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:",
+      "text": "Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:"
+    },
+    {
+      "self_ref": "#/texts/30",
+      "parent": {
+        "$ref": "#/groups/7"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "code",
+      "prov": [],
+      "orig": "$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...",
+      "text": "$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...",
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "code_language": "unknown"
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/html_code_snippets.html.md b/tests/data/groundtruth/docling_v2/html_code_snippets.html.md
new file mode 100644
index 00000000..8228e042
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/html_code_snippets.html.md
@@ -0,0 +1,24 @@
+# Code snippets
+
+The Pythagorean theorem can be written as an equation relating the lengths of the sides *a* , *b* and the hypotenuse *c* .
+
+To use Docling, simply install `docling` from your package manager, e.g. pip: `pip install docling`
+
+To convert individual documents with python, use `convert()` , for example:
+
+```
+from docling.document_converter import DocumentConverter
+
+source = "https://arxiv.org/pdf/2408.09869"
+converter = DocumentConverter()
+result = converter.convert(source)
+print(result.document.export_to_markdown())
+```
+
+The program will output: `## Docling Technical Report[...]`
+
+Prefetch the models:
+
+- Use the `docling-tools models download` utility:
+- Alternatively, models can be programmatically downloaded using `docling.utils.model_downloader.download_models()` .
+- Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id: `$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...`
\ No newline at end of file
diff --git a/tests/data/html/html_code_snippets.html b/tests/data/html/html_code_snippets.html
new file mode 100644
index 00000000..3858171e
--- /dev/null
+++ b/tests/data/html/html_code_snippets.html
@@ -0,0 +1,41 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset="UTF-8">
+  <title>Code snippets in HTML</title>
+</head>
+<body>
+
+<h1>Code snippets</h1>
+
+<p>The Pythagorean theorem can be written as an equation relating the lengths of the sides <var>a</var>, <var>b</var> and the hypotenuse <var>c</var>.</p>
+<p>To use Docling, simply install <code>docling</code>from your package manager, e.g. pip:
+    <kbd>pip install docling</kbd>
+</p>
+<p>To convert individual documents with python, use <code>convert()</code>, for example:</p>
+<pre><code>
+from docling.document_converter import DocumentConverter
+
+source = "https://arxiv.org/pdf/2408.09869"
+converter = DocumentConverter()
+result = converter.convert(source)
+print(result.document.export_to_markdown())
+</code></pre>
+<p>The program will output:
+    <samp>## Docling Technical Report[...]</samp>
+</p>
+
+<p>Prefetch the models:</p>
+<ul>
+    <li>Use the <code>docling-tools models download</code> utility:</li>
+    <li>Alternatively, models can be programmatically downloaded using <samp>docling.utils.model_downloader.download_models()</samp>.</li>
+    <li>Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:
+        <pre><code>
+            $ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview
+            Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...
+        </code></pre>
+        <pre hidden><code>$ docling-tools</code></pre>
+    </li>
+</ul>
+</body>
+</html>