fix(html): preserve code blocks in list items (#2131)

* chore(html): refactor parser to leverage context managers

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* fix(html): parse inline code snippets, also from list items

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(html): remove hidden tags

Remove tags that are not meant to be displayed.
Add regression tests for code blocks, inline code, and hidden tags.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-08-26 06:43:48 +02:00
committed by GitHub
parent c0268416cf
commit fa3327e1a6
5 changed files with 950 additions and 76 deletions

View File

@@ -1,5 +1,6 @@
import logging import logging
import re import re
import traceback
from contextlib import contextmanager from contextlib import contextmanager
from copy import deepcopy from copy import deepcopy
from io import BytesIO from io import BytesIO
@@ -45,20 +46,22 @@ _BLOCK_TAGS: Final = {
"h4", "h4",
"h5", "h5",
"h6", "h6",
"ol",
"p", "p",
"pre", "pre",
"code",
"ul",
"ol",
"summary", "summary",
"table", "table",
"ul",
} }
_CODE_TAG_SET: Final = {"code", "kbd", "samp"}
_FORMAT_TAG_MAP: Final = { _FORMAT_TAG_MAP: Final = {
"b": {"bold": True}, "b": {"bold": True},
"strong": {"bold": True}, "strong": {"bold": True},
"i": {"italic": True}, "i": {"italic": True},
"em": {"italic": True}, "em": {"italic": True},
"var": {"italic": True},
# "mark", # "mark",
# "small", # "small",
"s": {"strikethrough": True}, "s": {"strikethrough": True},
@@ -67,6 +70,7 @@ _FORMAT_TAG_MAP: Final = {
"ins": {"underline": True}, "ins": {"underline": True},
"sub": {"script": Script.SUB}, "sub": {"script": Script.SUB},
"sup": {"script": Script.SUPER}, "sup": {"script": Script.SUPER},
**{k: {} for k in _CODE_TAG_SET},
} }
@@ -79,6 +83,7 @@ class AnnotatedText(BaseModel):
text: str text: str
hyperlink: Union[AnyUrl, Path, None] = None hyperlink: Union[AnyUrl, Path, None] = None
formatting: Union[Formatting, None] = None formatting: Union[Formatting, None] = None
code: bool = False
class AnnotatedTextList(list): class AnnotatedTextList(list):
@@ -86,10 +91,12 @@ class AnnotatedTextList(list):
current_h = None current_h = None
current_text = "" current_text = ""
current_f = None current_f = None
current_code = False
for at in self: for at in self:
t = at.text t = at.text
h = at.hyperlink h = at.hyperlink
f = at.formatting f = at.formatting
c = at.code
current_text += t.strip() + " " current_text += t.strip() + " "
if f is not None and current_f is None: if f is not None and current_f is None:
current_f = f current_f = f
@@ -103,8 +110,13 @@ class AnnotatedTextList(list):
_log.warning( _log.warning(
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'" f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
) )
current_code = c if c else current_code
return AnnotatedText( return AnnotatedText(
text=current_text.strip(), hyperlink=current_h, formatting=current_f text=current_text.strip(),
hyperlink=current_h,
formatting=current_f,
code=current_code,
) )
def simplify_text_elements(self) -> "AnnotatedTextList": def simplify_text_elements(self) -> "AnnotatedTextList":
@@ -114,9 +126,14 @@ class AnnotatedTextList(list):
text = self[0].text text = self[0].text
hyperlink = self[0].hyperlink hyperlink = self[0].hyperlink
formatting = self[0].formatting formatting = self[0].formatting
code = self[0].code
last_elm = text last_elm = text
for i in range(1, len(self)): for i in range(1, len(self)):
if hyperlink == self[i].hyperlink and formatting == self[i].formatting: if (
hyperlink == self[i].hyperlink
and formatting == self[i].formatting
and code == self[i].code
):
sep = " " sep = " "
if not self[i].text.strip() or not last_elm.strip(): if not self[i].text.strip() or not last_elm.strip():
sep = "" sep = ""
@@ -124,15 +141,20 @@ class AnnotatedTextList(list):
last_elm = self[i].text last_elm = self[i].text
else: else:
simplified.append( simplified.append(
AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting) AnnotatedText(
text=text, hyperlink=hyperlink, formatting=formatting, code=code
)
) )
text = self[i].text text = self[i].text
last_elm = text last_elm = text
hyperlink = self[i].hyperlink hyperlink = self[i].hyperlink
formatting = self[i].formatting formatting = self[i].formatting
code = self[i].code
if text: if text:
simplified.append( simplified.append(
AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting) AnnotatedText(
text=text, hyperlink=hyperlink, formatting=formatting, code=code
)
) )
return simplified return simplified
@@ -174,7 +196,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.ctx = _Context() self.ctx = _Context()
for i in range(self.max_levels): for i in range(self.max_levels):
self.parents[i] = None self.parents[i] = None
self.hyperlink = None self.hyperlink: Union[AnyUrl, Path, None] = None
self.original_url = original_url self.original_url = original_url
self.format_tags: list[str] = [] self.format_tags: list[str] = []
@@ -235,9 +257,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
orig=title_text, orig=title_text,
content_layer=ContentLayer.FURNITURE, content_layer=ContentLayer.FURNITURE,
) )
# remove scripts/styles # remove script and style tags
for tag in self.soup(["script", "style"]): for tag in self.soup(["script", "style"]):
tag.decompose() tag.decompose()
# remove any hidden tag
for tag in self.soup(hidden=True):
tag.decompose()
content = self.soup.body or self.soup content = self.soup.body or self.soup
# normalize <br> tags # normalize <br> tags
for br in content("br"): for br in content("br"):
@@ -268,7 +294,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def flush_buffer(): def flush_buffer():
if not buffer: if not buffer:
return return
annotated_text_list = buffer.simplify_text_elements() annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
parts = annotated_text_list.split_by_newline() parts = annotated_text_list.split_by_newline()
buffer.clear() buffer.clear()
@@ -276,20 +302,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return return
for annotated_text_list in parts: for annotated_text_list in parts:
with self.use_inline_group(annotated_text_list, doc): with self._use_inline_group(annotated_text_list, doc):
for annotated_text in annotated_text_list: for annotated_text in annotated_text_list:
if annotated_text.text.strip(): if annotated_text.text.strip():
seg_clean = HTMLDocumentBackend._clean_unicode( seg_clean = HTMLDocumentBackend._clean_unicode(
annotated_text.text.strip() annotated_text.text.strip()
) )
doc.add_text( if annotated_text.code:
parent=self.parents[self.level], doc.add_code(
label=DocItemLabel.TEXT, parent=self.parents[self.level],
text=seg_clean, text=seg_clean,
content_layer=self.content_layer, content_layer=self.content_layer,
formatting=annotated_text.formatting, formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink, hyperlink=annotated_text.hyperlink,
) )
else:
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=seg_clean,
content_layer=self.content_layer,
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
for node in element.contents: for node in element.contents:
if isinstance(node, Tag): if isinstance(node, Tag):
@@ -298,10 +333,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
flush_buffer() flush_buffer()
self._emit_image(node, doc) self._emit_image(node, doc)
elif name in _FORMAT_TAG_MAP: elif name in _FORMAT_TAG_MAP:
with self.use_format([name]): with self._use_format([name]):
self._walk(node, doc) self._walk(node, doc)
elif name == "a": elif name == "a":
with self.use_hyperlink(node): with self._use_hyperlink(node):
self._walk(node, doc) self._walk(node, doc)
elif name in _BLOCK_TAGS: elif name in _BLOCK_TAGS:
flush_buffer() flush_buffer()
@@ -367,8 +402,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
this_parent = item.parent this_parent = item.parent
while this_parent is not None: while this_parent is not None:
if this_parent.name == "a" and this_parent.get("href"): if this_parent.name == "a" and this_parent.get("href"):
with self.use_format(format_tags): with self._use_format(format_tags):
with self.use_hyperlink(this_parent): with self._use_hyperlink(this_parent):
return self._extract_text_and_hyperlink_recursively( return self._extract_text_and_hyperlink_recursively(
item, ignore_list item, ignore_list
) )
@@ -379,6 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if isinstance(item, NavigableString): if isinstance(item, NavigableString):
text = item.strip() text = item.strip()
code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET)
if text: if text:
return AnnotatedTextList( return AnnotatedTextList(
[ [
@@ -386,6 +422,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text=text, text=text,
hyperlink=self.hyperlink, hyperlink=self.hyperlink,
formatting=self._formatting, formatting=self._formatting,
code=code,
) )
] ]
) )
@@ -396,6 +433,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text="\n", text="\n",
hyperlink=self.hyperlink, hyperlink=self.hyperlink,
formatting=self._formatting, formatting=self._formatting,
code=code,
) )
] ]
) )
@@ -405,14 +443,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if not ignore_list or (tag.name not in ["ul", "ol"]): if not ignore_list or (tag.name not in ["ul", "ol"]):
for child in tag: for child in tag:
if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP: if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
with self.use_format([child.name]): with self._use_format([child.name]):
result.extend( result.extend(
self._extract_text_and_hyperlink_recursively( self._extract_text_and_hyperlink_recursively(
child, ignore_list, keep_newlines=keep_newlines child, ignore_list, keep_newlines=keep_newlines
) )
) )
elif isinstance(child, Tag) and child.name == "a": elif isinstance(child, Tag) and child.name == "a":
with self.use_hyperlink(child): with self._use_hyperlink(child):
result.extend( result.extend(
self._extract_text_and_hyperlink_recursively( self._extract_text_and_hyperlink_recursively(
child, ignore_list, keep_newlines=keep_newlines child, ignore_list, keep_newlines=keep_newlines
@@ -428,29 +466,30 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return result return result
@contextmanager @contextmanager
def use_hyperlink(self, tag): def _use_hyperlink(self, tag: Tag):
this_href = tag.get("href") this_href = tag.get("href")
if this_href is None: if this_href is None:
yield None yield None
else: else:
if this_href: if isinstance(this_href, str) and this_href:
old_hyperlink = self.hyperlink old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink
new_hyperlink: Union[AnyUrl, Path, None] = None
if self.original_url is not None: if self.original_url is not None:
this_href = urljoin(self.original_url, this_href) this_href = urljoin(str(self.original_url), str(this_href))
# ugly fix for relative links since pydantic does not support them. # ugly fix for relative links since pydantic does not support them.
try: try:
AnyUrl(this_href) new_hyperlink = AnyUrl(this_href)
except ValidationError: except ValidationError:
this_href = Path(this_href) new_hyperlink = Path(this_href)
self.hyperlink = this_href self.hyperlink = new_hyperlink
try: try:
yield None yield None
finally: finally:
if this_href: if new_hyperlink:
self.hyperlink = old_hyperlink self.hyperlink = old_hyperlink
@contextmanager @contextmanager
def use_format(self, tags: list[str]): def _use_format(self, tags: list[str]):
if not tags: if not tags:
yield None yield None
else: else:
@@ -461,7 +500,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.format_tags = self.format_tags[: -len(tags)] self.format_tags = self.format_tags[: -len(tags)]
@contextmanager @contextmanager
def use_inline_group( def _use_inline_group(
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
): ):
"""Create an inline group for annotated texts. """Create an inline group for annotated texts.
@@ -473,9 +512,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
Args: Args:
annotated_text_list (AnnotatedTextList): Annotated text annotated_text_list (AnnotatedTextList): Annotated text
doc (DoclingDocument): Currently used document doc (DoclingDocument): Currently used document
Yields:
None: _description_
""" """
if len(annotated_text_list) > 1: if len(annotated_text_list) > 1:
inline_fmt = doc.add_group( inline_fmt = doc.add_group(
@@ -493,6 +529,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
else: else:
yield None yield None
@contextmanager
def _use_details(self, tag: Tag, doc: DoclingDocument):
"""Create a group with the content of a details tag.
While the context manager is active, the hierarchy level is set one
level higher as the cuurent parent.
Args:
tag: The details tag.
doc: Currently used document.
"""
self.parents[self.level + 1] = doc.add_group(
name=tag.name,
label=GroupLabel.SECTION,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
self.level += 1
try:
yield None
finally:
self.parents[self.level + 1] = None
self.level -= 1
@contextmanager
def _use_footer(self, tag: Tag, doc: DoclingDocument):
"""Create a group with a footer.
Create a group with the content of a footer tag. While the context manager
is active, the hierarchy level is set one level higher as the cuurent parent.
Args:
tag: The footer tag.
doc: Currently used document.
"""
current_layer = self.content_layer
self.content_layer = ContentLayer.FURNITURE
self.parents[self.level + 1] = doc.add_group(
name=tag.name,
label=GroupLabel.SECTION,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
self.level += 1
try:
yield None
finally:
self.parents[self.level + 1] = None
self.level -= 1
self.content_layer = current_layer
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None: def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
tag_name = tag.name.lower() tag_name = tag.name.lower()
# set default content layer to BODY as soon as we encounter a heading # set default content layer to BODY as soon as we encounter a heading
@@ -611,20 +698,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
content_layer=self.content_layer, content_layer=self.content_layer,
) )
self.level += 1 self.level += 1
with self.use_inline_group(min_parts, doc): with self._use_inline_group(min_parts, doc):
for annotated_text in min_parts: for annotated_text in min_parts:
li_text = re.sub( li_text = re.sub(
r"\s+|\n+", " ", annotated_text.text r"\s+|\n+", " ", annotated_text.text
).strip() ).strip()
li_clean = HTMLDocumentBackend._clean_unicode(li_text) li_clean = HTMLDocumentBackend._clean_unicode(li_text)
doc.add_text( if annotated_text.code:
parent=self.parents[self.level], doc.add_code(
label=DocItemLabel.TEXT, parent=self.parents[self.level],
text=li_clean, text=li_clean,
content_layer=self.content_layer, content_layer=self.content_layer,
formatting=annotated_text.formatting, formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink, hyperlink=annotated_text.hyperlink,
) )
else:
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=li_clean,
content_layer=self.content_layer,
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
# 4) recurse into any nested lists, attaching them to this <li> item # 4) recurse into any nested lists, attaching them to this <li> item
for sublist in li({"ul", "ol"}, recursive=False): for sublist in li({"ul", "ol"}, recursive=False):
@@ -687,20 +783,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text_list = self._extract_text_and_hyperlink_recursively( text_list = self._extract_text_and_hyperlink_recursively(
tag, find_parent_annotation=True tag, find_parent_annotation=True
) )
annotated_texts = text_list.simplify_text_elements() annotated_texts: AnnotatedTextList = text_list.simplify_text_elements()
for part in annotated_texts.split_by_newline(): for part in annotated_texts.split_by_newline():
with self.use_inline_group(part, doc): with self._use_inline_group(part, doc):
for annotated_text in part: for annotated_text in part:
if seg := annotated_text.text.strip(): if seg := annotated_text.text.strip():
seg_clean = HTMLDocumentBackend._clean_unicode(seg) seg_clean = HTMLDocumentBackend._clean_unicode(seg)
doc.add_text( if annotated_text.code:
parent=self.parents[self.level], doc.add_code(
label=DocItemLabel.TEXT, parent=self.parents[self.level],
text=seg_clean, text=seg_clean,
content_layer=self.content_layer, content_layer=self.content_layer,
formatting=annotated_text.formatting, formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink, hyperlink=annotated_text.hyperlink,
) )
else:
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=seg_clean,
content_layer=self.content_layer,
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
for img_tag in tag("img"): for img_tag in tag("img"):
if isinstance(img_tag, Tag): if isinstance(img_tag, Tag):
@@ -718,13 +823,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
content_layer=self.content_layer, content_layer=self.content_layer,
) )
elif tag_name in {"pre", "code"}: elif tag_name in {"pre"}:
# handle monospace code snippets (pre). # handle monospace code snippets (pre).
text_list = self._extract_text_and_hyperlink_recursively( text_list = self._extract_text_and_hyperlink_recursively(
tag, find_parent_annotation=True tag, find_parent_annotation=True, keep_newlines=True
) )
annotated_texts = text_list.simplify_text_elements() annotated_texts = text_list.simplify_text_elements()
with self.use_inline_group(annotated_texts, doc): with self._use_inline_group(annotated_texts, doc):
for annotated_text in annotated_texts: for annotated_text in annotated_texts:
text_clean = HTMLDocumentBackend._clean_unicode( text_clean = HTMLDocumentBackend._clean_unicode(
annotated_text.text.strip() annotated_text.text.strip()
@@ -737,22 +842,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
hyperlink=annotated_text.hyperlink, hyperlink=annotated_text.hyperlink,
) )
elif tag_name in {"details", "footer"}: elif tag_name == "footer":
if tag_name == "footer": with self._use_footer(tag, doc):
current_layer = self.content_layer self._walk(tag, doc)
self.content_layer = ContentLayer.FURNITURE
self.parents[self.level + 1] = doc.add_group( elif tag_name == "details":
name=tag_name, with self._use_details(tag, doc):
label=GroupLabel.SECTION, self._walk(tag, doc)
parent=self.parents[self.level],
content_layer=self.content_layer,
)
self.level += 1
self._walk(tag, doc)
self.parents[self.level + 1] = None
self.level -= 1
if tag_name == "footer":
self.content_layer = current_layer
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None: def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
figure = img_tag.find_parent("figure") figure = img_tag.find_parent("figure")

View File

@@ -0,0 +1,39 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Code snippets
item-2 at level 2: inline: group group
item-3 at level 3: text: The Pythagorean theorem can be w ... tion relating the lengths of the sides
item-4 at level 3: text: a
item-5 at level 3: text: ,
item-6 at level 3: text: b
item-7 at level 3: text: and the hypotenuse
item-8 at level 3: text: c
item-9 at level 3: text: .
item-10 at level 2: inline: group group
item-11 at level 3: text: To use Docling, simply install
item-12 at level 3: code: docling
item-13 at level 3: text: from your package manager, e.g. pip:
item-14 at level 3: code: pip install docling
item-15 at level 2: inline: group group
item-16 at level 3: text: To convert individual documents with python, use
item-17 at level 3: code: convert()
item-18 at level 3: text: , for example:
item-19 at level 2: code: from docling.document_converter ... (result.document.export_to_markdown())
item-20 at level 2: inline: group group
item-21 at level 3: text: The program will output:
item-22 at level 3: code: ## Docling Technical Report[...]
item-23 at level 2: text: Prefetch the models:
item-24 at level 2: list: group list
item-25 at level 3: list_item:
item-26 at level 4: inline: group group
item-27 at level 5: text: Use the
item-28 at level 5: code: docling-tools models download
item-29 at level 5: text: utility:
item-30 at level 3: list_item:
item-31 at level 4: inline: group group
item-32 at level 5: text: Alternatively, models can be programmatically downloaded using
item-33 at level 5: code: docling.utils.model_downloader.download_models()
item-34 at level 5: text: .
item-35 at level 3: list_item:
item-36 at level 4: inline: group group
item-37 at level 5: text: Also, you can use download-hf-re ... rom HuggingFace by specifying repo id:
item-38 at level 5: code: $ docling-tools models download- ... 256M-preview model from HuggingFace...

View File

@@ -0,0 +1,674 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"name": "html_code_snippets",
"origin": {
"mimetype": "text/html",
"binary_hash": 8468578485215893920,
"filename": "html_code_snippets.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
},
{
"$ref": "#/texts/4"
},
{
"$ref": "#/texts/5"
},
{
"$ref": "#/texts/6"
},
{
"$ref": "#/texts/7"
},
{
"$ref": "#/texts/8"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/9"
},
{
"$ref": "#/texts/10"
},
{
"$ref": "#/texts/11"
},
{
"$ref": "#/texts/12"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/13"
},
{
"$ref": "#/texts/14"
},
{
"$ref": "#/texts/15"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/3",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/17"
},
{
"$ref": "#/texts/18"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/4",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/20"
},
{
"$ref": "#/texts/24"
},
{
"$ref": "#/texts/28"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/5",
"parent": {
"$ref": "#/texts/20"
},
"children": [
{
"$ref": "#/texts/21"
},
{
"$ref": "#/texts/22"
},
{
"$ref": "#/texts/23"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/6",
"parent": {
"$ref": "#/texts/24"
},
"children": [
{
"$ref": "#/texts/25"
},
{
"$ref": "#/texts/26"
},
{
"$ref": "#/texts/27"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/7",
"parent": {
"$ref": "#/texts/28"
},
"children": [
{
"$ref": "#/texts/29"
},
{
"$ref": "#/texts/30"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "title",
"prov": [],
"orig": "Code snippets in HTML",
"text": "Code snippets in HTML"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/groups/0"
},
{
"$ref": "#/groups/1"
},
{
"$ref": "#/groups/2"
},
{
"$ref": "#/texts/16"
},
{
"$ref": "#/groups/3"
},
{
"$ref": "#/texts/19"
},
{
"$ref": "#/groups/4"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Code snippets",
"text": "Code snippets"
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "The Pythagorean theorem can be written as an equation relating the lengths of the sides",
"text": "The Pythagorean theorem can be written as an equation relating the lengths of the sides"
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "a",
"text": "a",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ",",
"text": ","
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "b",
"text": "b",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "and the hypotenuse",
"text": "and the hypotenuse"
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "c",
"text": "c",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ".",
"text": "."
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "To use Docling, simply install",
"text": "To use Docling, simply install"
},
{
"self_ref": "#/texts/10",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "docling",
"text": "docling",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/11",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "from your package manager, e.g. pip:",
"text": "from your package manager, e.g. pip:"
},
{
"self_ref": "#/texts/12",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "pip install docling",
"text": "pip install docling",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/13",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "To convert individual documents with python, use",
"text": "To convert individual documents with python, use"
},
{
"self_ref": "#/texts/14",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "convert()",
"text": "convert()",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/15",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ", for example:",
"text": ", for example:"
},
{
"self_ref": "#/texts/16",
"parent": {
"$ref": "#/texts/1"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "from docling.document_converter import DocumentConverter\n\nsource = \"https://arxiv.org/pdf/2408.09869\"\nconverter = DocumentConverter()\nresult = converter.convert(source)\nprint(result.document.export_to_markdown())",
"text": "from docling.document_converter import DocumentConverter\n\nsource = \"https://arxiv.org/pdf/2408.09869\"\nconverter = DocumentConverter()\nresult = converter.convert(source)\nprint(result.document.export_to_markdown())",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/17",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "The program will output:",
"text": "The program will output:"
},
{
"self_ref": "#/texts/18",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "## Docling Technical Report[...]",
"text": "## Docling Technical Report[...]",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/texts/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Prefetch the models:",
"text": "Prefetch the models:"
},
{
"self_ref": "#/texts/20",
"parent": {
"$ref": "#/groups/4"
},
"children": [
{
"$ref": "#/groups/5"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "",
"text": "",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/21",
"parent": {
"$ref": "#/groups/5"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Use the",
"text": "Use the"
},
{
"self_ref": "#/texts/22",
"parent": {
"$ref": "#/groups/5"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "docling-tools models download",
"text": "docling-tools models download",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/23",
"parent": {
"$ref": "#/groups/5"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "utility:",
"text": "utility:"
},
{
"self_ref": "#/texts/24",
"parent": {
"$ref": "#/groups/4"
},
"children": [
{
"$ref": "#/groups/6"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "",
"text": "",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/25",
"parent": {
"$ref": "#/groups/6"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Alternatively, models can be programmatically downloaded using",
"text": "Alternatively, models can be programmatically downloaded using"
},
{
"self_ref": "#/texts/26",
"parent": {
"$ref": "#/groups/6"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "docling.utils.model_downloader.download_models()",
"text": "docling.utils.model_downloader.download_models()",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/27",
"parent": {
"$ref": "#/groups/6"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ".",
"text": "."
},
{
"self_ref": "#/texts/28",
"parent": {
"$ref": "#/groups/4"
},
"children": [
{
"$ref": "#/groups/7"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "",
"text": "",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/29",
"parent": {
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:",
"text": "Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:"
},
{
"self_ref": "#/texts/30",
"parent": {
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...",
"text": "$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,24 @@
# Code snippets
The Pythagorean theorem can be written as an equation relating the lengths of the sides *a* , *b* and the hypotenuse *c* .
To use Docling, simply install `docling` from your package manager, e.g. pip: `pip install docling`
To convert individual documents with python, use `convert()` , for example:
```
from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869"
converter = DocumentConverter()
result = converter.convert(source)
print(result.document.export_to_markdown())
```
The program will output: `## Docling Technical Report[...]`
Prefetch the models:
- Use the `docling-tools models download` utility:
- Alternatively, models can be programmatically downloaded using `docling.utils.model_downloader.download_models()` .
- Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id: `$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...`

41
tests/data/html/html_code_snippets.html vendored Normal file
View File

@@ -0,0 +1,41 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Code snippets in HTML</title>
</head>
<body>
<h1>Code snippets</h1>
<p>The Pythagorean theorem can be written as an equation relating the lengths of the sides <var>a</var>, <var>b</var> and the hypotenuse <var>c</var>.</p>
<p>To use Docling, simply install <code>docling</code>from your package manager, e.g. pip:
<kbd>pip install docling</kbd>
</p>
<p>To convert individual documents with python, use <code>convert()</code>, for example:</p>
<pre><code>
from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869"
converter = DocumentConverter()
result = converter.convert(source)
print(result.document.export_to_markdown())
</code></pre>
<p>The program will output:
<samp>## Docling Technical Report[...]</samp>
</p>
<p>Prefetch the models:</p>
<ul>
<li>Use the <code>docling-tools models download</code> utility:</li>
<li>Alternatively, models can be programmatically downloaded using <samp>docling.utils.model_downloader.download_models()</samp>.</li>
<li>Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:
<pre><code>
$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview
Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...
</code></pre>
<pre hidden><code>$ docling-tools</code></pre>
</li>
</ul>
</body>
</html>