From fa3327e1a6f7f4de6d5fd1f83588d6b5cac324ca Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Tue, 26 Aug 2025 06:43:48 +0200 Subject: [PATCH] fix(html): preserve code blocks in list items (#2131) * chore(html): refactor parser to leverage context managers Signed-off-by: Cesar Berrospi Ramis * fix(html): parse inline code snippets, also from list items Signed-off-by: Cesar Berrospi Ramis * chore(html): remove hidden tags Remove tags that are not meant to be displayed. Add regression tests for code blocks, inline code, and hidden tags. Signed-off-by: Cesar Berrospi Ramis --------- Signed-off-by: Cesar Berrospi Ramis --- docling/backend/html_backend.py | 248 +++++-- .../docling_v2/html_code_snippets.html.itxt | 39 + .../docling_v2/html_code_snippets.html.json | 674 ++++++++++++++++++ .../docling_v2/html_code_snippets.html.md | 24 + tests/data/html/html_code_snippets.html | 41 ++ 5 files changed, 950 insertions(+), 76 deletions(-) create mode 100644 tests/data/groundtruth/docling_v2/html_code_snippets.html.itxt create mode 100644 tests/data/groundtruth/docling_v2/html_code_snippets.html.json create mode 100644 tests/data/groundtruth/docling_v2/html_code_snippets.html.md create mode 100644 tests/data/html/html_code_snippets.html diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index b24df93e..2334c645 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,5 +1,6 @@ import logging import re +import traceback from contextlib import contextmanager from copy import deepcopy from io import BytesIO @@ -45,20 +46,22 @@ _BLOCK_TAGS: Final = { "h4", "h5", "h6", + "ol", "p", "pre", - "code", - "ul", - "ol", "summary", "table", + "ul", } +_CODE_TAG_SET: Final = {"code", "kbd", "samp"} + _FORMAT_TAG_MAP: Final = { "b": {"bold": True}, "strong": {"bold": True}, "i": {"italic": True}, "em": {"italic": True}, + "var": {"italic": True}, # "mark", # "small", "s": {"strikethrough": True}, @@ -67,6 +70,7 @@ _FORMAT_TAG_MAP: Final = { "ins": {"underline": True}, "sub": {"script": Script.SUB}, "sup": {"script": Script.SUPER}, + **{k: {} for k in _CODE_TAG_SET}, } @@ -79,6 +83,7 @@ class AnnotatedText(BaseModel): text: str hyperlink: Union[AnyUrl, Path, None] = None formatting: Union[Formatting, None] = None + code: bool = False class AnnotatedTextList(list): @@ -86,10 +91,12 @@ class AnnotatedTextList(list): current_h = None current_text = "" current_f = None + current_code = False for at in self: t = at.text h = at.hyperlink f = at.formatting + c = at.code current_text += t.strip() + " " if f is not None and current_f is None: current_f = f @@ -103,8 +110,13 @@ class AnnotatedTextList(list): _log.warning( f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'" ) + current_code = c if c else current_code + return AnnotatedText( - text=current_text.strip(), hyperlink=current_h, formatting=current_f + text=current_text.strip(), + hyperlink=current_h, + formatting=current_f, + code=current_code, ) def simplify_text_elements(self) -> "AnnotatedTextList": @@ -114,9 +126,14 @@ class AnnotatedTextList(list): text = self[0].text hyperlink = self[0].hyperlink formatting = self[0].formatting + code = self[0].code last_elm = text for i in range(1, len(self)): - if hyperlink == self[i].hyperlink and formatting == self[i].formatting: + if ( + hyperlink == self[i].hyperlink + and formatting == self[i].formatting + and code == self[i].code + ): sep = " " if not self[i].text.strip() or not last_elm.strip(): sep = "" @@ -124,15 +141,20 @@ class AnnotatedTextList(list): last_elm = self[i].text else: simplified.append( - AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting) + AnnotatedText( + text=text, hyperlink=hyperlink, formatting=formatting, code=code + ) ) text = self[i].text last_elm = text hyperlink = self[i].hyperlink formatting = self[i].formatting + code = self[i].code if text: simplified.append( - AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting) + AnnotatedText( + text=text, hyperlink=hyperlink, formatting=formatting, code=code + ) ) return simplified @@ -174,7 +196,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.ctx = _Context() for i in range(self.max_levels): self.parents[i] = None - self.hyperlink = None + self.hyperlink: Union[AnyUrl, Path, None] = None self.original_url = original_url self.format_tags: list[str] = [] @@ -235,9 +257,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): orig=title_text, content_layer=ContentLayer.FURNITURE, ) - # remove scripts/styles + # remove script and style tags for tag in self.soup(["script", "style"]): tag.decompose() + # remove any hidden tag + for tag in self.soup(hidden=True): + tag.decompose() + content = self.soup.body or self.soup # normalize
tags for br in content("br"): @@ -268,7 +294,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def flush_buffer(): if not buffer: return - annotated_text_list = buffer.simplify_text_elements() + annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements() parts = annotated_text_list.split_by_newline() buffer.clear() @@ -276,20 +302,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return for annotated_text_list in parts: - with self.use_inline_group(annotated_text_list, doc): + with self._use_inline_group(annotated_text_list, doc): for annotated_text in annotated_text_list: if annotated_text.text.strip(): seg_clean = HTMLDocumentBackend._clean_unicode( annotated_text.text.strip() ) - doc.add_text( - parent=self.parents[self.level], - label=DocItemLabel.TEXT, - text=seg_clean, - content_layer=self.content_layer, - formatting=annotated_text.formatting, - hyperlink=annotated_text.hyperlink, - ) + if annotated_text.code: + doc.add_code( + parent=self.parents[self.level], + text=seg_clean, + content_layer=self.content_layer, + formatting=annotated_text.formatting, + hyperlink=annotated_text.hyperlink, + ) + else: + doc.add_text( + parent=self.parents[self.level], + label=DocItemLabel.TEXT, + text=seg_clean, + content_layer=self.content_layer, + formatting=annotated_text.formatting, + hyperlink=annotated_text.hyperlink, + ) for node in element.contents: if isinstance(node, Tag): @@ -298,10 +333,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): flush_buffer() self._emit_image(node, doc) elif name in _FORMAT_TAG_MAP: - with self.use_format([name]): + with self._use_format([name]): self._walk(node, doc) elif name == "a": - with self.use_hyperlink(node): + with self._use_hyperlink(node): self._walk(node, doc) elif name in _BLOCK_TAGS: flush_buffer() @@ -367,8 +402,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): this_parent = item.parent while this_parent is not None: if this_parent.name == "a" and this_parent.get("href"): - with self.use_format(format_tags): - with self.use_hyperlink(this_parent): + with self._use_format(format_tags): + with self._use_hyperlink(this_parent): return self._extract_text_and_hyperlink_recursively( item, ignore_list ) @@ -379,6 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if isinstance(item, NavigableString): text = item.strip() + code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET) if text: return AnnotatedTextList( [ @@ -386,6 +422,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): text=text, hyperlink=self.hyperlink, formatting=self._formatting, + code=code, ) ] ) @@ -396,6 +433,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): text="\n", hyperlink=self.hyperlink, formatting=self._formatting, + code=code, ) ] ) @@ -405,14 +443,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if not ignore_list or (tag.name not in ["ul", "ol"]): for child in tag: if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP: - with self.use_format([child.name]): + with self._use_format([child.name]): result.extend( self._extract_text_and_hyperlink_recursively( child, ignore_list, keep_newlines=keep_newlines ) ) elif isinstance(child, Tag) and child.name == "a": - with self.use_hyperlink(child): + with self._use_hyperlink(child): result.extend( self._extract_text_and_hyperlink_recursively( child, ignore_list, keep_newlines=keep_newlines @@ -428,29 +466,30 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return result @contextmanager - def use_hyperlink(self, tag): + def _use_hyperlink(self, tag: Tag): this_href = tag.get("href") if this_href is None: yield None else: - if this_href: - old_hyperlink = self.hyperlink + if isinstance(this_href, str) and this_href: + old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink + new_hyperlink: Union[AnyUrl, Path, None] = None if self.original_url is not None: - this_href = urljoin(self.original_url, this_href) + this_href = urljoin(str(self.original_url), str(this_href)) # ugly fix for relative links since pydantic does not support them. try: - AnyUrl(this_href) + new_hyperlink = AnyUrl(this_href) except ValidationError: - this_href = Path(this_href) - self.hyperlink = this_href + new_hyperlink = Path(this_href) + self.hyperlink = new_hyperlink try: yield None finally: - if this_href: + if new_hyperlink: self.hyperlink = old_hyperlink @contextmanager - def use_format(self, tags: list[str]): + def _use_format(self, tags: list[str]): if not tags: yield None else: @@ -461,7 +500,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.format_tags = self.format_tags[: -len(tags)] @contextmanager - def use_inline_group( + def _use_inline_group( self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument ): """Create an inline group for annotated texts. @@ -473,9 +512,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): Args: annotated_text_list (AnnotatedTextList): Annotated text doc (DoclingDocument): Currently used document - - Yields: - None: _description_ """ if len(annotated_text_list) > 1: inline_fmt = doc.add_group( @@ -493,6 +529,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): else: yield None + @contextmanager + def _use_details(self, tag: Tag, doc: DoclingDocument): + """Create a group with the content of a details tag. + + While the context manager is active, the hierarchy level is set one + level higher as the cuurent parent. + + Args: + tag: The details tag. + doc: Currently used document. + """ + self.parents[self.level + 1] = doc.add_group( + name=tag.name, + label=GroupLabel.SECTION, + parent=self.parents[self.level], + content_layer=self.content_layer, + ) + self.level += 1 + try: + yield None + finally: + self.parents[self.level + 1] = None + self.level -= 1 + + @contextmanager + def _use_footer(self, tag: Tag, doc: DoclingDocument): + """Create a group with a footer. + + Create a group with the content of a footer tag. While the context manager + is active, the hierarchy level is set one level higher as the cuurent parent. + + Args: + tag: The footer tag. + doc: Currently used document. + """ + current_layer = self.content_layer + self.content_layer = ContentLayer.FURNITURE + self.parents[self.level + 1] = doc.add_group( + name=tag.name, + label=GroupLabel.SECTION, + parent=self.parents[self.level], + content_layer=self.content_layer, + ) + self.level += 1 + try: + yield None + finally: + self.parents[self.level + 1] = None + self.level -= 1 + self.content_layer = current_layer + def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None: tag_name = tag.name.lower() # set default content layer to BODY as soon as we encounter a heading @@ -611,20 +698,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): content_layer=self.content_layer, ) self.level += 1 - with self.use_inline_group(min_parts, doc): + with self._use_inline_group(min_parts, doc): for annotated_text in min_parts: li_text = re.sub( r"\s+|\n+", " ", annotated_text.text ).strip() li_clean = HTMLDocumentBackend._clean_unicode(li_text) - doc.add_text( - parent=self.parents[self.level], - label=DocItemLabel.TEXT, - text=li_clean, - content_layer=self.content_layer, - formatting=annotated_text.formatting, - hyperlink=annotated_text.hyperlink, - ) + if annotated_text.code: + doc.add_code( + parent=self.parents[self.level], + text=li_clean, + content_layer=self.content_layer, + formatting=annotated_text.formatting, + hyperlink=annotated_text.hyperlink, + ) + else: + doc.add_text( + parent=self.parents[self.level], + label=DocItemLabel.TEXT, + text=li_clean, + content_layer=self.content_layer, + formatting=annotated_text.formatting, + hyperlink=annotated_text.hyperlink, + ) # 4) recurse into any nested lists, attaching them to this
  • item for sublist in li({"ul", "ol"}, recursive=False): @@ -687,20 +783,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): text_list = self._extract_text_and_hyperlink_recursively( tag, find_parent_annotation=True ) - annotated_texts = text_list.simplify_text_elements() + annotated_texts: AnnotatedTextList = text_list.simplify_text_elements() for part in annotated_texts.split_by_newline(): - with self.use_inline_group(part, doc): + with self._use_inline_group(part, doc): for annotated_text in part: if seg := annotated_text.text.strip(): seg_clean = HTMLDocumentBackend._clean_unicode(seg) - doc.add_text( - parent=self.parents[self.level], - label=DocItemLabel.TEXT, - text=seg_clean, - content_layer=self.content_layer, - formatting=annotated_text.formatting, - hyperlink=annotated_text.hyperlink, - ) + if annotated_text.code: + doc.add_code( + parent=self.parents[self.level], + text=seg_clean, + content_layer=self.content_layer, + formatting=annotated_text.formatting, + hyperlink=annotated_text.hyperlink, + ) + else: + doc.add_text( + parent=self.parents[self.level], + label=DocItemLabel.TEXT, + text=seg_clean, + content_layer=self.content_layer, + formatting=annotated_text.formatting, + hyperlink=annotated_text.hyperlink, + ) for img_tag in tag("img"): if isinstance(img_tag, Tag): @@ -718,13 +823,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): content_layer=self.content_layer, ) - elif tag_name in {"pre", "code"}: + elif tag_name in {"pre"}: # handle monospace code snippets (pre). text_list = self._extract_text_and_hyperlink_recursively( - tag, find_parent_annotation=True + tag, find_parent_annotation=True, keep_newlines=True ) annotated_texts = text_list.simplify_text_elements() - with self.use_inline_group(annotated_texts, doc): + with self._use_inline_group(annotated_texts, doc): for annotated_text in annotated_texts: text_clean = HTMLDocumentBackend._clean_unicode( annotated_text.text.strip() @@ -737,22 +842,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): hyperlink=annotated_text.hyperlink, ) - elif tag_name in {"details", "footer"}: - if tag_name == "footer": - current_layer = self.content_layer - self.content_layer = ContentLayer.FURNITURE - self.parents[self.level + 1] = doc.add_group( - name=tag_name, - label=GroupLabel.SECTION, - parent=self.parents[self.level], - content_layer=self.content_layer, - ) - self.level += 1 - self._walk(tag, doc) - self.parents[self.level + 1] = None - self.level -= 1 - if tag_name == "footer": - self.content_layer = current_layer + elif tag_name == "footer": + with self._use_footer(tag, doc): + self._walk(tag, doc) + + elif tag_name == "details": + with self._use_details(tag, doc): + self._walk(tag, doc) def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None: figure = img_tag.find_parent("figure") diff --git a/tests/data/groundtruth/docling_v2/html_code_snippets.html.itxt b/tests/data/groundtruth/docling_v2/html_code_snippets.html.itxt new file mode 100644 index 00000000..77f11c65 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/html_code_snippets.html.itxt @@ -0,0 +1,39 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: title: Code snippets + item-2 at level 2: inline: group group + item-3 at level 3: text: The Pythagorean theorem can be w ... tion relating the lengths of the sides + item-4 at level 3: text: a + item-5 at level 3: text: , + item-6 at level 3: text: b + item-7 at level 3: text: and the hypotenuse + item-8 at level 3: text: c + item-9 at level 3: text: . + item-10 at level 2: inline: group group + item-11 at level 3: text: To use Docling, simply install + item-12 at level 3: code: docling + item-13 at level 3: text: from your package manager, e.g. pip: + item-14 at level 3: code: pip install docling + item-15 at level 2: inline: group group + item-16 at level 3: text: To convert individual documents with python, use + item-17 at level 3: code: convert() + item-18 at level 3: text: , for example: + item-19 at level 2: code: from docling.document_converter ... (result.document.export_to_markdown()) + item-20 at level 2: inline: group group + item-21 at level 3: text: The program will output: + item-22 at level 3: code: ## Docling Technical Report[...] + item-23 at level 2: text: Prefetch the models: + item-24 at level 2: list: group list + item-25 at level 3: list_item: + item-26 at level 4: inline: group group + item-27 at level 5: text: Use the + item-28 at level 5: code: docling-tools models download + item-29 at level 5: text: utility: + item-30 at level 3: list_item: + item-31 at level 4: inline: group group + item-32 at level 5: text: Alternatively, models can be programmatically downloaded using + item-33 at level 5: code: docling.utils.model_downloader.download_models() + item-34 at level 5: text: . + item-35 at level 3: list_item: + item-36 at level 4: inline: group group + item-37 at level 5: text: Also, you can use download-hf-re ... rom HuggingFace by specifying repo id: + item-38 at level 5: code: $ docling-tools models download- ... 256M-preview model from HuggingFace... \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/html_code_snippets.html.json b/tests/data/groundtruth/docling_v2/html_code_snippets.html.json new file mode 100644 index 00000000..aaa239b0 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/html_code_snippets.html.json @@ -0,0 +1,674 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.5.0", + "name": "html_code_snippets", + "origin": { + "mimetype": "text/html", + "binary_hash": 8468578485215893920, + "filename": "html_code_snippets.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/texts/1" + }, + "children": [ + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/texts/1" + }, + "children": [ + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/texts/1" + }, + "children": [ + { + "$ref": "#/texts/13" + }, + { + "$ref": "#/texts/14" + }, + { + "$ref": "#/texts/15" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/texts/1" + }, + "children": [ + { + "$ref": "#/texts/17" + }, + { + "$ref": "#/texts/18" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/4", + "parent": { + "$ref": "#/texts/1" + }, + "children": [ + { + "$ref": "#/texts/20" + }, + { + "$ref": "#/texts/24" + }, + { + "$ref": "#/texts/28" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/5", + "parent": { + "$ref": "#/texts/20" + }, + "children": [ + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/6", + "parent": { + "$ref": "#/texts/24" + }, + "children": [ + { + "$ref": "#/texts/25" + }, + { + "$ref": "#/texts/26" + }, + { + "$ref": "#/texts/27" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/7", + "parent": { + "$ref": "#/texts/28" + }, + "children": [ + { + "$ref": "#/texts/29" + }, + { + "$ref": "#/texts/30" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "furniture", + "label": "title", + "prov": [], + "orig": "Code snippets in HTML", + "text": "Code snippets in HTML" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/groups/1" + }, + { + "$ref": "#/groups/2" + }, + { + "$ref": "#/texts/16" + }, + { + "$ref": "#/groups/3" + }, + { + "$ref": "#/texts/19" + }, + { + "$ref": "#/groups/4" + } + ], + "content_layer": "body", + "label": "title", + "prov": [], + "orig": "Code snippets", + "text": "Code snippets" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "The Pythagorean theorem can be written as an equation relating the lengths of the sides", + "text": "The Pythagorean theorem can be written as an equation relating the lengths of the sides" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "a", + "text": "a", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": ",", + "text": "," + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "b", + "text": "b", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "and the hypotenuse", + "text": "and the hypotenuse" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "c", + "text": "c", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": ".", + "text": "." + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "To use Docling, simply install", + "text": "To use Docling, simply install" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "code", + "prov": [], + "orig": "docling", + "text": "docling", + "captions": [], + "references": [], + "footnotes": [], + "code_language": "unknown" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "from your package manager, e.g. pip:", + "text": "from your package manager, e.g. pip:" + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "code", + "prov": [], + "orig": "pip install docling", + "text": "pip install docling", + "captions": [], + "references": [], + "footnotes": [], + "code_language": "unknown" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "To convert individual documents with python, use", + "text": "To convert individual documents with python, use" + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "code", + "prov": [], + "orig": "convert()", + "text": "convert()", + "captions": [], + "references": [], + "footnotes": [], + "code_language": "unknown" + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": ", for example:", + "text": ", for example:" + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/texts/1" + }, + "children": [], + "content_layer": "body", + "label": "code", + "prov": [], + "orig": "from docling.document_converter import DocumentConverter\n\nsource = \"https://arxiv.org/pdf/2408.09869\"\nconverter = DocumentConverter()\nresult = converter.convert(source)\nprint(result.document.export_to_markdown())", + "text": "from docling.document_converter import DocumentConverter\n\nsource = \"https://arxiv.org/pdf/2408.09869\"\nconverter = DocumentConverter()\nresult = converter.convert(source)\nprint(result.document.export_to_markdown())", + "captions": [], + "references": [], + "footnotes": [], + "code_language": "unknown" + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "The program will output:", + "text": "The program will output:" + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "code", + "prov": [], + "orig": "## Docling Technical Report[...]", + "text": "## Docling Technical Report[...]", + "captions": [], + "references": [], + "footnotes": [], + "code_language": "unknown" + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/texts/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Prefetch the models:", + "text": "Prefetch the models:" + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/groups/4" + }, + "children": [ + { + "$ref": "#/groups/5" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "", + "text": "", + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Use the", + "text": "Use the" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "code", + "prov": [], + "orig": "docling-tools models download", + "text": "docling-tools models download", + "captions": [], + "references": [], + "footnotes": [], + "code_language": "unknown" + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "utility:", + "text": "utility:" + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/groups/4" + }, + "children": [ + { + "$ref": "#/groups/6" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "", + "text": "", + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/25", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Alternatively, models can be programmatically downloaded using", + "text": "Alternatively, models can be programmatically downloaded using" + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "code", + "prov": [], + "orig": "docling.utils.model_downloader.download_models()", + "text": "docling.utils.model_downloader.download_models()", + "captions": [], + "references": [], + "footnotes": [], + "code_language": "unknown" + }, + { + "self_ref": "#/texts/27", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": ".", + "text": "." + }, + { + "self_ref": "#/texts/28", + "parent": { + "$ref": "#/groups/4" + }, + "children": [ + { + "$ref": "#/groups/7" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "", + "text": "", + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/29", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:", + "text": "Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:" + }, + { + "self_ref": "#/texts/30", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "content_layer": "body", + "label": "code", + "prov": [], + "orig": "$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...", + "text": "$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...", + "captions": [], + "references": [], + "footnotes": [], + "code_language": "unknown" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/html_code_snippets.html.md b/tests/data/groundtruth/docling_v2/html_code_snippets.html.md new file mode 100644 index 00000000..8228e042 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/html_code_snippets.html.md @@ -0,0 +1,24 @@ +# Code snippets + +The Pythagorean theorem can be written as an equation relating the lengths of the sides *a* , *b* and the hypotenuse *c* . + +To use Docling, simply install `docling` from your package manager, e.g. pip: `pip install docling` + +To convert individual documents with python, use `convert()` , for example: + +``` +from docling.document_converter import DocumentConverter + +source = "https://arxiv.org/pdf/2408.09869" +converter = DocumentConverter() +result = converter.convert(source) +print(result.document.export_to_markdown()) +``` + +The program will output: `## Docling Technical Report[...]` + +Prefetch the models: + +- Use the `docling-tools models download` utility: +- Alternatively, models can be programmatically downloaded using `docling.utils.model_downloader.download_models()` . +- Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id: `$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...` \ No newline at end of file diff --git a/tests/data/html/html_code_snippets.html b/tests/data/html/html_code_snippets.html new file mode 100644 index 00000000..3858171e --- /dev/null +++ b/tests/data/html/html_code_snippets.html @@ -0,0 +1,41 @@ + + + + + Code snippets in HTML + + + +

    Code snippets

    + +

    The Pythagorean theorem can be written as an equation relating the lengths of the sides a, b and the hypotenuse c.

    +

    To use Docling, simply install doclingfrom your package manager, e.g. pip: + pip install docling +

    +

    To convert individual documents with python, use convert(), for example:

    +
    
    +from docling.document_converter import DocumentConverter
    +
    +source = "https://arxiv.org/pdf/2408.09869"
    +converter = DocumentConverter()
    +result = converter.convert(source)
    +print(result.document.export_to_markdown())
    +
    +

    The program will output: + ## Docling Technical Report[...] +

    + +

    Prefetch the models:

    +
      +
    • Use the docling-tools models download utility:
    • +
    • Alternatively, models can be programmatically downloaded using docling.utils.model_downloader.download_models().
    • +
    • Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id: +
      
      +            $ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview
      +            Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...
      +        
      + +
    • +
    + +