mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
fix(html): preserve code blocks in list items (#2131)
* chore(html): refactor parser to leverage context managers Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(html): parse inline code snippets, also from list items Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(html): remove hidden tags Remove tags that are not meant to be displayed. Add regression tests for code blocks, inline code, and hidden tags. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
c0268416cf
commit
fa3327e1a6
@@ -1,5 +1,6 @@
|
||||
import logging
|
||||
import re
|
||||
import traceback
|
||||
from contextlib import contextmanager
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
@@ -45,20 +46,22 @@ _BLOCK_TAGS: Final = {
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"ol",
|
||||
"p",
|
||||
"pre",
|
||||
"code",
|
||||
"ul",
|
||||
"ol",
|
||||
"summary",
|
||||
"table",
|
||||
"ul",
|
||||
}
|
||||
|
||||
_CODE_TAG_SET: Final = {"code", "kbd", "samp"}
|
||||
|
||||
_FORMAT_TAG_MAP: Final = {
|
||||
"b": {"bold": True},
|
||||
"strong": {"bold": True},
|
||||
"i": {"italic": True},
|
||||
"em": {"italic": True},
|
||||
"var": {"italic": True},
|
||||
# "mark",
|
||||
# "small",
|
||||
"s": {"strikethrough": True},
|
||||
@@ -67,6 +70,7 @@ _FORMAT_TAG_MAP: Final = {
|
||||
"ins": {"underline": True},
|
||||
"sub": {"script": Script.SUB},
|
||||
"sup": {"script": Script.SUPER},
|
||||
**{k: {} for k in _CODE_TAG_SET},
|
||||
}
|
||||
|
||||
|
||||
@@ -79,6 +83,7 @@ class AnnotatedText(BaseModel):
|
||||
text: str
|
||||
hyperlink: Union[AnyUrl, Path, None] = None
|
||||
formatting: Union[Formatting, None] = None
|
||||
code: bool = False
|
||||
|
||||
|
||||
class AnnotatedTextList(list):
|
||||
@@ -86,10 +91,12 @@ class AnnotatedTextList(list):
|
||||
current_h = None
|
||||
current_text = ""
|
||||
current_f = None
|
||||
current_code = False
|
||||
for at in self:
|
||||
t = at.text
|
||||
h = at.hyperlink
|
||||
f = at.formatting
|
||||
c = at.code
|
||||
current_text += t.strip() + " "
|
||||
if f is not None and current_f is None:
|
||||
current_f = f
|
||||
@@ -103,8 +110,13 @@ class AnnotatedTextList(list):
|
||||
_log.warning(
|
||||
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
|
||||
)
|
||||
current_code = c if c else current_code
|
||||
|
||||
return AnnotatedText(
|
||||
text=current_text.strip(), hyperlink=current_h, formatting=current_f
|
||||
text=current_text.strip(),
|
||||
hyperlink=current_h,
|
||||
formatting=current_f,
|
||||
code=current_code,
|
||||
)
|
||||
|
||||
def simplify_text_elements(self) -> "AnnotatedTextList":
|
||||
@@ -114,9 +126,14 @@ class AnnotatedTextList(list):
|
||||
text = self[0].text
|
||||
hyperlink = self[0].hyperlink
|
||||
formatting = self[0].formatting
|
||||
code = self[0].code
|
||||
last_elm = text
|
||||
for i in range(1, len(self)):
|
||||
if hyperlink == self[i].hyperlink and formatting == self[i].formatting:
|
||||
if (
|
||||
hyperlink == self[i].hyperlink
|
||||
and formatting == self[i].formatting
|
||||
and code == self[i].code
|
||||
):
|
||||
sep = " "
|
||||
if not self[i].text.strip() or not last_elm.strip():
|
||||
sep = ""
|
||||
@@ -124,15 +141,20 @@ class AnnotatedTextList(list):
|
||||
last_elm = self[i].text
|
||||
else:
|
||||
simplified.append(
|
||||
AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
|
||||
AnnotatedText(
|
||||
text=text, hyperlink=hyperlink, formatting=formatting, code=code
|
||||
)
|
||||
)
|
||||
text = self[i].text
|
||||
last_elm = text
|
||||
hyperlink = self[i].hyperlink
|
||||
formatting = self[i].formatting
|
||||
code = self[i].code
|
||||
if text:
|
||||
simplified.append(
|
||||
AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
|
||||
AnnotatedText(
|
||||
text=text, hyperlink=hyperlink, formatting=formatting, code=code
|
||||
)
|
||||
)
|
||||
return simplified
|
||||
|
||||
@@ -174,7 +196,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.ctx = _Context()
|
||||
for i in range(self.max_levels):
|
||||
self.parents[i] = None
|
||||
self.hyperlink = None
|
||||
self.hyperlink: Union[AnyUrl, Path, None] = None
|
||||
self.original_url = original_url
|
||||
self.format_tags: list[str] = []
|
||||
|
||||
@@ -235,9 +257,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
orig=title_text,
|
||||
content_layer=ContentLayer.FURNITURE,
|
||||
)
|
||||
# remove scripts/styles
|
||||
# remove script and style tags
|
||||
for tag in self.soup(["script", "style"]):
|
||||
tag.decompose()
|
||||
# remove any hidden tag
|
||||
for tag in self.soup(hidden=True):
|
||||
tag.decompose()
|
||||
|
||||
content = self.soup.body or self.soup
|
||||
# normalize <br> tags
|
||||
for br in content("br"):
|
||||
@@ -268,7 +294,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def flush_buffer():
|
||||
if not buffer:
|
||||
return
|
||||
annotated_text_list = buffer.simplify_text_elements()
|
||||
annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
|
||||
parts = annotated_text_list.split_by_newline()
|
||||
buffer.clear()
|
||||
|
||||
@@ -276,12 +302,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
return
|
||||
|
||||
for annotated_text_list in parts:
|
||||
with self.use_inline_group(annotated_text_list, doc):
|
||||
with self._use_inline_group(annotated_text_list, doc):
|
||||
for annotated_text in annotated_text_list:
|
||||
if annotated_text.text.strip():
|
||||
seg_clean = HTMLDocumentBackend._clean_unicode(
|
||||
annotated_text.text.strip()
|
||||
)
|
||||
if annotated_text.code:
|
||||
doc.add_code(
|
||||
parent=self.parents[self.level],
|
||||
text=seg_clean,
|
||||
content_layer=self.content_layer,
|
||||
formatting=annotated_text.formatting,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
else:
|
||||
doc.add_text(
|
||||
parent=self.parents[self.level],
|
||||
label=DocItemLabel.TEXT,
|
||||
@@ -298,10 +333,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
flush_buffer()
|
||||
self._emit_image(node, doc)
|
||||
elif name in _FORMAT_TAG_MAP:
|
||||
with self.use_format([name]):
|
||||
with self._use_format([name]):
|
||||
self._walk(node, doc)
|
||||
elif name == "a":
|
||||
with self.use_hyperlink(node):
|
||||
with self._use_hyperlink(node):
|
||||
self._walk(node, doc)
|
||||
elif name in _BLOCK_TAGS:
|
||||
flush_buffer()
|
||||
@@ -367,8 +402,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
this_parent = item.parent
|
||||
while this_parent is not None:
|
||||
if this_parent.name == "a" and this_parent.get("href"):
|
||||
with self.use_format(format_tags):
|
||||
with self.use_hyperlink(this_parent):
|
||||
with self._use_format(format_tags):
|
||||
with self._use_hyperlink(this_parent):
|
||||
return self._extract_text_and_hyperlink_recursively(
|
||||
item, ignore_list
|
||||
)
|
||||
@@ -379,6 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
if isinstance(item, NavigableString):
|
||||
text = item.strip()
|
||||
code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET)
|
||||
if text:
|
||||
return AnnotatedTextList(
|
||||
[
|
||||
@@ -386,6 +422,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
text=text,
|
||||
hyperlink=self.hyperlink,
|
||||
formatting=self._formatting,
|
||||
code=code,
|
||||
)
|
||||
]
|
||||
)
|
||||
@@ -396,6 +433,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
text="\n",
|
||||
hyperlink=self.hyperlink,
|
||||
formatting=self._formatting,
|
||||
code=code,
|
||||
)
|
||||
]
|
||||
)
|
||||
@@ -405,14 +443,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if not ignore_list or (tag.name not in ["ul", "ol"]):
|
||||
for child in tag:
|
||||
if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
|
||||
with self.use_format([child.name]):
|
||||
with self._use_format([child.name]):
|
||||
result.extend(
|
||||
self._extract_text_and_hyperlink_recursively(
|
||||
child, ignore_list, keep_newlines=keep_newlines
|
||||
)
|
||||
)
|
||||
elif isinstance(child, Tag) and child.name == "a":
|
||||
with self.use_hyperlink(child):
|
||||
with self._use_hyperlink(child):
|
||||
result.extend(
|
||||
self._extract_text_and_hyperlink_recursively(
|
||||
child, ignore_list, keep_newlines=keep_newlines
|
||||
@@ -428,29 +466,30 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
return result
|
||||
|
||||
@contextmanager
|
||||
def use_hyperlink(self, tag):
|
||||
def _use_hyperlink(self, tag: Tag):
|
||||
this_href = tag.get("href")
|
||||
if this_href is None:
|
||||
yield None
|
||||
else:
|
||||
if this_href:
|
||||
old_hyperlink = self.hyperlink
|
||||
if isinstance(this_href, str) and this_href:
|
||||
old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink
|
||||
new_hyperlink: Union[AnyUrl, Path, None] = None
|
||||
if self.original_url is not None:
|
||||
this_href = urljoin(self.original_url, this_href)
|
||||
this_href = urljoin(str(self.original_url), str(this_href))
|
||||
# ugly fix for relative links since pydantic does not support them.
|
||||
try:
|
||||
AnyUrl(this_href)
|
||||
new_hyperlink = AnyUrl(this_href)
|
||||
except ValidationError:
|
||||
this_href = Path(this_href)
|
||||
self.hyperlink = this_href
|
||||
new_hyperlink = Path(this_href)
|
||||
self.hyperlink = new_hyperlink
|
||||
try:
|
||||
yield None
|
||||
finally:
|
||||
if this_href:
|
||||
if new_hyperlink:
|
||||
self.hyperlink = old_hyperlink
|
||||
|
||||
@contextmanager
|
||||
def use_format(self, tags: list[str]):
|
||||
def _use_format(self, tags: list[str]):
|
||||
if not tags:
|
||||
yield None
|
||||
else:
|
||||
@@ -461,7 +500,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.format_tags = self.format_tags[: -len(tags)]
|
||||
|
||||
@contextmanager
|
||||
def use_inline_group(
|
||||
def _use_inline_group(
|
||||
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
|
||||
):
|
||||
"""Create an inline group for annotated texts.
|
||||
@@ -473,9 +512,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
Args:
|
||||
annotated_text_list (AnnotatedTextList): Annotated text
|
||||
doc (DoclingDocument): Currently used document
|
||||
|
||||
Yields:
|
||||
None: _description_
|
||||
"""
|
||||
if len(annotated_text_list) > 1:
|
||||
inline_fmt = doc.add_group(
|
||||
@@ -493,6 +529,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
yield None
|
||||
|
||||
@contextmanager
|
||||
def _use_details(self, tag: Tag, doc: DoclingDocument):
|
||||
"""Create a group with the content of a details tag.
|
||||
|
||||
While the context manager is active, the hierarchy level is set one
|
||||
level higher as the cuurent parent.
|
||||
|
||||
Args:
|
||||
tag: The details tag.
|
||||
doc: Currently used document.
|
||||
"""
|
||||
self.parents[self.level + 1] = doc.add_group(
|
||||
name=tag.name,
|
||||
label=GroupLabel.SECTION,
|
||||
parent=self.parents[self.level],
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
self.level += 1
|
||||
try:
|
||||
yield None
|
||||
finally:
|
||||
self.parents[self.level + 1] = None
|
||||
self.level -= 1
|
||||
|
||||
@contextmanager
|
||||
def _use_footer(self, tag: Tag, doc: DoclingDocument):
|
||||
"""Create a group with a footer.
|
||||
|
||||
Create a group with the content of a footer tag. While the context manager
|
||||
is active, the hierarchy level is set one level higher as the cuurent parent.
|
||||
|
||||
Args:
|
||||
tag: The footer tag.
|
||||
doc: Currently used document.
|
||||
"""
|
||||
current_layer = self.content_layer
|
||||
self.content_layer = ContentLayer.FURNITURE
|
||||
self.parents[self.level + 1] = doc.add_group(
|
||||
name=tag.name,
|
||||
label=GroupLabel.SECTION,
|
||||
parent=self.parents[self.level],
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
self.level += 1
|
||||
try:
|
||||
yield None
|
||||
finally:
|
||||
self.parents[self.level + 1] = None
|
||||
self.level -= 1
|
||||
self.content_layer = current_layer
|
||||
|
||||
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||
tag_name = tag.name.lower()
|
||||
# set default content layer to BODY as soon as we encounter a heading
|
||||
@@ -611,12 +698,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
self.level += 1
|
||||
with self.use_inline_group(min_parts, doc):
|
||||
with self._use_inline_group(min_parts, doc):
|
||||
for annotated_text in min_parts:
|
||||
li_text = re.sub(
|
||||
r"\s+|\n+", " ", annotated_text.text
|
||||
).strip()
|
||||
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
||||
if annotated_text.code:
|
||||
doc.add_code(
|
||||
parent=self.parents[self.level],
|
||||
text=li_clean,
|
||||
content_layer=self.content_layer,
|
||||
formatting=annotated_text.formatting,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
else:
|
||||
doc.add_text(
|
||||
parent=self.parents[self.level],
|
||||
label=DocItemLabel.TEXT,
|
||||
@@ -687,12 +783,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
text_list = self._extract_text_and_hyperlink_recursively(
|
||||
tag, find_parent_annotation=True
|
||||
)
|
||||
annotated_texts = text_list.simplify_text_elements()
|
||||
annotated_texts: AnnotatedTextList = text_list.simplify_text_elements()
|
||||
for part in annotated_texts.split_by_newline():
|
||||
with self.use_inline_group(part, doc):
|
||||
with self._use_inline_group(part, doc):
|
||||
for annotated_text in part:
|
||||
if seg := annotated_text.text.strip():
|
||||
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
||||
if annotated_text.code:
|
||||
doc.add_code(
|
||||
parent=self.parents[self.level],
|
||||
text=seg_clean,
|
||||
content_layer=self.content_layer,
|
||||
formatting=annotated_text.formatting,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
else:
|
||||
doc.add_text(
|
||||
parent=self.parents[self.level],
|
||||
label=DocItemLabel.TEXT,
|
||||
@@ -718,13 +823,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
|
||||
elif tag_name in {"pre", "code"}:
|
||||
elif tag_name in {"pre"}:
|
||||
# handle monospace code snippets (pre).
|
||||
text_list = self._extract_text_and_hyperlink_recursively(
|
||||
tag, find_parent_annotation=True
|
||||
tag, find_parent_annotation=True, keep_newlines=True
|
||||
)
|
||||
annotated_texts = text_list.simplify_text_elements()
|
||||
with self.use_inline_group(annotated_texts, doc):
|
||||
with self._use_inline_group(annotated_texts, doc):
|
||||
for annotated_text in annotated_texts:
|
||||
text_clean = HTMLDocumentBackend._clean_unicode(
|
||||
annotated_text.text.strip()
|
||||
@@ -737,22 +842,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
|
||||
elif tag_name in {"details", "footer"}:
|
||||
if tag_name == "footer":
|
||||
current_layer = self.content_layer
|
||||
self.content_layer = ContentLayer.FURNITURE
|
||||
self.parents[self.level + 1] = doc.add_group(
|
||||
name=tag_name,
|
||||
label=GroupLabel.SECTION,
|
||||
parent=self.parents[self.level],
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
self.level += 1
|
||||
elif tag_name == "footer":
|
||||
with self._use_footer(tag, doc):
|
||||
self._walk(tag, doc)
|
||||
|
||||
elif tag_name == "details":
|
||||
with self._use_details(tag, doc):
|
||||
self._walk(tag, doc)
|
||||
self.parents[self.level + 1] = None
|
||||
self.level -= 1
|
||||
if tag_name == "footer":
|
||||
self.content_layer = current_layer
|
||||
|
||||
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
|
||||
figure = img_tag.find_parent("figure")
|
||||
|
||||
39
tests/data/groundtruth/docling_v2/html_code_snippets.html.itxt
vendored
Normal file
39
tests/data/groundtruth/docling_v2/html_code_snippets.html.itxt
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: title: Code snippets
|
||||
item-2 at level 2: inline: group group
|
||||
item-3 at level 3: text: The Pythagorean theorem can be w ... tion relating the lengths of the sides
|
||||
item-4 at level 3: text: a
|
||||
item-5 at level 3: text: ,
|
||||
item-6 at level 3: text: b
|
||||
item-7 at level 3: text: and the hypotenuse
|
||||
item-8 at level 3: text: c
|
||||
item-9 at level 3: text: .
|
||||
item-10 at level 2: inline: group group
|
||||
item-11 at level 3: text: To use Docling, simply install
|
||||
item-12 at level 3: code: docling
|
||||
item-13 at level 3: text: from your package manager, e.g. pip:
|
||||
item-14 at level 3: code: pip install docling
|
||||
item-15 at level 2: inline: group group
|
||||
item-16 at level 3: text: To convert individual documents with python, use
|
||||
item-17 at level 3: code: convert()
|
||||
item-18 at level 3: text: , for example:
|
||||
item-19 at level 2: code: from docling.document_converter ... (result.document.export_to_markdown())
|
||||
item-20 at level 2: inline: group group
|
||||
item-21 at level 3: text: The program will output:
|
||||
item-22 at level 3: code: ## Docling Technical Report[...]
|
||||
item-23 at level 2: text: Prefetch the models:
|
||||
item-24 at level 2: list: group list
|
||||
item-25 at level 3: list_item:
|
||||
item-26 at level 4: inline: group group
|
||||
item-27 at level 5: text: Use the
|
||||
item-28 at level 5: code: docling-tools models download
|
||||
item-29 at level 5: text: utility:
|
||||
item-30 at level 3: list_item:
|
||||
item-31 at level 4: inline: group group
|
||||
item-32 at level 5: text: Alternatively, models can be programmatically downloaded using
|
||||
item-33 at level 5: code: docling.utils.model_downloader.download_models()
|
||||
item-34 at level 5: text: .
|
||||
item-35 at level 3: list_item:
|
||||
item-36 at level 4: inline: group group
|
||||
item-37 at level 5: text: Also, you can use download-hf-re ... rom HuggingFace by specifying repo id:
|
||||
item-38 at level 5: code: $ docling-tools models download- ... 256M-preview model from HuggingFace...
|
||||
674
tests/data/groundtruth/docling_v2/html_code_snippets.html.json
vendored
Normal file
674
tests/data/groundtruth/docling_v2/html_code_snippets.html.json
vendored
Normal file
@@ -0,0 +1,674 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"name": "html_code_snippets",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
"binary_hash": 8468578485215893920,
|
||||
"filename": "html_code_snippets.html"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/3"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/5"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/6"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/7"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/8"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/1",
|
||||
"parent": {
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/9"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/10"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/11"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/12"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/2",
|
||||
"parent": {
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/13"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/14"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/15"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/3",
|
||||
"parent": {
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/17"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/18"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/4",
|
||||
"parent": {
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/20"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/24"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/28"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "list",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/5",
|
||||
"parent": {
|
||||
"$ref": "#/texts/20"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/21"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/22"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/23"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/6",
|
||||
"parent": {
|
||||
"$ref": "#/texts/24"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/25"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/26"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/27"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/7",
|
||||
"parent": {
|
||||
"$ref": "#/texts/28"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/29"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/30"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Code snippets in HTML",
|
||||
"text": "Code snippets in HTML"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/16"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/3"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/19"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/4"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Code snippets",
|
||||
"text": "Code snippets"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "The Pythagorean theorem can be written as an equation relating the lengths of the sides",
|
||||
"text": "The Pythagorean theorem can be written as an equation relating the lengths of the sides"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/3",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "a",
|
||||
"text": "a",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": ",",
|
||||
"text": ","
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "b",
|
||||
"text": "b",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/6",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "and the hypotenuse",
|
||||
"text": "and the hypotenuse"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/7",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "c",
|
||||
"text": "c",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/8",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": ".",
|
||||
"text": "."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/9",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "To use Docling, simply install",
|
||||
"text": "To use Docling, simply install"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/10",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "code",
|
||||
"prov": [],
|
||||
"orig": "docling",
|
||||
"text": "docling",
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"code_language": "unknown"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/11",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "from your package manager, e.g. pip:",
|
||||
"text": "from your package manager, e.g. pip:"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/12",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "code",
|
||||
"prov": [],
|
||||
"orig": "pip install docling",
|
||||
"text": "pip install docling",
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"code_language": "unknown"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/13",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "To convert individual documents with python, use",
|
||||
"text": "To convert individual documents with python, use"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/14",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "code",
|
||||
"prov": [],
|
||||
"orig": "convert()",
|
||||
"text": "convert()",
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"code_language": "unknown"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/15",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": ", for example:",
|
||||
"text": ", for example:"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/16",
|
||||
"parent": {
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "code",
|
||||
"prov": [],
|
||||
"orig": "from docling.document_converter import DocumentConverter\n\nsource = \"https://arxiv.org/pdf/2408.09869\"\nconverter = DocumentConverter()\nresult = converter.convert(source)\nprint(result.document.export_to_markdown())",
|
||||
"text": "from docling.document_converter import DocumentConverter\n\nsource = \"https://arxiv.org/pdf/2408.09869\"\nconverter = DocumentConverter()\nresult = converter.convert(source)\nprint(result.document.export_to_markdown())",
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"code_language": "unknown"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/17",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "The program will output:",
|
||||
"text": "The program will output:"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/18",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "code",
|
||||
"prov": [],
|
||||
"orig": "## Docling Technical Report[...]",
|
||||
"text": "## Docling Technical Report[...]",
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"code_language": "unknown"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/19",
|
||||
"parent": {
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Prefetch the models:",
|
||||
"text": "Prefetch the models:"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/20",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/5"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": "",
|
||||
"enumerated": false,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/21",
|
||||
"parent": {
|
||||
"$ref": "#/groups/5"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Use the",
|
||||
"text": "Use the"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/22",
|
||||
"parent": {
|
||||
"$ref": "#/groups/5"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "code",
|
||||
"prov": [],
|
||||
"orig": "docling-tools models download",
|
||||
"text": "docling-tools models download",
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"code_language": "unknown"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/23",
|
||||
"parent": {
|
||||
"$ref": "#/groups/5"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "utility:",
|
||||
"text": "utility:"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/24",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/6"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": "",
|
||||
"enumerated": false,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/25",
|
||||
"parent": {
|
||||
"$ref": "#/groups/6"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Alternatively, models can be programmatically downloaded using",
|
||||
"text": "Alternatively, models can be programmatically downloaded using"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/26",
|
||||
"parent": {
|
||||
"$ref": "#/groups/6"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "code",
|
||||
"prov": [],
|
||||
"orig": "docling.utils.model_downloader.download_models()",
|
||||
"text": "docling.utils.model_downloader.download_models()",
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"code_language": "unknown"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/27",
|
||||
"parent": {
|
||||
"$ref": "#/groups/6"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": ".",
|
||||
"text": "."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/28",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/7"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": "",
|
||||
"enumerated": false,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/29",
|
||||
"parent": {
|
||||
"$ref": "#/groups/7"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:",
|
||||
"text": "Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/30",
|
||||
"parent": {
|
||||
"$ref": "#/groups/7"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "code",
|
||||
"prov": [],
|
||||
"orig": "$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...",
|
||||
"text": "$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...",
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"code_language": "unknown"
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
||||
24
tests/data/groundtruth/docling_v2/html_code_snippets.html.md
vendored
Normal file
24
tests/data/groundtruth/docling_v2/html_code_snippets.html.md
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
# Code snippets
|
||||
|
||||
The Pythagorean theorem can be written as an equation relating the lengths of the sides *a* , *b* and the hypotenuse *c* .
|
||||
|
||||
To use Docling, simply install `docling` from your package manager, e.g. pip: `pip install docling`
|
||||
|
||||
To convert individual documents with python, use `convert()` , for example:
|
||||
|
||||
```
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
source = "https://arxiv.org/pdf/2408.09869"
|
||||
converter = DocumentConverter()
|
||||
result = converter.convert(source)
|
||||
print(result.document.export_to_markdown())
|
||||
```
|
||||
|
||||
The program will output: `## Docling Technical Report[...]`
|
||||
|
||||
Prefetch the models:
|
||||
|
||||
- Use the `docling-tools models download` utility:
|
||||
- Alternatively, models can be programmatically downloaded using `docling.utils.model_downloader.download_models()` .
|
||||
- Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id: `$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...`
|
||||
41
tests/data/html/html_code_snippets.html
vendored
Normal file
41
tests/data/html/html_code_snippets.html
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Code snippets in HTML</title>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h1>Code snippets</h1>
|
||||
|
||||
<p>The Pythagorean theorem can be written as an equation relating the lengths of the sides <var>a</var>, <var>b</var> and the hypotenuse <var>c</var>.</p>
|
||||
<p>To use Docling, simply install <code>docling</code>from your package manager, e.g. pip:
|
||||
<kbd>pip install docling</kbd>
|
||||
</p>
|
||||
<p>To convert individual documents with python, use <code>convert()</code>, for example:</p>
|
||||
<pre><code>
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
source = "https://arxiv.org/pdf/2408.09869"
|
||||
converter = DocumentConverter()
|
||||
result = converter.convert(source)
|
||||
print(result.document.export_to_markdown())
|
||||
</code></pre>
|
||||
<p>The program will output:
|
||||
<samp>## Docling Technical Report[...]</samp>
|
||||
</p>
|
||||
|
||||
<p>Prefetch the models:</p>
|
||||
<ul>
|
||||
<li>Use the <code>docling-tools models download</code> utility:</li>
|
||||
<li>Alternatively, models can be programmatically downloaded using <samp>docling.utils.model_downloader.download_models()</samp>.</li>
|
||||
<li>Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:
|
||||
<pre><code>
|
||||
$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview
|
||||
Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...
|
||||
</code></pre>
|
||||
<pre hidden><code>$ docling-tools</code></pre>
|
||||
</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user