mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
fix(html): preserve code blocks in list items (#2131)
* chore(html): refactor parser to leverage context managers Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(html): parse inline code snippets, also from list items Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(html): remove hidden tags Remove tags that are not meant to be displayed. Add regression tests for code blocks, inline code, and hidden tags. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
c0268416cf
commit
fa3327e1a6
@@ -1,5 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import traceback
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
@@ -45,20 +46,22 @@ _BLOCK_TAGS: Final = {
|
|||||||
"h4",
|
"h4",
|
||||||
"h5",
|
"h5",
|
||||||
"h6",
|
"h6",
|
||||||
|
"ol",
|
||||||
"p",
|
"p",
|
||||||
"pre",
|
"pre",
|
||||||
"code",
|
|
||||||
"ul",
|
|
||||||
"ol",
|
|
||||||
"summary",
|
"summary",
|
||||||
"table",
|
"table",
|
||||||
|
"ul",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_CODE_TAG_SET: Final = {"code", "kbd", "samp"}
|
||||||
|
|
||||||
_FORMAT_TAG_MAP: Final = {
|
_FORMAT_TAG_MAP: Final = {
|
||||||
"b": {"bold": True},
|
"b": {"bold": True},
|
||||||
"strong": {"bold": True},
|
"strong": {"bold": True},
|
||||||
"i": {"italic": True},
|
"i": {"italic": True},
|
||||||
"em": {"italic": True},
|
"em": {"italic": True},
|
||||||
|
"var": {"italic": True},
|
||||||
# "mark",
|
# "mark",
|
||||||
# "small",
|
# "small",
|
||||||
"s": {"strikethrough": True},
|
"s": {"strikethrough": True},
|
||||||
@@ -67,6 +70,7 @@ _FORMAT_TAG_MAP: Final = {
|
|||||||
"ins": {"underline": True},
|
"ins": {"underline": True},
|
||||||
"sub": {"script": Script.SUB},
|
"sub": {"script": Script.SUB},
|
||||||
"sup": {"script": Script.SUPER},
|
"sup": {"script": Script.SUPER},
|
||||||
|
**{k: {} for k in _CODE_TAG_SET},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -79,6 +83,7 @@ class AnnotatedText(BaseModel):
|
|||||||
text: str
|
text: str
|
||||||
hyperlink: Union[AnyUrl, Path, None] = None
|
hyperlink: Union[AnyUrl, Path, None] = None
|
||||||
formatting: Union[Formatting, None] = None
|
formatting: Union[Formatting, None] = None
|
||||||
|
code: bool = False
|
||||||
|
|
||||||
|
|
||||||
class AnnotatedTextList(list):
|
class AnnotatedTextList(list):
|
||||||
@@ -86,10 +91,12 @@ class AnnotatedTextList(list):
|
|||||||
current_h = None
|
current_h = None
|
||||||
current_text = ""
|
current_text = ""
|
||||||
current_f = None
|
current_f = None
|
||||||
|
current_code = False
|
||||||
for at in self:
|
for at in self:
|
||||||
t = at.text
|
t = at.text
|
||||||
h = at.hyperlink
|
h = at.hyperlink
|
||||||
f = at.formatting
|
f = at.formatting
|
||||||
|
c = at.code
|
||||||
current_text += t.strip() + " "
|
current_text += t.strip() + " "
|
||||||
if f is not None and current_f is None:
|
if f is not None and current_f is None:
|
||||||
current_f = f
|
current_f = f
|
||||||
@@ -103,8 +110,13 @@ class AnnotatedTextList(list):
|
|||||||
_log.warning(
|
_log.warning(
|
||||||
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
|
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
|
||||||
)
|
)
|
||||||
|
current_code = c if c else current_code
|
||||||
|
|
||||||
return AnnotatedText(
|
return AnnotatedText(
|
||||||
text=current_text.strip(), hyperlink=current_h, formatting=current_f
|
text=current_text.strip(),
|
||||||
|
hyperlink=current_h,
|
||||||
|
formatting=current_f,
|
||||||
|
code=current_code,
|
||||||
)
|
)
|
||||||
|
|
||||||
def simplify_text_elements(self) -> "AnnotatedTextList":
|
def simplify_text_elements(self) -> "AnnotatedTextList":
|
||||||
@@ -114,9 +126,14 @@ class AnnotatedTextList(list):
|
|||||||
text = self[0].text
|
text = self[0].text
|
||||||
hyperlink = self[0].hyperlink
|
hyperlink = self[0].hyperlink
|
||||||
formatting = self[0].formatting
|
formatting = self[0].formatting
|
||||||
|
code = self[0].code
|
||||||
last_elm = text
|
last_elm = text
|
||||||
for i in range(1, len(self)):
|
for i in range(1, len(self)):
|
||||||
if hyperlink == self[i].hyperlink and formatting == self[i].formatting:
|
if (
|
||||||
|
hyperlink == self[i].hyperlink
|
||||||
|
and formatting == self[i].formatting
|
||||||
|
and code == self[i].code
|
||||||
|
):
|
||||||
sep = " "
|
sep = " "
|
||||||
if not self[i].text.strip() or not last_elm.strip():
|
if not self[i].text.strip() or not last_elm.strip():
|
||||||
sep = ""
|
sep = ""
|
||||||
@@ -124,15 +141,20 @@ class AnnotatedTextList(list):
|
|||||||
last_elm = self[i].text
|
last_elm = self[i].text
|
||||||
else:
|
else:
|
||||||
simplified.append(
|
simplified.append(
|
||||||
AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
|
AnnotatedText(
|
||||||
|
text=text, hyperlink=hyperlink, formatting=formatting, code=code
|
||||||
|
)
|
||||||
)
|
)
|
||||||
text = self[i].text
|
text = self[i].text
|
||||||
last_elm = text
|
last_elm = text
|
||||||
hyperlink = self[i].hyperlink
|
hyperlink = self[i].hyperlink
|
||||||
formatting = self[i].formatting
|
formatting = self[i].formatting
|
||||||
|
code = self[i].code
|
||||||
if text:
|
if text:
|
||||||
simplified.append(
|
simplified.append(
|
||||||
AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
|
AnnotatedText(
|
||||||
|
text=text, hyperlink=hyperlink, formatting=formatting, code=code
|
||||||
|
)
|
||||||
)
|
)
|
||||||
return simplified
|
return simplified
|
||||||
|
|
||||||
@@ -174,7 +196,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.ctx = _Context()
|
self.ctx = _Context()
|
||||||
for i in range(self.max_levels):
|
for i in range(self.max_levels):
|
||||||
self.parents[i] = None
|
self.parents[i] = None
|
||||||
self.hyperlink = None
|
self.hyperlink: Union[AnyUrl, Path, None] = None
|
||||||
self.original_url = original_url
|
self.original_url = original_url
|
||||||
self.format_tags: list[str] = []
|
self.format_tags: list[str] = []
|
||||||
|
|
||||||
@@ -235,9 +257,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
orig=title_text,
|
orig=title_text,
|
||||||
content_layer=ContentLayer.FURNITURE,
|
content_layer=ContentLayer.FURNITURE,
|
||||||
)
|
)
|
||||||
# remove scripts/styles
|
# remove script and style tags
|
||||||
for tag in self.soup(["script", "style"]):
|
for tag in self.soup(["script", "style"]):
|
||||||
tag.decompose()
|
tag.decompose()
|
||||||
|
# remove any hidden tag
|
||||||
|
for tag in self.soup(hidden=True):
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
content = self.soup.body or self.soup
|
content = self.soup.body or self.soup
|
||||||
# normalize <br> tags
|
# normalize <br> tags
|
||||||
for br in content("br"):
|
for br in content("br"):
|
||||||
@@ -268,7 +294,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def flush_buffer():
|
def flush_buffer():
|
||||||
if not buffer:
|
if not buffer:
|
||||||
return
|
return
|
||||||
annotated_text_list = buffer.simplify_text_elements()
|
annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
|
||||||
parts = annotated_text_list.split_by_newline()
|
parts = annotated_text_list.split_by_newline()
|
||||||
buffer.clear()
|
buffer.clear()
|
||||||
|
|
||||||
@@ -276,20 +302,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return
|
return
|
||||||
|
|
||||||
for annotated_text_list in parts:
|
for annotated_text_list in parts:
|
||||||
with self.use_inline_group(annotated_text_list, doc):
|
with self._use_inline_group(annotated_text_list, doc):
|
||||||
for annotated_text in annotated_text_list:
|
for annotated_text in annotated_text_list:
|
||||||
if annotated_text.text.strip():
|
if annotated_text.text.strip():
|
||||||
seg_clean = HTMLDocumentBackend._clean_unicode(
|
seg_clean = HTMLDocumentBackend._clean_unicode(
|
||||||
annotated_text.text.strip()
|
annotated_text.text.strip()
|
||||||
)
|
)
|
||||||
doc.add_text(
|
if annotated_text.code:
|
||||||
parent=self.parents[self.level],
|
doc.add_code(
|
||||||
label=DocItemLabel.TEXT,
|
parent=self.parents[self.level],
|
||||||
text=seg_clean,
|
text=seg_clean,
|
||||||
content_layer=self.content_layer,
|
content_layer=self.content_layer,
|
||||||
formatting=annotated_text.formatting,
|
formatting=annotated_text.formatting,
|
||||||
hyperlink=annotated_text.hyperlink,
|
hyperlink=annotated_text.hyperlink,
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
doc.add_text(
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
label=DocItemLabel.TEXT,
|
||||||
|
text=seg_clean,
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
formatting=annotated_text.formatting,
|
||||||
|
hyperlink=annotated_text.hyperlink,
|
||||||
|
)
|
||||||
|
|
||||||
for node in element.contents:
|
for node in element.contents:
|
||||||
if isinstance(node, Tag):
|
if isinstance(node, Tag):
|
||||||
@@ -298,10 +333,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
flush_buffer()
|
flush_buffer()
|
||||||
self._emit_image(node, doc)
|
self._emit_image(node, doc)
|
||||||
elif name in _FORMAT_TAG_MAP:
|
elif name in _FORMAT_TAG_MAP:
|
||||||
with self.use_format([name]):
|
with self._use_format([name]):
|
||||||
self._walk(node, doc)
|
self._walk(node, doc)
|
||||||
elif name == "a":
|
elif name == "a":
|
||||||
with self.use_hyperlink(node):
|
with self._use_hyperlink(node):
|
||||||
self._walk(node, doc)
|
self._walk(node, doc)
|
||||||
elif name in _BLOCK_TAGS:
|
elif name in _BLOCK_TAGS:
|
||||||
flush_buffer()
|
flush_buffer()
|
||||||
@@ -367,8 +402,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
this_parent = item.parent
|
this_parent = item.parent
|
||||||
while this_parent is not None:
|
while this_parent is not None:
|
||||||
if this_parent.name == "a" and this_parent.get("href"):
|
if this_parent.name == "a" and this_parent.get("href"):
|
||||||
with self.use_format(format_tags):
|
with self._use_format(format_tags):
|
||||||
with self.use_hyperlink(this_parent):
|
with self._use_hyperlink(this_parent):
|
||||||
return self._extract_text_and_hyperlink_recursively(
|
return self._extract_text_and_hyperlink_recursively(
|
||||||
item, ignore_list
|
item, ignore_list
|
||||||
)
|
)
|
||||||
@@ -379,6 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
if isinstance(item, NavigableString):
|
if isinstance(item, NavigableString):
|
||||||
text = item.strip()
|
text = item.strip()
|
||||||
|
code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET)
|
||||||
if text:
|
if text:
|
||||||
return AnnotatedTextList(
|
return AnnotatedTextList(
|
||||||
[
|
[
|
||||||
@@ -386,6 +422,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
text=text,
|
text=text,
|
||||||
hyperlink=self.hyperlink,
|
hyperlink=self.hyperlink,
|
||||||
formatting=self._formatting,
|
formatting=self._formatting,
|
||||||
|
code=code,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
@@ -396,6 +433,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
text="\n",
|
text="\n",
|
||||||
hyperlink=self.hyperlink,
|
hyperlink=self.hyperlink,
|
||||||
formatting=self._formatting,
|
formatting=self._formatting,
|
||||||
|
code=code,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
@@ -405,14 +443,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if not ignore_list or (tag.name not in ["ul", "ol"]):
|
if not ignore_list or (tag.name not in ["ul", "ol"]):
|
||||||
for child in tag:
|
for child in tag:
|
||||||
if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
|
if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
|
||||||
with self.use_format([child.name]):
|
with self._use_format([child.name]):
|
||||||
result.extend(
|
result.extend(
|
||||||
self._extract_text_and_hyperlink_recursively(
|
self._extract_text_and_hyperlink_recursively(
|
||||||
child, ignore_list, keep_newlines=keep_newlines
|
child, ignore_list, keep_newlines=keep_newlines
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
elif isinstance(child, Tag) and child.name == "a":
|
elif isinstance(child, Tag) and child.name == "a":
|
||||||
with self.use_hyperlink(child):
|
with self._use_hyperlink(child):
|
||||||
result.extend(
|
result.extend(
|
||||||
self._extract_text_and_hyperlink_recursively(
|
self._extract_text_and_hyperlink_recursively(
|
||||||
child, ignore_list, keep_newlines=keep_newlines
|
child, ignore_list, keep_newlines=keep_newlines
|
||||||
@@ -428,29 +466,30 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def use_hyperlink(self, tag):
|
def _use_hyperlink(self, tag: Tag):
|
||||||
this_href = tag.get("href")
|
this_href = tag.get("href")
|
||||||
if this_href is None:
|
if this_href is None:
|
||||||
yield None
|
yield None
|
||||||
else:
|
else:
|
||||||
if this_href:
|
if isinstance(this_href, str) and this_href:
|
||||||
old_hyperlink = self.hyperlink
|
old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink
|
||||||
|
new_hyperlink: Union[AnyUrl, Path, None] = None
|
||||||
if self.original_url is not None:
|
if self.original_url is not None:
|
||||||
this_href = urljoin(self.original_url, this_href)
|
this_href = urljoin(str(self.original_url), str(this_href))
|
||||||
# ugly fix for relative links since pydantic does not support them.
|
# ugly fix for relative links since pydantic does not support them.
|
||||||
try:
|
try:
|
||||||
AnyUrl(this_href)
|
new_hyperlink = AnyUrl(this_href)
|
||||||
except ValidationError:
|
except ValidationError:
|
||||||
this_href = Path(this_href)
|
new_hyperlink = Path(this_href)
|
||||||
self.hyperlink = this_href
|
self.hyperlink = new_hyperlink
|
||||||
try:
|
try:
|
||||||
yield None
|
yield None
|
||||||
finally:
|
finally:
|
||||||
if this_href:
|
if new_hyperlink:
|
||||||
self.hyperlink = old_hyperlink
|
self.hyperlink = old_hyperlink
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def use_format(self, tags: list[str]):
|
def _use_format(self, tags: list[str]):
|
||||||
if not tags:
|
if not tags:
|
||||||
yield None
|
yield None
|
||||||
else:
|
else:
|
||||||
@@ -461,7 +500,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.format_tags = self.format_tags[: -len(tags)]
|
self.format_tags = self.format_tags[: -len(tags)]
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def use_inline_group(
|
def _use_inline_group(
|
||||||
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
|
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
|
||||||
):
|
):
|
||||||
"""Create an inline group for annotated texts.
|
"""Create an inline group for annotated texts.
|
||||||
@@ -473,9 +512,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
Args:
|
Args:
|
||||||
annotated_text_list (AnnotatedTextList): Annotated text
|
annotated_text_list (AnnotatedTextList): Annotated text
|
||||||
doc (DoclingDocument): Currently used document
|
doc (DoclingDocument): Currently used document
|
||||||
|
|
||||||
Yields:
|
|
||||||
None: _description_
|
|
||||||
"""
|
"""
|
||||||
if len(annotated_text_list) > 1:
|
if len(annotated_text_list) > 1:
|
||||||
inline_fmt = doc.add_group(
|
inline_fmt = doc.add_group(
|
||||||
@@ -493,6 +529,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
else:
|
else:
|
||||||
yield None
|
yield None
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _use_details(self, tag: Tag, doc: DoclingDocument):
|
||||||
|
"""Create a group with the content of a details tag.
|
||||||
|
|
||||||
|
While the context manager is active, the hierarchy level is set one
|
||||||
|
level higher as the cuurent parent.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag: The details tag.
|
||||||
|
doc: Currently used document.
|
||||||
|
"""
|
||||||
|
self.parents[self.level + 1] = doc.add_group(
|
||||||
|
name=tag.name,
|
||||||
|
label=GroupLabel.SECTION,
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
|
self.level += 1
|
||||||
|
try:
|
||||||
|
yield None
|
||||||
|
finally:
|
||||||
|
self.parents[self.level + 1] = None
|
||||||
|
self.level -= 1
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _use_footer(self, tag: Tag, doc: DoclingDocument):
|
||||||
|
"""Create a group with a footer.
|
||||||
|
|
||||||
|
Create a group with the content of a footer tag. While the context manager
|
||||||
|
is active, the hierarchy level is set one level higher as the cuurent parent.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tag: The footer tag.
|
||||||
|
doc: Currently used document.
|
||||||
|
"""
|
||||||
|
current_layer = self.content_layer
|
||||||
|
self.content_layer = ContentLayer.FURNITURE
|
||||||
|
self.parents[self.level + 1] = doc.add_group(
|
||||||
|
name=tag.name,
|
||||||
|
label=GroupLabel.SECTION,
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
|
self.level += 1
|
||||||
|
try:
|
||||||
|
yield None
|
||||||
|
finally:
|
||||||
|
self.parents[self.level + 1] = None
|
||||||
|
self.level -= 1
|
||||||
|
self.content_layer = current_layer
|
||||||
|
|
||||||
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
|
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||||
tag_name = tag.name.lower()
|
tag_name = tag.name.lower()
|
||||||
# set default content layer to BODY as soon as we encounter a heading
|
# set default content layer to BODY as soon as we encounter a heading
|
||||||
@@ -611,20 +698,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
content_layer=self.content_layer,
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
self.level += 1
|
self.level += 1
|
||||||
with self.use_inline_group(min_parts, doc):
|
with self._use_inline_group(min_parts, doc):
|
||||||
for annotated_text in min_parts:
|
for annotated_text in min_parts:
|
||||||
li_text = re.sub(
|
li_text = re.sub(
|
||||||
r"\s+|\n+", " ", annotated_text.text
|
r"\s+|\n+", " ", annotated_text.text
|
||||||
).strip()
|
).strip()
|
||||||
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
||||||
doc.add_text(
|
if annotated_text.code:
|
||||||
parent=self.parents[self.level],
|
doc.add_code(
|
||||||
label=DocItemLabel.TEXT,
|
parent=self.parents[self.level],
|
||||||
text=li_clean,
|
text=li_clean,
|
||||||
content_layer=self.content_layer,
|
content_layer=self.content_layer,
|
||||||
formatting=annotated_text.formatting,
|
formatting=annotated_text.formatting,
|
||||||
hyperlink=annotated_text.hyperlink,
|
hyperlink=annotated_text.hyperlink,
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
doc.add_text(
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
label=DocItemLabel.TEXT,
|
||||||
|
text=li_clean,
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
formatting=annotated_text.formatting,
|
||||||
|
hyperlink=annotated_text.hyperlink,
|
||||||
|
)
|
||||||
|
|
||||||
# 4) recurse into any nested lists, attaching them to this <li> item
|
# 4) recurse into any nested lists, attaching them to this <li> item
|
||||||
for sublist in li({"ul", "ol"}, recursive=False):
|
for sublist in li({"ul", "ol"}, recursive=False):
|
||||||
@@ -687,20 +783,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
text_list = self._extract_text_and_hyperlink_recursively(
|
text_list = self._extract_text_and_hyperlink_recursively(
|
||||||
tag, find_parent_annotation=True
|
tag, find_parent_annotation=True
|
||||||
)
|
)
|
||||||
annotated_texts = text_list.simplify_text_elements()
|
annotated_texts: AnnotatedTextList = text_list.simplify_text_elements()
|
||||||
for part in annotated_texts.split_by_newline():
|
for part in annotated_texts.split_by_newline():
|
||||||
with self.use_inline_group(part, doc):
|
with self._use_inline_group(part, doc):
|
||||||
for annotated_text in part:
|
for annotated_text in part:
|
||||||
if seg := annotated_text.text.strip():
|
if seg := annotated_text.text.strip():
|
||||||
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
||||||
doc.add_text(
|
if annotated_text.code:
|
||||||
parent=self.parents[self.level],
|
doc.add_code(
|
||||||
label=DocItemLabel.TEXT,
|
parent=self.parents[self.level],
|
||||||
text=seg_clean,
|
text=seg_clean,
|
||||||
content_layer=self.content_layer,
|
content_layer=self.content_layer,
|
||||||
formatting=annotated_text.formatting,
|
formatting=annotated_text.formatting,
|
||||||
hyperlink=annotated_text.hyperlink,
|
hyperlink=annotated_text.hyperlink,
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
doc.add_text(
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
label=DocItemLabel.TEXT,
|
||||||
|
text=seg_clean,
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
formatting=annotated_text.formatting,
|
||||||
|
hyperlink=annotated_text.hyperlink,
|
||||||
|
)
|
||||||
|
|
||||||
for img_tag in tag("img"):
|
for img_tag in tag("img"):
|
||||||
if isinstance(img_tag, Tag):
|
if isinstance(img_tag, Tag):
|
||||||
@@ -718,13 +823,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
content_layer=self.content_layer,
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif tag_name in {"pre", "code"}:
|
elif tag_name in {"pre"}:
|
||||||
# handle monospace code snippets (pre).
|
# handle monospace code snippets (pre).
|
||||||
text_list = self._extract_text_and_hyperlink_recursively(
|
text_list = self._extract_text_and_hyperlink_recursively(
|
||||||
tag, find_parent_annotation=True
|
tag, find_parent_annotation=True, keep_newlines=True
|
||||||
)
|
)
|
||||||
annotated_texts = text_list.simplify_text_elements()
|
annotated_texts = text_list.simplify_text_elements()
|
||||||
with self.use_inline_group(annotated_texts, doc):
|
with self._use_inline_group(annotated_texts, doc):
|
||||||
for annotated_text in annotated_texts:
|
for annotated_text in annotated_texts:
|
||||||
text_clean = HTMLDocumentBackend._clean_unicode(
|
text_clean = HTMLDocumentBackend._clean_unicode(
|
||||||
annotated_text.text.strip()
|
annotated_text.text.strip()
|
||||||
@@ -737,22 +842,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
hyperlink=annotated_text.hyperlink,
|
hyperlink=annotated_text.hyperlink,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif tag_name in {"details", "footer"}:
|
elif tag_name == "footer":
|
||||||
if tag_name == "footer":
|
with self._use_footer(tag, doc):
|
||||||
current_layer = self.content_layer
|
self._walk(tag, doc)
|
||||||
self.content_layer = ContentLayer.FURNITURE
|
|
||||||
self.parents[self.level + 1] = doc.add_group(
|
elif tag_name == "details":
|
||||||
name=tag_name,
|
with self._use_details(tag, doc):
|
||||||
label=GroupLabel.SECTION,
|
self._walk(tag, doc)
|
||||||
parent=self.parents[self.level],
|
|
||||||
content_layer=self.content_layer,
|
|
||||||
)
|
|
||||||
self.level += 1
|
|
||||||
self._walk(tag, doc)
|
|
||||||
self.parents[self.level + 1] = None
|
|
||||||
self.level -= 1
|
|
||||||
if tag_name == "footer":
|
|
||||||
self.content_layer = current_layer
|
|
||||||
|
|
||||||
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
|
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
|
||||||
figure = img_tag.find_parent("figure")
|
figure = img_tag.find_parent("figure")
|
||||||
|
|||||||
39
tests/data/groundtruth/docling_v2/html_code_snippets.html.itxt
vendored
Normal file
39
tests/data/groundtruth/docling_v2/html_code_snippets.html.itxt
vendored
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
item-0 at level 0: unspecified: group _root_
|
||||||
|
item-1 at level 1: title: Code snippets
|
||||||
|
item-2 at level 2: inline: group group
|
||||||
|
item-3 at level 3: text: The Pythagorean theorem can be w ... tion relating the lengths of the sides
|
||||||
|
item-4 at level 3: text: a
|
||||||
|
item-5 at level 3: text: ,
|
||||||
|
item-6 at level 3: text: b
|
||||||
|
item-7 at level 3: text: and the hypotenuse
|
||||||
|
item-8 at level 3: text: c
|
||||||
|
item-9 at level 3: text: .
|
||||||
|
item-10 at level 2: inline: group group
|
||||||
|
item-11 at level 3: text: To use Docling, simply install
|
||||||
|
item-12 at level 3: code: docling
|
||||||
|
item-13 at level 3: text: from your package manager, e.g. pip:
|
||||||
|
item-14 at level 3: code: pip install docling
|
||||||
|
item-15 at level 2: inline: group group
|
||||||
|
item-16 at level 3: text: To convert individual documents with python, use
|
||||||
|
item-17 at level 3: code: convert()
|
||||||
|
item-18 at level 3: text: , for example:
|
||||||
|
item-19 at level 2: code: from docling.document_converter ... (result.document.export_to_markdown())
|
||||||
|
item-20 at level 2: inline: group group
|
||||||
|
item-21 at level 3: text: The program will output:
|
||||||
|
item-22 at level 3: code: ## Docling Technical Report[...]
|
||||||
|
item-23 at level 2: text: Prefetch the models:
|
||||||
|
item-24 at level 2: list: group list
|
||||||
|
item-25 at level 3: list_item:
|
||||||
|
item-26 at level 4: inline: group group
|
||||||
|
item-27 at level 5: text: Use the
|
||||||
|
item-28 at level 5: code: docling-tools models download
|
||||||
|
item-29 at level 5: text: utility:
|
||||||
|
item-30 at level 3: list_item:
|
||||||
|
item-31 at level 4: inline: group group
|
||||||
|
item-32 at level 5: text: Alternatively, models can be programmatically downloaded using
|
||||||
|
item-33 at level 5: code: docling.utils.model_downloader.download_models()
|
||||||
|
item-34 at level 5: text: .
|
||||||
|
item-35 at level 3: list_item:
|
||||||
|
item-36 at level 4: inline: group group
|
||||||
|
item-37 at level 5: text: Also, you can use download-hf-re ... rom HuggingFace by specifying repo id:
|
||||||
|
item-38 at level 5: code: $ docling-tools models download- ... 256M-preview model from HuggingFace...
|
||||||
674
tests/data/groundtruth/docling_v2/html_code_snippets.html.json
vendored
Normal file
674
tests/data/groundtruth/docling_v2/html_code_snippets.html.json
vendored
Normal file
@@ -0,0 +1,674 @@
|
|||||||
|
{
|
||||||
|
"schema_name": "DoclingDocument",
|
||||||
|
"version": "1.5.0",
|
||||||
|
"name": "html_code_snippets",
|
||||||
|
"origin": {
|
||||||
|
"mimetype": "text/html",
|
||||||
|
"binary_hash": 8468578485215893920,
|
||||||
|
"filename": "html_code_snippets.html"
|
||||||
|
},
|
||||||
|
"furniture": {
|
||||||
|
"self_ref": "#/furniture",
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"name": "_root_",
|
||||||
|
"label": "unspecified"
|
||||||
|
},
|
||||||
|
"body": {
|
||||||
|
"self_ref": "#/body",
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/1"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "_root_",
|
||||||
|
"label": "unspecified"
|
||||||
|
},
|
||||||
|
"groups": [
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/0",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/texts/1"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/4"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/5"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/8"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "group",
|
||||||
|
"label": "inline"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/1",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/texts/1"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/9"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/10"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/11"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/12"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "group",
|
||||||
|
"label": "inline"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/2",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/texts/1"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/13"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/14"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/15"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "group",
|
||||||
|
"label": "inline"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/3",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/texts/1"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/17"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/18"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "group",
|
||||||
|
"label": "inline"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/4",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/texts/1"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/20"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/24"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/28"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "list",
|
||||||
|
"label": "list"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/5",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/texts/20"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/21"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/22"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/23"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "group",
|
||||||
|
"label": "inline"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/6",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/texts/24"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/25"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/26"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/27"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "group",
|
||||||
|
"label": "inline"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/7",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/texts/28"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/29"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/30"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "group",
|
||||||
|
"label": "inline"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"texts": [
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/0",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"label": "title",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Code snippets in HTML",
|
||||||
|
"text": "Code snippets in HTML"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/1",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/16"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/19"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "title",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Code snippets",
|
||||||
|
"text": "Code snippets"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/2",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "The Pythagorean theorem can be written as an equation relating the lengths of the sides",
|
||||||
|
"text": "The Pythagorean theorem can be written as an equation relating the lengths of the sides"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/3",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "a",
|
||||||
|
"text": "a",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": true,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/4",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": ",",
|
||||||
|
"text": ","
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/5",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "b",
|
||||||
|
"text": "b",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": true,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/6",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "and the hypotenuse",
|
||||||
|
"text": "and the hypotenuse"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/7",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "c",
|
||||||
|
"text": "c",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": true,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/8",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": ".",
|
||||||
|
"text": "."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/9",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/1"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "To use Docling, simply install",
|
||||||
|
"text": "To use Docling, simply install"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/10",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/1"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "code",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "docling",
|
||||||
|
"text": "docling",
|
||||||
|
"captions": [],
|
||||||
|
"references": [],
|
||||||
|
"footnotes": [],
|
||||||
|
"code_language": "unknown"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/11",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/1"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "from your package manager, e.g. pip:",
|
||||||
|
"text": "from your package manager, e.g. pip:"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/12",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/1"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "code",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "pip install docling",
|
||||||
|
"text": "pip install docling",
|
||||||
|
"captions": [],
|
||||||
|
"references": [],
|
||||||
|
"footnotes": [],
|
||||||
|
"code_language": "unknown"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/13",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/2"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "To convert individual documents with python, use",
|
||||||
|
"text": "To convert individual documents with python, use"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/14",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/2"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "code",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "convert()",
|
||||||
|
"text": "convert()",
|
||||||
|
"captions": [],
|
||||||
|
"references": [],
|
||||||
|
"footnotes": [],
|
||||||
|
"code_language": "unknown"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/15",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/2"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": ", for example:",
|
||||||
|
"text": ", for example:"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/16",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/texts/1"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "code",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "from docling.document_converter import DocumentConverter\n\nsource = \"https://arxiv.org/pdf/2408.09869\"\nconverter = DocumentConverter()\nresult = converter.convert(source)\nprint(result.document.export_to_markdown())",
|
||||||
|
"text": "from docling.document_converter import DocumentConverter\n\nsource = \"https://arxiv.org/pdf/2408.09869\"\nconverter = DocumentConverter()\nresult = converter.convert(source)\nprint(result.document.export_to_markdown())",
|
||||||
|
"captions": [],
|
||||||
|
"references": [],
|
||||||
|
"footnotes": [],
|
||||||
|
"code_language": "unknown"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/17",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/3"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "The program will output:",
|
||||||
|
"text": "The program will output:"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/18",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/3"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "code",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "## Docling Technical Report[...]",
|
||||||
|
"text": "## Docling Technical Report[...]",
|
||||||
|
"captions": [],
|
||||||
|
"references": [],
|
||||||
|
"footnotes": [],
|
||||||
|
"code_language": "unknown"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/19",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/texts/1"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Prefetch the models:",
|
||||||
|
"text": "Prefetch the models:"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/20",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/5"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "list_item",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "",
|
||||||
|
"text": "",
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/21",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/5"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Use the",
|
||||||
|
"text": "Use the"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/22",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/5"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "code",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "docling-tools models download",
|
||||||
|
"text": "docling-tools models download",
|
||||||
|
"captions": [],
|
||||||
|
"references": [],
|
||||||
|
"footnotes": [],
|
||||||
|
"code_language": "unknown"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/23",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/5"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "utility:",
|
||||||
|
"text": "utility:"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/24",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/6"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "list_item",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "",
|
||||||
|
"text": "",
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/25",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/6"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Alternatively, models can be programmatically downloaded using",
|
||||||
|
"text": "Alternatively, models can be programmatically downloaded using"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/26",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/6"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "code",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "docling.utils.model_downloader.download_models()",
|
||||||
|
"text": "docling.utils.model_downloader.download_models()",
|
||||||
|
"captions": [],
|
||||||
|
"references": [],
|
||||||
|
"footnotes": [],
|
||||||
|
"code_language": "unknown"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/27",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/6"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": ".",
|
||||||
|
"text": "."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/28",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/7"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "list_item",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "",
|
||||||
|
"text": "",
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/29",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/7"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:",
|
||||||
|
"text": "Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/30",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/7"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "code",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...",
|
||||||
|
"text": "$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...",
|
||||||
|
"captions": [],
|
||||||
|
"references": [],
|
||||||
|
"footnotes": [],
|
||||||
|
"code_language": "unknown"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"pictures": [],
|
||||||
|
"tables": [],
|
||||||
|
"key_value_items": [],
|
||||||
|
"form_items": [],
|
||||||
|
"pages": {}
|
||||||
|
}
|
||||||
24
tests/data/groundtruth/docling_v2/html_code_snippets.html.md
vendored
Normal file
24
tests/data/groundtruth/docling_v2/html_code_snippets.html.md
vendored
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
# Code snippets
|
||||||
|
|
||||||
|
The Pythagorean theorem can be written as an equation relating the lengths of the sides *a* , *b* and the hypotenuse *c* .
|
||||||
|
|
||||||
|
To use Docling, simply install `docling` from your package manager, e.g. pip: `pip install docling`
|
||||||
|
|
||||||
|
To convert individual documents with python, use `convert()` , for example:
|
||||||
|
|
||||||
|
```
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
source = "https://arxiv.org/pdf/2408.09869"
|
||||||
|
converter = DocumentConverter()
|
||||||
|
result = converter.convert(source)
|
||||||
|
print(result.document.export_to_markdown())
|
||||||
|
```
|
||||||
|
|
||||||
|
The program will output: `## Docling Technical Report[...]`
|
||||||
|
|
||||||
|
Prefetch the models:
|
||||||
|
|
||||||
|
- Use the `docling-tools models download` utility:
|
||||||
|
- Alternatively, models can be programmatically downloaded using `docling.utils.model_downloader.download_models()` .
|
||||||
|
- Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id: `$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...`
|
||||||
41
tests/data/html/html_code_snippets.html
vendored
Normal file
41
tests/data/html/html_code_snippets.html
vendored
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>Code snippets in HTML</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<h1>Code snippets</h1>
|
||||||
|
|
||||||
|
<p>The Pythagorean theorem can be written as an equation relating the lengths of the sides <var>a</var>, <var>b</var> and the hypotenuse <var>c</var>.</p>
|
||||||
|
<p>To use Docling, simply install <code>docling</code>from your package manager, e.g. pip:
|
||||||
|
<kbd>pip install docling</kbd>
|
||||||
|
</p>
|
||||||
|
<p>To convert individual documents with python, use <code>convert()</code>, for example:</p>
|
||||||
|
<pre><code>
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
source = "https://arxiv.org/pdf/2408.09869"
|
||||||
|
converter = DocumentConverter()
|
||||||
|
result = converter.convert(source)
|
||||||
|
print(result.document.export_to_markdown())
|
||||||
|
</code></pre>
|
||||||
|
<p>The program will output:
|
||||||
|
<samp>## Docling Technical Report[...]</samp>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>Prefetch the models:</p>
|
||||||
|
<ul>
|
||||||
|
<li>Use the <code>docling-tools models download</code> utility:</li>
|
||||||
|
<li>Alternatively, models can be programmatically downloaded using <samp>docling.utils.model_downloader.download_models()</samp>.</li>
|
||||||
|
<li>Also, you can use download-hf-repo parameter to download arbitrary models from HuggingFace by specifying repo id:
|
||||||
|
<pre><code>
|
||||||
|
$ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview
|
||||||
|
Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...
|
||||||
|
</code></pre>
|
||||||
|
<pre hidden><code>$ docling-tools</code></pre>
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
Reference in New Issue
Block a user