diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index b24df93e..2334c645 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -1,5 +1,6 @@
import logging
import re
+import traceback
from contextlib import contextmanager
from copy import deepcopy
from io import BytesIO
@@ -45,20 +46,22 @@ _BLOCK_TAGS: Final = {
"h4",
"h5",
"h6",
+ "ol",
"p",
"pre",
- "code",
- "ul",
- "ol",
"summary",
"table",
+ "ul",
}
+_CODE_TAG_SET: Final = {"code", "kbd", "samp"}
+
_FORMAT_TAG_MAP: Final = {
"b": {"bold": True},
"strong": {"bold": True},
"i": {"italic": True},
"em": {"italic": True},
+ "var": {"italic": True},
# "mark",
# "small",
"s": {"strikethrough": True},
@@ -67,6 +70,7 @@ _FORMAT_TAG_MAP: Final = {
"ins": {"underline": True},
"sub": {"script": Script.SUB},
"sup": {"script": Script.SUPER},
+ **{k: {} for k in _CODE_TAG_SET},
}
@@ -79,6 +83,7 @@ class AnnotatedText(BaseModel):
text: str
hyperlink: Union[AnyUrl, Path, None] = None
formatting: Union[Formatting, None] = None
+ code: bool = False
class AnnotatedTextList(list):
@@ -86,10 +91,12 @@ class AnnotatedTextList(list):
current_h = None
current_text = ""
current_f = None
+ current_code = False
for at in self:
t = at.text
h = at.hyperlink
f = at.formatting
+ c = at.code
current_text += t.strip() + " "
if f is not None and current_f is None:
current_f = f
@@ -103,8 +110,13 @@ class AnnotatedTextList(list):
_log.warning(
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
)
+ current_code = c if c else current_code
+
return AnnotatedText(
- text=current_text.strip(), hyperlink=current_h, formatting=current_f
+ text=current_text.strip(),
+ hyperlink=current_h,
+ formatting=current_f,
+ code=current_code,
)
def simplify_text_elements(self) -> "AnnotatedTextList":
@@ -114,9 +126,14 @@ class AnnotatedTextList(list):
text = self[0].text
hyperlink = self[0].hyperlink
formatting = self[0].formatting
+ code = self[0].code
last_elm = text
for i in range(1, len(self)):
- if hyperlink == self[i].hyperlink and formatting == self[i].formatting:
+ if (
+ hyperlink == self[i].hyperlink
+ and formatting == self[i].formatting
+ and code == self[i].code
+ ):
sep = " "
if not self[i].text.strip() or not last_elm.strip():
sep = ""
@@ -124,15 +141,20 @@ class AnnotatedTextList(list):
last_elm = self[i].text
else:
simplified.append(
- AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
+ AnnotatedText(
+ text=text, hyperlink=hyperlink, formatting=formatting, code=code
+ )
)
text = self[i].text
last_elm = text
hyperlink = self[i].hyperlink
formatting = self[i].formatting
+ code = self[i].code
if text:
simplified.append(
- AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
+ AnnotatedText(
+ text=text, hyperlink=hyperlink, formatting=formatting, code=code
+ )
)
return simplified
@@ -174,7 +196,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.ctx = _Context()
for i in range(self.max_levels):
self.parents[i] = None
- self.hyperlink = None
+ self.hyperlink: Union[AnyUrl, Path, None] = None
self.original_url = original_url
self.format_tags: list[str] = []
@@ -235,9 +257,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
orig=title_text,
content_layer=ContentLayer.FURNITURE,
)
- # remove scripts/styles
+ # remove script and style tags
for tag in self.soup(["script", "style"]):
tag.decompose()
+ # remove any hidden tag
+ for tag in self.soup(hidden=True):
+ tag.decompose()
+
content = self.soup.body or self.soup
# normalize
tags
for br in content("br"):
@@ -268,7 +294,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def flush_buffer():
if not buffer:
return
- annotated_text_list = buffer.simplify_text_elements()
+ annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
parts = annotated_text_list.split_by_newline()
buffer.clear()
@@ -276,20 +302,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return
for annotated_text_list in parts:
- with self.use_inline_group(annotated_text_list, doc):
+ with self._use_inline_group(annotated_text_list, doc):
for annotated_text in annotated_text_list:
if annotated_text.text.strip():
seg_clean = HTMLDocumentBackend._clean_unicode(
annotated_text.text.strip()
)
- doc.add_text(
- parent=self.parents[self.level],
- label=DocItemLabel.TEXT,
- text=seg_clean,
- content_layer=self.content_layer,
- formatting=annotated_text.formatting,
- hyperlink=annotated_text.hyperlink,
- )
+ if annotated_text.code:
+ doc.add_code(
+ parent=self.parents[self.level],
+ text=seg_clean,
+ content_layer=self.content_layer,
+ formatting=annotated_text.formatting,
+ hyperlink=annotated_text.hyperlink,
+ )
+ else:
+ doc.add_text(
+ parent=self.parents[self.level],
+ label=DocItemLabel.TEXT,
+ text=seg_clean,
+ content_layer=self.content_layer,
+ formatting=annotated_text.formatting,
+ hyperlink=annotated_text.hyperlink,
+ )
for node in element.contents:
if isinstance(node, Tag):
@@ -298,10 +333,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
flush_buffer()
self._emit_image(node, doc)
elif name in _FORMAT_TAG_MAP:
- with self.use_format([name]):
+ with self._use_format([name]):
self._walk(node, doc)
elif name == "a":
- with self.use_hyperlink(node):
+ with self._use_hyperlink(node):
self._walk(node, doc)
elif name in _BLOCK_TAGS:
flush_buffer()
@@ -367,8 +402,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
this_parent = item.parent
while this_parent is not None:
if this_parent.name == "a" and this_parent.get("href"):
- with self.use_format(format_tags):
- with self.use_hyperlink(this_parent):
+ with self._use_format(format_tags):
+ with self._use_hyperlink(this_parent):
return self._extract_text_and_hyperlink_recursively(
item, ignore_list
)
@@ -379,6 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if isinstance(item, NavigableString):
text = item.strip()
+ code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET)
if text:
return AnnotatedTextList(
[
@@ -386,6 +422,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text=text,
hyperlink=self.hyperlink,
formatting=self._formatting,
+ code=code,
)
]
)
@@ -396,6 +433,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text="\n",
hyperlink=self.hyperlink,
formatting=self._formatting,
+ code=code,
)
]
)
@@ -405,14 +443,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if not ignore_list or (tag.name not in ["ul", "ol"]):
for child in tag:
if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
- with self.use_format([child.name]):
+ with self._use_format([child.name]):
result.extend(
self._extract_text_and_hyperlink_recursively(
child, ignore_list, keep_newlines=keep_newlines
)
)
elif isinstance(child, Tag) and child.name == "a":
- with self.use_hyperlink(child):
+ with self._use_hyperlink(child):
result.extend(
self._extract_text_and_hyperlink_recursively(
child, ignore_list, keep_newlines=keep_newlines
@@ -428,29 +466,30 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return result
@contextmanager
- def use_hyperlink(self, tag):
+ def _use_hyperlink(self, tag: Tag):
this_href = tag.get("href")
if this_href is None:
yield None
else:
- if this_href:
- old_hyperlink = self.hyperlink
+ if isinstance(this_href, str) and this_href:
+ old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink
+ new_hyperlink: Union[AnyUrl, Path, None] = None
if self.original_url is not None:
- this_href = urljoin(self.original_url, this_href)
+ this_href = urljoin(str(self.original_url), str(this_href))
# ugly fix for relative links since pydantic does not support them.
try:
- AnyUrl(this_href)
+ new_hyperlink = AnyUrl(this_href)
except ValidationError:
- this_href = Path(this_href)
- self.hyperlink = this_href
+ new_hyperlink = Path(this_href)
+ self.hyperlink = new_hyperlink
try:
yield None
finally:
- if this_href:
+ if new_hyperlink:
self.hyperlink = old_hyperlink
@contextmanager
- def use_format(self, tags: list[str]):
+ def _use_format(self, tags: list[str]):
if not tags:
yield None
else:
@@ -461,7 +500,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.format_tags = self.format_tags[: -len(tags)]
@contextmanager
- def use_inline_group(
+ def _use_inline_group(
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
):
"""Create an inline group for annotated texts.
@@ -473,9 +512,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
Args:
annotated_text_list (AnnotatedTextList): Annotated text
doc (DoclingDocument): Currently used document
-
- Yields:
- None: _description_
"""
if len(annotated_text_list) > 1:
inline_fmt = doc.add_group(
@@ -493,6 +529,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
else:
yield None
+ @contextmanager
+ def _use_details(self, tag: Tag, doc: DoclingDocument):
+ """Create a group with the content of a details tag.
+
+ While the context manager is active, the hierarchy level is set one
+ level higher as the cuurent parent.
+
+ Args:
+ tag: The details tag.
+ doc: Currently used document.
+ """
+ self.parents[self.level + 1] = doc.add_group(
+ name=tag.name,
+ label=GroupLabel.SECTION,
+ parent=self.parents[self.level],
+ content_layer=self.content_layer,
+ )
+ self.level += 1
+ try:
+ yield None
+ finally:
+ self.parents[self.level + 1] = None
+ self.level -= 1
+
+ @contextmanager
+ def _use_footer(self, tag: Tag, doc: DoclingDocument):
+ """Create a group with a footer.
+
+ Create a group with the content of a footer tag. While the context manager
+ is active, the hierarchy level is set one level higher as the cuurent parent.
+
+ Args:
+ tag: The footer tag.
+ doc: Currently used document.
+ """
+ current_layer = self.content_layer
+ self.content_layer = ContentLayer.FURNITURE
+ self.parents[self.level + 1] = doc.add_group(
+ name=tag.name,
+ label=GroupLabel.SECTION,
+ parent=self.parents[self.level],
+ content_layer=self.content_layer,
+ )
+ self.level += 1
+ try:
+ yield None
+ finally:
+ self.parents[self.level + 1] = None
+ self.level -= 1
+ self.content_layer = current_layer
+
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
tag_name = tag.name.lower()
# set default content layer to BODY as soon as we encounter a heading
@@ -611,20 +698,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
content_layer=self.content_layer,
)
self.level += 1
- with self.use_inline_group(min_parts, doc):
+ with self._use_inline_group(min_parts, doc):
for annotated_text in min_parts:
li_text = re.sub(
r"\s+|\n+", " ", annotated_text.text
).strip()
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
- doc.add_text(
- parent=self.parents[self.level],
- label=DocItemLabel.TEXT,
- text=li_clean,
- content_layer=self.content_layer,
- formatting=annotated_text.formatting,
- hyperlink=annotated_text.hyperlink,
- )
+ if annotated_text.code:
+ doc.add_code(
+ parent=self.parents[self.level],
+ text=li_clean,
+ content_layer=self.content_layer,
+ formatting=annotated_text.formatting,
+ hyperlink=annotated_text.hyperlink,
+ )
+ else:
+ doc.add_text(
+ parent=self.parents[self.level],
+ label=DocItemLabel.TEXT,
+ text=li_clean,
+ content_layer=self.content_layer,
+ formatting=annotated_text.formatting,
+ hyperlink=annotated_text.hyperlink,
+ )
# 4) recurse into any nested lists, attaching them to this
The Pythagorean theorem can be written as an equation relating the lengths of the sides a, b and the hypotenuse c.
+To use Docling, simply install doclingfrom your package manager, e.g. pip:
+ pip install docling
+
To convert individual documents with python, use convert(), for example:
+from docling.document_converter import DocumentConverter
+
+source = "https://arxiv.org/pdf/2408.09869"
+converter = DocumentConverter()
+result = converter.convert(source)
+print(result.document.export_to_markdown())
+
+The program will output: + ## Docling Technical Report[...] +
+ +Prefetch the models:
+docling-tools models download utility:
+ $ docling-tools models download-hf-repo ds4sd/SmolDocling-256M-preview
+ Downloading ds4sd/SmolDocling-256M-preview model from HuggingFace...
+
+ $ docling-tools
+