fix minor bugs, mark helper methods internal

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
Panos Vagenas 2025-04-03 14:21:34 +02:00
parent c4f9916fbb
commit a1cb0dd344
6 changed files with 752 additions and 92 deletions

View File

@ -26,6 +26,7 @@ from docx.text.run import Run
from lxml import etree
from lxml.etree import XPath
from PIL import Image, UnidentifiedImageError
from pydantic import AnyUrl
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
@ -121,14 +122,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
if self.is_valid():
assert self.docx_obj is not None
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
return doc
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
)
def update_history(
def _update_history(
self,
name: str,
level: Optional[int],
@ -141,26 +142,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.history["numids"].append(numid)
self.history["indents"].append(ilevel)
def prev_name(self) -> Optional[str]:
def _prev_name(self) -> Optional[str]:
return self.history["names"][-1]
def prev_level(self) -> Optional[int]:
def _prev_level(self) -> Optional[int]:
return self.history["levels"][-1]
def prev_numid(self) -> Optional[int]:
def _prev_numid(self) -> Optional[int]:
return self.history["numids"][-1]
def prev_indent(self) -> Optional[int]:
def _prev_indent(self) -> Optional[int]:
return self.history["indents"][-1]
def get_level(self) -> int:
def _get_level(self) -> int:
"""Return the first None index."""
for k, v in self.parents.items():
if k >= 0 and v == None:
return k
return 0
def walk_linear(
def _walk_linear(
self,
body: BaseOxmlElement,
docx_obj: DocxDocument,
@ -180,12 +181,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Check for Tables
if element.tag.endswith("tbl"):
try:
self.handle_tables(element, docx_obj, doc)
self._handle_tables(element, docx_obj, doc)
except Exception:
_log.debug("could not parse a table, broken docx table")
elif drawing_blip:
self.handle_pictures(docx_obj, drawing_blip, doc)
self._handle_pictures(docx_obj, drawing_blip, doc)
# Check for the sdt containers, like table of contents
elif tag_name in ["sdt"]:
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@ -193,16 +194,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Iterate paragraphs, runs, or text inside <w:sdtContent>.
paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
for p in paragraphs:
self.handle_text_elements(p, docx_obj, doc)
self._handle_text_elements(p, docx_obj, doc)
# Check for Text
elif tag_name in ["p"]:
# "tcPr", "sectPr"
self.handle_text_elements(element, docx_obj, doc)
self._handle_text_elements(element, docx_obj, doc)
else:
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc
def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]:
def _str_to_int(
self, s: Optional[str], default: Optional[int] = 0
) -> Optional[int]:
if s is None:
return None
try:
@ -210,7 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
except ValueError:
return default
def split_text_and_number(self, input_string: str) -> list[str]:
def _split_text_and_number(self, input_string: str) -> list[str]:
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
if match:
parts = list(filter(None, match.groups()))
@ -218,7 +221,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else:
return [input_string]
def get_numId_and_ilvl(
def _get_numId_and_ilvl(
self, paragraph: Paragraph
) -> tuple[Optional[int], Optional[int]]:
# Access the XML element of the paragraph
@ -233,12 +236,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
return self.str_to_int(numId, None), self.str_to_int(ilvl, None)
return self._str_to_int(numId, None), self._str_to_int(ilvl, None)
return None, None # If the paragraph is not part of a list
def get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
parts = self.split_text_and_number(style_label)
def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
parts = self._split_text_and_number(style_label)
if len(parts) == 2:
parts.sort()
@ -246,15 +249,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label_level: Optional[int] = 0
if parts[0].strip().lower() == "heading":
label_str = "Heading"
label_level = self.str_to_int(parts[1], None)
label_level = self._str_to_int(parts[1], None)
if parts[1].strip().lower() == "heading":
label_str = "Heading"
label_level = self.str_to_int(parts[0], None)
label_level = self._str_to_int(parts[0], None)
return label_str, label_level
return style_label, None
def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
def _get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
if paragraph.style is None:
return "Normal", None
@ -267,21 +270,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if ":" in label:
parts = label.split(":")
if len(parts) == 2:
return parts[0], self.str_to_int(parts[1], None)
return parts[0], self._str_to_int(parts[1], None)
if "heading" in label.lower():
return self.get_heading_and_level(label)
return self._get_heading_and_level(label)
if "heading" in name.lower():
return self.get_heading_and_level(name)
return self._get_heading_and_level(name)
return label, None
@classmethod
def _get_format_from_run(cls, run: Run) -> Formatting:
return Formatting(
bold=run.bold if run.bold is not None else False,
italic=run.italic if run.italic is not None else False,
underline=run.underline if run.underline is not None else False,
def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
has_any_formatting = run.bold or run.italic or run.underline
return (
Formatting(
bold=run.bold or False,
italic=run.italic or False,
underline=run.underline or False,
)
if has_any_formatting
else None
)
def _get_paragraph_elements(self, paragraph: Paragraph):
@ -289,7 +297,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
Extract paragraph elements along with their formatting and hyperlink
"""
paragraph_elements: list[tuple[str, Formatting, Path | None]] = []
# for now retain empty paragraphs for backwards compatibility:
if paragraph.text.strip() == "":
return [("", None, None)]
paragraph_elements: list[
tuple[str, Optional[Formatting], Optional[Union[AnyUrl, Path]]]
] = []
group_text = ""
previous_format = None
@ -306,13 +320,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else:
continue
# Initialize previous_format with the first format
previous_format = previous_format or format
if (len(text.strip()) and (format != previous_format)) or (
if (len(text.strip()) and format != previous_format) or (
hyperlink is not None
):
# If the style changes for a non empty text, add the previous group
if len(group_text.strip()) > 0:
paragraph_elements.append(
@ -335,7 +345,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return paragraph_elements
def handle_equations_in_text(self, element, text):
def _handle_equations_in_text(self, element, text):
only_texts = []
only_equations = []
texts_and_equations = []
@ -381,7 +391,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return output_text, only_equations
def handle_text_elements(
def _create_or_reuse_parent(
self,
*,
doc: DoclingDocument,
prev_parent: Optional[NodeItem],
paragraph_elements: list,
) -> Optional[NodeItem]:
return (
doc.add_group(label=GroupLabel.INLINE, parent=prev_parent)
if len(paragraph_elements) > 1
else prev_parent
)
def _handle_text_elements(
self,
element: BaseOxmlElement,
docx_obj: DocxDocument,
@ -390,19 +413,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
paragraph = Paragraph(element, docx_obj)
raw_text = paragraph.text
text, equations = self.handle_equations_in_text(element=element, text=raw_text)
text, equations = self._handle_equations_in_text(element=element, text=raw_text)
if text is None:
return
paragraph_elements = self._get_paragraph_elements(paragraph)
text = text.strip()
# Common styles for bullet and numbered lists.
# "List Bullet", "List Number", "List Paragraph"
# Identify wether list is a numbered list or not
# is_numbered = "List Bullet" not in paragraph.style.name
is_numbered = False
p_style_id, p_level = self.get_label_and_level(paragraph)
numid, ilevel = self.get_numId_and_ilvl(paragraph)
p_style_id, p_level = self._get_label_and_level(paragraph)
numid, ilevel = self._get_numId_and_ilvl(paragraph)
if numid == 0:
numid = None
@ -413,18 +437,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
and ilevel is not None
and p_style_id not in ["Title", "Heading"]
):
self._add_listitem(
doc,
numid,
ilevel,
paragraph_elements,
is_numbered,
self._add_list_item(
doc=doc,
numid=numid,
ilevel=ilevel,
elements=paragraph_elements,
is_numbered=is_numbered,
)
self.update_history(p_style_id, p_level, numid, ilevel)
self._update_history(p_style_id, p_level, numid, ilevel)
return
elif (
numid is None
and self.prev_numid() is not None
and self._prev_numid() is not None
and p_style_id not in ["Title", "Heading"]
): # Close list
if self.level_at_new_list:
@ -452,12 +476,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
else:
is_numbered_style = False
self.add_header(doc, p_level, text, is_numbered_style)
self._add_header(doc, p_level, text, is_numbered_style)
elif len(equations) > 0:
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
# Standalone equation
level = self.get_level()
level = self._get_level()
doc.add_text(
label=DocItemLabel.FORMULA,
parent=self.parents[level - 1],
@ -465,7 +489,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
else:
# Inline equation
level = self.get_level()
level = self._get_level()
inline_equation = doc.add_group(
label=GroupLabel.INLINE, parent=self.parents[level - 1]
)
@ -504,14 +528,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
"ListBullet",
"Quote",
]:
level = self.get_level()
inline_fmt = doc.add_group(
label=GroupLabel.INLINE, parent=self.parents[level - 1]
level = self._get_level()
parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents.get(level - 1),
paragraph_elements=paragraph_elements,
)
for text, format, hyperlink in paragraph_elements:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=inline_fmt,
parent=parent,
text=text,
formatting=format,
hyperlink=hyperlink,
@ -520,30 +546,32 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else:
# Text style names can, and will have, not only default values but user values too
# hence we treat all other labels as pure text
level = self.get_level()
inline_fmt = doc.add_group(
label=GroupLabel.INLINE, parent=self.parents[level - 1]
level = self._get_level()
parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents.get(level - 1),
paragraph_elements=paragraph_elements,
)
for text, format, hyperlink in paragraph_elements:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=inline_fmt,
parent=parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
self.update_history(p_style_id, p_level, numid, ilevel)
self._update_history(p_style_id, p_level, numid, ilevel)
return
def add_header(
def _add_header(
self,
doc: DoclingDocument,
curr_level: Optional[int],
text: str,
is_numbered_style: bool = False,
) -> None:
level = self.get_level()
level = self._get_level()
if isinstance(curr_level, int):
if curr_level > level:
# add invisible group
@ -599,8 +627,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
return
def _add_listitem(
def _add_list_item(
self,
*,
doc: DoclingDocument,
numid: int,
ilevel: int,
@ -609,9 +638,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) -> None:
enum_marker = ""
level = self.get_level()
prev_indent = self.prev_indent()
if self.prev_numid() is None: # Open new list
level = self._get_level()
prev_indent = self._prev_indent()
if self._prev_numid() is None: # Open new list
self.level_at_new_list = level
self.parents[level] = doc.add_group(
@ -623,22 +652,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
inline_fmt = doc.add_group(
label=GroupLabel.INLINE, parent=self.parents[level]
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[level],
paragraph_elements=elements,
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=inline_fmt,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elif (
self.prev_numid() == numid
self._prev_numid() == numid
and self.level_at_new_list is not None
and prev_indent is not None
and prev_indent < ilevel
@ -667,21 +697,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
enum_marker = str(self.listIter) + "."
is_numbered = True
inline_fmt = doc.add_group(
label=GroupLabel.INLINE,
parent=self.parents[self.level_at_new_list + ilevel],
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[self.level_at_new_list + ilevel],
paragraph_elements=elements,
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=inline_fmt,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elif (
self.prev_numid() == numid
self._prev_numid() == numid
and self.level_at_new_list is not None
and prev_indent is not None
and ilevel < prev_indent
@ -695,43 +726,46 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
inline_fmt = doc.add_group(
label=GroupLabel.INLINE,
parent=self.parents[self.level_at_new_list + ilevel],
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[self.level_at_new_list + ilevel],
paragraph_elements=elements,
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=inline_fmt,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
self.listIter = 0
elif self.prev_numid() == numid or prev_indent == ilevel:
elif self._prev_numid() == numid or prev_indent == ilevel:
# TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
inline_fmt = doc.add_group(
label=GroupLabel.INLINE, parent=self.parents[level - 1]
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[level - 1],
paragraph_elements=elements,
)
for text, format, hyperlink in elements:
# Add the list item to the parent group
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=inline_fmt,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
return
def handle_tables(
def _handle_tables(
self,
element: BaseOxmlElement,
docx_obj: DocxDocument,
@ -746,7 +780,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
cell_element = table.rows[0].cells[0]
# In case we have a table of only 1 cell, we consider it furniture
# And proceed processing the content of the cell as though it's in the document body
self.walk_linear(cell_element._element, docx_obj, doc)
self._walk_linear(cell_element._element, docx_obj, doc)
return
data = TableData(num_rows=num_rows, num_cols=num_cols)
@ -791,11 +825,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
data.table_cells.append(table_cell)
col_idx += cell.grid_span
level = self.get_level()
level = self._get_level()
doc.add_table(data=data, parent=self.parents[level - 1])
return
def handle_pictures(
def _handle_pictures(
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
) -> None:
def get_docx_image(drawing_blip):
@ -808,7 +842,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
image_data = image_part.blob # Get the binary image data
return image_data
level = self.get_level()
level = self._get_level()
# Open the BytesIO object with PIL to create an Image
try:
image_data = get_docx_image(drawing_blip)

View File

@ -0,0 +1,30 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: italic
item-2 at level 1: paragraph: bold
item-3 at level 1: paragraph: underline
item-4 at level 1: paragraph: hyperlink
item-5 at level 1: paragraph: italic and bold hyperlink
item-6 at level 1: inline: group group
item-7 at level 2: paragraph: Normal
item-8 at level 2: paragraph: italic
item-9 at level 2: paragraph: bold
item-10 at level 2: paragraph: underline
item-11 at level 2: paragraph: and
item-12 at level 2: paragraph: hyperlink
item-13 at level 2: paragraph: on the same line
item-14 at level 1: paragraph:
item-15 at level 1: list: group list
item-16 at level 2: list_item: Italic bullet 1
item-17 at level 2: list_item: Bold bullet 2
item-18 at level 2: list_item: Underline bullet 3
item-19 at level 2: inline: group group
item-20 at level 3: list_item: Some
item-21 at level 3: list_item: italic
item-22 at level 3: list_item: bold
item-23 at level 3: list_item: underline
item-24 at level 2: list: group list
item-25 at level 3: inline: group group
item-26 at level 4: list_item: Nested
item-27 at level 4: list_item: italic
item-28 at level 4: list_item: bold
item-29 at level 1: paragraph:

View File

@ -0,0 +1,577 @@
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "unit_test_formatting",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"binary_hash": 16380079676357958448,
"filename": "unit_test_formatting.docx"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
},
{
"$ref": "#/texts/4"
},
{
"$ref": "#/groups/0"
},
{
"$ref": "#/texts/12"
},
{
"$ref": "#/groups/1"
},
{
"$ref": "#/texts/23"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/5"
},
{
"$ref": "#/texts/6"
},
{
"$ref": "#/texts/7"
},
{
"$ref": "#/texts/8"
},
{
"$ref": "#/texts/9"
},
{
"$ref": "#/texts/10"
},
{
"$ref": "#/texts/11"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/13"
},
{
"$ref": "#/texts/14"
},
{
"$ref": "#/texts/15"
},
{
"$ref": "#/groups/2"
},
{
"$ref": "#/groups/3"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/groups/1"
},
"children": [
{
"$ref": "#/texts/16"
},
{
"$ref": "#/texts/17"
},
{
"$ref": "#/texts/18"
},
{
"$ref": "#/texts/19"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/3",
"parent": {
"$ref": "#/groups/1"
},
"children": [
{
"$ref": "#/groups/4"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/4",
"parent": {
"$ref": "#/groups/3"
},
"children": [
{
"$ref": "#/texts/20"
},
{
"$ref": "#/texts/21"
},
{
"$ref": "#/texts/22"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
}
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "hyperlink",
"text": "hyperlink",
"hyperlink": "https:/github.com/DS4SD/docling"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "italic and bold hyperlink",
"text": "italic and bold hyperlink",
"formatting": {
"bold": true,
"italic": true,
"underline": false,
"strikethrough": false
},
"hyperlink": "https:/github.com/DS4SD/docling"
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "Normal",
"text": "Normal"
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
}
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "and",
"text": "and"
},
{
"self_ref": "#/texts/10",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "hyperlink",
"text": "hyperlink",
"hyperlink": "https:/github.com/DS4SD/docling"
},
{
"self_ref": "#/texts/11",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "on the same line",
"text": "on the same line"
},
{
"self_ref": "#/texts/12",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/13",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Italic bullet 1",
"text": "Italic bullet 1",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/14",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Bold bullet 2",
"text": "Bold bullet 2",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/15",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Underline bullet 3",
"text": "Underline bullet 3",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/16",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Some",
"text": "Some",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/17",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/18",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/20",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Nested",
"text": "Nested",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/21",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/22",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/23",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -0,0 +1,17 @@
*italic*
**bold**
underline
[hyperlink](https:/github.com/DS4SD/docling)
[***italic and bold hyperlink***](https:/github.com/DS4SD/docling)
Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docling) on the same line
- *Italic bullet 1*
- **Bold bullet 2**
- Underline bullet 3
- Some *italic* **bold** underline
- Nested *italic* **bold**

View File

@ -76,17 +76,19 @@ def test_e2e_docx_conversions():
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
assert verify_export(
pred_md, str(gt_path) + ".md", generate=GENERATE
), "export to md"
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(
pred_itxt, str(gt_path) + ".itxt"
pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
), "export to indented-text"
assert verify_document(
doc, str(gt_path) + ".json", GENERATE
doc, str(gt_path) + ".json", generate=GENERATE
), "document document"
if docx_path.name == "word_tables.docx":