Update all test cases again

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-06-20 14:56:46 +02:00
commit 6158a2e784
13 changed files with 710 additions and 456 deletions

View File

@ -14,7 +14,7 @@ from docling_core.types.doc import (
TableCell,
TableData,
)
from docling_core.types.doc.document import Formatting
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
from docx import Document
from docx.document import Document as DocxDocument
from docx.oxml.table import CT_Tc
@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.valid = True
except Exception as e:
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
) from e
@override
@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self._handle_tables(element, docx_obj, doc)
except Exception:
_log.debug("could not parse a table, broken docx table")
# Check for Image
elif drawing_blip:
self._handle_pictures(docx_obj, drawing_blip, doc)
# Check for Text after the Image
if (
tag_name in ["p"]
and element.find(".//w:t", namespaces=namespaces) is not None
):
self._handle_text_elements(element, docx_obj, doc)
# Check for the sdt containers, like table of contents
elif tag_name in ["sdt"]:
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@ -268,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self._handle_text_elements(element, docx_obj, doc)
else:
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc
def _str_to_int(
@ -578,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
all_paragraphs = []
# Sort paragraphs within each container, then process containers
for container_id, paragraphs in container_paragraphs.items():
for paragraphs in container_paragraphs.values():
# Sort by vertical position within each container
sorted_container_paragraphs = sorted(
paragraphs,
@ -689,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc: DoclingDocument,
) -> None:
paragraph = Paragraph(element, docx_obj)
paragraph_elements = self._get_paragraph_elements(paragraph)
text, equations = self._handle_equations_in_text(
element=element, text=paragraph.text
)
if text is None:
return
paragraph_elements = self._get_paragraph_elements(paragraph)
text = text.strip()
# Common styles for bullet and numbered lists.
@ -912,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
return
def _add_formatted_list_item(
self,
doc: DoclingDocument,
elements: list,
marker: str,
enumerated: bool,
level: int,
) -> None:
# This should not happen by construction
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
return
if len(elements) == 1:
text, format, hyperlink = elements[0]
doc.add_list_item(
marker=marker,
enumerated=enumerated,
parent=self.parents[level],
text=text,
formatting=format,
hyperlink=hyperlink,
)
else:
new_item = doc.add_list_item(
marker=marker,
enumerated=enumerated,
parent=self.parents[level],
text="",
)
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
for text, format, hyperlink in elements:
doc.add_text(
label=DocItemLabel.TEXT,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
def _add_list_item(
self,
*,
@ -921,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
elements: list,
is_numbered: bool = False,
) -> None:
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
if not elements:
return None
enum_marker = ""
level = self._get_level()
@ -937,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[level],
paragraph_elements=elements,
self._add_formatted_list_item(
doc, elements, enum_marker, is_numbered, level
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elif (
self._prev_numid() == numid
and self.level_at_new_list is not None
@ -981,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[self.level_at_new_list + ilevel],
paragraph_elements=elements,
self._add_formatted_list_item(
doc,
elements,
enum_marker,
is_numbered,
self.level_at_new_list + ilevel,
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elif (
self._prev_numid() == numid
and self.level_at_new_list is not None
and prev_indent is not None
and ilevel < prev_indent
): # Close list
for k, v in self.parents.items():
for k in self.parents:
if k > self.level_at_new_list + ilevel:
self.parents[k] = None
@ -1011,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[self.level_at_new_list + ilevel],
paragraph_elements=elements,
self._add_formatted_list_item(
doc,
elements,
enum_marker,
is_numbered,
self.level_at_new_list + ilevel,
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
self.listIter = 0
elif self._prev_numid() == numid or prev_indent == ilevel:
@ -1033,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[level - 1],
paragraph_elements=elements,
self._add_formatted_list_item(
doc, elements, enum_marker, is_numbered, level - 1
)
for text, format, hyperlink in elements:
# Add the list item to the parent group
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
return
def _handle_tables(

BIN
tests/data/docx/word_image_anchors.docx vendored Normal file

Binary file not shown.

View File

@ -12,9 +12,7 @@ Create your feature branch: `git checkout -b feature/AmazingFeature` .
4. Push to the branch ( `git push origin feature/AmazingFeature` )
5. Open a Pull Request
##
*Second* section
## *Second* section
- **First** : Lorem ipsum.
- **Second** : Dolor `sit` amet.

View File

@ -11,84 +11,82 @@ item-0 at level 0: unspecified: group _root_
Blisters
Headache
Sore throat
item-9 at level 1: list: group group
item-10 at level 2: list_item:
item-11 at level 1: paragraph:
item-12 at level 1: paragraph:
item-13 at level 1: section: group textbox
item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
item-9 at level 1: paragraph:
item-10 at level 1: paragraph:
item-11 at level 1: section: group textbox
item-12 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
item-13 at level 1: paragraph:
item-14 at level 1: paragraph:
item-15 at level 1: paragraph:
item-16 at level 1: paragraph:
item-17 at level 1: paragraph:
item-18 at level 1: paragraph:
item-19 at level 1: section: group textbox
item-20 at level 2: paragraph: Yes
item-21 at level 1: paragraph:
item-22 at level 1: paragraph:
item-23 at level 1: section: group textbox
item-24 at level 2: list: group list
item-25 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
item-26 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System.
item-27 at level 2: paragraph:
item-28 at level 1: list: group list
item-29 at level 2: list_item:
item-17 at level 1: section: group textbox
item-18 at level 2: paragraph: Yes
item-19 at level 1: paragraph:
item-20 at level 1: paragraph:
item-21 at level 1: section: group textbox
item-22 at level 2: list: group list
item-23 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
item-24 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System.
item-25 at level 2: paragraph:
item-26 at level 1: list: group list
item-27 at level 2: list_item:
item-28 at level 1: paragraph:
item-29 at level 1: paragraph:
item-30 at level 1: paragraph:
item-31 at level 1: paragraph:
item-32 at level 1: paragraph:
item-33 at level 1: paragraph:
item-34 at level 1: paragraph:
item-35 at level 1: section: group textbox
item-36 at level 2: paragraph: Health Bureau:
item-37 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
item-38 at level 2: list: group list
item-39 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
item-40 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
item-41 at level 2: paragraph:
item-42 at level 1: list: group list
item-43 at level 2: list_item:
item-44 at level 1: paragraph:
item-45 at level 1: section: group textbox
item-46 at level 2: paragraph: Department of Education:
item-33 at level 1: section: group textbox
item-34 at level 2: paragraph: Health Bureau:
item-35 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
item-36 at level 2: list: group list
item-37 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
item-38 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
item-39 at level 2: paragraph:
item-40 at level 1: list: group list
item-41 at level 2: list_item:
item-42 at level 1: paragraph:
item-43 at level 1: section: group textbox
item-44 at level 2: paragraph: Department of Education:
Collabo ... vention measures at all school levels.
item-45 at level 1: paragraph:
item-46 at level 1: paragraph:
item-47 at level 1: paragraph:
item-48 at level 1: paragraph:
item-49 at level 1: paragraph:
item-50 at level 1: paragraph:
item-51 at level 1: paragraph:
item-52 at level 1: paragraph:
item-53 at level 1: paragraph:
item-54 at level 1: section: group textbox
item-55 at level 2: inline: group group
item-56 at level 3: paragraph: The Health Bureau will handle
item-57 at level 3: paragraph: reporting and specimen collection
item-58 at level 3: paragraph: .
item-59 at level 2: paragraph:
item-52 at level 1: section: group textbox
item-53 at level 2: inline: group group
item-54 at level 3: paragraph: The Health Bureau will handle
item-55 at level 3: paragraph: reporting and specimen collection
item-56 at level 3: paragraph: .
item-57 at level 2: paragraph:
item-58 at level 1: paragraph:
item-59 at level 1: paragraph:
item-60 at level 1: paragraph:
item-61 at level 1: paragraph:
item-62 at level 1: paragraph:
item-63 at level 1: section: group textbox
item-64 at level 2: paragraph: Whether the epidemic has eased.
item-65 at level 2: paragraph:
item-66 at level 1: paragraph:
item-67 at level 1: section: group textbox
item-68 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
item-69 at level 2: paragraph: No
item-70 at level 1: paragraph:
item-71 at level 1: paragraph:
item-72 at level 1: section: group textbox
item-73 at level 2: paragraph: Yes
item-74 at level 1: paragraph:
item-75 at level 1: section: group textbox
item-76 at level 2: paragraph: Yes
item-77 at level 1: paragraph:
item-78 at level 1: paragraph:
item-79 at level 1: section: group textbox
item-80 at level 2: paragraph: Case closed.
item-81 at level 2: paragraph:
item-82 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
item-83 at level 1: paragraph:
item-84 at level 1: section: group textbox
item-85 at level 2: paragraph: No
item-86 at level 1: paragraph:
item-87 at level 1: paragraph:
item-88 at level 1: paragraph:
item-61 at level 1: section: group textbox
item-62 at level 2: paragraph: Whether the epidemic has eased.
item-63 at level 2: paragraph:
item-64 at level 1: paragraph:
item-65 at level 1: section: group textbox
item-66 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
item-67 at level 2: paragraph: No
item-68 at level 1: paragraph:
item-69 at level 1: paragraph:
item-70 at level 1: section: group textbox
item-71 at level 2: paragraph: Yes
item-72 at level 1: paragraph:
item-73 at level 1: section: group textbox
item-74 at level 2: paragraph: Yes
item-75 at level 1: paragraph:
item-76 at level 1: paragraph:
item-77 at level 1: section: group textbox
item-78 at level 2: paragraph: Case closed.
item-79 at level 2: paragraph:
item-80 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
item-81 at level 1: paragraph:
item-82 at level 1: section: group textbox
item-83 at level 2: paragraph: No
item-84 at level 1: paragraph:
item-85 at level 1: paragraph:
item-86 at level 1: paragraph:

View File

@ -29,9 +29,6 @@
{
"$ref": "#/groups/0"
},
{
"$ref": "#/groups/19"
},
{
"$ref": "#/texts/6"
},
@ -492,20 +489,6 @@
"content_layer": "body",
"name": "textbox",
"label": "section"
},
{
"self_ref": "#/groups/19",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/67"
}
],
"content_layer": "body",
"name": "group",
"label": "list"
}
],
"texts": [
@ -1494,20 +1477,6 @@
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/67",
"parent": {
"$ref": "#/groups/19"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "",
"text": "",
"enumerated": false,
"marker": "-"
}
],
"pictures": [],

View File

@ -17,21 +17,16 @@ item-0 at level 0: unspecified: group _root_
item-16 at level 2: list_item: Italic bullet 1
item-17 at level 2: list_item: Bold bullet 2
item-18 at level 2: list_item: Underline bullet 3
item-19 at level 2: inline: group group
item-20 at level 3: list: group group
item-21 at level 4: list_item: Some
item-22 at level 3: list: group group
item-23 at level 4: list_item: italic
item-24 at level 3: list: group group
item-25 at level 4: list_item: bold
item-26 at level 3: list: group group
item-27 at level 4: list_item: underline
item-28 at level 2: list: group list
item-29 at level 3: inline: group group
item-30 at level 4: list: group group
item-31 at level 5: list_item: Nested
item-32 at level 4: list: group group
item-33 at level 5: list_item: italic
item-34 at level 4: list: group group
item-35 at level 5: list_item: bold
item-36 at level 1: paragraph:
item-19 at level 2: list_item:
item-20 at level 3: inline: group group
item-21 at level 4: text: Some
item-22 at level 4: text: italic
item-23 at level 4: text: bold
item-24 at level 4: text: underline
item-25 at level 2: list: group list
item-26 at level 3: list_item:
item-27 at level 4: inline: group group
item-28 at level 5: text: Nested
item-29 at level 5: text: italic
item-30 at level 5: text: bold
item-31 at level 1: paragraph:

View File

@ -42,7 +42,7 @@
"$ref": "#/groups/1"
},
{
"$ref": "#/texts/16"
"$ref": "#/texts/25"
}
],
"content_layer": "body",
@ -98,7 +98,7 @@
"$ref": "#/texts/15"
},
{
"$ref": "#/groups/2"
"$ref": "#/texts/16"
},
{
"$ref": "#/groups/3"
@ -111,20 +111,20 @@
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/groups/1"
"$ref": "#/texts/16"
},
"children": [
{
"$ref": "#/groups/11"
"$ref": "#/texts/17"
},
{
"$ref": "#/groups/10"
"$ref": "#/texts/18"
},
{
"$ref": "#/groups/9"
"$ref": "#/texts/19"
},
{
"$ref": "#/groups/8"
"$ref": "#/texts/20"
}
],
"content_layer": "body",
@ -138,7 +138,7 @@
},
"children": [
{
"$ref": "#/groups/4"
"$ref": "#/texts/21"
}
],
"content_layer": "body",
@ -148,120 +148,22 @@
{
"self_ref": "#/groups/4",
"parent": {
"$ref": "#/groups/3"
"$ref": "#/texts/21"
},
"children": [
{
"$ref": "#/groups/7"
"$ref": "#/texts/22"
},
{
"$ref": "#/groups/6"
"$ref": "#/texts/23"
},
{
"$ref": "#/groups/5"
"$ref": "#/texts/24"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/5",
"parent": {
"$ref": "#/groups/4"
},
"children": [
{
"$ref": "#/texts/17"
}
],
"content_layer": "body",
"name": "group",
"label": "list"
},
{
"self_ref": "#/groups/6",
"parent": {
"$ref": "#/groups/4"
},
"children": [
{
"$ref": "#/texts/18"
}
],
"content_layer": "body",
"name": "group",
"label": "list"
},
{
"self_ref": "#/groups/7",
"parent": {
"$ref": "#/groups/4"
},
"children": [
{
"$ref": "#/texts/19"
}
],
"content_layer": "body",
"name": "group",
"label": "list"
},
{
"self_ref": "#/groups/8",
"parent": {
"$ref": "#/groups/2"
},
"children": [
{
"$ref": "#/texts/20"
}
],
"content_layer": "body",
"name": "group",
"label": "list"
},
{
"self_ref": "#/groups/9",
"parent": {
"$ref": "#/groups/2"
},
"children": [
{
"$ref": "#/texts/21"
}
],
"content_layer": "body",
"name": "group",
"label": "list"
},
{
"self_ref": "#/groups/10",
"parent": {
"$ref": "#/groups/2"
},
"children": [
{
"$ref": "#/texts/22"
}
],
"content_layer": "body",
"name": "group",
"label": "list"
},
{
"self_ref": "#/groups/11",
"parent": {
"$ref": "#/groups/2"
},
"children": [
{
"$ref": "#/texts/23"
}
],
"content_layer": "body",
"name": "group",
"label": "list"
}
],
"texts": [
@ -574,149 +476,29 @@
{
"self_ref": "#/texts/16",
"parent": {
"$ref": "#/body"
"$ref": "#/groups/1"
},
"children": [],
"children": [
{
"$ref": "#/groups/2"
}
],
"content_layer": "body",
"label": "paragraph",
"label": "list_item",
"prov": [],
"orig": "",
"text": ""
"text": "",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/17",
"parent": {
"$ref": "#/groups/5"
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/18",
"parent": {
"$ref": "#/groups/6"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Nested",
"text": "Nested",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/20",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false,
"script": "baseline"
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/21",
"parent": {
"$ref": "#/groups/9"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/22",
"parent": {
"$ref": "#/groups/10"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/23",
"parent": {
"$ref": "#/groups/11"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"label": "text",
"prov": [],
"orig": "Some",
"text": "Some",
@ -726,9 +508,151 @@
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/18",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/20",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/21",
"parent": {
"$ref": "#/groups/3"
},
"children": [
{
"$ref": "#/groups/4"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "",
"text": "",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/22",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Nested",
"text": "Nested",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/23",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/24",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/25",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
}
],
"pictures": [],

View File

@ -13,5 +13,5 @@ Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docli
- *Italic bullet 1*
- **Bold bullet 2**
- Underline bullet 3
- Some - *italic* - **bold** - underline
- Nested - *italic* - **bold**
- Some *italic* **bold** underline
- Nested *italic* **bold**

View File

@ -0,0 +1,16 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Transcript
item-2 at level 1: paragraph: February 20, 2025, 8:32PM
item-3 at level 1: picture
item-4 at level 1: inline: group group
item-5 at level 2: paragraph: This is test 1
item-6 at level 2: paragraph: 0:08
Correct, he is not.
item-7 at level 1: paragraph:
item-8 at level 1: picture
item-9 at level 1: inline: group group
item-10 at level 2: paragraph: This is test 2
item-11 at level 2: paragraph: 0:16
Yeah, exactly.
item-12 at level 1: paragraph:
item-13 at level 1: paragraph:

View File

@ -0,0 +1,292 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"name": "word_image_anchors",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"binary_hash": 2428692234257307633,
"filename": "word_image_anchors.docx"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/pictures/0"
},
{
"$ref": "#/groups/0"
},
{
"$ref": "#/texts/4"
},
{
"$ref": "#/pictures/1"
},
{
"$ref": "#/groups/1"
},
{
"$ref": "#/texts/7"
},
{
"$ref": "#/texts/8"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/5"
},
{
"$ref": "#/texts/6"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "Transcript",
"text": "Transcript",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "February 20, 2025, 8:32PM",
"text": "February 20, 2025, 8:32PM",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "This is test 1",
"text": "This is test 1",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "0:08\nCorrect, he is not.",
"text": "0:08\nCorrect, he is not.",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "This is test 2",
"text": "This is test 2",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "0:16\nYeah, exactly.",
"text": "0:16\nYeah, exactly.",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
}
],
"pictures": [
{
"self_ref": "#/pictures/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "picture",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/png",
"dpi": 72,
"size": {
"width": 100.0,
"height": 100.0
},
"uri": ""
},
"annotations": []
},
{
"self_ref": "#/pictures/1",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "picture",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/png",
"dpi": 72,
"size": {
"width": 100.0,
"height": 100.0
},
"uri": ""
},
"annotations": []
}
],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -0,0 +1,13 @@
**Transcript**
February 20, 2025, 8:32PM
<!-- image -->
**This is test 1** 0:08
Correct, he is not.
<!-- image -->
**This is test 2** 0:16
Yeah, exactly.

View File

@ -9,6 +9,7 @@ from docling.datamodel.document import (
DoclingDocument,
InputDocument,
SectionHeaderItem,
TextItem,
)
from docling.document_converter import DocumentConverter
@ -96,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
"export to md"
f"export to markdown failed on {docx_path}"
)
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
"export to indented-text"
f"export to indented-text failed on {docx_path}"
)
assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
"document document"
f"DoclingDocument verification failed on {docx_path}"
)
if docx_path.name == "word_tables.docx":
@ -116,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
pred_text=pred_html,
gtfile=str(gt_path) + ".html",
generate=GENERATE,
), "export to html"
), f"export to html failed on {docx_path}"
flaky_path = Path("tests/data/docx/textbox.docx")
@ -131,3 +132,42 @@ def test_e2e_docx_conversions():
@pytest.mark.xfail(strict=False)
def test_textbox_conversion():
_test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
def test_text_after_image_anchors():
"""
Test to analyse whether text gets parsed after image anchors.
"""
in_path = Path("tests/data/docx/word_image_anchors.docx")
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.DOCX,
backend=MsWordDocumentBackend,
)
backend = MsWordDocumentBackend(
in_doc=in_doc,
path_or_stream=in_path,
)
doc = backend.convert()
found_text_after_anchor_1 = found_text_after_anchor_2 = (
found_text_after_anchor_3
) = found_text_after_anchor_4 = False
for item, _ in doc.iterate_items():
if isinstance(item, TextItem):
if item.text == "This is test 1":
found_text_after_anchor_1 = True
elif item.text == "0:08\nCorrect, he is not.":
found_text_after_anchor_2 = True
elif item.text == "This is test 2":
found_text_after_anchor_3 = True
elif item.text == "0:16\nYeah, exactly.":
found_text_after_anchor_4 = True
assert (
found_text_after_anchor_1
and found_text_after_anchor_2
and found_text_after_anchor_3
and found_text_after_anchor_4
)

50
uv.lock generated
View File

@ -983,7 +983,7 @@ examples = [
[[package]]
name = "docling-core"
version = "2.38.0"
version = "2.38.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "jsonref" },
@ -997,9 +997,9 @@ dependencies = [
{ name = "typer" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/01/3d/02b4926567735c252b4750074f9dfc96d06078566f067eb47c13713952a2/docling_core-2.38.0.tar.gz", hash = "sha256:3bad4c476cc798e29d01b02ea383b5582d7031e9595b177be0a9450f2eb7bef6", size = 145997, upload-time = "2025-06-18T12:35:23.81Z" }
sdist = { url = "https://files.pythonhosted.org/packages/38/f7/33bb17aa13e73722bf18ecfb7f13d6fbfb384c22003209bd72708123b33f/docling_core-2.38.1.tar.gz", hash = "sha256:a0566df2316eec4d22953ca7dac839b926dd57549b4c07ac810e87dbbaf91a10", size = 146276, upload-time = "2025-06-20T12:28:48.422Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/3c/52/e65521ec8ae7ecbce2f9dd95dbf4164b4d4c58c29136e1489a038ce9a2fc/docling_core-2.38.0-py3-none-any.whl", hash = "sha256:8f27d7074a99913f2ba73bde363bbed3416852014eda136bb8880d37805c6950", size = 151276, upload-time = "2025-06-18T12:35:22.25Z" },
{ url = "https://files.pythonhosted.org/packages/f0/c5/fb2e24602db94ec02cc3ac8eb7b9665f2a5f61ff81866beb67aff95a353a/docling_core-2.38.1-py3-none-any.whl", hash = "sha256:6859313561030503e8b53aec535aa5edb765a679af76ce2e2c60722d78c6c613", size = 151570, upload-time = "2025-06-20T12:28:46.764Z" },
]
[package.optional-dependencies]
@ -3387,10 +3387,10 @@ name = "ocrmac"
version = "1.0.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux')" },
{ name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform == 'darwin'" },
{ name = "pillow", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
{ name = "pyobjc-framework-vision", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
{ name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
{ name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
{ name = "pillow" },
{ name = "pyobjc-framework-vision" },
]
sdist = { url = "https://files.pythonhosted.org/packages/dd/dc/de3e9635774b97d9766f6815bbb3f5ec9bce347115f10d9abbf2733a9316/ocrmac-1.0.0.tar.gz", hash = "sha256:5b299e9030c973d1f60f82db000d6c2e5ff271601878c7db0885e850597d1d2e", size = 1463997, upload-time = "2024-11-07T12:00:00.197Z" }
wheels = [
@ -4414,7 +4414,7 @@ name = "pyobjc-framework-cocoa"
version = "11.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
{ name = "pyobjc-core" },
]
sdist = { url = "https://files.pythonhosted.org/packages/4b/c5/7a866d24bc026f79239b74d05e2cf3088b03263da66d53d1b4cf5207f5ae/pyobjc_framework_cocoa-11.1.tar.gz", hash = "sha256:87df76b9b73e7ca699a828ff112564b59251bb9bbe72e610e670a4dc9940d038", size = 5565335, upload-time = "2025-06-14T20:56:59.683Z" }
wheels = [
@ -4433,8 +4433,8 @@ name = "pyobjc-framework-coreml"
version = "11.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
{ name = "pyobjc-core" },
{ name = "pyobjc-framework-cocoa" },
]
sdist = { url = "https://files.pythonhosted.org/packages/0d/5d/4309f220981d769b1a2f0dcb2c5c104490d31389a8ebea67e5595ce1cb74/pyobjc_framework_coreml-11.1.tar.gz", hash = "sha256:775923eefb9eac2e389c0821b10564372de8057cea89f1ea1cdaf04996c970a7", size = 82005, upload-time = "2025-06-14T20:57:12.004Z" }
wheels = [
@ -4453,8 +4453,8 @@ name = "pyobjc-framework-quartz"
version = "11.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
{ name = "pyobjc-core" },
{ name = "pyobjc-framework-cocoa" },
]
sdist = { url = "https://files.pythonhosted.org/packages/c7/ac/6308fec6c9ffeda9942fef72724f4094c6df4933560f512e63eac37ebd30/pyobjc_framework_quartz-11.1.tar.gz", hash = "sha256:a57f35ccfc22ad48c87c5932818e583777ff7276605fef6afad0ac0741169f75", size = 3953275, upload-time = "2025-06-14T20:58:17.924Z" }
wheels = [
@ -4473,10 +4473,10 @@ name = "pyobjc-framework-vision"
version = "11.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
{ name = "pyobjc-framework-coreml", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
{ name = "pyobjc-framework-quartz", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
{ name = "pyobjc-core" },
{ name = "pyobjc-framework-cocoa" },
{ name = "pyobjc-framework-coreml" },
{ name = "pyobjc-framework-quartz" },
]
sdist = { url = "https://files.pythonhosted.org/packages/40/a8/7128da4d0a0103cabe58910a7233e2f98d18c590b1d36d4b3efaaedba6b9/pyobjc_framework_vision-11.1.tar.gz", hash = "sha256:26590512ee7758da3056499062a344b8a351b178be66d4b719327884dde4216b", size = 133721, upload-time = "2025-06-14T20:58:46.095Z" }
wheels = [
@ -4957,17 +4957,17 @@ source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
{ name = "numpy", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and python_full_version < '3.13') or (python_full_version >= '3.11' and platform_machine != 'arm64') or (python_full_version >= '3.11' and sys_platform != 'darwin')" },
{ name = "numpy", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "onnxruntime", version = "1.19.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
{ name = "onnxruntime", version = "1.22.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" },
{ name = "opencv-python", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
{ name = "pillow", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
{ name = "pyclipper", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
{ name = "pyyaml", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
{ name = "onnxruntime", version = "1.22.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
{ name = "opencv-python" },
{ name = "pillow" },
{ name = "pyclipper" },
{ name = "pyyaml" },
{ name = "shapely", version = "2.0.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
{ name = "shapely", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" },
{ name = "six", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
{ name = "tqdm", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
{ name = "shapely", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
{ name = "six" },
{ name = "tqdm" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/ba/12/1e5497183bdbe782dbb91bad1d0d2297dba4d2831b2652657f7517bfc6df/rapidocr_onnxruntime-1.4.4-py3-none-any.whl", hash = "sha256:971d7d5f223a7a808662229df1ef69893809d8457d834e6373d3854bc1782cbf", size = 14915192, upload-time = "2025-01-17T01:48:25.104Z" },