mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Update all test cases again
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
commit
6158a2e784
@ -14,7 +14,7 @@ from docling_core.types.doc import (
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from docling_core.types.doc.document import Formatting
|
||||
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
||||
from docx import Document
|
||||
from docx.document import Document as DocxDocument
|
||||
from docx.oxml.table import CT_Tc
|
||||
@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.valid = True
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
||||
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
|
||||
@override
|
||||
@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._handle_tables(element, docx_obj, doc)
|
||||
except Exception:
|
||||
_log.debug("could not parse a table, broken docx table")
|
||||
|
||||
# Check for Image
|
||||
elif drawing_blip:
|
||||
self._handle_pictures(docx_obj, drawing_blip, doc)
|
||||
# Check for Text after the Image
|
||||
if (
|
||||
tag_name in ["p"]
|
||||
and element.find(".//w:t", namespaces=namespaces) is not None
|
||||
):
|
||||
self._handle_text_elements(element, docx_obj, doc)
|
||||
# Check for the sdt containers, like table of contents
|
||||
elif tag_name in ["sdt"]:
|
||||
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
||||
@ -268,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._handle_text_elements(element, docx_obj, doc)
|
||||
else:
|
||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||
|
||||
return doc
|
||||
|
||||
def _str_to_int(
|
||||
@ -578,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
all_paragraphs = []
|
||||
|
||||
# Sort paragraphs within each container, then process containers
|
||||
for container_id, paragraphs in container_paragraphs.items():
|
||||
for paragraphs in container_paragraphs.values():
|
||||
# Sort by vertical position within each container
|
||||
sorted_container_paragraphs = sorted(
|
||||
paragraphs,
|
||||
@ -689,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc: DoclingDocument,
|
||||
) -> None:
|
||||
paragraph = Paragraph(element, docx_obj)
|
||||
|
||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||
text, equations = self._handle_equations_in_text(
|
||||
element=element, text=paragraph.text
|
||||
)
|
||||
|
||||
if text is None:
|
||||
return
|
||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||
text = text.strip()
|
||||
|
||||
# Common styles for bullet and numbered lists.
|
||||
@ -912,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
return
|
||||
|
||||
def _add_formatted_list_item(
|
||||
self,
|
||||
doc: DoclingDocument,
|
||||
elements: list,
|
||||
marker: str,
|
||||
enumerated: bool,
|
||||
level: int,
|
||||
) -> None:
|
||||
# This should not happen by construction
|
||||
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
|
||||
return
|
||||
if len(elements) == 1:
|
||||
text, format, hyperlink = elements[0]
|
||||
doc.add_list_item(
|
||||
marker=marker,
|
||||
enumerated=enumerated,
|
||||
parent=self.parents[level],
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
else:
|
||||
new_item = doc.add_list_item(
|
||||
marker=marker,
|
||||
enumerated=enumerated,
|
||||
parent=self.parents[level],
|
||||
text="",
|
||||
)
|
||||
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
def _add_list_item(
|
||||
self,
|
||||
*,
|
||||
@ -921,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
elements: list,
|
||||
is_numbered: bool = False,
|
||||
) -> None:
|
||||
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
|
||||
if not elements:
|
||||
return None
|
||||
enum_marker = ""
|
||||
|
||||
level = self._get_level()
|
||||
@ -937,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[level],
|
||||
paragraph_elements=elements,
|
||||
self._add_formatted_list_item(
|
||||
doc, elements, enum_marker, is_numbered, level
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
elif (
|
||||
self._prev_numid() == numid
|
||||
and self.level_at_new_list is not None
|
||||
@ -981,20 +1016,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
||||
paragraph_elements=elements,
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
self._add_formatted_list_item(
|
||||
doc,
|
||||
elements,
|
||||
enum_marker,
|
||||
is_numbered,
|
||||
self.level_at_new_list + ilevel,
|
||||
)
|
||||
elif (
|
||||
self._prev_numid() == numid
|
||||
@ -1002,7 +1029,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
and prev_indent is not None
|
||||
and ilevel < prev_indent
|
||||
): # Close list
|
||||
for k, v in self.parents.items():
|
||||
for k in self.parents:
|
||||
if k > self.level_at_new_list + ilevel:
|
||||
self.parents[k] = None
|
||||
|
||||
@ -1011,19 +1038,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
||||
paragraph_elements=elements,
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
self._add_formatted_list_item(
|
||||
doc,
|
||||
elements,
|
||||
enum_marker,
|
||||
is_numbered,
|
||||
self.level_at_new_list + ilevel,
|
||||
)
|
||||
self.listIter = 0
|
||||
|
||||
@ -1033,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[level - 1],
|
||||
paragraph_elements=elements,
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
# Add the list item to the parent group
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
self._add_formatted_list_item(
|
||||
doc, elements, enum_marker, is_numbered, level - 1
|
||||
)
|
||||
|
||||
return
|
||||
|
||||
def _handle_tables(
|
||||
|
BIN
tests/data/docx/word_image_anchors.docx
vendored
Normal file
BIN
tests/data/docx/word_image_anchors.docx
vendored
Normal file
Binary file not shown.
@ -12,9 +12,7 @@ Create your feature branch: `git checkout -b feature/AmazingFeature` .
|
||||
4. Push to the branch ( `git push origin feature/AmazingFeature` )
|
||||
5. Open a Pull Request
|
||||
|
||||
##
|
||||
|
||||
*Second* section
|
||||
## *Second* section
|
||||
|
||||
- **First** : Lorem ipsum.
|
||||
- **Second** : Dolor `sit` amet.
|
||||
|
134
tests/data/groundtruth/docling_v2/textbox.docx.itxt
vendored
134
tests/data/groundtruth/docling_v2/textbox.docx.itxt
vendored
@ -11,84 +11,82 @@ item-0 at level 0: unspecified: group _root_
|
||||
* Blisters
|
||||
* Headache
|
||||
* Sore throat
|
||||
item-9 at level 1: list: group group
|
||||
item-10 at level 2: list_item:
|
||||
item-11 at level 1: paragraph:
|
||||
item-12 at level 1: paragraph:
|
||||
item-13 at level 1: section: group textbox
|
||||
item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
|
||||
item-9 at level 1: paragraph:
|
||||
item-10 at level 1: paragraph:
|
||||
item-11 at level 1: section: group textbox
|
||||
item-12 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
|
||||
item-13 at level 1: paragraph:
|
||||
item-14 at level 1: paragraph:
|
||||
item-15 at level 1: paragraph:
|
||||
item-16 at level 1: paragraph:
|
||||
item-17 at level 1: paragraph:
|
||||
item-18 at level 1: paragraph:
|
||||
item-19 at level 1: section: group textbox
|
||||
item-20 at level 2: paragraph: Yes
|
||||
item-21 at level 1: paragraph:
|
||||
item-22 at level 1: paragraph:
|
||||
item-23 at level 1: section: group textbox
|
||||
item-24 at level 2: list: group list
|
||||
item-25 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
|
||||
item-26 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System.
|
||||
item-27 at level 2: paragraph:
|
||||
item-28 at level 1: list: group list
|
||||
item-29 at level 2: list_item:
|
||||
item-17 at level 1: section: group textbox
|
||||
item-18 at level 2: paragraph: Yes
|
||||
item-19 at level 1: paragraph:
|
||||
item-20 at level 1: paragraph:
|
||||
item-21 at level 1: section: group textbox
|
||||
item-22 at level 2: list: group list
|
||||
item-23 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
|
||||
item-24 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System.
|
||||
item-25 at level 2: paragraph:
|
||||
item-26 at level 1: list: group list
|
||||
item-27 at level 2: list_item:
|
||||
item-28 at level 1: paragraph:
|
||||
item-29 at level 1: paragraph:
|
||||
item-30 at level 1: paragraph:
|
||||
item-31 at level 1: paragraph:
|
||||
item-32 at level 1: paragraph:
|
||||
item-33 at level 1: paragraph:
|
||||
item-34 at level 1: paragraph:
|
||||
item-35 at level 1: section: group textbox
|
||||
item-36 at level 2: paragraph: Health Bureau:
|
||||
item-37 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
|
||||
item-38 at level 2: list: group list
|
||||
item-39 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
|
||||
item-40 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
|
||||
item-41 at level 2: paragraph:
|
||||
item-42 at level 1: list: group list
|
||||
item-43 at level 2: list_item:
|
||||
item-44 at level 1: paragraph:
|
||||
item-45 at level 1: section: group textbox
|
||||
item-46 at level 2: paragraph: Department of Education:
|
||||
item-33 at level 1: section: group textbox
|
||||
item-34 at level 2: paragraph: Health Bureau:
|
||||
item-35 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
|
||||
item-36 at level 2: list: group list
|
||||
item-37 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
|
||||
item-38 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
|
||||
item-39 at level 2: paragraph:
|
||||
item-40 at level 1: list: group list
|
||||
item-41 at level 2: list_item:
|
||||
item-42 at level 1: paragraph:
|
||||
item-43 at level 1: section: group textbox
|
||||
item-44 at level 2: paragraph: Department of Education:
|
||||
Collabo ... vention measures at all school levels.
|
||||
item-45 at level 1: paragraph:
|
||||
item-46 at level 1: paragraph:
|
||||
item-47 at level 1: paragraph:
|
||||
item-48 at level 1: paragraph:
|
||||
item-49 at level 1: paragraph:
|
||||
item-50 at level 1: paragraph:
|
||||
item-51 at level 1: paragraph:
|
||||
item-52 at level 1: paragraph:
|
||||
item-53 at level 1: paragraph:
|
||||
item-54 at level 1: section: group textbox
|
||||
item-55 at level 2: inline: group group
|
||||
item-56 at level 3: paragraph: The Health Bureau will handle
|
||||
item-57 at level 3: paragraph: reporting and specimen collection
|
||||
item-58 at level 3: paragraph: .
|
||||
item-59 at level 2: paragraph:
|
||||
item-52 at level 1: section: group textbox
|
||||
item-53 at level 2: inline: group group
|
||||
item-54 at level 3: paragraph: The Health Bureau will handle
|
||||
item-55 at level 3: paragraph: reporting and specimen collection
|
||||
item-56 at level 3: paragraph: .
|
||||
item-57 at level 2: paragraph:
|
||||
item-58 at level 1: paragraph:
|
||||
item-59 at level 1: paragraph:
|
||||
item-60 at level 1: paragraph:
|
||||
item-61 at level 1: paragraph:
|
||||
item-62 at level 1: paragraph:
|
||||
item-63 at level 1: section: group textbox
|
||||
item-64 at level 2: paragraph: Whether the epidemic has eased.
|
||||
item-65 at level 2: paragraph:
|
||||
item-66 at level 1: paragraph:
|
||||
item-67 at level 1: section: group textbox
|
||||
item-68 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
|
||||
item-69 at level 2: paragraph: No
|
||||
item-70 at level 1: paragraph:
|
||||
item-71 at level 1: paragraph:
|
||||
item-72 at level 1: section: group textbox
|
||||
item-73 at level 2: paragraph: Yes
|
||||
item-74 at level 1: paragraph:
|
||||
item-75 at level 1: section: group textbox
|
||||
item-76 at level 2: paragraph: Yes
|
||||
item-77 at level 1: paragraph:
|
||||
item-78 at level 1: paragraph:
|
||||
item-79 at level 1: section: group textbox
|
||||
item-80 at level 2: paragraph: Case closed.
|
||||
item-81 at level 2: paragraph:
|
||||
item-82 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
|
||||
item-83 at level 1: paragraph:
|
||||
item-84 at level 1: section: group textbox
|
||||
item-85 at level 2: paragraph: No
|
||||
item-61 at level 1: section: group textbox
|
||||
item-62 at level 2: paragraph: Whether the epidemic has eased.
|
||||
item-63 at level 2: paragraph:
|
||||
item-64 at level 1: paragraph:
|
||||
item-65 at level 1: section: group textbox
|
||||
item-66 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
|
||||
item-67 at level 2: paragraph: No
|
||||
item-68 at level 1: paragraph:
|
||||
item-69 at level 1: paragraph:
|
||||
item-70 at level 1: section: group textbox
|
||||
item-71 at level 2: paragraph: Yes
|
||||
item-72 at level 1: paragraph:
|
||||
item-73 at level 1: section: group textbox
|
||||
item-74 at level 2: paragraph: Yes
|
||||
item-75 at level 1: paragraph:
|
||||
item-76 at level 1: paragraph:
|
||||
item-77 at level 1: section: group textbox
|
||||
item-78 at level 2: paragraph: Case closed.
|
||||
item-79 at level 2: paragraph:
|
||||
item-80 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
|
||||
item-81 at level 1: paragraph:
|
||||
item-82 at level 1: section: group textbox
|
||||
item-83 at level 2: paragraph: No
|
||||
item-84 at level 1: paragraph:
|
||||
item-85 at level 1: paragraph:
|
||||
item-86 at level 1: paragraph:
|
||||
item-87 at level 1: paragraph:
|
||||
item-88 at level 1: paragraph:
|
@ -29,9 +29,6 @@
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/19"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/6"
|
||||
},
|
||||
@ -492,20 +489,6 @@
|
||||
"content_layer": "body",
|
||||
"name": "textbox",
|
||||
"label": "section"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/19",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/67"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "list"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
@ -1494,20 +1477,6 @@
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/67",
|
||||
"parent": {
|
||||
"$ref": "#/groups/19"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": "",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
|
@ -17,21 +17,16 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-16 at level 2: list_item: Italic bullet 1
|
||||
item-17 at level 2: list_item: Bold bullet 2
|
||||
item-18 at level 2: list_item: Underline bullet 3
|
||||
item-19 at level 2: inline: group group
|
||||
item-20 at level 3: list: group group
|
||||
item-21 at level 4: list_item: Some
|
||||
item-22 at level 3: list: group group
|
||||
item-23 at level 4: list_item: italic
|
||||
item-24 at level 3: list: group group
|
||||
item-25 at level 4: list_item: bold
|
||||
item-26 at level 3: list: group group
|
||||
item-27 at level 4: list_item: underline
|
||||
item-28 at level 2: list: group list
|
||||
item-29 at level 3: inline: group group
|
||||
item-30 at level 4: list: group group
|
||||
item-31 at level 5: list_item: Nested
|
||||
item-32 at level 4: list: group group
|
||||
item-33 at level 5: list_item: italic
|
||||
item-34 at level 4: list: group group
|
||||
item-35 at level 5: list_item: bold
|
||||
item-36 at level 1: paragraph:
|
||||
item-19 at level 2: list_item:
|
||||
item-20 at level 3: inline: group group
|
||||
item-21 at level 4: text: Some
|
||||
item-22 at level 4: text: italic
|
||||
item-23 at level 4: text: bold
|
||||
item-24 at level 4: text: underline
|
||||
item-25 at level 2: list: group list
|
||||
item-26 at level 3: list_item:
|
||||
item-27 at level 4: inline: group group
|
||||
item-28 at level 5: text: Nested
|
||||
item-29 at level 5: text: italic
|
||||
item-30 at level 5: text: bold
|
||||
item-31 at level 1: paragraph:
|
@ -42,7 +42,7 @@
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/16"
|
||||
"$ref": "#/texts/25"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -98,7 +98,7 @@
|
||||
"$ref": "#/texts/15"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/2"
|
||||
"$ref": "#/texts/16"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/3"
|
||||
@ -111,20 +111,20 @@
|
||||
{
|
||||
"self_ref": "#/groups/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
"$ref": "#/texts/16"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/11"
|
||||
"$ref": "#/texts/17"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/10"
|
||||
"$ref": "#/texts/18"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/9"
|
||||
"$ref": "#/texts/19"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/8"
|
||||
"$ref": "#/texts/20"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -138,7 +138,7 @@
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/4"
|
||||
"$ref": "#/texts/21"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -148,120 +148,22 @@
|
||||
{
|
||||
"self_ref": "#/groups/4",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
"$ref": "#/texts/21"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/7"
|
||||
"$ref": "#/texts/22"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/6"
|
||||
"$ref": "#/texts/23"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/5"
|
||||
"$ref": "#/texts/24"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/17"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/6",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/18"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/7",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/19"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/8",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/20"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/9",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/21"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/10",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/22"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/11",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/23"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "list"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
@ -574,149 +476,29 @@
|
||||
{
|
||||
"self_ref": "#/texts/16",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/2"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
"text": "",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/17",
|
||||
"parent": {
|
||||
"$ref": "#/groups/5"
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/18",
|
||||
"parent": {
|
||||
"$ref": "#/groups/6"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/19",
|
||||
"parent": {
|
||||
"$ref": "#/groups/7"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Nested",
|
||||
"text": "Nested",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/20",
|
||||
"parent": {
|
||||
"$ref": "#/groups/8"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "underline",
|
||||
"text": "underline",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": true,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/21",
|
||||
"parent": {
|
||||
"$ref": "#/groups/9"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/22",
|
||||
"parent": {
|
||||
"$ref": "#/groups/10"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/23",
|
||||
"parent": {
|
||||
"$ref": "#/groups/11"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Some",
|
||||
"text": "Some",
|
||||
@ -726,9 +508,151 @@
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/18",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/19",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/20",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "underline",
|
||||
"text": "underline",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": true,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/21",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/4"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": "",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/22",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Nested",
|
||||
"text": "Nested",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/23",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/24",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/25",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
|
@ -13,5 +13,5 @@ Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docli
|
||||
- *Italic bullet 1*
|
||||
- **Bold bullet 2**
|
||||
- Underline bullet 3
|
||||
- Some - *italic* - **bold** - underline
|
||||
- Nested - *italic* - **bold**
|
||||
- Some *italic* **bold** underline
|
||||
- Nested *italic* **bold**
|
16
tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt
vendored
Normal file
16
tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: paragraph: Transcript
|
||||
item-2 at level 1: paragraph: February 20, 2025, 8:32PM
|
||||
item-3 at level 1: picture
|
||||
item-4 at level 1: inline: group group
|
||||
item-5 at level 2: paragraph: This is test 1
|
||||
item-6 at level 2: paragraph: 0:08
|
||||
Correct, he is not.
|
||||
item-7 at level 1: paragraph:
|
||||
item-8 at level 1: picture
|
||||
item-9 at level 1: inline: group group
|
||||
item-10 at level 2: paragraph: This is test 2
|
||||
item-11 at level 2: paragraph: 0:16
|
||||
Yeah, exactly.
|
||||
item-12 at level 1: paragraph:
|
||||
item-13 at level 1: paragraph:
|
292
tests/data/groundtruth/docling_v2/word_image_anchors.docx.json
vendored
Normal file
292
tests/data/groundtruth/docling_v2/word_image_anchors.docx.json
vendored
Normal file
@ -0,0 +1,292 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.4.0",
|
||||
"name": "word_image_anchors",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"binary_hash": 2428692234257307633,
|
||||
"filename": "word_image_anchors.docx"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/pictures/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
},
|
||||
{
|
||||
"$ref": "#/pictures/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/7"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/8"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/3"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/1",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/5"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/6"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "Transcript",
|
||||
"text": "Transcript",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "February 20, 2025, 8:32PM",
|
||||
"text": "February 20, 2025, 8:32PM",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "This is test 1",
|
||||
"text": "This is test 1",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/3",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "0:08\nCorrect, he is not.",
|
||||
"text": "0:08\nCorrect, he is not.",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "This is test 2",
|
||||
"text": "This is test 2",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/6",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "0:16\nYeah, exactly.",
|
||||
"text": "0:16\nYeah, exactly.",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/7",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/8",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
}
|
||||
],
|
||||
"pictures": [
|
||||
{
|
||||
"self_ref": "#/pictures/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"image": {
|
||||
"mimetype": "image/png",
|
||||
"dpi": 72,
|
||||
"size": {
|
||||
"width": 100.0,
|
||||
"height": 100.0
|
||||
},
|
||||
"uri": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAAz0lEQVR4nO3bUW0CURRF0TukQvDSauBr0mACE1VBAzYQg5Lpdw0wO2EtA+cl+/6+GQAAAAAAAAAAAADe1DIR53X9mcNcdhnf5nm93Y8T8DElyzyuv/evlx/CMqeJOOz9AP4TJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiWp8+t/k8f6/bDrvPl28CAAAAAAAAAAAAAAAAzLv5A5bTEG2TIIlOAAAAAElFTkSuQmCC"
|
||||
},
|
||||
"annotations": []
|
||||
},
|
||||
{
|
||||
"self_ref": "#/pictures/1",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"image": {
|
||||
"mimetype": "image/png",
|
||||
"dpi": 72,
|
||||
"size": {
|
||||
"width": 100.0,
|
||||
"height": 100.0
|
||||
},
|
||||
"uri": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAJIElEQVR4nO2dbWxb1RnH/8+1c5O4bITEwJrRF1ZAI6gtL9oK29oxihAdTQOVoGhbKyS0MDWZJk1CQ+q0aR/4xLYvJNGabdK07MukrSUNaxEvg7aUlteuLUoHrUTbseylSRSgpLGd3Ac9596kSWzHvva1fXzv/UmW4jaxj5+/z73nPOec50/QnM3t5xdbUWOlZeBGgK8jNpYC3AxQHOAGEMXAXKN+mSgF5nGAxgAeBmiIyToH0GnDwklj0jqxq/fK/0BjCJrR2jn8ZcPCXSBaC9DtAC/39h3oDMBHwHzQMvD3ga74P6ERWgjS1jG8BjAeALgVQEuZ334QoAHA2t3fHX8dQRWktX0obpi1jzDjewSshgYwcIwIf7KSiT8M9DYPB0KQts7RlWDuANCuSw/NAAPoBVF3f1fjCZQRKq8QeBzgragqqA+Ep8olDJXj0kSm+XNi6kQVw8RdnEz+otSXspIK0rZ9eDuIngTQAH8wBuYd/T3xnqoSRIauERi/ZuYN8CFEtG8K1o9LMWT2XJBN20e+TwZ1gdmEnyFKssWde3qafuvpy3r5Ym0dI78B8BiCxc7+7qYfaCXIxvbRpZEa7gOwDsHkwFSKtj7b23iu4oLYs2z6M4BlCDZnAd5S7Gy/KEHu3z5yDxN2AVhUzOv4iE+JsfmZnqbnyy7Iph+O3kcWD2g8264UzAa17nm68W+F/DEV0TOeC8XIChPj3kJ6ChV4z3gpvEzl5FOA17u9pxhuR1PODTwUIzeLJFZ2zEokiDO0Dfpoyg3LnJh5L4gz6QvqPKMY1jmx804QSYcEcAbuJY85MSz+pq7WuGEc831uqtQQJS1Yq3MlJHP2EMnahmJ4ALOpYpkDI9d6hl9T6JVAYqnWiAq5ZKlNCDW1p3y0uKQLY1YqcX22lcesPUSWXUMxSkKDE9v8e4izM+R4adoToiBalWnjROYeonaHhJSULDFO6yFh76hsL0nvIfYmtpBykCHWlGFk9X8d0uqrbqjBj7YtQlODq3QbLAtIphgffcL44N+TeO1oEgfeSkJT2Eolrpo94orO/l/ZawuuvBjFYBhAXS2px9VxE2tWmdjWZmHvgQnsemECmkEq5sAvp/9hztdPNj7DZxAB8SsMfLc1hscfvQz1dXp93+bH3Ji98KTLLvRSEDGAO1abaH8wBp2QmNuLfmk9RM5n+BvDAL6y0sTa23RLzV2K/ax7iDosUzUcOprEmydS6udoBFixJIovLYng2msiMGuyX5YW1RNuXBHFwbd1utGr2D8xI4ik2MFlP7lUFBcnGK+8kZh5/uJh+2e5ibc/FMs6OpN7yjVXR6AZLaKBpOZVq9WZPp/w+vEkXjqcwOQUqoppDeyvkTpg6R+GzltIpeQQVBXhaOD0azntGgyYgQ//p2P3sTUw5By490ePK8u1X4zANDPf2D+6YOGtd+3BgF7wctHCkEP58BG3ttTgG7eZat6RKa0iYrwzqKMggGgRlQoJVGWX20xcvyyKDetqcfsqE7F6yiiGCPH7v45DV0SLqJSr0CCX6Jq776hVj3yQZKMMi/v2XFTDZX3h66JO7RD4kQvjjKMnU3j2lQm898EkdEe0kB7SDJ9yWYzwtVtMLGuOqEnkvoMJ3XtIs2FX1fEvEQNYujiCrZti+NVPPq9m8vpCcUOVOAoAREDzlRGVVpGRmJ5wQ9SpN4Vq49Cs5KJQZwI3LJcEYxRLFkcyDnsFyXFta4vh/OgF/Ou/mk0QiWLRmeJfVcbFeclF4blX7ecy+vrOxno0Xp5ZlSVfiOBba2rxx37NhsDMNe4WrKuEFw8nsHd/Qg13MxGJAC0r5qxea4OhyuL5kGPvpTD2cfZLsfQemUxqBVHKsGsU+o9TZyeRWCDjK72kvlazCTHzuAx7x+BDWlZEEVtgQ8PUFHAxodtghsZk2FuRUnal5tabanD557LfIkUM6UV6wcPSQ4bgMzbeWYcNa+vUWnsmZJR/bkizIa+ChqJS15ZYs2tpHsj+qju/eim5KMGXeYg8FpqHCOMTjBOn9BvLiBZRKTKMKuTrt5jq4RbpHYOnJ/H8oblzGD2g04ZUfEaAODs0pd+E0EG0iEr57Sl/zg/Tesbpc5P43V/G9UuZOIgWhl0LXcpv+5ePLzAGXp7Az57+RON1ETojWjhTVT4CwDcbHZIpVjfuMx9O4cjxJPa/mdR8HWRGA2crKfNBED0MjTj+fgqP/tSXc9bMiAbT+7LEJaDS7Qk6lqOBEsQp9zBY6UYFmMHpkhuzhldi2RBSGS7FfpYg1u4KtSYEl2I/J2eyqWPkH34+RaUj4lmyp7vp5unnc2aEYmZSkVYFGJoX8zmCiLOMY2YSUh7YiXlmQZzz0r1lakwI0Du/KlB6Eouou5wtCjSUHus0QezaG+SqkmZIIVBf/tWACE8V9B4h+ZMlxhkFEeXEc8nFy4e4QGKbzWQs60KIGGApz6UQrxlzYgtXgqi7P/MOz5sTdJh3LOT0lnN3w/2do3vDyqTemYk909X47YV+J+farbiRSRFgj9oUXIiSKpY5yCmIpIXFjcyzhgUUtrgzH5u9vHY3ONZwOz1pWTDZma+9nqsdcm0dI/tDhwTXHOjvbvpmvr/sav+PWMPZbmQheXLWiRlKIojt08dbbDufkDwsj7a49TZ0vUNOPJXEGi5M0+c0BdtciKdhQVsWxX1MrOFCUbLb5hXqZVjwHlLx6RNruPDylWYseW+hHoZCUZt67W8Brw9v9DPWq+uLcfkUQnNiv5kTzya07y4eT88hSMOY0R6I3BdRUj6rl2IInh8MkRSBuJFJZhM+hYj2yWfMNx3i6rVRQpQBFtGTPrJOGpP1jP6eeE+p3qCkR6ek4WKA5YflYCbuks9SSjGEsh2/tZ17xOaHXeV2Kg/1yYaEbGvgnr8byoxjqSTOMu06GMdkQTIQvbJvqlxCTFOxgCg3H7P2EfHP0GWDNwPHZK+tbO9caN27lGjxDbX9M8SyQbkElNsUYNA+n2HtLiQZ6EtB0syQLdxl10KX8tteV92WE8d8RM70yTGyfJZVAy0I5iHlt6XisxQZlrq2TlnbZrt4Jzc4JQrtqnhS+0uVm5IKR1JUh4akXIWqkGDhpJwDt4+B68tnvr6L5zB8YjIAAAAASUVORK5CYII="
|
||||
},
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
13
tests/data/groundtruth/docling_v2/word_image_anchors.docx.md
vendored
Normal file
13
tests/data/groundtruth/docling_v2/word_image_anchors.docx.md
vendored
Normal file
@ -0,0 +1,13 @@
|
||||
**Transcript**
|
||||
|
||||
February 20, 2025, 8:32PM
|
||||
|
||||
<!-- image -->
|
||||
|
||||
**This is test 1** 0:08
|
||||
Correct, he is not.
|
||||
|
||||
<!-- image -->
|
||||
|
||||
**This is test 2** 0:16
|
||||
Yeah, exactly.
|
@ -9,6 +9,7 @@ from docling.datamodel.document import (
|
||||
DoclingDocument,
|
||||
InputDocument,
|
||||
SectionHeaderItem,
|
||||
TextItem,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
@ -96,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
|
||||
|
||||
pred_md: str = doc.export_to_markdown()
|
||||
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
|
||||
"export to md"
|
||||
f"export to markdown failed on {docx_path}"
|
||||
)
|
||||
|
||||
pred_itxt: str = doc._export_to_indented_text(
|
||||
max_text_len=70, explicit_tables=False
|
||||
)
|
||||
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
|
||||
"export to indented-text"
|
||||
f"export to indented-text failed on {docx_path}"
|
||||
)
|
||||
|
||||
assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
|
||||
"document document"
|
||||
f"DoclingDocument verification failed on {docx_path}"
|
||||
)
|
||||
|
||||
if docx_path.name == "word_tables.docx":
|
||||
@ -116,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
|
||||
pred_text=pred_html,
|
||||
gtfile=str(gt_path) + ".html",
|
||||
generate=GENERATE,
|
||||
), "export to html"
|
||||
), f"export to html failed on {docx_path}"
|
||||
|
||||
|
||||
flaky_path = Path("tests/data/docx/textbox.docx")
|
||||
@ -131,3 +132,42 @@ def test_e2e_docx_conversions():
|
||||
@pytest.mark.xfail(strict=False)
|
||||
def test_textbox_conversion():
|
||||
_test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
|
||||
|
||||
|
||||
def test_text_after_image_anchors():
|
||||
"""
|
||||
Test to analyse whether text gets parsed after image anchors.
|
||||
"""
|
||||
|
||||
in_path = Path("tests/data/docx/word_image_anchors.docx")
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=in_path,
|
||||
format=InputFormat.DOCX,
|
||||
backend=MsWordDocumentBackend,
|
||||
)
|
||||
backend = MsWordDocumentBackend(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=in_path,
|
||||
)
|
||||
doc = backend.convert()
|
||||
|
||||
found_text_after_anchor_1 = found_text_after_anchor_2 = (
|
||||
found_text_after_anchor_3
|
||||
) = found_text_after_anchor_4 = False
|
||||
for item, _ in doc.iterate_items():
|
||||
if isinstance(item, TextItem):
|
||||
if item.text == "This is test 1":
|
||||
found_text_after_anchor_1 = True
|
||||
elif item.text == "0:08\nCorrect, he is not.":
|
||||
found_text_after_anchor_2 = True
|
||||
elif item.text == "This is test 2":
|
||||
found_text_after_anchor_3 = True
|
||||
elif item.text == "0:16\nYeah, exactly.":
|
||||
found_text_after_anchor_4 = True
|
||||
|
||||
assert (
|
||||
found_text_after_anchor_1
|
||||
and found_text_after_anchor_2
|
||||
and found_text_after_anchor_3
|
||||
and found_text_after_anchor_4
|
||||
)
|
||||
|
50
uv.lock
generated
50
uv.lock
generated
@ -983,7 +983,7 @@ examples = [
|
||||
|
||||
[[package]]
|
||||
name = "docling-core"
|
||||
version = "2.38.0"
|
||||
version = "2.38.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "jsonref" },
|
||||
@ -997,9 +997,9 @@ dependencies = [
|
||||
{ name = "typer" },
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/01/3d/02b4926567735c252b4750074f9dfc96d06078566f067eb47c13713952a2/docling_core-2.38.0.tar.gz", hash = "sha256:3bad4c476cc798e29d01b02ea383b5582d7031e9595b177be0a9450f2eb7bef6", size = 145997, upload-time = "2025-06-18T12:35:23.81Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/38/f7/33bb17aa13e73722bf18ecfb7f13d6fbfb384c22003209bd72708123b33f/docling_core-2.38.1.tar.gz", hash = "sha256:a0566df2316eec4d22953ca7dac839b926dd57549b4c07ac810e87dbbaf91a10", size = 146276, upload-time = "2025-06-20T12:28:48.422Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/3c/52/e65521ec8ae7ecbce2f9dd95dbf4164b4d4c58c29136e1489a038ce9a2fc/docling_core-2.38.0-py3-none-any.whl", hash = "sha256:8f27d7074a99913f2ba73bde363bbed3416852014eda136bb8880d37805c6950", size = 151276, upload-time = "2025-06-18T12:35:22.25Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f0/c5/fb2e24602db94ec02cc3ac8eb7b9665f2a5f61ff81866beb67aff95a353a/docling_core-2.38.1-py3-none-any.whl", hash = "sha256:6859313561030503e8b53aec535aa5edb765a679af76ce2e2c60722d78c6c613", size = 151570, upload-time = "2025-06-20T12:28:46.764Z" },
|
||||
]
|
||||
|
||||
[package.optional-dependencies]
|
||||
@ -3387,10 +3387,10 @@ name = "ocrmac"
|
||||
version = "1.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux')" },
|
||||
{ name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform == 'darwin'" },
|
||||
{ name = "pillow", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "pyobjc-framework-vision", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
|
||||
{ name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
|
||||
{ name = "pillow" },
|
||||
{ name = "pyobjc-framework-vision" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/dd/dc/de3e9635774b97d9766f6815bbb3f5ec9bce347115f10d9abbf2733a9316/ocrmac-1.0.0.tar.gz", hash = "sha256:5b299e9030c973d1f60f82db000d6c2e5ff271601878c7db0885e850597d1d2e", size = 1463997, upload-time = "2024-11-07T12:00:00.197Z" }
|
||||
wheels = [
|
||||
@ -4414,7 +4414,7 @@ name = "pyobjc-framework-cocoa"
|
||||
version = "11.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "pyobjc-core" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/4b/c5/7a866d24bc026f79239b74d05e2cf3088b03263da66d53d1b4cf5207f5ae/pyobjc_framework_cocoa-11.1.tar.gz", hash = "sha256:87df76b9b73e7ca699a828ff112564b59251bb9bbe72e610e670a4dc9940d038", size = 5565335, upload-time = "2025-06-14T20:56:59.683Z" }
|
||||
wheels = [
|
||||
@ -4433,8 +4433,8 @@ name = "pyobjc-framework-coreml"
|
||||
version = "11.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "pyobjc-core" },
|
||||
{ name = "pyobjc-framework-cocoa" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/0d/5d/4309f220981d769b1a2f0dcb2c5c104490d31389a8ebea67e5595ce1cb74/pyobjc_framework_coreml-11.1.tar.gz", hash = "sha256:775923eefb9eac2e389c0821b10564372de8057cea89f1ea1cdaf04996c970a7", size = 82005, upload-time = "2025-06-14T20:57:12.004Z" }
|
||||
wheels = [
|
||||
@ -4453,8 +4453,8 @@ name = "pyobjc-framework-quartz"
|
||||
version = "11.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "pyobjc-core" },
|
||||
{ name = "pyobjc-framework-cocoa" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/c7/ac/6308fec6c9ffeda9942fef72724f4094c6df4933560f512e63eac37ebd30/pyobjc_framework_quartz-11.1.tar.gz", hash = "sha256:a57f35ccfc22ad48c87c5932818e583777ff7276605fef6afad0ac0741169f75", size = 3953275, upload-time = "2025-06-14T20:58:17.924Z" }
|
||||
wheels = [
|
||||
@ -4473,10 +4473,10 @@ name = "pyobjc-framework-vision"
|
||||
version = "11.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "pyobjc-framework-coreml", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "pyobjc-framework-quartz", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
||||
{ name = "pyobjc-core" },
|
||||
{ name = "pyobjc-framework-cocoa" },
|
||||
{ name = "pyobjc-framework-coreml" },
|
||||
{ name = "pyobjc-framework-quartz" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/40/a8/7128da4d0a0103cabe58910a7233e2f98d18c590b1d36d4b3efaaedba6b9/pyobjc_framework_vision-11.1.tar.gz", hash = "sha256:26590512ee7758da3056499062a344b8a351b178be66d4b719327884dde4216b", size = 133721, upload-time = "2025-06-14T20:58:46.095Z" }
|
||||
wheels = [
|
||||
@ -4957,17 +4957,17 @@ source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
|
||||
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
|
||||
{ name = "numpy", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and python_full_version < '3.13') or (python_full_version >= '3.11' and platform_machine != 'arm64') or (python_full_version >= '3.11' and sys_platform != 'darwin')" },
|
||||
{ name = "numpy", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
|
||||
{ name = "onnxruntime", version = "1.19.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
|
||||
{ name = "onnxruntime", version = "1.22.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" },
|
||||
{ name = "opencv-python", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
|
||||
{ name = "pillow", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
|
||||
{ name = "pyclipper", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
|
||||
{ name = "pyyaml", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
|
||||
{ name = "onnxruntime", version = "1.22.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
|
||||
{ name = "opencv-python" },
|
||||
{ name = "pillow" },
|
||||
{ name = "pyclipper" },
|
||||
{ name = "pyyaml" },
|
||||
{ name = "shapely", version = "2.0.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
|
||||
{ name = "shapely", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" },
|
||||
{ name = "six", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
|
||||
{ name = "tqdm", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
|
||||
{ name = "shapely", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
|
||||
{ name = "six" },
|
||||
{ name = "tqdm" },
|
||||
]
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ba/12/1e5497183bdbe782dbb91bad1d0d2297dba4d2831b2652657f7517bfc6df/rapidocr_onnxruntime-1.4.4-py3-none-any.whl", hash = "sha256:971d7d5f223a7a808662229df1ef69893809d8457d834e6373d3854bc1782cbf", size = 14915192, upload-time = "2025-01-17T01:48:25.104Z" },
|
||||
|
Loading…
Reference in New Issue
Block a user