mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Update all test cases again
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
commit
6158a2e784
@ -14,7 +14,7 @@ from docling_core.types.doc import (
|
|||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.document import Formatting
|
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from docx.document import Document as DocxDocument
|
from docx.document import Document as DocxDocument
|
||||||
from docx.oxml.table import CT_Tc
|
from docx.oxml.table import CT_Tc
|
||||||
@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.valid = True
|
self.valid = True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
@override
|
@override
|
||||||
@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self._handle_tables(element, docx_obj, doc)
|
self._handle_tables(element, docx_obj, doc)
|
||||||
except Exception:
|
except Exception:
|
||||||
_log.debug("could not parse a table, broken docx table")
|
_log.debug("could not parse a table, broken docx table")
|
||||||
|
# Check for Image
|
||||||
elif drawing_blip:
|
elif drawing_blip:
|
||||||
self._handle_pictures(docx_obj, drawing_blip, doc)
|
self._handle_pictures(docx_obj, drawing_blip, doc)
|
||||||
|
# Check for Text after the Image
|
||||||
|
if (
|
||||||
|
tag_name in ["p"]
|
||||||
|
and element.find(".//w:t", namespaces=namespaces) is not None
|
||||||
|
):
|
||||||
|
self._handle_text_elements(element, docx_obj, doc)
|
||||||
# Check for the sdt containers, like table of contents
|
# Check for the sdt containers, like table of contents
|
||||||
elif tag_name in ["sdt"]:
|
elif tag_name in ["sdt"]:
|
||||||
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
||||||
@ -268,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self._handle_text_elements(element, docx_obj, doc)
|
self._handle_text_elements(element, docx_obj, doc)
|
||||||
else:
|
else:
|
||||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def _str_to_int(
|
def _str_to_int(
|
||||||
@ -578,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
all_paragraphs = []
|
all_paragraphs = []
|
||||||
|
|
||||||
# Sort paragraphs within each container, then process containers
|
# Sort paragraphs within each container, then process containers
|
||||||
for container_id, paragraphs in container_paragraphs.items():
|
for paragraphs in container_paragraphs.values():
|
||||||
# Sort by vertical position within each container
|
# Sort by vertical position within each container
|
||||||
sorted_container_paragraphs = sorted(
|
sorted_container_paragraphs = sorted(
|
||||||
paragraphs,
|
paragraphs,
|
||||||
@ -689,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
) -> None:
|
) -> None:
|
||||||
paragraph = Paragraph(element, docx_obj)
|
paragraph = Paragraph(element, docx_obj)
|
||||||
|
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||||
text, equations = self._handle_equations_in_text(
|
text, equations = self._handle_equations_in_text(
|
||||||
element=element, text=paragraph.text
|
element=element, text=paragraph.text
|
||||||
)
|
)
|
||||||
|
|
||||||
if text is None:
|
if text is None:
|
||||||
return
|
return
|
||||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
|
|
||||||
# Common styles for bullet and numbered lists.
|
# Common styles for bullet and numbered lists.
|
||||||
@ -912,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def _add_formatted_list_item(
|
||||||
|
self,
|
||||||
|
doc: DoclingDocument,
|
||||||
|
elements: list,
|
||||||
|
marker: str,
|
||||||
|
enumerated: bool,
|
||||||
|
level: int,
|
||||||
|
) -> None:
|
||||||
|
# This should not happen by construction
|
||||||
|
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
|
||||||
|
return
|
||||||
|
if len(elements) == 1:
|
||||||
|
text, format, hyperlink = elements[0]
|
||||||
|
doc.add_list_item(
|
||||||
|
marker=marker,
|
||||||
|
enumerated=enumerated,
|
||||||
|
parent=self.parents[level],
|
||||||
|
text=text,
|
||||||
|
formatting=format,
|
||||||
|
hyperlink=hyperlink,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
new_item = doc.add_list_item(
|
||||||
|
marker=marker,
|
||||||
|
enumerated=enumerated,
|
||||||
|
parent=self.parents[level],
|
||||||
|
text="",
|
||||||
|
)
|
||||||
|
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
|
||||||
|
for text, format, hyperlink in elements:
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.TEXT,
|
||||||
|
parent=new_parent,
|
||||||
|
text=text,
|
||||||
|
formatting=format,
|
||||||
|
hyperlink=hyperlink,
|
||||||
|
)
|
||||||
|
|
||||||
def _add_list_item(
|
def _add_list_item(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
@ -921,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
elements: list,
|
elements: list,
|
||||||
is_numbered: bool = False,
|
is_numbered: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
|
||||||
|
if not elements:
|
||||||
|
return None
|
||||||
enum_marker = ""
|
enum_marker = ""
|
||||||
|
|
||||||
level = self._get_level()
|
level = self._get_level()
|
||||||
@ -937,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
new_parent = self._create_or_reuse_parent(
|
self._add_formatted_list_item(
|
||||||
doc=doc,
|
doc, elements, enum_marker, is_numbered, level
|
||||||
prev_parent=self.parents[level],
|
|
||||||
paragraph_elements=elements,
|
|
||||||
)
|
)
|
||||||
for text, format, hyperlink in elements:
|
|
||||||
doc.add_list_item(
|
|
||||||
marker=enum_marker,
|
|
||||||
enumerated=is_numbered,
|
|
||||||
parent=new_parent,
|
|
||||||
text=text,
|
|
||||||
formatting=format,
|
|
||||||
hyperlink=hyperlink,
|
|
||||||
)
|
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
self._prev_numid() == numid
|
self._prev_numid() == numid
|
||||||
and self.level_at_new_list is not None
|
and self.level_at_new_list is not None
|
||||||
@ -981,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
|
self._add_formatted_list_item(
|
||||||
new_parent = self._create_or_reuse_parent(
|
doc,
|
||||||
doc=doc,
|
elements,
|
||||||
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
enum_marker,
|
||||||
paragraph_elements=elements,
|
is_numbered,
|
||||||
|
self.level_at_new_list + ilevel,
|
||||||
)
|
)
|
||||||
for text, format, hyperlink in elements:
|
|
||||||
doc.add_list_item(
|
|
||||||
marker=enum_marker,
|
|
||||||
enumerated=is_numbered,
|
|
||||||
parent=new_parent,
|
|
||||||
text=text,
|
|
||||||
formatting=format,
|
|
||||||
hyperlink=hyperlink,
|
|
||||||
)
|
|
||||||
elif (
|
elif (
|
||||||
self._prev_numid() == numid
|
self._prev_numid() == numid
|
||||||
and self.level_at_new_list is not None
|
and self.level_at_new_list is not None
|
||||||
and prev_indent is not None
|
and prev_indent is not None
|
||||||
and ilevel < prev_indent
|
and ilevel < prev_indent
|
||||||
): # Close list
|
): # Close list
|
||||||
for k, v in self.parents.items():
|
for k in self.parents:
|
||||||
if k > self.level_at_new_list + ilevel:
|
if k > self.level_at_new_list + ilevel:
|
||||||
self.parents[k] = None
|
self.parents[k] = None
|
||||||
|
|
||||||
@ -1011,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
new_parent = self._create_or_reuse_parent(
|
self._add_formatted_list_item(
|
||||||
doc=doc,
|
doc,
|
||||||
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
elements,
|
||||||
paragraph_elements=elements,
|
enum_marker,
|
||||||
|
is_numbered,
|
||||||
|
self.level_at_new_list + ilevel,
|
||||||
)
|
)
|
||||||
for text, format, hyperlink in elements:
|
|
||||||
doc.add_list_item(
|
|
||||||
marker=enum_marker,
|
|
||||||
enumerated=is_numbered,
|
|
||||||
parent=new_parent,
|
|
||||||
text=text,
|
|
||||||
formatting=format,
|
|
||||||
hyperlink=hyperlink,
|
|
||||||
)
|
|
||||||
self.listIter = 0
|
self.listIter = 0
|
||||||
|
|
||||||
elif self._prev_numid() == numid or prev_indent == ilevel:
|
elif self._prev_numid() == numid or prev_indent == ilevel:
|
||||||
@ -1033,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
new_parent = self._create_or_reuse_parent(
|
self._add_formatted_list_item(
|
||||||
doc=doc,
|
doc, elements, enum_marker, is_numbered, level - 1
|
||||||
prev_parent=self.parents[level - 1],
|
|
||||||
paragraph_elements=elements,
|
|
||||||
)
|
)
|
||||||
for text, format, hyperlink in elements:
|
|
||||||
# Add the list item to the parent group
|
|
||||||
doc.add_list_item(
|
|
||||||
marker=enum_marker,
|
|
||||||
enumerated=is_numbered,
|
|
||||||
parent=new_parent,
|
|
||||||
text=text,
|
|
||||||
formatting=format,
|
|
||||||
hyperlink=hyperlink,
|
|
||||||
)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def _handle_tables(
|
def _handle_tables(
|
||||||
|
BIN
tests/data/docx/word_image_anchors.docx
vendored
Normal file
BIN
tests/data/docx/word_image_anchors.docx
vendored
Normal file
Binary file not shown.
@ -12,9 +12,7 @@ Create your feature branch: `git checkout -b feature/AmazingFeature` .
|
|||||||
4. Push to the branch ( `git push origin feature/AmazingFeature` )
|
4. Push to the branch ( `git push origin feature/AmazingFeature` )
|
||||||
5. Open a Pull Request
|
5. Open a Pull Request
|
||||||
|
|
||||||
##
|
## *Second* section
|
||||||
|
|
||||||
*Second* section
|
|
||||||
|
|
||||||
- **First** : Lorem ipsum.
|
- **First** : Lorem ipsum.
|
||||||
- **Second** : Dolor `sit` amet.
|
- **Second** : Dolor `sit` amet.
|
||||||
|
134
tests/data/groundtruth/docling_v2/textbox.docx.itxt
vendored
134
tests/data/groundtruth/docling_v2/textbox.docx.itxt
vendored
@ -11,84 +11,82 @@ item-0 at level 0: unspecified: group _root_
|
|||||||
* Blisters
|
* Blisters
|
||||||
* Headache
|
* Headache
|
||||||
* Sore throat
|
* Sore throat
|
||||||
item-9 at level 1: list: group group
|
item-9 at level 1: paragraph:
|
||||||
item-10 at level 2: list_item:
|
item-10 at level 1: paragraph:
|
||||||
item-11 at level 1: paragraph:
|
item-11 at level 1: section: group textbox
|
||||||
item-12 at level 1: paragraph:
|
item-12 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
|
||||||
item-13 at level 1: section: group textbox
|
item-13 at level 1: paragraph:
|
||||||
item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
|
item-14 at level 1: paragraph:
|
||||||
item-15 at level 1: paragraph:
|
item-15 at level 1: paragraph:
|
||||||
item-16 at level 1: paragraph:
|
item-16 at level 1: paragraph:
|
||||||
item-17 at level 1: paragraph:
|
item-17 at level 1: section: group textbox
|
||||||
item-18 at level 1: paragraph:
|
item-18 at level 2: paragraph: Yes
|
||||||
item-19 at level 1: section: group textbox
|
item-19 at level 1: paragraph:
|
||||||
item-20 at level 2: paragraph: Yes
|
item-20 at level 1: paragraph:
|
||||||
item-21 at level 1: paragraph:
|
item-21 at level 1: section: group textbox
|
||||||
item-22 at level 1: paragraph:
|
item-22 at level 2: list: group list
|
||||||
item-23 at level 1: section: group textbox
|
item-23 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
|
||||||
item-24 at level 2: list: group list
|
item-24 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System.
|
||||||
item-25 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
|
item-25 at level 2: paragraph:
|
||||||
item-26 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System.
|
item-26 at level 1: list: group list
|
||||||
item-27 at level 2: paragraph:
|
item-27 at level 2: list_item:
|
||||||
item-28 at level 1: list: group list
|
item-28 at level 1: paragraph:
|
||||||
item-29 at level 2: list_item:
|
item-29 at level 1: paragraph:
|
||||||
item-30 at level 1: paragraph:
|
item-30 at level 1: paragraph:
|
||||||
item-31 at level 1: paragraph:
|
item-31 at level 1: paragraph:
|
||||||
item-32 at level 1: paragraph:
|
item-32 at level 1: paragraph:
|
||||||
item-33 at level 1: paragraph:
|
item-33 at level 1: section: group textbox
|
||||||
item-34 at level 1: paragraph:
|
item-34 at level 2: paragraph: Health Bureau:
|
||||||
item-35 at level 1: section: group textbox
|
item-35 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
|
||||||
item-36 at level 2: paragraph: Health Bureau:
|
item-36 at level 2: list: group list
|
||||||
item-37 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
|
item-37 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
|
||||||
item-38 at level 2: list: group list
|
item-38 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
|
||||||
item-39 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
|
item-39 at level 2: paragraph:
|
||||||
item-40 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
|
item-40 at level 1: list: group list
|
||||||
item-41 at level 2: paragraph:
|
item-41 at level 2: list_item:
|
||||||
item-42 at level 1: list: group list
|
item-42 at level 1: paragraph:
|
||||||
item-43 at level 2: list_item:
|
item-43 at level 1: section: group textbox
|
||||||
item-44 at level 1: paragraph:
|
item-44 at level 2: paragraph: Department of Education:
|
||||||
item-45 at level 1: section: group textbox
|
|
||||||
item-46 at level 2: paragraph: Department of Education:
|
|
||||||
Collabo ... vention measures at all school levels.
|
Collabo ... vention measures at all school levels.
|
||||||
|
item-45 at level 1: paragraph:
|
||||||
|
item-46 at level 1: paragraph:
|
||||||
item-47 at level 1: paragraph:
|
item-47 at level 1: paragraph:
|
||||||
item-48 at level 1: paragraph:
|
item-48 at level 1: paragraph:
|
||||||
item-49 at level 1: paragraph:
|
item-49 at level 1: paragraph:
|
||||||
item-50 at level 1: paragraph:
|
item-50 at level 1: paragraph:
|
||||||
item-51 at level 1: paragraph:
|
item-51 at level 1: paragraph:
|
||||||
item-52 at level 1: paragraph:
|
item-52 at level 1: section: group textbox
|
||||||
item-53 at level 1: paragraph:
|
item-53 at level 2: inline: group group
|
||||||
item-54 at level 1: section: group textbox
|
item-54 at level 3: paragraph: The Health Bureau will handle
|
||||||
item-55 at level 2: inline: group group
|
item-55 at level 3: paragraph: reporting and specimen collection
|
||||||
item-56 at level 3: paragraph: The Health Bureau will handle
|
item-56 at level 3: paragraph: .
|
||||||
item-57 at level 3: paragraph: reporting and specimen collection
|
item-57 at level 2: paragraph:
|
||||||
item-58 at level 3: paragraph: .
|
item-58 at level 1: paragraph:
|
||||||
item-59 at level 2: paragraph:
|
item-59 at level 1: paragraph:
|
||||||
item-60 at level 1: paragraph:
|
item-60 at level 1: paragraph:
|
||||||
item-61 at level 1: paragraph:
|
item-61 at level 1: section: group textbox
|
||||||
item-62 at level 1: paragraph:
|
item-62 at level 2: paragraph: Whether the epidemic has eased.
|
||||||
item-63 at level 1: section: group textbox
|
item-63 at level 2: paragraph:
|
||||||
item-64 at level 2: paragraph: Whether the epidemic has eased.
|
item-64 at level 1: paragraph:
|
||||||
item-65 at level 2: paragraph:
|
item-65 at level 1: section: group textbox
|
||||||
item-66 at level 1: paragraph:
|
item-66 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
|
||||||
item-67 at level 1: section: group textbox
|
item-67 at level 2: paragraph: No
|
||||||
item-68 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
|
item-68 at level 1: paragraph:
|
||||||
item-69 at level 2: paragraph: No
|
item-69 at level 1: paragraph:
|
||||||
item-70 at level 1: paragraph:
|
item-70 at level 1: section: group textbox
|
||||||
item-71 at level 1: paragraph:
|
item-71 at level 2: paragraph: Yes
|
||||||
item-72 at level 1: section: group textbox
|
item-72 at level 1: paragraph:
|
||||||
item-73 at level 2: paragraph: Yes
|
item-73 at level 1: section: group textbox
|
||||||
item-74 at level 1: paragraph:
|
item-74 at level 2: paragraph: Yes
|
||||||
item-75 at level 1: section: group textbox
|
item-75 at level 1: paragraph:
|
||||||
item-76 at level 2: paragraph: Yes
|
item-76 at level 1: paragraph:
|
||||||
item-77 at level 1: paragraph:
|
item-77 at level 1: section: group textbox
|
||||||
item-78 at level 1: paragraph:
|
item-78 at level 2: paragraph: Case closed.
|
||||||
item-79 at level 1: section: group textbox
|
item-79 at level 2: paragraph:
|
||||||
item-80 at level 2: paragraph: Case closed.
|
item-80 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
|
||||||
item-81 at level 2: paragraph:
|
item-81 at level 1: paragraph:
|
||||||
item-82 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
|
item-82 at level 1: section: group textbox
|
||||||
item-83 at level 1: paragraph:
|
item-83 at level 2: paragraph: No
|
||||||
item-84 at level 1: section: group textbox
|
item-84 at level 1: paragraph:
|
||||||
item-85 at level 2: paragraph: No
|
item-85 at level 1: paragraph:
|
||||||
item-86 at level 1: paragraph:
|
item-86 at level 1: paragraph:
|
||||||
item-87 at level 1: paragraph:
|
|
||||||
item-88 at level 1: paragraph:
|
|
@ -29,9 +29,6 @@
|
|||||||
{
|
{
|
||||||
"$ref": "#/groups/0"
|
"$ref": "#/groups/0"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"$ref": "#/groups/19"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/6"
|
"$ref": "#/texts/6"
|
||||||
},
|
},
|
||||||
@ -492,20 +489,6 @@
|
|||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"name": "textbox",
|
"name": "textbox",
|
||||||
"label": "section"
|
"label": "section"
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/groups/19",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/body"
|
|
||||||
},
|
|
||||||
"children": [
|
|
||||||
{
|
|
||||||
"$ref": "#/texts/67"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"content_layer": "body",
|
|
||||||
"name": "group",
|
|
||||||
"label": "list"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"texts": [
|
"texts": [
|
||||||
@ -1494,20 +1477,6 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "",
|
"orig": "",
|
||||||
"text": ""
|
"text": ""
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/texts/67",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/19"
|
|
||||||
},
|
|
||||||
"children": [],
|
|
||||||
"content_layer": "body",
|
|
||||||
"label": "list_item",
|
|
||||||
"prov": [],
|
|
||||||
"orig": "",
|
|
||||||
"text": "",
|
|
||||||
"enumerated": false,
|
|
||||||
"marker": "-"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"pictures": [],
|
"pictures": [],
|
||||||
|
@ -17,21 +17,16 @@ item-0 at level 0: unspecified: group _root_
|
|||||||
item-16 at level 2: list_item: Italic bullet 1
|
item-16 at level 2: list_item: Italic bullet 1
|
||||||
item-17 at level 2: list_item: Bold bullet 2
|
item-17 at level 2: list_item: Bold bullet 2
|
||||||
item-18 at level 2: list_item: Underline bullet 3
|
item-18 at level 2: list_item: Underline bullet 3
|
||||||
item-19 at level 2: inline: group group
|
item-19 at level 2: list_item:
|
||||||
item-20 at level 3: list: group group
|
item-20 at level 3: inline: group group
|
||||||
item-21 at level 4: list_item: Some
|
item-21 at level 4: text: Some
|
||||||
item-22 at level 3: list: group group
|
item-22 at level 4: text: italic
|
||||||
item-23 at level 4: list_item: italic
|
item-23 at level 4: text: bold
|
||||||
item-24 at level 3: list: group group
|
item-24 at level 4: text: underline
|
||||||
item-25 at level 4: list_item: bold
|
item-25 at level 2: list: group list
|
||||||
item-26 at level 3: list: group group
|
item-26 at level 3: list_item:
|
||||||
item-27 at level 4: list_item: underline
|
item-27 at level 4: inline: group group
|
||||||
item-28 at level 2: list: group list
|
item-28 at level 5: text: Nested
|
||||||
item-29 at level 3: inline: group group
|
item-29 at level 5: text: italic
|
||||||
item-30 at level 4: list: group group
|
item-30 at level 5: text: bold
|
||||||
item-31 at level 5: list_item: Nested
|
item-31 at level 1: paragraph:
|
||||||
item-32 at level 4: list: group group
|
|
||||||
item-33 at level 5: list_item: italic
|
|
||||||
item-34 at level 4: list: group group
|
|
||||||
item-35 at level 5: list_item: bold
|
|
||||||
item-36 at level 1: paragraph:
|
|
@ -42,7 +42,7 @@
|
|||||||
"$ref": "#/groups/1"
|
"$ref": "#/groups/1"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/16"
|
"$ref": "#/texts/25"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -98,7 +98,7 @@
|
|||||||
"$ref": "#/texts/15"
|
"$ref": "#/texts/15"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/2"
|
"$ref": "#/texts/16"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/3"
|
"$ref": "#/groups/3"
|
||||||
@ -111,20 +111,20 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/groups/2",
|
"self_ref": "#/groups/2",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/1"
|
"$ref": "#/texts/16"
|
||||||
},
|
},
|
||||||
"children": [
|
"children": [
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/11"
|
"$ref": "#/texts/17"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/10"
|
"$ref": "#/texts/18"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/9"
|
"$ref": "#/texts/19"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/8"
|
"$ref": "#/texts/20"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -138,7 +138,7 @@
|
|||||||
},
|
},
|
||||||
"children": [
|
"children": [
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/4"
|
"$ref": "#/texts/21"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -148,120 +148,22 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/groups/4",
|
"self_ref": "#/groups/4",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/3"
|
"$ref": "#/texts/21"
|
||||||
},
|
},
|
||||||
"children": [
|
"children": [
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/7"
|
"$ref": "#/texts/22"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/6"
|
"$ref": "#/texts/23"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/5"
|
"$ref": "#/texts/24"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"name": "group",
|
"name": "group",
|
||||||
"label": "inline"
|
"label": "inline"
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/groups/5",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/4"
|
|
||||||
},
|
|
||||||
"children": [
|
|
||||||
{
|
|
||||||
"$ref": "#/texts/17"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"content_layer": "body",
|
|
||||||
"name": "group",
|
|
||||||
"label": "list"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/groups/6",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/4"
|
|
||||||
},
|
|
||||||
"children": [
|
|
||||||
{
|
|
||||||
"$ref": "#/texts/18"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"content_layer": "body",
|
|
||||||
"name": "group",
|
|
||||||
"label": "list"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/groups/7",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/4"
|
|
||||||
},
|
|
||||||
"children": [
|
|
||||||
{
|
|
||||||
"$ref": "#/texts/19"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"content_layer": "body",
|
|
||||||
"name": "group",
|
|
||||||
"label": "list"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/groups/8",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/2"
|
|
||||||
},
|
|
||||||
"children": [
|
|
||||||
{
|
|
||||||
"$ref": "#/texts/20"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"content_layer": "body",
|
|
||||||
"name": "group",
|
|
||||||
"label": "list"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/groups/9",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/2"
|
|
||||||
},
|
|
||||||
"children": [
|
|
||||||
{
|
|
||||||
"$ref": "#/texts/21"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"content_layer": "body",
|
|
||||||
"name": "group",
|
|
||||||
"label": "list"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/groups/10",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/2"
|
|
||||||
},
|
|
||||||
"children": [
|
|
||||||
{
|
|
||||||
"$ref": "#/texts/22"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"content_layer": "body",
|
|
||||||
"name": "group",
|
|
||||||
"label": "list"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/groups/11",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/2"
|
|
||||||
},
|
|
||||||
"children": [
|
|
||||||
{
|
|
||||||
"$ref": "#/texts/23"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"content_layer": "body",
|
|
||||||
"name": "group",
|
|
||||||
"label": "list"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"texts": [
|
"texts": [
|
||||||
@ -574,149 +476,29 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/16",
|
"self_ref": "#/texts/16",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/body"
|
"$ref": "#/groups/1"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/2"
|
||||||
|
}
|
||||||
|
],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "paragraph",
|
"label": "list_item",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "",
|
"orig": "",
|
||||||
"text": ""
|
"text": "",
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": "-"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/17",
|
"self_ref": "#/texts/17",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/5"
|
"$ref": "#/groups/2"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "list_item",
|
"label": "text",
|
||||||
"prov": [],
|
|
||||||
"orig": "bold",
|
|
||||||
"text": "bold",
|
|
||||||
"formatting": {
|
|
||||||
"bold": true,
|
|
||||||
"italic": false,
|
|
||||||
"underline": false,
|
|
||||||
"strikethrough": false,
|
|
||||||
"script": "baseline"
|
|
||||||
},
|
|
||||||
"enumerated": false,
|
|
||||||
"marker": "-"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/texts/18",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/6"
|
|
||||||
},
|
|
||||||
"children": [],
|
|
||||||
"content_layer": "body",
|
|
||||||
"label": "list_item",
|
|
||||||
"prov": [],
|
|
||||||
"orig": "italic",
|
|
||||||
"text": "italic",
|
|
||||||
"formatting": {
|
|
||||||
"bold": false,
|
|
||||||
"italic": true,
|
|
||||||
"underline": false,
|
|
||||||
"strikethrough": false,
|
|
||||||
"script": "baseline"
|
|
||||||
},
|
|
||||||
"enumerated": false,
|
|
||||||
"marker": "-"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/texts/19",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/7"
|
|
||||||
},
|
|
||||||
"children": [],
|
|
||||||
"content_layer": "body",
|
|
||||||
"label": "list_item",
|
|
||||||
"prov": [],
|
|
||||||
"orig": "Nested",
|
|
||||||
"text": "Nested",
|
|
||||||
"formatting": {
|
|
||||||
"bold": false,
|
|
||||||
"italic": false,
|
|
||||||
"underline": false,
|
|
||||||
"strikethrough": false,
|
|
||||||
"script": "baseline"
|
|
||||||
},
|
|
||||||
"enumerated": false,
|
|
||||||
"marker": "-"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/texts/20",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/8"
|
|
||||||
},
|
|
||||||
"children": [],
|
|
||||||
"content_layer": "body",
|
|
||||||
"label": "list_item",
|
|
||||||
"prov": [],
|
|
||||||
"orig": "underline",
|
|
||||||
"text": "underline",
|
|
||||||
"formatting": {
|
|
||||||
"bold": false,
|
|
||||||
"italic": false,
|
|
||||||
"underline": true,
|
|
||||||
"strikethrough": false,
|
|
||||||
"script": "baseline"
|
|
||||||
},
|
|
||||||
"enumerated": false,
|
|
||||||
"marker": "-"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/texts/21",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/9"
|
|
||||||
},
|
|
||||||
"children": [],
|
|
||||||
"content_layer": "body",
|
|
||||||
"label": "list_item",
|
|
||||||
"prov": [],
|
|
||||||
"orig": "bold",
|
|
||||||
"text": "bold",
|
|
||||||
"formatting": {
|
|
||||||
"bold": true,
|
|
||||||
"italic": false,
|
|
||||||
"underline": false,
|
|
||||||
"strikethrough": false,
|
|
||||||
"script": "baseline"
|
|
||||||
},
|
|
||||||
"enumerated": false,
|
|
||||||
"marker": "-"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/texts/22",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/10"
|
|
||||||
},
|
|
||||||
"children": [],
|
|
||||||
"content_layer": "body",
|
|
||||||
"label": "list_item",
|
|
||||||
"prov": [],
|
|
||||||
"orig": "italic",
|
|
||||||
"text": "italic",
|
|
||||||
"formatting": {
|
|
||||||
"bold": false,
|
|
||||||
"italic": true,
|
|
||||||
"underline": false,
|
|
||||||
"strikethrough": false,
|
|
||||||
"script": "baseline"
|
|
||||||
},
|
|
||||||
"enumerated": false,
|
|
||||||
"marker": "-"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/texts/23",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/11"
|
|
||||||
},
|
|
||||||
"children": [],
|
|
||||||
"content_layer": "body",
|
|
||||||
"label": "list_item",
|
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Some",
|
"orig": "Some",
|
||||||
"text": "Some",
|
"text": "Some",
|
||||||
@ -726,9 +508,151 @@
|
|||||||
"underline": false,
|
"underline": false,
|
||||||
"strikethrough": false,
|
"strikethrough": false,
|
||||||
"script": "baseline"
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/18",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/2"
|
||||||
},
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "italic",
|
||||||
|
"text": "italic",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": true,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/19",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/2"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "bold",
|
||||||
|
"text": "bold",
|
||||||
|
"formatting": {
|
||||||
|
"bold": true,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/20",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/2"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "underline",
|
||||||
|
"text": "underline",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": true,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/21",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/3"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "list_item",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "",
|
||||||
|
"text": "",
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/22",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Nested",
|
||||||
|
"text": "Nested",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/23",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "italic",
|
||||||
|
"text": "italic",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": true,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/24",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "bold",
|
||||||
|
"text": "bold",
|
||||||
|
"formatting": {
|
||||||
|
"bold": true,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/25",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "",
|
||||||
|
"text": ""
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"pictures": [],
|
"pictures": [],
|
||||||
|
@ -13,5 +13,5 @@ Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docli
|
|||||||
- *Italic bullet 1*
|
- *Italic bullet 1*
|
||||||
- **Bold bullet 2**
|
- **Bold bullet 2**
|
||||||
- Underline bullet 3
|
- Underline bullet 3
|
||||||
- Some - *italic* - **bold** - underline
|
- Some *italic* **bold** underline
|
||||||
- Nested - *italic* - **bold**
|
- Nested *italic* **bold**
|
16
tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt
vendored
Normal file
16
tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt
vendored
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
item-0 at level 0: unspecified: group _root_
|
||||||
|
item-1 at level 1: paragraph: Transcript
|
||||||
|
item-2 at level 1: paragraph: February 20, 2025, 8:32PM
|
||||||
|
item-3 at level 1: picture
|
||||||
|
item-4 at level 1: inline: group group
|
||||||
|
item-5 at level 2: paragraph: This is test 1
|
||||||
|
item-6 at level 2: paragraph: 0:08
|
||||||
|
Correct, he is not.
|
||||||
|
item-7 at level 1: paragraph:
|
||||||
|
item-8 at level 1: picture
|
||||||
|
item-9 at level 1: inline: group group
|
||||||
|
item-10 at level 2: paragraph: This is test 2
|
||||||
|
item-11 at level 2: paragraph: 0:16
|
||||||
|
Yeah, exactly.
|
||||||
|
item-12 at level 1: paragraph:
|
||||||
|
item-13 at level 1: paragraph:
|
292
tests/data/groundtruth/docling_v2/word_image_anchors.docx.json
vendored
Normal file
292
tests/data/groundtruth/docling_v2/word_image_anchors.docx.json
vendored
Normal file
@ -0,0 +1,292 @@
|
|||||||
|
{
|
||||||
|
"schema_name": "DoclingDocument",
|
||||||
|
"version": "1.4.0",
|
||||||
|
"name": "word_image_anchors",
|
||||||
|
"origin": {
|
||||||
|
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
"binary_hash": 2428692234257307633,
|
||||||
|
"filename": "word_image_anchors.docx"
|
||||||
|
},
|
||||||
|
"furniture": {
|
||||||
|
"self_ref": "#/furniture",
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"name": "_root_",
|
||||||
|
"label": "unspecified"
|
||||||
|
},
|
||||||
|
"body": {
|
||||||
|
"self_ref": "#/body",
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/pictures/0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/4"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/pictures/1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/8"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "_root_",
|
||||||
|
"label": "unspecified"
|
||||||
|
},
|
||||||
|
"groups": [
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/0",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/3"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "group",
|
||||||
|
"label": "inline"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/1",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/5"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/6"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "group",
|
||||||
|
"label": "inline"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"texts": [
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/0",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Transcript",
|
||||||
|
"text": "Transcript",
|
||||||
|
"formatting": {
|
||||||
|
"bold": true,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/1",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "February 20, 2025, 8:32PM",
|
||||||
|
"text": "February 20, 2025, 8:32PM",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/2",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "This is test 1",
|
||||||
|
"text": "This is test 1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": true,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/3",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "0:08\nCorrect, he is not.",
|
||||||
|
"text": "0:08\nCorrect, he is not.",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/4",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "",
|
||||||
|
"text": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/5",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/1"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "This is test 2",
|
||||||
|
"text": "This is test 2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": true,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/6",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/1"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "0:16\nYeah, exactly.",
|
||||||
|
"text": "0:16\nYeah, exactly.",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/7",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "",
|
||||||
|
"text": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/8",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "",
|
||||||
|
"text": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"pictures": [
|
||||||
|
{
|
||||||
|
"self_ref": "#/pictures/0",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "picture",
|
||||||
|
"prov": [],
|
||||||
|
"captions": [],
|
||||||
|
"references": [],
|
||||||
|
"footnotes": [],
|
||||||
|
"image": {
|
||||||
|
"mimetype": "image/png",
|
||||||
|
"dpi": 72,
|
||||||
|
"size": {
|
||||||
|
"width": 100.0,
|
||||||
|
"height": 100.0
|
||||||
|
},
|
||||||
|
"uri": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAAz0lEQVR4nO3bUW0CURRF0TukQvDSauBr0mACE1VBAzYQg5Lpdw0wO2EtA+cl+/6+GQAAAAAAAAAAAADe1DIR53X9mcNcdhnf5nm93Y8T8DElyzyuv/evlx/CMqeJOOz9AP4TJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiWp8+t/k8f6/bDrvPl28CAAAAAAAAAAAAAAAAzLv5A5bTEG2TIIlOAAAAAElFTkSuQmCC"
|
||||||
|
},
|
||||||
|
"annotations": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/pictures/1",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "picture",
|
||||||
|
"prov": [],
|
||||||
|
"captions": [],
|
||||||
|
"references": [],
|
||||||
|
"footnotes": [],
|
||||||
|
"image": {
|
||||||
|
"mimetype": "image/png",
|
||||||
|
"dpi": 72,
|
||||||
|
"size": {
|
||||||
|
"width": 100.0,
|
||||||
|
"height": 100.0
|
||||||
|
},
|
||||||
|
"uri": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAJIElEQVR4nO2dbWxb1RnH/8+1c5O4bITEwJrRF1ZAI6gtL9oK29oxihAdTQOVoGhbKyS0MDWZJk1CQ+q0aR/4xLYvJNGabdK07MukrSUNaxEvg7aUlteuLUoHrUTbseylSRSgpLGd3Ac9596kSWzHvva1fXzv/UmW4jaxj5+/z73nPOec50/QnM3t5xdbUWOlZeBGgK8jNpYC3AxQHOAGEMXAXKN+mSgF5nGAxgAeBmiIyToH0GnDwklj0jqxq/fK/0BjCJrR2jn8ZcPCXSBaC9DtAC/39h3oDMBHwHzQMvD3ga74P6ERWgjS1jG8BjAeALgVQEuZ334QoAHA2t3fHX8dQRWktX0obpi1jzDjewSshgYwcIwIf7KSiT8M9DYPB0KQts7RlWDuANCuSw/NAAPoBVF3f1fjCZQRKq8QeBzgragqqA+Ep8olDJXj0kSm+XNi6kQVw8RdnEz+otSXspIK0rZ9eDuIngTQAH8wBuYd/T3xnqoSRIauERi/ZuYN8CFEtG8K1o9LMWT2XJBN20e+TwZ1gdmEnyFKssWde3qafuvpy3r5Ym0dI78B8BiCxc7+7qYfaCXIxvbRpZEa7gOwDsHkwFSKtj7b23iu4oLYs2z6M4BlCDZnAd5S7Gy/KEHu3z5yDxN2AVhUzOv4iE+JsfmZnqbnyy7Iph+O3kcWD2g8264UzAa17nm68W+F/DEV0TOeC8XIChPj3kJ6ChV4z3gpvEzl5FOA17u9pxhuR1PODTwUIzeLJFZ2zEokiDO0Dfpoyg3LnJh5L4gz6QvqPKMY1jmx804QSYcEcAbuJY85MSz+pq7WuGEc831uqtQQJS1Yq3MlJHP2EMnahmJ4ALOpYpkDI9d6hl9T6JVAYqnWiAq5ZKlNCDW1p3y0uKQLY1YqcX22lcesPUSWXUMxSkKDE9v8e4izM+R4adoToiBalWnjROYeonaHhJSULDFO6yFh76hsL0nvIfYmtpBykCHWlGFk9X8d0uqrbqjBj7YtQlODq3QbLAtIphgffcL44N+TeO1oEgfeSkJT2Eolrpo94orO/l/ZawuuvBjFYBhAXS2px9VxE2tWmdjWZmHvgQnsemECmkEq5sAvp/9hztdPNj7DZxAB8SsMfLc1hscfvQz1dXp93+bH3Ji98KTLLvRSEDGAO1abaH8wBp2QmNuLfmk9RM5n+BvDAL6y0sTa23RLzV2K/ax7iDosUzUcOprEmydS6udoBFixJIovLYng2msiMGuyX5YW1RNuXBHFwbd1utGr2D8xI4ik2MFlP7lUFBcnGK+8kZh5/uJh+2e5ibc/FMs6OpN7yjVXR6AZLaKBpOZVq9WZPp/w+vEkXjqcwOQUqoppDeyvkTpg6R+GzltIpeQQVBXhaOD0azntGgyYgQ//p2P3sTUw5By490ePK8u1X4zANDPf2D+6YOGtd+3BgF7wctHCkEP58BG3ttTgG7eZat6RKa0iYrwzqKMggGgRlQoJVGWX20xcvyyKDetqcfsqE7F6yiiGCPH7v45DV0SLqJSr0CCX6Jq776hVj3yQZKMMi/v2XFTDZX3h66JO7RD4kQvjjKMnU3j2lQm898EkdEe0kB7SDJ9yWYzwtVtMLGuOqEnkvoMJ3XtIs2FX1fEvEQNYujiCrZti+NVPPq9m8vpCcUOVOAoAREDzlRGVVpGRmJ5wQ9SpN4Vq49Cs5KJQZwI3LJcEYxRLFkcyDnsFyXFta4vh/OgF/Ou/mk0QiWLRmeJfVcbFeclF4blX7ecy+vrOxno0Xp5ZlSVfiOBba2rxx37NhsDMNe4WrKuEFw8nsHd/Qg13MxGJAC0r5qxea4OhyuL5kGPvpTD2cfZLsfQemUxqBVHKsGsU+o9TZyeRWCDjK72kvlazCTHzuAx7x+BDWlZEEVtgQ8PUFHAxodtghsZk2FuRUnal5tabanD557LfIkUM6UV6wcPSQ4bgMzbeWYcNa+vUWnsmZJR/bkizIa+ChqJS15ZYs2tpHsj+qju/eim5KMGXeYg8FpqHCOMTjBOn9BvLiBZRKTKMKuTrt5jq4RbpHYOnJ/H8oblzGD2g04ZUfEaAODs0pd+E0EG0iEr57Sl/zg/Tesbpc5P43V/G9UuZOIgWhl0LXcpv+5ePLzAGXp7Az57+RON1ETojWjhTVT4CwDcbHZIpVjfuMx9O4cjxJPa/mdR8HWRGA2crKfNBED0MjTj+fgqP/tSXc9bMiAbT+7LEJaDS7Qk6lqOBEsQp9zBY6UYFmMHpkhuzhldi2RBSGS7FfpYg1u4KtSYEl2I/J2eyqWPkH34+RaUj4lmyp7vp5unnc2aEYmZSkVYFGJoX8zmCiLOMY2YSUh7YiXlmQZzz0r1lakwI0Du/KlB6Eouou5wtCjSUHus0QezaG+SqkmZIIVBf/tWACE8V9B4h+ZMlxhkFEeXEc8nFy4e4QGKbzWQs60KIGGApz6UQrxlzYgtXgqi7P/MOz5sTdJh3LOT0lnN3w/2do3vDyqTemYk909X47YV+J+farbiRSRFgj9oUXIiSKpY5yCmIpIXFjcyzhgUUtrgzH5u9vHY3ONZwOz1pWTDZma+9nqsdcm0dI/tDhwTXHOjvbvpmvr/sav+PWMPZbmQheXLWiRlKIojt08dbbDufkDwsj7a49TZ0vUNOPJXEGi5M0+c0BdtciKdhQVsWxX1MrOFCUbLb5hXqZVjwHlLx6RNruPDylWYseW+hHoZCUZt67W8Brw9v9DPWq+uLcfkUQnNiv5kTzya07y4eT88hSMOY0R6I3BdRUj6rl2IInh8MkRSBuJFJZhM+hYj2yWfMNx3i6rVRQpQBFtGTPrJOGpP1jP6eeE+p3qCkR6ek4WKA5YflYCbuks9SSjGEsh2/tZ17xOaHXeV2Kg/1yYaEbGvgnr8byoxjqSTOMu06GMdkQTIQvbJvqlxCTFOxgCg3H7P2EfHP0GWDNwPHZK+tbO9caN27lGjxDbX9M8SyQbkElNsUYNA+n2HtLiQZ6EtB0syQLdxl10KX8tteV92WE8d8RM70yTGyfJZVAy0I5iHlt6XisxQZlrq2TlnbZrt4Jzc4JQrtqnhS+0uVm5IKR1JUh4akXIWqkGDhpJwDt4+B68tnvr6L5zB8YjIAAAAASUVORK5CYII="
|
||||||
|
},
|
||||||
|
"annotations": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"tables": [],
|
||||||
|
"key_value_items": [],
|
||||||
|
"form_items": [],
|
||||||
|
"pages": {}
|
||||||
|
}
|
13
tests/data/groundtruth/docling_v2/word_image_anchors.docx.md
vendored
Normal file
13
tests/data/groundtruth/docling_v2/word_image_anchors.docx.md
vendored
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
**Transcript**
|
||||||
|
|
||||||
|
February 20, 2025, 8:32PM
|
||||||
|
|
||||||
|
<!-- image -->
|
||||||
|
|
||||||
|
**This is test 1** 0:08
|
||||||
|
Correct, he is not.
|
||||||
|
|
||||||
|
<!-- image -->
|
||||||
|
|
||||||
|
**This is test 2** 0:16
|
||||||
|
Yeah, exactly.
|
@ -9,6 +9,7 @@ from docling.datamodel.document import (
|
|||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
InputDocument,
|
InputDocument,
|
||||||
SectionHeaderItem,
|
SectionHeaderItem,
|
||||||
|
TextItem,
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
@ -96,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
|
|||||||
|
|
||||||
pred_md: str = doc.export_to_markdown()
|
pred_md: str = doc.export_to_markdown()
|
||||||
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
|
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
|
||||||
"export to md"
|
f"export to markdown failed on {docx_path}"
|
||||||
)
|
)
|
||||||
|
|
||||||
pred_itxt: str = doc._export_to_indented_text(
|
pred_itxt: str = doc._export_to_indented_text(
|
||||||
max_text_len=70, explicit_tables=False
|
max_text_len=70, explicit_tables=False
|
||||||
)
|
)
|
||||||
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
|
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
|
||||||
"export to indented-text"
|
f"export to indented-text failed on {docx_path}"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
|
assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
|
||||||
"document document"
|
f"DoclingDocument verification failed on {docx_path}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if docx_path.name == "word_tables.docx":
|
if docx_path.name == "word_tables.docx":
|
||||||
@ -116,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
|
|||||||
pred_text=pred_html,
|
pred_text=pred_html,
|
||||||
gtfile=str(gt_path) + ".html",
|
gtfile=str(gt_path) + ".html",
|
||||||
generate=GENERATE,
|
generate=GENERATE,
|
||||||
), "export to html"
|
), f"export to html failed on {docx_path}"
|
||||||
|
|
||||||
|
|
||||||
flaky_path = Path("tests/data/docx/textbox.docx")
|
flaky_path = Path("tests/data/docx/textbox.docx")
|
||||||
@ -131,3 +132,42 @@ def test_e2e_docx_conversions():
|
|||||||
@pytest.mark.xfail(strict=False)
|
@pytest.mark.xfail(strict=False)
|
||||||
def test_textbox_conversion():
|
def test_textbox_conversion():
|
||||||
_test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
|
_test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_after_image_anchors():
|
||||||
|
"""
|
||||||
|
Test to analyse whether text gets parsed after image anchors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
in_path = Path("tests/data/docx/word_image_anchors.docx")
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=in_path,
|
||||||
|
format=InputFormat.DOCX,
|
||||||
|
backend=MsWordDocumentBackend,
|
||||||
|
)
|
||||||
|
backend = MsWordDocumentBackend(
|
||||||
|
in_doc=in_doc,
|
||||||
|
path_or_stream=in_path,
|
||||||
|
)
|
||||||
|
doc = backend.convert()
|
||||||
|
|
||||||
|
found_text_after_anchor_1 = found_text_after_anchor_2 = (
|
||||||
|
found_text_after_anchor_3
|
||||||
|
) = found_text_after_anchor_4 = False
|
||||||
|
for item, _ in doc.iterate_items():
|
||||||
|
if isinstance(item, TextItem):
|
||||||
|
if item.text == "This is test 1":
|
||||||
|
found_text_after_anchor_1 = True
|
||||||
|
elif item.text == "0:08\nCorrect, he is not.":
|
||||||
|
found_text_after_anchor_2 = True
|
||||||
|
elif item.text == "This is test 2":
|
||||||
|
found_text_after_anchor_3 = True
|
||||||
|
elif item.text == "0:16\nYeah, exactly.":
|
||||||
|
found_text_after_anchor_4 = True
|
||||||
|
|
||||||
|
assert (
|
||||||
|
found_text_after_anchor_1
|
||||||
|
and found_text_after_anchor_2
|
||||||
|
and found_text_after_anchor_3
|
||||||
|
and found_text_after_anchor_4
|
||||||
|
)
|
||||||
|
50
uv.lock
generated
50
uv.lock
generated
@ -983,7 +983,7 @@ examples = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docling-core"
|
name = "docling-core"
|
||||||
version = "2.38.0"
|
version = "2.38.1"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "jsonref" },
|
{ name = "jsonref" },
|
||||||
@ -997,9 +997,9 @@ dependencies = [
|
|||||||
{ name = "typer" },
|
{ name = "typer" },
|
||||||
{ name = "typing-extensions" },
|
{ name = "typing-extensions" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/01/3d/02b4926567735c252b4750074f9dfc96d06078566f067eb47c13713952a2/docling_core-2.38.0.tar.gz", hash = "sha256:3bad4c476cc798e29d01b02ea383b5582d7031e9595b177be0a9450f2eb7bef6", size = 145997, upload-time = "2025-06-18T12:35:23.81Z" }
|
sdist = { url = "https://files.pythonhosted.org/packages/38/f7/33bb17aa13e73722bf18ecfb7f13d6fbfb384c22003209bd72708123b33f/docling_core-2.38.1.tar.gz", hash = "sha256:a0566df2316eec4d22953ca7dac839b926dd57549b4c07ac810e87dbbaf91a10", size = 146276, upload-time = "2025-06-20T12:28:48.422Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/3c/52/e65521ec8ae7ecbce2f9dd95dbf4164b4d4c58c29136e1489a038ce9a2fc/docling_core-2.38.0-py3-none-any.whl", hash = "sha256:8f27d7074a99913f2ba73bde363bbed3416852014eda136bb8880d37805c6950", size = 151276, upload-time = "2025-06-18T12:35:22.25Z" },
|
{ url = "https://files.pythonhosted.org/packages/f0/c5/fb2e24602db94ec02cc3ac8eb7b9665f2a5f61ff81866beb67aff95a353a/docling_core-2.38.1-py3-none-any.whl", hash = "sha256:6859313561030503e8b53aec535aa5edb765a679af76ce2e2c60722d78c6c613", size = 151570, upload-time = "2025-06-20T12:28:46.764Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.optional-dependencies]
|
[package.optional-dependencies]
|
||||||
@ -3387,10 +3387,10 @@ name = "ocrmac"
|
|||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux')" },
|
{ name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
|
||||||
{ name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform == 'darwin'" },
|
{ name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
|
||||||
{ name = "pillow", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
{ name = "pillow" },
|
||||||
{ name = "pyobjc-framework-vision", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
{ name = "pyobjc-framework-vision" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/dd/dc/de3e9635774b97d9766f6815bbb3f5ec9bce347115f10d9abbf2733a9316/ocrmac-1.0.0.tar.gz", hash = "sha256:5b299e9030c973d1f60f82db000d6c2e5ff271601878c7db0885e850597d1d2e", size = 1463997, upload-time = "2024-11-07T12:00:00.197Z" }
|
sdist = { url = "https://files.pythonhosted.org/packages/dd/dc/de3e9635774b97d9766f6815bbb3f5ec9bce347115f10d9abbf2733a9316/ocrmac-1.0.0.tar.gz", hash = "sha256:5b299e9030c973d1f60f82db000d6c2e5ff271601878c7db0885e850597d1d2e", size = 1463997, upload-time = "2024-11-07T12:00:00.197Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
@ -4414,7 +4414,7 @@ name = "pyobjc-framework-cocoa"
|
|||||||
version = "11.1"
|
version = "11.1"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
{ name = "pyobjc-core" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/4b/c5/7a866d24bc026f79239b74d05e2cf3088b03263da66d53d1b4cf5207f5ae/pyobjc_framework_cocoa-11.1.tar.gz", hash = "sha256:87df76b9b73e7ca699a828ff112564b59251bb9bbe72e610e670a4dc9940d038", size = 5565335, upload-time = "2025-06-14T20:56:59.683Z" }
|
sdist = { url = "https://files.pythonhosted.org/packages/4b/c5/7a866d24bc026f79239b74d05e2cf3088b03263da66d53d1b4cf5207f5ae/pyobjc_framework_cocoa-11.1.tar.gz", hash = "sha256:87df76b9b73e7ca699a828ff112564b59251bb9bbe72e610e670a4dc9940d038", size = 5565335, upload-time = "2025-06-14T20:56:59.683Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
@ -4433,8 +4433,8 @@ name = "pyobjc-framework-coreml"
|
|||||||
version = "11.1"
|
version = "11.1"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
{ name = "pyobjc-core" },
|
||||||
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
{ name = "pyobjc-framework-cocoa" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/0d/5d/4309f220981d769b1a2f0dcb2c5c104490d31389a8ebea67e5595ce1cb74/pyobjc_framework_coreml-11.1.tar.gz", hash = "sha256:775923eefb9eac2e389c0821b10564372de8057cea89f1ea1cdaf04996c970a7", size = 82005, upload-time = "2025-06-14T20:57:12.004Z" }
|
sdist = { url = "https://files.pythonhosted.org/packages/0d/5d/4309f220981d769b1a2f0dcb2c5c104490d31389a8ebea67e5595ce1cb74/pyobjc_framework_coreml-11.1.tar.gz", hash = "sha256:775923eefb9eac2e389c0821b10564372de8057cea89f1ea1cdaf04996c970a7", size = 82005, upload-time = "2025-06-14T20:57:12.004Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
@ -4453,8 +4453,8 @@ name = "pyobjc-framework-quartz"
|
|||||||
version = "11.1"
|
version = "11.1"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
{ name = "pyobjc-core" },
|
||||||
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
{ name = "pyobjc-framework-cocoa" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/c7/ac/6308fec6c9ffeda9942fef72724f4094c6df4933560f512e63eac37ebd30/pyobjc_framework_quartz-11.1.tar.gz", hash = "sha256:a57f35ccfc22ad48c87c5932818e583777ff7276605fef6afad0ac0741169f75", size = 3953275, upload-time = "2025-06-14T20:58:17.924Z" }
|
sdist = { url = "https://files.pythonhosted.org/packages/c7/ac/6308fec6c9ffeda9942fef72724f4094c6df4933560f512e63eac37ebd30/pyobjc_framework_quartz-11.1.tar.gz", hash = "sha256:a57f35ccfc22ad48c87c5932818e583777ff7276605fef6afad0ac0741169f75", size = 3953275, upload-time = "2025-06-14T20:58:17.924Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
@ -4473,10 +4473,10 @@ name = "pyobjc-framework-vision"
|
|||||||
version = "11.1"
|
version = "11.1"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
{ name = "pyobjc-core" },
|
||||||
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
{ name = "pyobjc-framework-cocoa" },
|
||||||
{ name = "pyobjc-framework-coreml", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
{ name = "pyobjc-framework-coreml" },
|
||||||
{ name = "pyobjc-framework-quartz", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
|
{ name = "pyobjc-framework-quartz" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/40/a8/7128da4d0a0103cabe58910a7233e2f98d18c590b1d36d4b3efaaedba6b9/pyobjc_framework_vision-11.1.tar.gz", hash = "sha256:26590512ee7758da3056499062a344b8a351b178be66d4b719327884dde4216b", size = 133721, upload-time = "2025-06-14T20:58:46.095Z" }
|
sdist = { url = "https://files.pythonhosted.org/packages/40/a8/7128da4d0a0103cabe58910a7233e2f98d18c590b1d36d4b3efaaedba6b9/pyobjc_framework_vision-11.1.tar.gz", hash = "sha256:26590512ee7758da3056499062a344b8a351b178be66d4b719327884dde4216b", size = 133721, upload-time = "2025-06-14T20:58:46.095Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
@ -4957,17 +4957,17 @@ source = { registry = "https://pypi.org/simple" }
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
|
{ name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
|
||||||
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
|
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
|
||||||
{ name = "numpy", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and python_full_version < '3.13') or (python_full_version >= '3.11' and platform_machine != 'arm64') or (python_full_version >= '3.11' and sys_platform != 'darwin')" },
|
{ name = "numpy", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
|
||||||
{ name = "onnxruntime", version = "1.19.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
|
{ name = "onnxruntime", version = "1.19.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
|
||||||
{ name = "onnxruntime", version = "1.22.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" },
|
{ name = "onnxruntime", version = "1.22.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
|
||||||
{ name = "opencv-python", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
|
{ name = "opencv-python" },
|
||||||
{ name = "pillow", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
|
{ name = "pillow" },
|
||||||
{ name = "pyclipper", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
|
{ name = "pyclipper" },
|
||||||
{ name = "pyyaml", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
|
{ name = "pyyaml" },
|
||||||
{ name = "shapely", version = "2.0.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
|
{ name = "shapely", version = "2.0.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
|
||||||
{ name = "shapely", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" },
|
{ name = "shapely", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
|
||||||
{ name = "six", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
|
{ name = "six" },
|
||||||
{ name = "tqdm", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
|
{ name = "tqdm" },
|
||||||
]
|
]
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/ba/12/1e5497183bdbe782dbb91bad1d0d2297dba4d2831b2652657f7517bfc6df/rapidocr_onnxruntime-1.4.4-py3-none-any.whl", hash = "sha256:971d7d5f223a7a808662229df1ef69893809d8457d834e6373d3854bc1782cbf", size = 14915192, upload-time = "2025-01-17T01:48:25.104Z" },
|
{ url = "https://files.pythonhosted.org/packages/ba/12/1e5497183bdbe782dbb91bad1d0d2297dba4d2831b2652657f7517bfc6df/rapidocr_onnxruntime-1.4.4-py3-none-any.whl", hash = "sha256:971d7d5f223a7a808662229df1ef69893809d8457d834e6373d3854bc1782cbf", size = 14915192, upload-time = "2025-01-17T01:48:25.104Z" },
|
||||||
|
Loading…
Reference in New Issue
Block a user