diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 44a0f2cf..8386082a 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -14,7 +14,7 @@ from docling_core.types.doc import ( TableCell, TableData, ) -from docling_core.types.doc.document import Formatting +from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList from docx import Document from docx.document import Document as DocxDocument from docx.oxml.table import CT_Tc @@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.valid = True except Exception as e: raise RuntimeError( - f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" + f"MsWordDocumentBackend could not load document with hash {self.document_hash}" ) from e @override @@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self._handle_tables(element, docx_obj, doc) except Exception: _log.debug("could not parse a table, broken docx table") - + # Check for Image elif drawing_blip: self._handle_pictures(docx_obj, drawing_blip, doc) + # Check for Text after the Image + if ( + tag_name in ["p"] + and element.find(".//w:t", namespaces=namespaces) is not None + ): + self._handle_text_elements(element, docx_obj, doc) # Check for the sdt containers, like table of contents elif tag_name in ["sdt"]: sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) @@ -268,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self._handle_text_elements(element, docx_obj, doc) else: _log.debug(f"Ignoring element in DOCX with tag: {tag_name}") + return doc def _str_to_int( @@ -578,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): all_paragraphs = [] # Sort paragraphs within each container, then process containers - for container_id, paragraphs in container_paragraphs.items(): + for paragraphs in container_paragraphs.values(): # Sort by vertical position within each container sorted_container_paragraphs = sorted( paragraphs, @@ -689,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc: DoclingDocument, ) -> None: paragraph = Paragraph(element, docx_obj) - + paragraph_elements = self._get_paragraph_elements(paragraph) text, equations = self._handle_equations_in_text( element=element, text=paragraph.text ) if text is None: return - paragraph_elements = self._get_paragraph_elements(paragraph) text = text.strip() # Common styles for bullet and numbered lists. @@ -912,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) return + def _add_formatted_list_item( + self, + doc: DoclingDocument, + elements: list, + marker: str, + enumerated: bool, + level: int, + ) -> None: + # This should not happen by construction + if not isinstance(self.parents[level], (OrderedList, UnorderedList)): + return + if len(elements) == 1: + text, format, hyperlink = elements[0] + doc.add_list_item( + marker=marker, + enumerated=enumerated, + parent=self.parents[level], + text=text, + formatting=format, + hyperlink=hyperlink, + ) + else: + new_item = doc.add_list_item( + marker=marker, + enumerated=enumerated, + parent=self.parents[level], + text="", + ) + new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item) + for text, format, hyperlink in elements: + doc.add_text( + label=DocItemLabel.TEXT, + parent=new_parent, + text=text, + formatting=format, + hyperlink=hyperlink, + ) + def _add_list_item( self, *, @@ -921,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): elements: list, is_numbered: bool = False, ) -> None: + # TODO: this method is always called with is_numbered. Numbered lists should be properly addressed. + if not elements: + return None enum_marker = "" level = self._get_level() @@ -937,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[level], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, elements, enum_marker, is_numbered, level ) - for text, format, hyperlink in elements: - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) - elif ( self._prev_numid() == numid and self.level_at_new_list is not None @@ -981,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[self.level_at_new_list + ilevel], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, + elements, + enum_marker, + is_numbered, + self.level_at_new_list + ilevel, ) - for text, format, hyperlink in elements: - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) elif ( self._prev_numid() == numid and self.level_at_new_list is not None and prev_indent is not None and ilevel < prev_indent ): # Close list - for k, v in self.parents.items(): + for k in self.parents: if k > self.level_at_new_list + ilevel: self.parents[k] = None @@ -1011,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[self.level_at_new_list + ilevel], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, + elements, + enum_marker, + is_numbered, + self.level_at_new_list + ilevel, ) - for text, format, hyperlink in elements: - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) self.listIter = 0 elif self._prev_numid() == numid or prev_indent == ilevel: @@ -1033,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[level - 1], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, elements, enum_marker, is_numbered, level - 1 ) - for text, format, hyperlink in elements: - # Add the list item to the parent group - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) + return def _handle_tables( diff --git a/tests/data/docx/word_image_anchors.docx b/tests/data/docx/word_image_anchors.docx new file mode 100644 index 00000000..c0b030c3 Binary files /dev/null and b/tests/data/docx/word_image_anchors.docx differ diff --git a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md index 31c3f3be..28fa0d45 100644 --- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md +++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md @@ -12,9 +12,7 @@ Create your feature branch: `git checkout -b feature/AmazingFeature` . 4. Push to the branch ( `git push origin feature/AmazingFeature` ) 5. Open a Pull Request -## - -*Second* section +## *Second* section - **First** : Lorem ipsum. - **Second** : Dolor `sit` amet. diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.itxt b/tests/data/groundtruth/docling_v2/textbox.docx.itxt index 1372608d..fa4ae97a 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.itxt +++ b/tests/data/groundtruth/docling_v2/textbox.docx.itxt @@ -11,84 +11,82 @@ item-0 at level 0: unspecified: group _root_ * Blisters * Headache * Sore throat - item-9 at level 1: list: group group - item-10 at level 2: list_item: - item-11 at level 1: paragraph: - item-12 at level 1: paragraph: - item-13 at level 1: section: group textbox - item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms + item-9 at level 1: paragraph: + item-10 at level 1: paragraph: + item-11 at level 1: section: group textbox + item-12 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms + item-13 at level 1: paragraph: + item-14 at level 1: paragraph: item-15 at level 1: paragraph: item-16 at level 1: paragraph: - item-17 at level 1: paragraph: - item-18 at level 1: paragraph: - item-19 at level 1: section: group textbox - item-20 at level 2: paragraph: Yes - item-21 at level 1: paragraph: - item-22 at level 1: paragraph: - item-23 at level 1: section: group textbox - item-24 at level 2: list: group list - item-25 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network. - item-26 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System. - item-27 at level 2: paragraph: - item-28 at level 1: list: group list - item-29 at level 2: list_item: + item-17 at level 1: section: group textbox + item-18 at level 2: paragraph: Yes + item-19 at level 1: paragraph: + item-20 at level 1: paragraph: + item-21 at level 1: section: group textbox + item-22 at level 2: list: group list + item-23 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network. + item-24 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System. + item-25 at level 2: paragraph: + item-26 at level 1: list: group list + item-27 at level 2: list_item: + item-28 at level 1: paragraph: + item-29 at level 1: paragraph: item-30 at level 1: paragraph: item-31 at level 1: paragraph: item-32 at level 1: paragraph: - item-33 at level 1: paragraph: - item-34 at level 1: paragraph: - item-35 at level 1: section: group textbox - item-36 at level 2: paragraph: Health Bureau: - item-37 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. - item-38 at level 2: list: group list - item-39 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. - item-40 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. - item-41 at level 2: paragraph: - item-42 at level 1: list: group list - item-43 at level 2: list_item: - item-44 at level 1: paragraph: - item-45 at level 1: section: group textbox - item-46 at level 2: paragraph: Department of Education: + item-33 at level 1: section: group textbox + item-34 at level 2: paragraph: Health Bureau: + item-35 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. + item-36 at level 2: list: group list + item-37 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. + item-38 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. + item-39 at level 2: paragraph: + item-40 at level 1: list: group list + item-41 at level 2: list_item: + item-42 at level 1: paragraph: + item-43 at level 1: section: group textbox + item-44 at level 2: paragraph: Department of Education: Collabo ... vention measures at all school levels. + item-45 at level 1: paragraph: + item-46 at level 1: paragraph: item-47 at level 1: paragraph: item-48 at level 1: paragraph: item-49 at level 1: paragraph: item-50 at level 1: paragraph: item-51 at level 1: paragraph: - item-52 at level 1: paragraph: - item-53 at level 1: paragraph: - item-54 at level 1: section: group textbox - item-55 at level 2: inline: group group - item-56 at level 3: paragraph: The Health Bureau will handle - item-57 at level 3: paragraph: reporting and specimen collection - item-58 at level 3: paragraph: . - item-59 at level 2: paragraph: + item-52 at level 1: section: group textbox + item-53 at level 2: inline: group group + item-54 at level 3: paragraph: The Health Bureau will handle + item-55 at level 3: paragraph: reporting and specimen collection + item-56 at level 3: paragraph: . + item-57 at level 2: paragraph: + item-58 at level 1: paragraph: + item-59 at level 1: paragraph: item-60 at level 1: paragraph: - item-61 at level 1: paragraph: - item-62 at level 1: paragraph: - item-63 at level 1: section: group textbox - item-64 at level 2: paragraph: Whether the epidemic has eased. - item-65 at level 2: paragraph: - item-66 at level 1: paragraph: - item-67 at level 1: section: group textbox - item-68 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. - item-69 at level 2: paragraph: No - item-70 at level 1: paragraph: - item-71 at level 1: paragraph: - item-72 at level 1: section: group textbox - item-73 at level 2: paragraph: Yes - item-74 at level 1: paragraph: - item-75 at level 1: section: group textbox - item-76 at level 2: paragraph: Yes - item-77 at level 1: paragraph: - item-78 at level 1: paragraph: - item-79 at level 1: section: group textbox - item-80 at level 2: paragraph: Case closed. - item-81 at level 2: paragraph: - item-82 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. - item-83 at level 1: paragraph: - item-84 at level 1: section: group textbox - item-85 at level 2: paragraph: No - item-86 at level 1: paragraph: - item-87 at level 1: paragraph: - item-88 at level 1: paragraph: \ No newline at end of file + item-61 at level 1: section: group textbox + item-62 at level 2: paragraph: Whether the epidemic has eased. + item-63 at level 2: paragraph: + item-64 at level 1: paragraph: + item-65 at level 1: section: group textbox + item-66 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. + item-67 at level 2: paragraph: No + item-68 at level 1: paragraph: + item-69 at level 1: paragraph: + item-70 at level 1: section: group textbox + item-71 at level 2: paragraph: Yes + item-72 at level 1: paragraph: + item-73 at level 1: section: group textbox + item-74 at level 2: paragraph: Yes + item-75 at level 1: paragraph: + item-76 at level 1: paragraph: + item-77 at level 1: section: group textbox + item-78 at level 2: paragraph: Case closed. + item-79 at level 2: paragraph: + item-80 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. + item-81 at level 1: paragraph: + item-82 at level 1: section: group textbox + item-83 at level 2: paragraph: No + item-84 at level 1: paragraph: + item-85 at level 1: paragraph: + item-86 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.json b/tests/data/groundtruth/docling_v2/textbox.docx.json index 9b1771f2..1e91f060 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.json +++ b/tests/data/groundtruth/docling_v2/textbox.docx.json @@ -29,9 +29,6 @@ { "$ref": "#/groups/0" }, - { - "$ref": "#/groups/19" - }, { "$ref": "#/texts/6" }, @@ -492,20 +489,6 @@ "content_layer": "body", "name": "textbox", "label": "section" - }, - { - "self_ref": "#/groups/19", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/67" - } - ], - "content_layer": "body", - "name": "group", - "label": "list" } ], "texts": [ @@ -1494,20 +1477,6 @@ "prov": [], "orig": "", "text": "" - }, - { - "self_ref": "#/texts/67", - "parent": { - "$ref": "#/groups/19" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "", - "text": "", - "enumerated": false, - "marker": "-" } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt index bc923c1d..fccb44c6 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt @@ -17,21 +17,16 @@ item-0 at level 0: unspecified: group _root_ item-16 at level 2: list_item: Italic bullet 1 item-17 at level 2: list_item: Bold bullet 2 item-18 at level 2: list_item: Underline bullet 3 - item-19 at level 2: inline: group group - item-20 at level 3: list: group group - item-21 at level 4: list_item: Some - item-22 at level 3: list: group group - item-23 at level 4: list_item: italic - item-24 at level 3: list: group group - item-25 at level 4: list_item: bold - item-26 at level 3: list: group group - item-27 at level 4: list_item: underline - item-28 at level 2: list: group list - item-29 at level 3: inline: group group - item-30 at level 4: list: group group - item-31 at level 5: list_item: Nested - item-32 at level 4: list: group group - item-33 at level 5: list_item: italic - item-34 at level 4: list: group group - item-35 at level 5: list_item: bold - item-36 at level 1: paragraph: \ No newline at end of file + item-19 at level 2: list_item: + item-20 at level 3: inline: group group + item-21 at level 4: text: Some + item-22 at level 4: text: italic + item-23 at level 4: text: bold + item-24 at level 4: text: underline + item-25 at level 2: list: group list + item-26 at level 3: list_item: + item-27 at level 4: inline: group group + item-28 at level 5: text: Nested + item-29 at level 5: text: italic + item-30 at level 5: text: bold + item-31 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json index a1c3c13f..4173fc62 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json @@ -42,7 +42,7 @@ "$ref": "#/groups/1" }, { - "$ref": "#/texts/16" + "$ref": "#/texts/25" } ], "content_layer": "body", @@ -98,7 +98,7 @@ "$ref": "#/texts/15" }, { - "$ref": "#/groups/2" + "$ref": "#/texts/16" }, { "$ref": "#/groups/3" @@ -111,20 +111,20 @@ { "self_ref": "#/groups/2", "parent": { - "$ref": "#/groups/1" + "$ref": "#/texts/16" }, "children": [ { - "$ref": "#/groups/11" + "$ref": "#/texts/17" }, { - "$ref": "#/groups/10" + "$ref": "#/texts/18" }, { - "$ref": "#/groups/9" + "$ref": "#/texts/19" }, { - "$ref": "#/groups/8" + "$ref": "#/texts/20" } ], "content_layer": "body", @@ -138,7 +138,7 @@ }, "children": [ { - "$ref": "#/groups/4" + "$ref": "#/texts/21" } ], "content_layer": "body", @@ -148,120 +148,22 @@ { "self_ref": "#/groups/4", "parent": { - "$ref": "#/groups/3" + "$ref": "#/texts/21" }, "children": [ { - "$ref": "#/groups/7" + "$ref": "#/texts/22" }, { - "$ref": "#/groups/6" + "$ref": "#/texts/23" }, { - "$ref": "#/groups/5" + "$ref": "#/texts/24" } ], "content_layer": "body", "name": "group", "label": "inline" - }, - { - "self_ref": "#/groups/5", - "parent": { - "$ref": "#/groups/4" - }, - "children": [ - { - "$ref": "#/texts/17" - } - ], - "content_layer": "body", - "name": "group", - "label": "list" - }, - { - "self_ref": "#/groups/6", - "parent": { - "$ref": "#/groups/4" - }, - "children": [ - { - "$ref": "#/texts/18" - } - ], - "content_layer": "body", - "name": "group", - "label": "list" - }, - { - "self_ref": "#/groups/7", - "parent": { - "$ref": "#/groups/4" - }, - "children": [ - { - "$ref": "#/texts/19" - } - ], - "content_layer": "body", - "name": "group", - "label": "list" - }, - { - "self_ref": "#/groups/8", - "parent": { - "$ref": "#/groups/2" - }, - "children": [ - { - "$ref": "#/texts/20" - } - ], - "content_layer": "body", - "name": "group", - "label": "list" - }, - { - "self_ref": "#/groups/9", - "parent": { - "$ref": "#/groups/2" - }, - "children": [ - { - "$ref": "#/texts/21" - } - ], - "content_layer": "body", - "name": "group", - "label": "list" - }, - { - "self_ref": "#/groups/10", - "parent": { - "$ref": "#/groups/2" - }, - "children": [ - { - "$ref": "#/texts/22" - } - ], - "content_layer": "body", - "name": "group", - "label": "list" - }, - { - "self_ref": "#/groups/11", - "parent": { - "$ref": "#/groups/2" - }, - "children": [ - { - "$ref": "#/texts/23" - } - ], - "content_layer": "body", - "name": "group", - "label": "list" } ], "texts": [ @@ -574,149 +476,29 @@ { "self_ref": "#/texts/16", "parent": { - "$ref": "#/body" + "$ref": "#/groups/1" }, - "children": [], + "children": [ + { + "$ref": "#/groups/2" + } + ], "content_layer": "body", - "label": "paragraph", + "label": "list_item", "prov": [], "orig": "", - "text": "" + "text": "", + "enumerated": false, + "marker": "-" }, { "self_ref": "#/texts/17", "parent": { - "$ref": "#/groups/5" + "$ref": "#/groups/2" }, "children": [], "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "bold", - "text": "bold", - "formatting": { - "bold": true, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/18", - "parent": { - "$ref": "#/groups/6" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "italic", - "text": "italic", - "formatting": { - "bold": false, - "italic": true, - "underline": false, - "strikethrough": false, - "script": "baseline" - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/19", - "parent": { - "$ref": "#/groups/7" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "Nested", - "text": "Nested", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/20", - "parent": { - "$ref": "#/groups/8" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "underline", - "text": "underline", - "formatting": { - "bold": false, - "italic": false, - "underline": true, - "strikethrough": false, - "script": "baseline" - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/21", - "parent": { - "$ref": "#/groups/9" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "bold", - "text": "bold", - "formatting": { - "bold": true, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/22", - "parent": { - "$ref": "#/groups/10" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "italic", - "text": "italic", - "formatting": { - "bold": false, - "italic": true, - "underline": false, - "strikethrough": false, - "script": "baseline" - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/23", - "parent": { - "$ref": "#/groups/11" - }, - "children": [], - "content_layer": "body", - "label": "list_item", + "label": "text", "prov": [], "orig": "Some", "text": "Some", @@ -726,9 +508,151 @@ "underline": false, "strikethrough": false, "script": "baseline" + } + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/groups/2" }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "underline", + "text": "underline", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/groups/3" + }, + "children": [ + { + "$ref": "#/groups/4" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "", + "text": "", "enumerated": false, "marker": "-" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Nested", + "text": "Nested", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/25", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md index 05ee80fc..918e89e2 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md @@ -13,5 +13,5 @@ Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docli - *Italic bullet 1* - **Bold bullet 2** - Underline bullet 3 - - Some - *italic* - **bold** - underline - - Nested - *italic* - **bold** \ No newline at end of file +- Some *italic* **bold** underline + - Nested *italic* **bold** \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt new file mode 100644 index 00000000..ebc5cebf --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt @@ -0,0 +1,16 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: paragraph: Transcript + item-2 at level 1: paragraph: February 20, 2025, 8:32PM + item-3 at level 1: picture + item-4 at level 1: inline: group group + item-5 at level 2: paragraph: This is test 1 + item-6 at level 2: paragraph: 0:08 +Correct, he is not. + item-7 at level 1: paragraph: + item-8 at level 1: picture + item-9 at level 1: inline: group group + item-10 at level 2: paragraph: This is test 2 + item-11 at level 2: paragraph: 0:16 +Yeah, exactly. + item-12 at level 1: paragraph: + item-13 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json new file mode 100644 index 00000000..4b75d8d3 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json @@ -0,0 +1,292 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.4.0", + "name": "word_image_anchors", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 2428692234257307633, + "filename": "word_image_anchors.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/pictures/0" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/pictures/1" + }, + { + "$ref": "#/groups/1" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Transcript", + "text": "Transcript", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "February 20, 2025, 8:32PM", + "text": "February 20, 2025, 8:32PM", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "This is test 1", + "text": "This is test 1", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "0:08\nCorrect, he is not.", + "text": "0:08\nCorrect, he is not.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "This is test 2", + "text": "This is test 2", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "0:16\nYeah, exactly.", + "text": "0:16\nYeah, exactly.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [ + { + "self_ref": "#/pictures/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "image": { + "mimetype": "image/png", + "dpi": 72, + "size": { + "width": 100.0, + "height": 100.0 + }, + "uri": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAAz0lEQVR4nO3bUW0CURRF0TukQvDSauBr0mACE1VBAzYQg5Lpdw0wO2EtA+cl+/6+GQAAAAAAAAAAAADe1DIR53X9mcNcdhnf5nm93Y8T8DElyzyuv/evlx/CMqeJOOz9AP4TJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiWp8+t/k8f6/bDrvPl28CAAAAAAAAAAAAAAAAzLv5A5bTEG2TIIlOAAAAAElFTkSuQmCC" + }, + "annotations": [] + }, + { + "self_ref": "#/pictures/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "image": { + "mimetype": "image/png", + "dpi": 72, + "size": { + "width": 100.0, + "height": 100.0 + }, + "uri": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAJIElEQVR4nO2dbWxb1RnH/8+1c5O4bITEwJrRF1ZAI6gtL9oK29oxihAdTQOVoGhbKyS0MDWZJk1CQ+q0aR/4xLYvJNGabdK07MukrSUNaxEvg7aUlteuLUoHrUTbseylSRSgpLGd3Ac9596kSWzHvva1fXzv/UmW4jaxj5+/z73nPOec50/QnM3t5xdbUWOlZeBGgK8jNpYC3AxQHOAGEMXAXKN+mSgF5nGAxgAeBmiIyToH0GnDwklj0jqxq/fK/0BjCJrR2jn8ZcPCXSBaC9DtAC/39h3oDMBHwHzQMvD3ga74P6ERWgjS1jG8BjAeALgVQEuZ334QoAHA2t3fHX8dQRWktX0obpi1jzDjewSshgYwcIwIf7KSiT8M9DYPB0KQts7RlWDuANCuSw/NAAPoBVF3f1fjCZQRKq8QeBzgragqqA+Ep8olDJXj0kSm+XNi6kQVw8RdnEz+otSXspIK0rZ9eDuIngTQAH8wBuYd/T3xnqoSRIauERi/ZuYN8CFEtG8K1o9LMWT2XJBN20e+TwZ1gdmEnyFKssWde3qafuvpy3r5Ym0dI78B8BiCxc7+7qYfaCXIxvbRpZEa7gOwDsHkwFSKtj7b23iu4oLYs2z6M4BlCDZnAd5S7Gy/KEHu3z5yDxN2AVhUzOv4iE+JsfmZnqbnyy7Iph+O3kcWD2g8264UzAa17nm68W+F/DEV0TOeC8XIChPj3kJ6ChV4z3gpvEzl5FOA17u9pxhuR1PODTwUIzeLJFZ2zEokiDO0Dfpoyg3LnJh5L4gz6QvqPKMY1jmx804QSYcEcAbuJY85MSz+pq7WuGEc831uqtQQJS1Yq3MlJHP2EMnahmJ4ALOpYpkDI9d6hl9T6JVAYqnWiAq5ZKlNCDW1p3y0uKQLY1YqcX22lcesPUSWXUMxSkKDE9v8e4izM+R4adoToiBalWnjROYeonaHhJSULDFO6yFh76hsL0nvIfYmtpBykCHWlGFk9X8d0uqrbqjBj7YtQlODq3QbLAtIphgffcL44N+TeO1oEgfeSkJT2Eolrpo94orO/l/ZawuuvBjFYBhAXS2px9VxE2tWmdjWZmHvgQnsemECmkEq5sAvp/9hztdPNj7DZxAB8SsMfLc1hscfvQz1dXp93+bH3Ji98KTLLvRSEDGAO1abaH8wBp2QmNuLfmk9RM5n+BvDAL6y0sTa23RLzV2K/ax7iDosUzUcOprEmydS6udoBFixJIovLYng2msiMGuyX5YW1RNuXBHFwbd1utGr2D8xI4ik2MFlP7lUFBcnGK+8kZh5/uJh+2e5ibc/FMs6OpN7yjVXR6AZLaKBpOZVq9WZPp/w+vEkXjqcwOQUqoppDeyvkTpg6R+GzltIpeQQVBXhaOD0azntGgyYgQ//p2P3sTUw5By490ePK8u1X4zANDPf2D+6YOGtd+3BgF7wctHCkEP58BG3ttTgG7eZat6RKa0iYrwzqKMggGgRlQoJVGWX20xcvyyKDetqcfsqE7F6yiiGCPH7v45DV0SLqJSr0CCX6Jq776hVj3yQZKMMi/v2XFTDZX3h66JO7RD4kQvjjKMnU3j2lQm898EkdEe0kB7SDJ9yWYzwtVtMLGuOqEnkvoMJ3XtIs2FX1fEvEQNYujiCrZti+NVPPq9m8vpCcUOVOAoAREDzlRGVVpGRmJ5wQ9SpN4Vq49Cs5KJQZwI3LJcEYxRLFkcyDnsFyXFta4vh/OgF/Ou/mk0QiWLRmeJfVcbFeclF4blX7ecy+vrOxno0Xp5ZlSVfiOBba2rxx37NhsDMNe4WrKuEFw8nsHd/Qg13MxGJAC0r5qxea4OhyuL5kGPvpTD2cfZLsfQemUxqBVHKsGsU+o9TZyeRWCDjK72kvlazCTHzuAx7x+BDWlZEEVtgQ8PUFHAxodtghsZk2FuRUnal5tabanD557LfIkUM6UV6wcPSQ4bgMzbeWYcNa+vUWnsmZJR/bkizIa+ChqJS15ZYs2tpHsj+qju/eim5KMGXeYg8FpqHCOMTjBOn9BvLiBZRKTKMKuTrt5jq4RbpHYOnJ/H8oblzGD2g04ZUfEaAODs0pd+E0EG0iEr57Sl/zg/Tesbpc5P43V/G9UuZOIgWhl0LXcpv+5ePLzAGXp7Az57+RON1ETojWjhTVT4CwDcbHZIpVjfuMx9O4cjxJPa/mdR8HWRGA2crKfNBED0MjTj+fgqP/tSXc9bMiAbT+7LEJaDS7Qk6lqOBEsQp9zBY6UYFmMHpkhuzhldi2RBSGS7FfpYg1u4KtSYEl2I/J2eyqWPkH34+RaUj4lmyp7vp5unnc2aEYmZSkVYFGJoX8zmCiLOMY2YSUh7YiXlmQZzz0r1lakwI0Du/KlB6Eouou5wtCjSUHus0QezaG+SqkmZIIVBf/tWACE8V9B4h+ZMlxhkFEeXEc8nFy4e4QGKbzWQs60KIGGApz6UQrxlzYgtXgqi7P/MOz5sTdJh3LOT0lnN3w/2do3vDyqTemYk909X47YV+J+farbiRSRFgj9oUXIiSKpY5yCmIpIXFjcyzhgUUtrgzH5u9vHY3ONZwOz1pWTDZma+9nqsdcm0dI/tDhwTXHOjvbvpmvr/sav+PWMPZbmQheXLWiRlKIojt08dbbDufkDwsj7a49TZ0vUNOPJXEGi5M0+c0BdtciKdhQVsWxX1MrOFCUbLb5hXqZVjwHlLx6RNruPDylWYseW+hHoZCUZt67W8Brw9v9DPWq+uLcfkUQnNiv5kTzya07y4eT88hSMOY0R6I3BdRUj6rl2IInh8MkRSBuJFJZhM+hYj2yWfMNx3i6rVRQpQBFtGTPrJOGpP1jP6eeE+p3qCkR6ek4WKA5YflYCbuks9SSjGEsh2/tZ17xOaHXeV2Kg/1yYaEbGvgnr8byoxjqSTOMu06GMdkQTIQvbJvqlxCTFOxgCg3H7P2EfHP0GWDNwPHZK+tbO9caN27lGjxDbX9M8SyQbkElNsUYNA+n2HtLiQZ6EtB0syQLdxl10KX8tteV92WE8d8RM70yTGyfJZVAy0I5iHlt6XisxQZlrq2TlnbZrt4Jzc4JQrtqnhS+0uVm5IKR1JUh4akXIWqkGDhpJwDt4+B68tnvr6L5zB8YjIAAAAASUVORK5CYII=" + }, + "annotations": [] + } + ], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.md b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.md new file mode 100644 index 00000000..a3ce2fc7 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.md @@ -0,0 +1,13 @@ +**Transcript** + +February 20, 2025, 8:32PM + + + +**This is test 1** 0:08 +Correct, he is not. + + + +**This is test 2** 0:16 +Yeah, exactly. \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index f37b4874..9da0ea25 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -9,6 +9,7 @@ from docling.datamodel.document import ( DoclingDocument, InputDocument, SectionHeaderItem, + TextItem, ) from docling.document_converter import DocumentConverter @@ -96,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]): pred_md: str = doc.export_to_markdown() assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), ( - "export to md" + f"export to markdown failed on {docx_path}" ) pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), ( - "export to indented-text" + f"export to indented-text failed on {docx_path}" ) assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), ( - "document document" + f"DoclingDocument verification failed on {docx_path}" ) if docx_path.name == "word_tables.docx": @@ -116,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]): pred_text=pred_html, gtfile=str(gt_path) + ".html", generate=GENERATE, - ), "export to html" + ), f"export to html failed on {docx_path}" flaky_path = Path("tests/data/docx/textbox.docx") @@ -131,3 +132,42 @@ def test_e2e_docx_conversions(): @pytest.mark.xfail(strict=False) def test_textbox_conversion(): _test_e2e_docx_conversions_impl(docx_paths=[flaky_path]) + + +def test_text_after_image_anchors(): + """ + Test to analyse whether text gets parsed after image anchors. + """ + + in_path = Path("tests/data/docx/word_image_anchors.docx") + in_doc = InputDocument( + path_or_stream=in_path, + format=InputFormat.DOCX, + backend=MsWordDocumentBackend, + ) + backend = MsWordDocumentBackend( + in_doc=in_doc, + path_or_stream=in_path, + ) + doc = backend.convert() + + found_text_after_anchor_1 = found_text_after_anchor_2 = ( + found_text_after_anchor_3 + ) = found_text_after_anchor_4 = False + for item, _ in doc.iterate_items(): + if isinstance(item, TextItem): + if item.text == "This is test 1": + found_text_after_anchor_1 = True + elif item.text == "0:08\nCorrect, he is not.": + found_text_after_anchor_2 = True + elif item.text == "This is test 2": + found_text_after_anchor_3 = True + elif item.text == "0:16\nYeah, exactly.": + found_text_after_anchor_4 = True + + assert ( + found_text_after_anchor_1 + and found_text_after_anchor_2 + and found_text_after_anchor_3 + and found_text_after_anchor_4 + ) diff --git a/uv.lock b/uv.lock index ebc216f7..eb5a3629 100644 --- a/uv.lock +++ b/uv.lock @@ -983,7 +983,7 @@ examples = [ [[package]] name = "docling-core" -version = "2.38.0" +version = "2.38.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jsonref" }, @@ -997,9 +997,9 @@ dependencies = [ { name = "typer" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/01/3d/02b4926567735c252b4750074f9dfc96d06078566f067eb47c13713952a2/docling_core-2.38.0.tar.gz", hash = "sha256:3bad4c476cc798e29d01b02ea383b5582d7031e9595b177be0a9450f2eb7bef6", size = 145997, upload-time = "2025-06-18T12:35:23.81Z" } +sdist = { url = "https://files.pythonhosted.org/packages/38/f7/33bb17aa13e73722bf18ecfb7f13d6fbfb384c22003209bd72708123b33f/docling_core-2.38.1.tar.gz", hash = "sha256:a0566df2316eec4d22953ca7dac839b926dd57549b4c07ac810e87dbbaf91a10", size = 146276, upload-time = "2025-06-20T12:28:48.422Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/3c/52/e65521ec8ae7ecbce2f9dd95dbf4164b4d4c58c29136e1489a038ce9a2fc/docling_core-2.38.0-py3-none-any.whl", hash = "sha256:8f27d7074a99913f2ba73bde363bbed3416852014eda136bb8880d37805c6950", size = 151276, upload-time = "2025-06-18T12:35:22.25Z" }, + { url = "https://files.pythonhosted.org/packages/f0/c5/fb2e24602db94ec02cc3ac8eb7b9665f2a5f61ff81866beb67aff95a353a/docling_core-2.38.1-py3-none-any.whl", hash = "sha256:6859313561030503e8b53aec535aa5edb765a679af76ce2e2c60722d78c6c613", size = 151570, upload-time = "2025-06-20T12:28:46.764Z" }, ] [package.optional-dependencies] @@ -3387,10 +3387,10 @@ name = "ocrmac" version = "1.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux')" }, - { name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform == 'darwin'" }, - { name = "pillow", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, - { name = "pyobjc-framework-vision", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, + { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "pillow" }, + { name = "pyobjc-framework-vision" }, ] sdist = { url = "https://files.pythonhosted.org/packages/dd/dc/de3e9635774b97d9766f6815bbb3f5ec9bce347115f10d9abbf2733a9316/ocrmac-1.0.0.tar.gz", hash = "sha256:5b299e9030c973d1f60f82db000d6c2e5ff271601878c7db0885e850597d1d2e", size = 1463997, upload-time = "2024-11-07T12:00:00.197Z" } wheels = [ @@ -4414,7 +4414,7 @@ name = "pyobjc-framework-cocoa" version = "11.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, + { name = "pyobjc-core" }, ] sdist = { url = "https://files.pythonhosted.org/packages/4b/c5/7a866d24bc026f79239b74d05e2cf3088b03263da66d53d1b4cf5207f5ae/pyobjc_framework_cocoa-11.1.tar.gz", hash = "sha256:87df76b9b73e7ca699a828ff112564b59251bb9bbe72e610e670a4dc9940d038", size = 5565335, upload-time = "2025-06-14T20:56:59.683Z" } wheels = [ @@ -4433,8 +4433,8 @@ name = "pyobjc-framework-coreml" version = "11.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, - { name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, + { name = "pyobjc-core" }, + { name = "pyobjc-framework-cocoa" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0d/5d/4309f220981d769b1a2f0dcb2c5c104490d31389a8ebea67e5595ce1cb74/pyobjc_framework_coreml-11.1.tar.gz", hash = "sha256:775923eefb9eac2e389c0821b10564372de8057cea89f1ea1cdaf04996c970a7", size = 82005, upload-time = "2025-06-14T20:57:12.004Z" } wheels = [ @@ -4453,8 +4453,8 @@ name = "pyobjc-framework-quartz" version = "11.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, - { name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, + { name = "pyobjc-core" }, + { name = "pyobjc-framework-cocoa" }, ] sdist = { url = "https://files.pythonhosted.org/packages/c7/ac/6308fec6c9ffeda9942fef72724f4094c6df4933560f512e63eac37ebd30/pyobjc_framework_quartz-11.1.tar.gz", hash = "sha256:a57f35ccfc22ad48c87c5932818e583777ff7276605fef6afad0ac0741169f75", size = 3953275, upload-time = "2025-06-14T20:58:17.924Z" } wheels = [ @@ -4473,10 +4473,10 @@ name = "pyobjc-framework-vision" version = "11.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, - { name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, - { name = "pyobjc-framework-coreml", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, - { name = "pyobjc-framework-quartz", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, + { name = "pyobjc-core" }, + { name = "pyobjc-framework-cocoa" }, + { name = "pyobjc-framework-coreml" }, + { name = "pyobjc-framework-quartz" }, ] sdist = { url = "https://files.pythonhosted.org/packages/40/a8/7128da4d0a0103cabe58910a7233e2f98d18c590b1d36d4b3efaaedba6b9/pyobjc_framework_vision-11.1.tar.gz", hash = "sha256:26590512ee7758da3056499062a344b8a351b178be66d4b719327884dde4216b", size = 133721, upload-time = "2025-06-14T20:58:46.095Z" } wheels = [ @@ -4957,17 +4957,17 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, - { name = "numpy", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and python_full_version < '3.13') or (python_full_version >= '3.11' and platform_machine != 'arm64') or (python_full_version >= '3.11' and sys_platform != 'darwin')" }, + { name = "numpy", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "onnxruntime", version = "1.19.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "onnxruntime", version = "1.22.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" }, - { name = "opencv-python", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, - { name = "pillow", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, - { name = "pyclipper", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, - { name = "pyyaml", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, + { name = "onnxruntime", version = "1.22.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "opencv-python" }, + { name = "pillow" }, + { name = "pyclipper" }, + { name = "pyyaml" }, { name = "shapely", version = "2.0.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "shapely", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" }, - { name = "six", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, - { name = "tqdm", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, + { name = "shapely", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "six" }, + { name = "tqdm" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/ba/12/1e5497183bdbe782dbb91bad1d0d2297dba4d2831b2652657f7517bfc6df/rapidocr_onnxruntime-1.4.4-py3-none-any.whl", hash = "sha256:971d7d5f223a7a808662229df1ef69893809d8457d834e6373d3854bc1782cbf", size = 14915192, upload-time = "2025-01-17T01:48:25.104Z" },