diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 2455389a..d50287f5 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -94,7 +94,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB ) doc = DoclingDocument( description=DescriptionItem(), name="name_without_extension", origin=origin - ) # TODO must add origin information + ) # must add origin information doc = self.walk_linear(self.pptx_obj, doc) return doc @@ -183,7 +183,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB e_is_a_list_item = False if e_is_a_list_item: - # TODO: Set marker and enumerated arguments if this is an enumeration element. + # Set marker and enumerated arguments if this is an enumeration element. enum_marker = str(enum_list_item_value) + "." doc.add_list_item( marker=enum_marker, diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 7bf0946b..cc0e2613 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -44,6 +44,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.parents[i] = None self.level = 0 + self.listIter = 0 self.history = { "names": [None], @@ -124,6 +125,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): for element in body: tag_name = etree.QName(element).localname + # Check for Inline Images (drawings or blip elements) found_drawing = etree.ElementBase.xpath( element, ".//w:drawing", namespaces=self.xml_namespaces ) @@ -137,8 +139,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.handle_tables(element, docx_obj, doc) except Exception: _log.error("could not parse a table, broken docx table") - # Check for Inline Images (drawings or blip elements) - # elif element.xpath(".//w:drawing", namespaces = self.xml_namespaces) or element.xpath(".//w:pict", namespaces = self.xml_namespaces): + elif found_drawing or found_pict: self.handle_pictures(element, docx_obj, doc) # Check for Text @@ -166,7 +167,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Get the numId element and extract the value numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap) ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap) - numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None @@ -215,15 +215,31 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): text = paragraph.text.strip() # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists! + # Common styles for bullet and numbered lists. + # "List Bullet", "List Number", "List Paragraph" + # TODO: reliably identify wether list is a numbered list or not + # is_numbered = "List Bullet" not in paragraph.style.name + is_numbered = False + p_style_name, p_level = self.get_label_and_level(paragraph) numid, ilevel = self.get_numId_and_ilvl(paragraph) + # print("numid: {}, ilevel: {}, text: {}".format(numid, ilevel, text)) - # print("paragraph.text: {} | numid: {} | ilevel: {}".format(paragraph.text, numid, ilevel)) + if numid == 0: + numid = None # Handle lists if numid is not None and ilevel is not None: self.add_listitem( - element, docx_obj, doc, p_style_name, p_level, numid, ilevel, text + element, + docx_obj, + doc, + p_style_name, + p_level, + numid, + ilevel, + text, + is_numbered, ) self.update_history(p_style_name, p_level, numid, ilevel) return @@ -233,14 +249,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.parents[key] = None self.level = self.level_at_new_list - 1 self.level_at_new_list = None - if p_style_name in ["Title"]: for key, val in self.parents.items(): self.parents[key] = None self.parents[0] = doc.add_text( parent=None, label=DocItemLabel.TITLE, text=text ) - elif "Heading" in p_style_name: self.add_header(element, docx_obj, doc, p_style_name, p_level, text) @@ -312,8 +326,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): return def add_listitem( - self, element, docx_obj, doc, p_style_name, p_level, numid, ilevel, text: str + self, + element, + docx_obj, + doc, + p_style_name, + p_level, + numid, + ilevel, + text: str, + is_numbered=False, ): + # is_numbered = is_numbered + enum_marker = "" + level = self.get_level() if self.prev_numid() is None: # Open new list self.level_at_new_list = level # type: ignore @@ -323,7 +349,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) # TODO: Set marker and enumerated arguments if this is an enumeration element. - doc.add_list_item(parent=self.parents[level], text=text) + self.listIter += 1 + if is_numbered: + enum_marker = str(self.listIter) + "." + is_numbered = True + doc.add_list_item( + marker=enum_marker, + enumerated=is_numbered, + parent=self.parents[level], + text=text, + ) elif ( self.prev_numid() == numid and self.prev_indent() < ilevel @@ -334,12 +369,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ): # TODO: determine if this is an unordered list or an ordered list. # Set GroupLabel.ORDERED_LIST when it fits. - self.parents[i] = doc.add_group( - label=GroupLabel.LIST, name="list", parent=self.parents[i - 1] - ) + self.listIter = 0 + if is_numbered: + self.parents[i] = doc.add_group( + label=GroupLabel.ORDERED_LIST, + name="list", + parent=self.parents[i - 1], + ) + else: + self.parents[i] = doc.add_group( + label=GroupLabel.LIST, name="list", parent=self.parents[i - 1] + ) # TODO: Set marker and enumerated arguments if this is an enumeration element. + self.listIter += 1 + if is_numbered: + enum_marker = str(self.listIter) + "." + is_numbered = True doc.add_list_item( + marker=enum_marker, + enumerated=is_numbered, parent=self.parents[self.level_at_new_list + ilevel], text=text, ) @@ -350,14 +399,30 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.parents[k] = None # TODO: Set marker and enumerated arguments if this is an enumeration element. + self.listIter += 1 + if is_numbered: + enum_marker = str(self.listIter) + "." + is_numbered = True doc.add_list_item( + marker=enum_marker, + enumerated=is_numbered, parent=self.parents[self.level_at_new_list + ilevel], text=text, ) + self.listIter = 0 elif self.prev_numid() == numid or self.prev_indent() == ilevel: # TODO: Set marker and enumerated arguments if this is an enumeration element. - doc.add_list_item(parent=self.parents[level - 1], text=text) + self.listIter += 1 + if is_numbered: + enum_marker = str(self.listIter) + "." + is_numbered = True + doc.add_list_item( + marker=enum_marker, + enumerated=is_numbered, + parent=self.parents[level - 1], + text=text, + ) return def handle_tables(self, element, docx_obj, doc): diff --git a/poetry.lock b/poetry.lock index cefef5e6..4a868911 100644 --- a/poetry.lock +++ b/poetry.lock @@ -885,7 +885,7 @@ files = [] develop = false [package.dependencies] -docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "6fee533a101ca08f62e88826218c33e0aab2f417"} +docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "7c104d61aa5d003dd8d9711c37e23ce04799f4c9"} docutils = "!=0.21" matplotlib = "^3.7.1" networkx = "^3.1" @@ -909,8 +909,8 @@ toolkit = ["deepsearch-toolkit (>=0.31.0)"] [package.source] type = "git" url = "https://github.com/DS4SD/deepsearch-glm.git" -reference = "c13a6cdda25206911d63a5a28e990217ad823068" -resolved_reference = "c13a6cdda25206911d63a5a28e990217ad823068" +reference = "c185c4f985ccd29a470a1cddd3bec43880b739ee" +resolved_reference = "c185c4f985ccd29a470a1cddd3bec43880b739ee" [[package]] name = "dill" @@ -952,14 +952,15 @@ json-schema-for-humans = "^1.0.0" jsonref = "^1.1.0" jsonschema = "^4.16.0" pandas = "^2.1.4" +pillow = "^10.3.0" pydantic = "^2.6.0" tabulate = "^0.9.0" [package.source] type = "git" url = "https://github.com/DS4SD/docling-core.git" -reference = "6fee533a101ca08f62e88826218c33e0aab2f417" -resolved_reference = "6fee533a101ca08f62e88826218c33e0aab2f417" +reference = "7c104d61aa5d003dd8d9711c37e23ce04799f4c9" +resolved_reference = "7c104d61aa5d003dd8d9711c37e23ce04799f4c9" [[package]] name = "docling-ibm-models" @@ -3441,9 +3442,9 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, ] [[package]] @@ -3577,8 +3578,8 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.2", markers = "python_version == \"3.11\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -7114,4 +7115,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "46f6c1eb76034223f7d65760f6ebe0989ba9e8aff46fcdbce82c147030fcb8be" +content-hash = "14143d6cc79f4c2c8a4d021711198697e91ca01ecf290dd270b483984461c3d1" diff --git a/pyproject.toml b/pyproject.toml index 12e485d8..70a20826 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,9 +37,9 @@ torchvision = [ ###################### python = "^3.10" pydantic = "^2.0.0" -docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "6fee533a101ca08f62e88826218c33e0aab2f417"} +docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "7c104d61aa5d003dd8d9711c37e23ce04799f4c9"} docling-ibm-models = {git = "https://github.com/DS4SD/docling-ibm-models.git", rev = "1d2e2a2e6eb152c237f1383cdba20cf85db80b97"} -deepsearch-glm = {git = "https://github.com/DS4SD/deepsearch-glm.git", rev = "c13a6cdda25206911d63a5a28e990217ad823068"} +deepsearch-glm = {git = "https://github.com/DS4SD/deepsearch-glm.git", rev = "c185c4f985ccd29a470a1cddd3bec43880b739ee"} docling-parse = "^1.5.1" filetype = "^1.2.0" diff --git a/tests/data/word_sample.docx b/tests/data/word_sample.docx index b1889405..69b09f5a 100644 Binary files a/tests/data/word_sample.docx and b/tests/data/word_sample.docx differ