diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 56c025fb..48d32809 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import Any, Callable, Final, Optional, Union from docling_core.types.doc import ( + ContentLayer, DocItemLabel, DoclingDocument, DocumentOrigin, @@ -95,6 +96,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.listIter = 0 # Track list counters per numId and ilvl self.list_counters: dict[tuple[int, int], int] = {} + # Set starting content layer + self.content_layer = ContentLayer.BODY self.history: dict[str, Any] = { "names": [None], @@ -148,6 +151,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if self.is_valid(): assert self.docx_obj is not None doc, _ = self._walk_linear(self.docx_obj.element.body, doc) + self._add_header_footer(self.docx_obj, doc) return doc else: @@ -258,12 +262,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): label=GroupLabel.SECTION, parent=self.parents[level - 1], name="shape-text", + content_layer=self.content_layer, ) added_elements.append(shape_group.get_ref()) doc.add_text( label=DocItemLabel.TEXT, parent=shape_group, text=text_content, + content_layer=self.content_layer, ) if textbox_elements: @@ -750,7 +756,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): level = self._get_level() # Create a textbox group to contain all text from the textbox textbox_group = doc.add_group( - label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox" + label=GroupLabel.SECTION, + parent=self.parents[level - 1], + name="textbox", + content_layer=self.content_layer, ) elem_ref.append(textbox_group.get_ref()) # Set this as the current parent to ensure textbox content @@ -864,7 +873,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): paragraph_elements: list, ) -> Optional[NodeItem]: return ( - doc.add_inline_group(parent=prev_parent) + doc.add_inline_group(parent=prev_parent, content_layer=self.content_layer) if len(paragraph_elements) > 1 else prev_parent ) @@ -932,7 +941,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if p_style_id in ["Title"]: for key in range(len(self.parents)): self.parents[key] = None - te = doc.add_text(parent=None, label=DocItemLabel.TITLE, text=text) + te = doc.add_text( + parent=None, + label=DocItemLabel.TITLE, + text=text, + content_layer=self.content_layer, + ) self.parents[0] = te elem_ref.append(te.get_ref()) elif "Heading" in p_style_id: @@ -943,7 +957,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) else: is_numbered_style = False - h1 = self._add_header(doc, p_level, text, is_numbered_style) + h1 = self._add_heading(doc, p_level, text, is_numbered_style) elem_ref.extend(h1) elif len(equations) > 0: @@ -956,12 +970,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): label=DocItemLabel.FORMULA, parent=self.parents[level - 1], text=text.replace("", "").replace("", ""), + content_layer=self.content_layer, ) elem_ref.append(t1.get_ref()) else: # Inline equation level = self._get_level() - inline_equation = doc.add_inline_group(parent=self.parents[level - 1]) + inline_equation = doc.add_inline_group( + parent=self.parents[level - 1], content_layer=self.content_layer + ) elem_ref.append(inline_equation.get_ref()) text_tmp = text for eq in equations: @@ -978,12 +995,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): label=DocItemLabel.TEXT, parent=inline_equation, text=pre_eq_text, + content_layer=self.content_layer, ) elem_ref.append(e1.get_ref()) e2 = doc.add_text( label=DocItemLabel.FORMULA, parent=inline_equation, text=eq.replace("", "").replace("", ""), + content_layer=self.content_layer, ) elem_ref.append(e2.get_ref()) @@ -992,6 +1011,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): label=DocItemLabel.TEXT, parent=inline_equation, text=text_tmp.strip(), + content_layer=self.content_layer, ) elem_ref.append(e3.get_ref()) @@ -1018,6 +1038,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): text=text, formatting=format, hyperlink=hyperlink, + content_layer=self.content_layer, ) elem_ref.append(t2.get_ref()) @@ -1037,13 +1058,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): text=text, formatting=format, hyperlink=hyperlink, + content_layer=self.content_layer, ) elem_ref.append(t3.get_ref()) self._update_history(p_style_id, p_level, numid, ilevel) return elem_ref - def _add_header( + def _add_heading( self, doc: DoclingDocument, curr_level: Optional[int], @@ -1154,6 +1176,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): text=text, formatting=format, hyperlink=hyperlink, + content_layer=self.content_layer, ) return elem_ref @@ -1180,7 +1203,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Reset counters for the new numbering sequence self._reset_list_counters_for_new_sequence(numid) - list_gr = doc.add_list_group(name="list", parent=self.parents[level - 1]) + list_gr = doc.add_list_group( + name="list", + parent=self.parents[level - 1], + content_layer=self.content_layer, + ) self.parents[level] = list_gr elem_ref.append(list_gr.get_ref()) @@ -1203,7 +1230,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.level_at_new_list + prev_indent + 1, self.level_at_new_list + ilevel + 1, ): - list_gr1 = doc.add_list_group(name="list", parent=self.parents[i - 1]) + list_gr1 = doc.add_list_group( + name="list", + parent=self.parents[i - 1], + content_layer=self.content_layer, + ) self.parents[i] = list_gr1 elem_ref.append(list_gr1.get_ref()) @@ -1262,11 +1293,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc: DoclingDocument, provs_in_cell: list[RefItem], docling_table: TableItem, + content_layer: ContentLayer = ContentLayer.BODY, ) -> RefItem: group_element = doc.add_group( label=GroupLabel.UNSPECIFIED, name=group_name, parent=docling_table, + content_layer=content_layer, ) for prov in provs_in_cell: group_element.children.append(prov) @@ -1298,7 +1331,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): data = TableData(num_rows=num_rows, num_cols=num_cols) level = self._get_level() - docling_table = doc.add_table(data=data, parent=self.parents[level - 1]) + docling_table = doc.add_table( + data=data, parent=self.parents[level - 1], content_layer=self.content_layer + ) elem_ref.append(docling_table.get_ref()) cell_set: set[CT_Tc] = set() @@ -1349,7 +1384,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): rich_table_cell = True group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}" ref_for_rich_cell = MsWordDocumentBackend._group_cell_elements( - group_name, doc, provs_in_cell, docling_table + group_name, + doc, + provs_in_cell, + docling_table, + content_layer=self.content_layer, ) if rich_table_cell: @@ -1383,6 +1422,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): col_idx += cell.grid_span return elem_ref + def _has_blip(self, element: BaseOxmlElement) -> bool: + """Check if a docx element holds any BLIP as a child. + + Args: + element: a docx element + + Returns: + Whether the element contains a BLIP as a direct child. + """ + + for item in element: + if self.blip_xpath_expr(item): + return True + if item.findall( + ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES + ): + return True + + return False + def _is_rich_table_cell(self, cell: _Cell) -> bool: """Determine whether a docx cell should be parsed as a Docling RichTableCell. @@ -1420,13 +1479,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): tag = child.tag.split("}")[-1] if tag not in allowed_tags: return True - for elem in tc: - if self.blip_xpath_expr(elem): - return True - if elem.findall( - ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES - ): - return True + if self._has_blip(tc): + return True # paragraph must contain runs with no run-properties for para in paragraphs: @@ -1468,6 +1522,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): p1 = doc.add_picture( parent=self.parents[level - 1], caption=None, + content_layer=self.content_layer, ) elem_ref.append(p1.get_ref()) else: @@ -1478,6 +1533,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[level - 1], image=ImageRef.from_pil(image=pil_image, dpi=72), caption=None, + content_layer=self.content_layer, ) elem_ref.append(p2.get_ref()) except (UnidentifiedImageError, OSError): @@ -1485,6 +1541,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): p3 = doc.add_picture( parent=self.parents[level - 1], caption=None, + content_layer=self.content_layer, ) elem_ref.append(p3.get_ref()) return elem_ref @@ -1515,12 +1572,68 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[level - 1], image=ImageRef.from_pil(image=pil_image, dpi=72), caption=None, + content_layer=self.content_layer, ) except (UnidentifiedImageError, OSError): _log.warning("Warning: DrawingML image cannot be loaded by Pillow") doc.add_picture( parent=self.parents[level - 1], caption=None, + content_layer=self.content_layer, ) return + + def _add_header_footer(self, docx_obj: DocxDocument, doc: DoclingDocument) -> None: + """Add section headers and footers. + + Headers and footers are added in the furniture content and only the text paragraphs + are parsed. The paragraphs are attached to a single group item for the header or the + footer. If the document has a section with new header and footer, they will be parsed + in new group items. + + Args: + docx_obj: A docx Document object to be parsed. + doc: A DoclingDocument object to add the header and footer from docx_obj. + """ + current_layer = self.content_layer + base_parent = self.parents[0] + self.content_layer = ContentLayer.FURNITURE + for sec_idx, section in enumerate(docx_obj.sections): + if sec_idx > 0 and not section.different_first_page_header_footer: + continue + + hdr = ( + section.first_page_header + if section.different_first_page_header_footer + else section.header + ) + par = [txt for txt in (par.text.strip() for par in hdr.paragraphs) if txt] + tables = hdr.tables + has_blip = self._has_blip(hdr._element) + if par or tables or has_blip: + self.parents[0] = doc.add_group( + label=GroupLabel.SECTION, + name="page header", + content_layer=self.content_layer, + ) + self._walk_linear(hdr._element, doc) + + ftr = ( + section.first_page_footer + if section.different_first_page_header_footer + else section.footer + ) + par = [txt for txt in (par.text.strip() for par in ftr.paragraphs) if txt] + tables = ftr.tables + has_blip = self._has_blip(ftr._element) + if par or tables or has_blip: + self.parents[0] = doc.add_group( + label=GroupLabel.SECTION, + name="page footer", + content_layer=self.content_layer, + ) + self._walk_linear(ftr._element, doc) + + self.content_layer = current_layer + self.parents[0] = base_parent diff --git a/tests/data/docx/unit_test_formatting.docx b/tests/data/docx/unit_test_formatting.docx index 5d08668e..d82edbb2 100644 Binary files a/tests/data/docx/unit_test_formatting.docx and b/tests/data/docx/unit_test_formatting.docx differ diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt index f968494e..f54f15c9 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt @@ -29,4 +29,7 @@ item-0 at level 0: unspecified: group _root_ item-28 at level 5: text: Nested item-29 at level 5: text: italic item-30 at level 5: text: bold - item-31 at level 1: text: \ No newline at end of file + item-31 at level 1: text: + item-32 at level 1: text: The second page of the document with same header and footer + item-33 at level 1: text: + item-34 at level 1: text: The third page of the document with different header and footer \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json index c0a63738..9e1ba463 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json @@ -1,10 +1,10 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "unit_test_formatting", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "binary_hash": 16380079676357958448, + "binary_hash": 4350524979083842953, "filename": "unit_test_formatting.docx" }, "furniture": { @@ -43,6 +43,27 @@ }, { "$ref": "#/texts/25" + }, + { + "$ref": "#/texts/26" + }, + { + "$ref": "#/texts/27" + }, + { + "$ref": "#/texts/28" + }, + { + "$ref": "#/groups/5" + }, + { + "$ref": "#/groups/6" + }, + { + "$ref": "#/groups/7" + }, + { + "$ref": "#/groups/9" } ], "content_layer": "body", @@ -164,6 +185,94 @@ "content_layer": "body", "name": "group", "label": "inline" + }, + { + "self_ref": "#/groups/5", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/29" + } + ], + "content_layer": "furniture", + "name": "page header", + "label": "section" + }, + { + "self_ref": "#/groups/6", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/30" + } + ], + "content_layer": "furniture", + "name": "page footer", + "label": "section" + }, + { + "self_ref": "#/groups/7", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/groups/8" + }, + { + "$ref": "#/texts/34" + } + ], + "content_layer": "furniture", + "name": "page header", + "label": "section" + }, + { + "self_ref": "#/groups/8", + "parent": { + "$ref": "#/groups/7" + }, + "children": [ + { + "$ref": "#/texts/31" + }, + { + "$ref": "#/texts/32" + }, + { + "$ref": "#/texts/33" + } + ], + "content_layer": "furniture", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/9", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/35" + }, + { + "$ref": "#/texts/36" + }, + { + "$ref": "#/pictures/0" + }, + { + "$ref": "#/texts/37" + } + ], + "content_layer": "furniture", + "name": "page footer", + "label": "section" } ], "texts": [ @@ -653,9 +762,245 @@ "prov": [], "orig": "", "text": "" + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "The second page of the document with same header and footer", + "text": "The second page of the document with same header and footer", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/27", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/28", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "The third page of the document with different header and footer", + "text": "The third page of the document with different header and footer", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/29", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "This is a header", + "text": "This is a header", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/30", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "This is a footer", + "text": "This is a footer", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/groups/8" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "Another", + "text": "Another", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/groups/8" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "header", + "text": "header", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/groups/8" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "in bold", + "text": "in bold", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "With 2 paragraphs", + "text": "With 2 paragraphs", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/35", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "Another footer", + "text": "Another footer", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/36", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "With", + "text": "With", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/37", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "3 paragraphs and a picture", + "text": "3 paragraphs and a picture", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + } + ], + "pictures": [ + { + "self_ref": "#/pictures/0", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "furniture", + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "annotations": [] } ], - "pictures": [], "tables": [], "key_value_items": [], "form_items": [], diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md index 918e89e2..59b2429d 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md @@ -14,4 +14,8 @@ Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docli - **Bold bullet 2** - Underline bullet 3 - Some *italic* **bold** underline - - Nested *italic* **bold** \ No newline at end of file + - Nested *italic* **bold** + +The second page of the document with same header and footer + +The third page of the document with different header and footer \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 385884a5..5af52842 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -3,6 +3,7 @@ import os from pathlib import Path import pytest +from docling_core.types.doc import GroupItem from docling.backend.docx.drawingml.utils import get_libreoffice_cmd from docling.backend.msword_backend import MsWordDocumentBackend @@ -208,3 +209,31 @@ def test_is_rich_table_cell(docx_paths): f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} " f"with text: {cell.text}" ) + + +def test_add_header_footer(documents): + """Test the funciton _add_header_footer.""" + + name = "unit_test_formatting.docx" + doc = next(item[1] for item in documents if item[0].name == name) + + headers: list[GroupItem] = [] + footers: list[GroupItem] = [] + for group in doc.groups: + if not isinstance(group, GroupItem): + continue + if group.name == "page header": + headers.append(group) + elif group.name == "page footer": + footers.append(group) + + assert len(headers) == 2, "Expected 2 different headers" + assert len(footers) == 2, "Expected 2 different footers" + + assert len(headers[0].children) == 1, "First page header should have 1 paragraph" + assert len(headers[1].children) == 2, "Second page header should have 2 paragraphs" + + assert len(footers[0].children) == 1, "First page footer should have 1 paragraph" + assert len(footers[1].children) == 4, ( + "Second page footer should have 3 paragraphs and 1 picture" + )