diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
index 56c025fb..48d32809 100644
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -6,6 +6,7 @@ from pathlib import Path
from typing import Any, Callable, Final, Optional, Union
from docling_core.types.doc import (
+ ContentLayer,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
@@ -95,6 +96,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.listIter = 0
# Track list counters per numId and ilvl
self.list_counters: dict[tuple[int, int], int] = {}
+ # Set starting content layer
+ self.content_layer = ContentLayer.BODY
self.history: dict[str, Any] = {
"names": [None],
@@ -148,6 +151,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if self.is_valid():
assert self.docx_obj is not None
doc, _ = self._walk_linear(self.docx_obj.element.body, doc)
+ self._add_header_footer(self.docx_obj, doc)
return doc
else:
@@ -258,12 +262,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label=GroupLabel.SECTION,
parent=self.parents[level - 1],
name="shape-text",
+ content_layer=self.content_layer,
)
added_elements.append(shape_group.get_ref())
doc.add_text(
label=DocItemLabel.TEXT,
parent=shape_group,
text=text_content,
+ content_layer=self.content_layer,
)
if textbox_elements:
@@ -750,7 +756,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
level = self._get_level()
# Create a textbox group to contain all text from the textbox
textbox_group = doc.add_group(
- label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
+ label=GroupLabel.SECTION,
+ parent=self.parents[level - 1],
+ name="textbox",
+ content_layer=self.content_layer,
)
elem_ref.append(textbox_group.get_ref())
# Set this as the current parent to ensure textbox content
@@ -864,7 +873,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
paragraph_elements: list,
) -> Optional[NodeItem]:
return (
- doc.add_inline_group(parent=prev_parent)
+ doc.add_inline_group(parent=prev_parent, content_layer=self.content_layer)
if len(paragraph_elements) > 1
else prev_parent
)
@@ -932,7 +941,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if p_style_id in ["Title"]:
for key in range(len(self.parents)):
self.parents[key] = None
- te = doc.add_text(parent=None, label=DocItemLabel.TITLE, text=text)
+ te = doc.add_text(
+ parent=None,
+ label=DocItemLabel.TITLE,
+ text=text,
+ content_layer=self.content_layer,
+ )
self.parents[0] = te
elem_ref.append(te.get_ref())
elif "Heading" in p_style_id:
@@ -943,7 +957,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
else:
is_numbered_style = False
- h1 = self._add_header(doc, p_level, text, is_numbered_style)
+ h1 = self._add_heading(doc, p_level, text, is_numbered_style)
elem_ref.extend(h1)
elif len(equations) > 0:
@@ -956,12 +970,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label=DocItemLabel.FORMULA,
parent=self.parents[level - 1],
text=text.replace("", "").replace("", ""),
+ content_layer=self.content_layer,
)
elem_ref.append(t1.get_ref())
else:
# Inline equation
level = self._get_level()
- inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
+ inline_equation = doc.add_inline_group(
+ parent=self.parents[level - 1], content_layer=self.content_layer
+ )
elem_ref.append(inline_equation.get_ref())
text_tmp = text
for eq in equations:
@@ -978,12 +995,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label=DocItemLabel.TEXT,
parent=inline_equation,
text=pre_eq_text,
+ content_layer=self.content_layer,
)
elem_ref.append(e1.get_ref())
e2 = doc.add_text(
label=DocItemLabel.FORMULA,
parent=inline_equation,
text=eq.replace("", "").replace("", ""),
+ content_layer=self.content_layer,
)
elem_ref.append(e2.get_ref())
@@ -992,6 +1011,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label=DocItemLabel.TEXT,
parent=inline_equation,
text=text_tmp.strip(),
+ content_layer=self.content_layer,
)
elem_ref.append(e3.get_ref())
@@ -1018,6 +1038,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
text=text,
formatting=format,
hyperlink=hyperlink,
+ content_layer=self.content_layer,
)
elem_ref.append(t2.get_ref())
@@ -1037,13 +1058,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
text=text,
formatting=format,
hyperlink=hyperlink,
+ content_layer=self.content_layer,
)
elem_ref.append(t3.get_ref())
self._update_history(p_style_id, p_level, numid, ilevel)
return elem_ref
- def _add_header(
+ def _add_heading(
self,
doc: DoclingDocument,
curr_level: Optional[int],
@@ -1154,6 +1176,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
text=text,
formatting=format,
hyperlink=hyperlink,
+ content_layer=self.content_layer,
)
return elem_ref
@@ -1180,7 +1203,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Reset counters for the new numbering sequence
self._reset_list_counters_for_new_sequence(numid)
- list_gr = doc.add_list_group(name="list", parent=self.parents[level - 1])
+ list_gr = doc.add_list_group(
+ name="list",
+ parent=self.parents[level - 1],
+ content_layer=self.content_layer,
+ )
self.parents[level] = list_gr
elem_ref.append(list_gr.get_ref())
@@ -1203,7 +1230,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level_at_new_list + prev_indent + 1,
self.level_at_new_list + ilevel + 1,
):
- list_gr1 = doc.add_list_group(name="list", parent=self.parents[i - 1])
+ list_gr1 = doc.add_list_group(
+ name="list",
+ parent=self.parents[i - 1],
+ content_layer=self.content_layer,
+ )
self.parents[i] = list_gr1
elem_ref.append(list_gr1.get_ref())
@@ -1262,11 +1293,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc: DoclingDocument,
provs_in_cell: list[RefItem],
docling_table: TableItem,
+ content_layer: ContentLayer = ContentLayer.BODY,
) -> RefItem:
group_element = doc.add_group(
label=GroupLabel.UNSPECIFIED,
name=group_name,
parent=docling_table,
+ content_layer=content_layer,
)
for prov in provs_in_cell:
group_element.children.append(prov)
@@ -1298,7 +1331,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
data = TableData(num_rows=num_rows, num_cols=num_cols)
level = self._get_level()
- docling_table = doc.add_table(data=data, parent=self.parents[level - 1])
+ docling_table = doc.add_table(
+ data=data, parent=self.parents[level - 1], content_layer=self.content_layer
+ )
elem_ref.append(docling_table.get_ref())
cell_set: set[CT_Tc] = set()
@@ -1349,7 +1384,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
rich_table_cell = True
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
ref_for_rich_cell = MsWordDocumentBackend._group_cell_elements(
- group_name, doc, provs_in_cell, docling_table
+ group_name,
+ doc,
+ provs_in_cell,
+ docling_table,
+ content_layer=self.content_layer,
)
if rich_table_cell:
@@ -1383,6 +1422,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
col_idx += cell.grid_span
return elem_ref
+ def _has_blip(self, element: BaseOxmlElement) -> bool:
+ """Check if a docx element holds any BLIP as a child.
+
+ Args:
+ element: a docx element
+
+ Returns:
+ Whether the element contains a BLIP as a direct child.
+ """
+
+ for item in element:
+ if self.blip_xpath_expr(item):
+ return True
+ if item.findall(
+ ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
+ ):
+ return True
+
+ return False
+
def _is_rich_table_cell(self, cell: _Cell) -> bool:
"""Determine whether a docx cell should be parsed as a Docling RichTableCell.
@@ -1420,13 +1479,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
tag = child.tag.split("}")[-1]
if tag not in allowed_tags:
return True
- for elem in tc:
- if self.blip_xpath_expr(elem):
- return True
- if elem.findall(
- ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
- ):
- return True
+ if self._has_blip(tc):
+ return True
# paragraph must contain runs with no run-properties
for para in paragraphs:
@@ -1468,6 +1522,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
p1 = doc.add_picture(
parent=self.parents[level - 1],
caption=None,
+ content_layer=self.content_layer,
)
elem_ref.append(p1.get_ref())
else:
@@ -1478,6 +1533,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[level - 1],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
+ content_layer=self.content_layer,
)
elem_ref.append(p2.get_ref())
except (UnidentifiedImageError, OSError):
@@ -1485,6 +1541,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
p3 = doc.add_picture(
parent=self.parents[level - 1],
caption=None,
+ content_layer=self.content_layer,
)
elem_ref.append(p3.get_ref())
return elem_ref
@@ -1515,12 +1572,68 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[level - 1],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
+ content_layer=self.content_layer,
)
except (UnidentifiedImageError, OSError):
_log.warning("Warning: DrawingML image cannot be loaded by Pillow")
doc.add_picture(
parent=self.parents[level - 1],
caption=None,
+ content_layer=self.content_layer,
)
return
+
+ def _add_header_footer(self, docx_obj: DocxDocument, doc: DoclingDocument) -> None:
+ """Add section headers and footers.
+
+ Headers and footers are added in the furniture content and only the text paragraphs
+ are parsed. The paragraphs are attached to a single group item for the header or the
+ footer. If the document has a section with new header and footer, they will be parsed
+ in new group items.
+
+ Args:
+ docx_obj: A docx Document object to be parsed.
+ doc: A DoclingDocument object to add the header and footer from docx_obj.
+ """
+ current_layer = self.content_layer
+ base_parent = self.parents[0]
+ self.content_layer = ContentLayer.FURNITURE
+ for sec_idx, section in enumerate(docx_obj.sections):
+ if sec_idx > 0 and not section.different_first_page_header_footer:
+ continue
+
+ hdr = (
+ section.first_page_header
+ if section.different_first_page_header_footer
+ else section.header
+ )
+ par = [txt for txt in (par.text.strip() for par in hdr.paragraphs) if txt]
+ tables = hdr.tables
+ has_blip = self._has_blip(hdr._element)
+ if par or tables or has_blip:
+ self.parents[0] = doc.add_group(
+ label=GroupLabel.SECTION,
+ name="page header",
+ content_layer=self.content_layer,
+ )
+ self._walk_linear(hdr._element, doc)
+
+ ftr = (
+ section.first_page_footer
+ if section.different_first_page_header_footer
+ else section.footer
+ )
+ par = [txt for txt in (par.text.strip() for par in ftr.paragraphs) if txt]
+ tables = ftr.tables
+ has_blip = self._has_blip(ftr._element)
+ if par or tables or has_blip:
+ self.parents[0] = doc.add_group(
+ label=GroupLabel.SECTION,
+ name="page footer",
+ content_layer=self.content_layer,
+ )
+ self._walk_linear(ftr._element, doc)
+
+ self.content_layer = current_layer
+ self.parents[0] = base_parent
diff --git a/tests/data/docx/unit_test_formatting.docx b/tests/data/docx/unit_test_formatting.docx
index 5d08668e..d82edbb2 100644
Binary files a/tests/data/docx/unit_test_formatting.docx and b/tests/data/docx/unit_test_formatting.docx differ
diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt
index f968494e..f54f15c9 100644
--- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt
@@ -29,4 +29,7 @@ item-0 at level 0: unspecified: group _root_
item-28 at level 5: text: Nested
item-29 at level 5: text: italic
item-30 at level 5: text: bold
- item-31 at level 1: text:
\ No newline at end of file
+ item-31 at level 1: text:
+ item-32 at level 1: text: The second page of the document with same header and footer
+ item-33 at level 1: text:
+ item-34 at level 1: text: The third page of the document with different header and footer
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json
index c0a63738..9e1ba463 100644
--- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json
+++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json
@@ -1,10 +1,10 @@
{
"schema_name": "DoclingDocument",
- "version": "1.7.0",
+ "version": "1.8.0",
"name": "unit_test_formatting",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- "binary_hash": 16380079676357958448,
+ "binary_hash": 4350524979083842953,
"filename": "unit_test_formatting.docx"
},
"furniture": {
@@ -43,6 +43,27 @@
},
{
"$ref": "#/texts/25"
+ },
+ {
+ "$ref": "#/texts/26"
+ },
+ {
+ "$ref": "#/texts/27"
+ },
+ {
+ "$ref": "#/texts/28"
+ },
+ {
+ "$ref": "#/groups/5"
+ },
+ {
+ "$ref": "#/groups/6"
+ },
+ {
+ "$ref": "#/groups/7"
+ },
+ {
+ "$ref": "#/groups/9"
}
],
"content_layer": "body",
@@ -164,6 +185,94 @@
"content_layer": "body",
"name": "group",
"label": "inline"
+ },
+ {
+ "self_ref": "#/groups/5",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/29"
+ }
+ ],
+ "content_layer": "furniture",
+ "name": "page header",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/6",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/30"
+ }
+ ],
+ "content_layer": "furniture",
+ "name": "page footer",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/7",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/groups/8"
+ },
+ {
+ "$ref": "#/texts/34"
+ }
+ ],
+ "content_layer": "furniture",
+ "name": "page header",
+ "label": "section"
+ },
+ {
+ "self_ref": "#/groups/8",
+ "parent": {
+ "$ref": "#/groups/7"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/31"
+ },
+ {
+ "$ref": "#/texts/32"
+ },
+ {
+ "$ref": "#/texts/33"
+ }
+ ],
+ "content_layer": "furniture",
+ "name": "group",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/9",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/35"
+ },
+ {
+ "$ref": "#/texts/36"
+ },
+ {
+ "$ref": "#/pictures/0"
+ },
+ {
+ "$ref": "#/texts/37"
+ }
+ ],
+ "content_layer": "furniture",
+ "name": "page footer",
+ "label": "section"
}
],
"texts": [
@@ -653,9 +762,245 @@
"prov": [],
"orig": "",
"text": ""
+ },
+ {
+ "self_ref": "#/texts/26",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "The second page of the document with same header and footer",
+ "text": "The second page of the document with same header and footer",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/27",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "",
+ "text": ""
+ },
+ {
+ "self_ref": "#/texts/28",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "The third page of the document with different header and footer",
+ "text": "The third page of the document with different header and footer",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/29",
+ "parent": {
+ "$ref": "#/groups/5"
+ },
+ "children": [],
+ "content_layer": "furniture",
+ "label": "text",
+ "prov": [],
+ "orig": "This is a header",
+ "text": "This is a header",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/30",
+ "parent": {
+ "$ref": "#/groups/6"
+ },
+ "children": [],
+ "content_layer": "furniture",
+ "label": "text",
+ "prov": [],
+ "orig": "This is a footer",
+ "text": "This is a footer",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/31",
+ "parent": {
+ "$ref": "#/groups/8"
+ },
+ "children": [],
+ "content_layer": "furniture",
+ "label": "text",
+ "prov": [],
+ "orig": "Another",
+ "text": "Another",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/32",
+ "parent": {
+ "$ref": "#/groups/8"
+ },
+ "children": [],
+ "content_layer": "furniture",
+ "label": "text",
+ "prov": [],
+ "orig": "header",
+ "text": "header",
+ "formatting": {
+ "bold": true,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/33",
+ "parent": {
+ "$ref": "#/groups/8"
+ },
+ "children": [],
+ "content_layer": "furniture",
+ "label": "text",
+ "prov": [],
+ "orig": "in bold",
+ "text": "in bold",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/34",
+ "parent": {
+ "$ref": "#/groups/7"
+ },
+ "children": [],
+ "content_layer": "furniture",
+ "label": "text",
+ "prov": [],
+ "orig": "With 2 paragraphs",
+ "text": "With 2 paragraphs",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/35",
+ "parent": {
+ "$ref": "#/groups/9"
+ },
+ "children": [],
+ "content_layer": "furniture",
+ "label": "text",
+ "prov": [],
+ "orig": "Another footer",
+ "text": "Another footer",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/36",
+ "parent": {
+ "$ref": "#/groups/9"
+ },
+ "children": [],
+ "content_layer": "furniture",
+ "label": "text",
+ "prov": [],
+ "orig": "With",
+ "text": "With",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/37",
+ "parent": {
+ "$ref": "#/groups/9"
+ },
+ "children": [],
+ "content_layer": "furniture",
+ "label": "text",
+ "prov": [],
+ "orig": "3 paragraphs and a picture",
+ "text": "3 paragraphs and a picture",
+ "formatting": {
+ "bold": false,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ }
+ ],
+ "pictures": [
+ {
+ "self_ref": "#/pictures/0",
+ "parent": {
+ "$ref": "#/groups/9"
+ },
+ "children": [],
+ "content_layer": "furniture",
+ "label": "picture",
+ "prov": [],
+ "captions": [],
+ "references": [],
+ "footnotes": [],
+ "annotations": []
}
],
- "pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md
index 918e89e2..59b2429d 100644
--- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md
+++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md
@@ -14,4 +14,8 @@ Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docli
- **Bold bullet 2**
- Underline bullet 3
- Some *italic* **bold** underline
- - Nested *italic* **bold**
\ No newline at end of file
+ - Nested *italic* **bold**
+
+The second page of the document with same header and footer
+
+The third page of the document with different header and footer
\ No newline at end of file
diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py
index 385884a5..5af52842 100644
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@@ -3,6 +3,7 @@ import os
from pathlib import Path
import pytest
+from docling_core.types.doc import GroupItem
from docling.backend.docx.drawingml.utils import get_libreoffice_cmd
from docling.backend.msword_backend import MsWordDocumentBackend
@@ -208,3 +209,31 @@ def test_is_rich_table_cell(docx_paths):
f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} "
f"with text: {cell.text}"
)
+
+
+def test_add_header_footer(documents):
+ """Test the funciton _add_header_footer."""
+
+ name = "unit_test_formatting.docx"
+ doc = next(item[1] for item in documents if item[0].name == name)
+
+ headers: list[GroupItem] = []
+ footers: list[GroupItem] = []
+ for group in doc.groups:
+ if not isinstance(group, GroupItem):
+ continue
+ if group.name == "page header":
+ headers.append(group)
+ elif group.name == "page footer":
+ footers.append(group)
+
+ assert len(headers) == 2, "Expected 2 different headers"
+ assert len(footers) == 2, "Expected 2 different footers"
+
+ assert len(headers[0].children) == 1, "First page header should have 1 paragraph"
+ assert len(headers[1].children) == 2, "Second page header should have 2 paragraphs"
+
+ assert len(footers[0].children) == 1, "First page footer should have 1 paragraph"
+ assert len(footers[1].children) == 4, (
+ "Second page footer should have 3 paragraphs and 1 picture"
+ )