fix(docx): parse page headers and footers (#2599)

* fix(docx): parse page headers and footers

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(docx): rename _add_header with _add_heading

To avoid confusion, rename _add_header function name with _add_heading
since the function is about adding section headings.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(docx): extend the page header and footer parsing to any content type

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(docx): fix _add_header_footer function

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-11-10 16:10:12 +01:00
committed by GitHub
parent 463051b852
commit 054c4a634d
6 changed files with 516 additions and 22 deletions

View File

@@ -3,6 +3,7 @@ import os
from pathlib import Path
import pytest
from docling_core.types.doc import GroupItem
from docling.backend.docx.drawingml.utils import get_libreoffice_cmd
from docling.backend.msword_backend import MsWordDocumentBackend
@@ -208,3 +209,31 @@ def test_is_rich_table_cell(docx_paths):
f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} "
f"with text: {cell.text}"
)
def test_add_header_footer(documents):
"""Test the funciton _add_header_footer."""
name = "unit_test_formatting.docx"
doc = next(item[1] for item in documents if item[0].name == name)
headers: list[GroupItem] = []
footers: list[GroupItem] = []
for group in doc.groups:
if not isinstance(group, GroupItem):
continue
if group.name == "page header":
headers.append(group)
elif group.name == "page footer":
footers.append(group)
assert len(headers) == 2, "Expected 2 different headers"
assert len(footers) == 2, "Expected 2 different footers"
assert len(headers[0].children) == 1, "First page header should have 1 paragraph"
assert len(headers[1].children) == 2, "Second page header should have 2 paragraphs"
assert len(footers[0].children) == 1, "First page footer should have 1 paragraph"
assert len(footers[1].children) == 4, (
"Second page footer should have 3 paragraphs and 1 picture"
)