fix(docx): parse page headers and footers (#2599)

* fix(docx): parse page headers and footers

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(docx): rename _add_header with _add_heading

To avoid confusion, rename _add_header function name with _add_heading
since the function is about adding section headings.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(docx): extend the page header and footer parsing to any content type

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(docx): fix _add_header_footer function

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-11-10 16:10:12 +01:00
committed by GitHub
parent 463051b852
commit 054c4a634d
6 changed files with 516 additions and 22 deletions

Binary file not shown.

View File

@@ -29,4 +29,7 @@ item-0 at level 0: unspecified: group _root_
item-28 at level 5: text: Nested
item-29 at level 5: text: italic
item-30 at level 5: text: bold
item-31 at level 1: text:
item-31 at level 1: text:
item-32 at level 1: text: The second page of the document with same header and footer
item-33 at level 1: text:
item-34 at level 1: text: The third page of the document with different header and footer

View File

@@ -1,10 +1,10 @@
{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"version": "1.8.0",
"name": "unit_test_formatting",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"binary_hash": 16380079676357958448,
"binary_hash": 4350524979083842953,
"filename": "unit_test_formatting.docx"
},
"furniture": {
@@ -43,6 +43,27 @@
},
{
"$ref": "#/texts/25"
},
{
"$ref": "#/texts/26"
},
{
"$ref": "#/texts/27"
},
{
"$ref": "#/texts/28"
},
{
"$ref": "#/groups/5"
},
{
"$ref": "#/groups/6"
},
{
"$ref": "#/groups/7"
},
{
"$ref": "#/groups/9"
}
],
"content_layer": "body",
@@ -164,6 +185,94 @@
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/5",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/29"
}
],
"content_layer": "furniture",
"name": "page header",
"label": "section"
},
{
"self_ref": "#/groups/6",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/30"
}
],
"content_layer": "furniture",
"name": "page footer",
"label": "section"
},
{
"self_ref": "#/groups/7",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/groups/8"
},
{
"$ref": "#/texts/34"
}
],
"content_layer": "furniture",
"name": "page header",
"label": "section"
},
{
"self_ref": "#/groups/8",
"parent": {
"$ref": "#/groups/7"
},
"children": [
{
"$ref": "#/texts/31"
},
{
"$ref": "#/texts/32"
},
{
"$ref": "#/texts/33"
}
],
"content_layer": "furniture",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/9",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/35"
},
{
"$ref": "#/texts/36"
},
{
"$ref": "#/pictures/0"
},
{
"$ref": "#/texts/37"
}
],
"content_layer": "furniture",
"name": "page footer",
"label": "section"
}
],
"texts": [
@@ -653,9 +762,245 @@
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/26",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "The second page of the document with same header and footer",
"text": "The second page of the document with same header and footer",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/27",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/28",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "The third page of the document with different header and footer",
"text": "The third page of the document with different header and footer",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/29",
"parent": {
"$ref": "#/groups/5"
},
"children": [],
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "This is a header",
"text": "This is a header",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/30",
"parent": {
"$ref": "#/groups/6"
},
"children": [],
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "This is a footer",
"text": "This is a footer",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/31",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "Another",
"text": "Another",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/32",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "header",
"text": "header",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/33",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "in bold",
"text": "in bold",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/34",
"parent": {
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "With 2 paragraphs",
"text": "With 2 paragraphs",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/35",
"parent": {
"$ref": "#/groups/9"
},
"children": [],
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "Another footer",
"text": "Another footer",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/36",
"parent": {
"$ref": "#/groups/9"
},
"children": [],
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "With",
"text": "With",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/37",
"parent": {
"$ref": "#/groups/9"
},
"children": [],
"content_layer": "furniture",
"label": "text",
"prov": [],
"orig": "3 paragraphs and a picture",
"text": "3 paragraphs and a picture",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
}
],
"pictures": [
{
"self_ref": "#/pictures/0",
"parent": {
"$ref": "#/groups/9"
},
"children": [],
"content_layer": "furniture",
"label": "picture",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"annotations": []
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],

View File

@@ -14,4 +14,8 @@ Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docli
- **Bold bullet 2**
- Underline bullet 3
- Some *italic* **bold** underline
- Nested *italic* **bold**
- Nested *italic* **bold**
The second page of the document with same header and footer
The third page of the document with different header and footer