mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
fix(docx): parse page headers and footers (#2599)
* fix(docx): parse page headers and footers Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): rename _add_header with _add_heading To avoid confusion, rename _add_header function name with _add_heading since the function is about adding section headings. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): extend the page header and footer parsing to any content type Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): fix _add_header_footer function Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
463051b852
commit
054c4a634d
BIN
tests/data/docx/unit_test_formatting.docx
vendored
BIN
tests/data/docx/unit_test_formatting.docx
vendored
Binary file not shown.
@@ -29,4 +29,7 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-28 at level 5: text: Nested
|
||||
item-29 at level 5: text: italic
|
||||
item-30 at level 5: text: bold
|
||||
item-31 at level 1: text:
|
||||
item-31 at level 1: text:
|
||||
item-32 at level 1: text: The second page of the document with same header and footer
|
||||
item-33 at level 1: text:
|
||||
item-34 at level 1: text: The third page of the document with different header and footer
|
||||
@@ -1,10 +1,10 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.7.0",
|
||||
"version": "1.8.0",
|
||||
"name": "unit_test_formatting",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"binary_hash": 16380079676357958448,
|
||||
"binary_hash": 4350524979083842953,
|
||||
"filename": "unit_test_formatting.docx"
|
||||
},
|
||||
"furniture": {
|
||||
@@ -43,6 +43,27 @@
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/25"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/26"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/27"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/28"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/5"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/6"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/7"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/9"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@@ -164,6 +185,94 @@
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/5",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/29"
|
||||
}
|
||||
],
|
||||
"content_layer": "furniture",
|
||||
"name": "page header",
|
||||
"label": "section"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/6",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/30"
|
||||
}
|
||||
],
|
||||
"content_layer": "furniture",
|
||||
"name": "page footer",
|
||||
"label": "section"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/7",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/8"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/34"
|
||||
}
|
||||
],
|
||||
"content_layer": "furniture",
|
||||
"name": "page header",
|
||||
"label": "section"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/8",
|
||||
"parent": {
|
||||
"$ref": "#/groups/7"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/31"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/32"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/33"
|
||||
}
|
||||
],
|
||||
"content_layer": "furniture",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/9",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/35"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/36"
|
||||
},
|
||||
{
|
||||
"$ref": "#/pictures/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/37"
|
||||
}
|
||||
],
|
||||
"content_layer": "furniture",
|
||||
"name": "page footer",
|
||||
"label": "section"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
@@ -653,9 +762,245 @@
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/26",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "The second page of the document with same header and footer",
|
||||
"text": "The second page of the document with same header and footer",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/27",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/28",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "The third page of the document with different header and footer",
|
||||
"text": "The third page of the document with different header and footer",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/29",
|
||||
"parent": {
|
||||
"$ref": "#/groups/5"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is a header",
|
||||
"text": "This is a header",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/30",
|
||||
"parent": {
|
||||
"$ref": "#/groups/6"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is a footer",
|
||||
"text": "This is a footer",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/31",
|
||||
"parent": {
|
||||
"$ref": "#/groups/8"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Another",
|
||||
"text": "Another",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/32",
|
||||
"parent": {
|
||||
"$ref": "#/groups/8"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "header",
|
||||
"text": "header",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/33",
|
||||
"parent": {
|
||||
"$ref": "#/groups/8"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "in bold",
|
||||
"text": "in bold",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/34",
|
||||
"parent": {
|
||||
"$ref": "#/groups/7"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "With 2 paragraphs",
|
||||
"text": "With 2 paragraphs",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/35",
|
||||
"parent": {
|
||||
"$ref": "#/groups/9"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Another footer",
|
||||
"text": "Another footer",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/36",
|
||||
"parent": {
|
||||
"$ref": "#/groups/9"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "With",
|
||||
"text": "With",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/37",
|
||||
"parent": {
|
||||
"$ref": "#/groups/9"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "3 paragraphs and a picture",
|
||||
"text": "3 paragraphs and a picture",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
}
|
||||
],
|
||||
"pictures": [
|
||||
{
|
||||
"self_ref": "#/pictures/0",
|
||||
"parent": {
|
||||
"$ref": "#/groups/9"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
|
||||
@@ -14,4 +14,8 @@ Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docli
|
||||
- **Bold bullet 2**
|
||||
- Underline bullet 3
|
||||
- Some *italic* **bold** underline
|
||||
- Nested *italic* **bold**
|
||||
- Nested *italic* **bold**
|
||||
|
||||
The second page of the document with same header and footer
|
||||
|
||||
The third page of the document with different header and footer
|
||||
@@ -3,6 +3,7 @@ import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from docling_core.types.doc import GroupItem
|
||||
|
||||
from docling.backend.docx.drawingml.utils import get_libreoffice_cmd
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
@@ -208,3 +209,31 @@ def test_is_rich_table_cell(docx_paths):
|
||||
f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} "
|
||||
f"with text: {cell.text}"
|
||||
)
|
||||
|
||||
|
||||
def test_add_header_footer(documents):
|
||||
"""Test the funciton _add_header_footer."""
|
||||
|
||||
name = "unit_test_formatting.docx"
|
||||
doc = next(item[1] for item in documents if item[0].name == name)
|
||||
|
||||
headers: list[GroupItem] = []
|
||||
footers: list[GroupItem] = []
|
||||
for group in doc.groups:
|
||||
if not isinstance(group, GroupItem):
|
||||
continue
|
||||
if group.name == "page header":
|
||||
headers.append(group)
|
||||
elif group.name == "page footer":
|
||||
footers.append(group)
|
||||
|
||||
assert len(headers) == 2, "Expected 2 different headers"
|
||||
assert len(footers) == 2, "Expected 2 different footers"
|
||||
|
||||
assert len(headers[0].children) == 1, "First page header should have 1 paragraph"
|
||||
assert len(headers[1].children) == 2, "Second page header should have 2 paragraphs"
|
||||
|
||||
assert len(footers[0].children) == 1, "First page footer should have 1 paragraph"
|
||||
assert len(footers[1].children) == 4, (
|
||||
"Second page footer should have 3 paragraphs and 1 picture"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user