mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
fix(docx): parse page headers and footers (#2599)
* fix(docx): parse page headers and footers Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): rename _add_header with _add_heading To avoid confusion, rename _add_header function name with _add_heading since the function is about adding section headings. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): extend the page header and footer parsing to any content type Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(docx): fix _add_header_footer function Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
463051b852
commit
054c4a634d
@@ -6,6 +6,7 @@ from pathlib import Path
|
|||||||
from typing import Any, Callable, Final, Optional, Union
|
from typing import Any, Callable, Final, Optional, Union
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
|
ContentLayer,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
@@ -95,6 +96,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.listIter = 0
|
self.listIter = 0
|
||||||
# Track list counters per numId and ilvl
|
# Track list counters per numId and ilvl
|
||||||
self.list_counters: dict[tuple[int, int], int] = {}
|
self.list_counters: dict[tuple[int, int], int] = {}
|
||||||
|
# Set starting content layer
|
||||||
|
self.content_layer = ContentLayer.BODY
|
||||||
|
|
||||||
self.history: dict[str, Any] = {
|
self.history: dict[str, Any] = {
|
||||||
"names": [None],
|
"names": [None],
|
||||||
@@ -148,6 +151,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
assert self.docx_obj is not None
|
assert self.docx_obj is not None
|
||||||
doc, _ = self._walk_linear(self.docx_obj.element.body, doc)
|
doc, _ = self._walk_linear(self.docx_obj.element.body, doc)
|
||||||
|
self._add_header_footer(self.docx_obj, doc)
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
else:
|
else:
|
||||||
@@ -258,12 +262,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
label=GroupLabel.SECTION,
|
label=GroupLabel.SECTION,
|
||||||
parent=self.parents[level - 1],
|
parent=self.parents[level - 1],
|
||||||
name="shape-text",
|
name="shape-text",
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
added_elements.append(shape_group.get_ref())
|
added_elements.append(shape_group.get_ref())
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.TEXT,
|
label=DocItemLabel.TEXT,
|
||||||
parent=shape_group,
|
parent=shape_group,
|
||||||
text=text_content,
|
text=text_content,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
|
|
||||||
if textbox_elements:
|
if textbox_elements:
|
||||||
@@ -750,7 +756,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
level = self._get_level()
|
level = self._get_level()
|
||||||
# Create a textbox group to contain all text from the textbox
|
# Create a textbox group to contain all text from the textbox
|
||||||
textbox_group = doc.add_group(
|
textbox_group = doc.add_group(
|
||||||
label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
|
label=GroupLabel.SECTION,
|
||||||
|
parent=self.parents[level - 1],
|
||||||
|
name="textbox",
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
elem_ref.append(textbox_group.get_ref())
|
elem_ref.append(textbox_group.get_ref())
|
||||||
# Set this as the current parent to ensure textbox content
|
# Set this as the current parent to ensure textbox content
|
||||||
@@ -864,7 +873,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
paragraph_elements: list,
|
paragraph_elements: list,
|
||||||
) -> Optional[NodeItem]:
|
) -> Optional[NodeItem]:
|
||||||
return (
|
return (
|
||||||
doc.add_inline_group(parent=prev_parent)
|
doc.add_inline_group(parent=prev_parent, content_layer=self.content_layer)
|
||||||
if len(paragraph_elements) > 1
|
if len(paragraph_elements) > 1
|
||||||
else prev_parent
|
else prev_parent
|
||||||
)
|
)
|
||||||
@@ -932,7 +941,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if p_style_id in ["Title"]:
|
if p_style_id in ["Title"]:
|
||||||
for key in range(len(self.parents)):
|
for key in range(len(self.parents)):
|
||||||
self.parents[key] = None
|
self.parents[key] = None
|
||||||
te = doc.add_text(parent=None, label=DocItemLabel.TITLE, text=text)
|
te = doc.add_text(
|
||||||
|
parent=None,
|
||||||
|
label=DocItemLabel.TITLE,
|
||||||
|
text=text,
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
self.parents[0] = te
|
self.parents[0] = te
|
||||||
elem_ref.append(te.get_ref())
|
elem_ref.append(te.get_ref())
|
||||||
elif "Heading" in p_style_id:
|
elif "Heading" in p_style_id:
|
||||||
@@ -943,7 +957,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
is_numbered_style = False
|
is_numbered_style = False
|
||||||
h1 = self._add_header(doc, p_level, text, is_numbered_style)
|
h1 = self._add_heading(doc, p_level, text, is_numbered_style)
|
||||||
elem_ref.extend(h1)
|
elem_ref.extend(h1)
|
||||||
|
|
||||||
elif len(equations) > 0:
|
elif len(equations) > 0:
|
||||||
@@ -956,12 +970,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
label=DocItemLabel.FORMULA,
|
label=DocItemLabel.FORMULA,
|
||||||
parent=self.parents[level - 1],
|
parent=self.parents[level - 1],
|
||||||
text=text.replace("<eq>", "").replace("</eq>", ""),
|
text=text.replace("<eq>", "").replace("</eq>", ""),
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
elem_ref.append(t1.get_ref())
|
elem_ref.append(t1.get_ref())
|
||||||
else:
|
else:
|
||||||
# Inline equation
|
# Inline equation
|
||||||
level = self._get_level()
|
level = self._get_level()
|
||||||
inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
|
inline_equation = doc.add_inline_group(
|
||||||
|
parent=self.parents[level - 1], content_layer=self.content_layer
|
||||||
|
)
|
||||||
elem_ref.append(inline_equation.get_ref())
|
elem_ref.append(inline_equation.get_ref())
|
||||||
text_tmp = text
|
text_tmp = text
|
||||||
for eq in equations:
|
for eq in equations:
|
||||||
@@ -978,12 +995,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
label=DocItemLabel.TEXT,
|
label=DocItemLabel.TEXT,
|
||||||
parent=inline_equation,
|
parent=inline_equation,
|
||||||
text=pre_eq_text,
|
text=pre_eq_text,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
elem_ref.append(e1.get_ref())
|
elem_ref.append(e1.get_ref())
|
||||||
e2 = doc.add_text(
|
e2 = doc.add_text(
|
||||||
label=DocItemLabel.FORMULA,
|
label=DocItemLabel.FORMULA,
|
||||||
parent=inline_equation,
|
parent=inline_equation,
|
||||||
text=eq.replace("<eq>", "").replace("</eq>", ""),
|
text=eq.replace("<eq>", "").replace("</eq>", ""),
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
elem_ref.append(e2.get_ref())
|
elem_ref.append(e2.get_ref())
|
||||||
|
|
||||||
@@ -992,6 +1011,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
label=DocItemLabel.TEXT,
|
label=DocItemLabel.TEXT,
|
||||||
parent=inline_equation,
|
parent=inline_equation,
|
||||||
text=text_tmp.strip(),
|
text=text_tmp.strip(),
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
elem_ref.append(e3.get_ref())
|
elem_ref.append(e3.get_ref())
|
||||||
|
|
||||||
@@ -1018,6 +1038,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
text=text,
|
text=text,
|
||||||
formatting=format,
|
formatting=format,
|
||||||
hyperlink=hyperlink,
|
hyperlink=hyperlink,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
elem_ref.append(t2.get_ref())
|
elem_ref.append(t2.get_ref())
|
||||||
|
|
||||||
@@ -1037,13 +1058,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
text=text,
|
text=text,
|
||||||
formatting=format,
|
formatting=format,
|
||||||
hyperlink=hyperlink,
|
hyperlink=hyperlink,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
elem_ref.append(t3.get_ref())
|
elem_ref.append(t3.get_ref())
|
||||||
|
|
||||||
self._update_history(p_style_id, p_level, numid, ilevel)
|
self._update_history(p_style_id, p_level, numid, ilevel)
|
||||||
return elem_ref
|
return elem_ref
|
||||||
|
|
||||||
def _add_header(
|
def _add_heading(
|
||||||
self,
|
self,
|
||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
curr_level: Optional[int],
|
curr_level: Optional[int],
|
||||||
@@ -1154,6 +1176,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
text=text,
|
text=text,
|
||||||
formatting=format,
|
formatting=format,
|
||||||
hyperlink=hyperlink,
|
hyperlink=hyperlink,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
return elem_ref
|
return elem_ref
|
||||||
|
|
||||||
@@ -1180,7 +1203,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Reset counters for the new numbering sequence
|
# Reset counters for the new numbering sequence
|
||||||
self._reset_list_counters_for_new_sequence(numid)
|
self._reset_list_counters_for_new_sequence(numid)
|
||||||
|
|
||||||
list_gr = doc.add_list_group(name="list", parent=self.parents[level - 1])
|
list_gr = doc.add_list_group(
|
||||||
|
name="list",
|
||||||
|
parent=self.parents[level - 1],
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
self.parents[level] = list_gr
|
self.parents[level] = list_gr
|
||||||
elem_ref.append(list_gr.get_ref())
|
elem_ref.append(list_gr.get_ref())
|
||||||
|
|
||||||
@@ -1203,7 +1230,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.level_at_new_list + prev_indent + 1,
|
self.level_at_new_list + prev_indent + 1,
|
||||||
self.level_at_new_list + ilevel + 1,
|
self.level_at_new_list + ilevel + 1,
|
||||||
):
|
):
|
||||||
list_gr1 = doc.add_list_group(name="list", parent=self.parents[i - 1])
|
list_gr1 = doc.add_list_group(
|
||||||
|
name="list",
|
||||||
|
parent=self.parents[i - 1],
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
self.parents[i] = list_gr1
|
self.parents[i] = list_gr1
|
||||||
elem_ref.append(list_gr1.get_ref())
|
elem_ref.append(list_gr1.get_ref())
|
||||||
|
|
||||||
@@ -1262,11 +1293,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
provs_in_cell: list[RefItem],
|
provs_in_cell: list[RefItem],
|
||||||
docling_table: TableItem,
|
docling_table: TableItem,
|
||||||
|
content_layer: ContentLayer = ContentLayer.BODY,
|
||||||
) -> RefItem:
|
) -> RefItem:
|
||||||
group_element = doc.add_group(
|
group_element = doc.add_group(
|
||||||
label=GroupLabel.UNSPECIFIED,
|
label=GroupLabel.UNSPECIFIED,
|
||||||
name=group_name,
|
name=group_name,
|
||||||
parent=docling_table,
|
parent=docling_table,
|
||||||
|
content_layer=content_layer,
|
||||||
)
|
)
|
||||||
for prov in provs_in_cell:
|
for prov in provs_in_cell:
|
||||||
group_element.children.append(prov)
|
group_element.children.append(prov)
|
||||||
@@ -1298,7 +1331,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
||||||
level = self._get_level()
|
level = self._get_level()
|
||||||
docling_table = doc.add_table(data=data, parent=self.parents[level - 1])
|
docling_table = doc.add_table(
|
||||||
|
data=data, parent=self.parents[level - 1], content_layer=self.content_layer
|
||||||
|
)
|
||||||
elem_ref.append(docling_table.get_ref())
|
elem_ref.append(docling_table.get_ref())
|
||||||
|
|
||||||
cell_set: set[CT_Tc] = set()
|
cell_set: set[CT_Tc] = set()
|
||||||
@@ -1349,7 +1384,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
rich_table_cell = True
|
rich_table_cell = True
|
||||||
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
|
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
|
||||||
ref_for_rich_cell = MsWordDocumentBackend._group_cell_elements(
|
ref_for_rich_cell = MsWordDocumentBackend._group_cell_elements(
|
||||||
group_name, doc, provs_in_cell, docling_table
|
group_name,
|
||||||
|
doc,
|
||||||
|
provs_in_cell,
|
||||||
|
docling_table,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
|
|
||||||
if rich_table_cell:
|
if rich_table_cell:
|
||||||
@@ -1383,6 +1422,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
col_idx += cell.grid_span
|
col_idx += cell.grid_span
|
||||||
return elem_ref
|
return elem_ref
|
||||||
|
|
||||||
|
def _has_blip(self, element: BaseOxmlElement) -> bool:
|
||||||
|
"""Check if a docx element holds any BLIP as a child.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
element: a docx element
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Whether the element contains a BLIP as a direct child.
|
||||||
|
"""
|
||||||
|
|
||||||
|
for item in element:
|
||||||
|
if self.blip_xpath_expr(item):
|
||||||
|
return True
|
||||||
|
if item.findall(
|
||||||
|
".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
def _is_rich_table_cell(self, cell: _Cell) -> bool:
|
def _is_rich_table_cell(self, cell: _Cell) -> bool:
|
||||||
"""Determine whether a docx cell should be parsed as a Docling RichTableCell.
|
"""Determine whether a docx cell should be parsed as a Docling RichTableCell.
|
||||||
|
|
||||||
@@ -1420,13 +1479,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
tag = child.tag.split("}")[-1]
|
tag = child.tag.split("}")[-1]
|
||||||
if tag not in allowed_tags:
|
if tag not in allowed_tags:
|
||||||
return True
|
return True
|
||||||
for elem in tc:
|
if self._has_blip(tc):
|
||||||
if self.blip_xpath_expr(elem):
|
return True
|
||||||
return True
|
|
||||||
if elem.findall(
|
|
||||||
".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
|
|
||||||
):
|
|
||||||
return True
|
|
||||||
|
|
||||||
# paragraph must contain runs with no run-properties
|
# paragraph must contain runs with no run-properties
|
||||||
for para in paragraphs:
|
for para in paragraphs:
|
||||||
@@ -1468,6 +1522,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
p1 = doc.add_picture(
|
p1 = doc.add_picture(
|
||||||
parent=self.parents[level - 1],
|
parent=self.parents[level - 1],
|
||||||
caption=None,
|
caption=None,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
elem_ref.append(p1.get_ref())
|
elem_ref.append(p1.get_ref())
|
||||||
else:
|
else:
|
||||||
@@ -1478,6 +1533,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
parent=self.parents[level - 1],
|
parent=self.parents[level - 1],
|
||||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||||
caption=None,
|
caption=None,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
elem_ref.append(p2.get_ref())
|
elem_ref.append(p2.get_ref())
|
||||||
except (UnidentifiedImageError, OSError):
|
except (UnidentifiedImageError, OSError):
|
||||||
@@ -1485,6 +1541,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
p3 = doc.add_picture(
|
p3 = doc.add_picture(
|
||||||
parent=self.parents[level - 1],
|
parent=self.parents[level - 1],
|
||||||
caption=None,
|
caption=None,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
elem_ref.append(p3.get_ref())
|
elem_ref.append(p3.get_ref())
|
||||||
return elem_ref
|
return elem_ref
|
||||||
@@ -1515,12 +1572,68 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
parent=self.parents[level - 1],
|
parent=self.parents[level - 1],
|
||||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||||
caption=None,
|
caption=None,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
except (UnidentifiedImageError, OSError):
|
except (UnidentifiedImageError, OSError):
|
||||||
_log.warning("Warning: DrawingML image cannot be loaded by Pillow")
|
_log.warning("Warning: DrawingML image cannot be loaded by Pillow")
|
||||||
doc.add_picture(
|
doc.add_picture(
|
||||||
parent=self.parents[level - 1],
|
parent=self.parents[level - 1],
|
||||||
caption=None,
|
caption=None,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def _add_header_footer(self, docx_obj: DocxDocument, doc: DoclingDocument) -> None:
|
||||||
|
"""Add section headers and footers.
|
||||||
|
|
||||||
|
Headers and footers are added in the furniture content and only the text paragraphs
|
||||||
|
are parsed. The paragraphs are attached to a single group item for the header or the
|
||||||
|
footer. If the document has a section with new header and footer, they will be parsed
|
||||||
|
in new group items.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
docx_obj: A docx Document object to be parsed.
|
||||||
|
doc: A DoclingDocument object to add the header and footer from docx_obj.
|
||||||
|
"""
|
||||||
|
current_layer = self.content_layer
|
||||||
|
base_parent = self.parents[0]
|
||||||
|
self.content_layer = ContentLayer.FURNITURE
|
||||||
|
for sec_idx, section in enumerate(docx_obj.sections):
|
||||||
|
if sec_idx > 0 and not section.different_first_page_header_footer:
|
||||||
|
continue
|
||||||
|
|
||||||
|
hdr = (
|
||||||
|
section.first_page_header
|
||||||
|
if section.different_first_page_header_footer
|
||||||
|
else section.header
|
||||||
|
)
|
||||||
|
par = [txt for txt in (par.text.strip() for par in hdr.paragraphs) if txt]
|
||||||
|
tables = hdr.tables
|
||||||
|
has_blip = self._has_blip(hdr._element)
|
||||||
|
if par or tables or has_blip:
|
||||||
|
self.parents[0] = doc.add_group(
|
||||||
|
label=GroupLabel.SECTION,
|
||||||
|
name="page header",
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
|
self._walk_linear(hdr._element, doc)
|
||||||
|
|
||||||
|
ftr = (
|
||||||
|
section.first_page_footer
|
||||||
|
if section.different_first_page_header_footer
|
||||||
|
else section.footer
|
||||||
|
)
|
||||||
|
par = [txt for txt in (par.text.strip() for par in ftr.paragraphs) if txt]
|
||||||
|
tables = ftr.tables
|
||||||
|
has_blip = self._has_blip(ftr._element)
|
||||||
|
if par or tables or has_blip:
|
||||||
|
self.parents[0] = doc.add_group(
|
||||||
|
label=GroupLabel.SECTION,
|
||||||
|
name="page footer",
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
|
self._walk_linear(ftr._element, doc)
|
||||||
|
|
||||||
|
self.content_layer = current_layer
|
||||||
|
self.parents[0] = base_parent
|
||||||
|
|||||||
BIN
tests/data/docx/unit_test_formatting.docx
vendored
BIN
tests/data/docx/unit_test_formatting.docx
vendored
Binary file not shown.
@@ -29,4 +29,7 @@ item-0 at level 0: unspecified: group _root_
|
|||||||
item-28 at level 5: text: Nested
|
item-28 at level 5: text: Nested
|
||||||
item-29 at level 5: text: italic
|
item-29 at level 5: text: italic
|
||||||
item-30 at level 5: text: bold
|
item-30 at level 5: text: bold
|
||||||
item-31 at level 1: text:
|
item-31 at level 1: text:
|
||||||
|
item-32 at level 1: text: The second page of the document with same header and footer
|
||||||
|
item-33 at level 1: text:
|
||||||
|
item-34 at level 1: text: The third page of the document with different header and footer
|
||||||
@@ -1,10 +1,10 @@
|
|||||||
{
|
{
|
||||||
"schema_name": "DoclingDocument",
|
"schema_name": "DoclingDocument",
|
||||||
"version": "1.7.0",
|
"version": "1.8.0",
|
||||||
"name": "unit_test_formatting",
|
"name": "unit_test_formatting",
|
||||||
"origin": {
|
"origin": {
|
||||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
"binary_hash": 16380079676357958448,
|
"binary_hash": 4350524979083842953,
|
||||||
"filename": "unit_test_formatting.docx"
|
"filename": "unit_test_formatting.docx"
|
||||||
},
|
},
|
||||||
"furniture": {
|
"furniture": {
|
||||||
@@ -43,6 +43,27 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/25"
|
"$ref": "#/texts/25"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/26"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/27"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/28"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/5"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/9"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@@ -164,6 +185,94 @@
|
|||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"name": "group",
|
"name": "group",
|
||||||
"label": "inline"
|
"label": "inline"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/5",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/29"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"name": "page header",
|
||||||
|
"label": "section"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/6",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/30"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"name": "page footer",
|
||||||
|
"label": "section"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/7",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/8"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/34"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"name": "page header",
|
||||||
|
"label": "section"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/8",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/7"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/31"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/32"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/33"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"name": "group",
|
||||||
|
"label": "inline"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/9",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/35"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/36"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/pictures/0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/37"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"name": "page footer",
|
||||||
|
"label": "section"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"texts": [
|
"texts": [
|
||||||
@@ -653,9 +762,245 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "",
|
"orig": "",
|
||||||
"text": ""
|
"text": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/26",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "The second page of the document with same header and footer",
|
||||||
|
"text": "The second page of the document with same header and footer",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/27",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "",
|
||||||
|
"text": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/28",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "The third page of the document with different header and footer",
|
||||||
|
"text": "The third page of the document with different header and footer",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/29",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/5"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "This is a header",
|
||||||
|
"text": "This is a header",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/30",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/6"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "This is a footer",
|
||||||
|
"text": "This is a footer",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/31",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/8"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Another",
|
||||||
|
"text": "Another",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/32",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/8"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "header",
|
||||||
|
"text": "header",
|
||||||
|
"formatting": {
|
||||||
|
"bold": true,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/33",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/8"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "in bold",
|
||||||
|
"text": "in bold",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/34",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/7"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "With 2 paragraphs",
|
||||||
|
"text": "With 2 paragraphs",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/35",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/9"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Another footer",
|
||||||
|
"text": "Another footer",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/36",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/9"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "With",
|
||||||
|
"text": "With",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/37",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/9"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "3 paragraphs and a picture",
|
||||||
|
"text": "3 paragraphs and a picture",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false,
|
||||||
|
"script": "baseline"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"pictures": [
|
||||||
|
{
|
||||||
|
"self_ref": "#/pictures/0",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/9"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"label": "picture",
|
||||||
|
"prov": [],
|
||||||
|
"captions": [],
|
||||||
|
"references": [],
|
||||||
|
"footnotes": [],
|
||||||
|
"annotations": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"pictures": [],
|
|
||||||
"tables": [],
|
"tables": [],
|
||||||
"key_value_items": [],
|
"key_value_items": [],
|
||||||
"form_items": [],
|
"form_items": [],
|
||||||
|
|||||||
@@ -14,4 +14,8 @@ Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docli
|
|||||||
- **Bold bullet 2**
|
- **Bold bullet 2**
|
||||||
- Underline bullet 3
|
- Underline bullet 3
|
||||||
- Some *italic* **bold** underline
|
- Some *italic* **bold** underline
|
||||||
- Nested *italic* **bold**
|
- Nested *italic* **bold**
|
||||||
|
|
||||||
|
The second page of the document with same header and footer
|
||||||
|
|
||||||
|
The third page of the document with different header and footer
|
||||||
@@ -3,6 +3,7 @@ import os
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from docling_core.types.doc import GroupItem
|
||||||
|
|
||||||
from docling.backend.docx.drawingml.utils import get_libreoffice_cmd
|
from docling.backend.docx.drawingml.utils import get_libreoffice_cmd
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
@@ -208,3 +209,31 @@ def test_is_rich_table_cell(docx_paths):
|
|||||||
f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} "
|
f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} "
|
||||||
f"with text: {cell.text}"
|
f"with text: {cell.text}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_header_footer(documents):
|
||||||
|
"""Test the funciton _add_header_footer."""
|
||||||
|
|
||||||
|
name = "unit_test_formatting.docx"
|
||||||
|
doc = next(item[1] for item in documents if item[0].name == name)
|
||||||
|
|
||||||
|
headers: list[GroupItem] = []
|
||||||
|
footers: list[GroupItem] = []
|
||||||
|
for group in doc.groups:
|
||||||
|
if not isinstance(group, GroupItem):
|
||||||
|
continue
|
||||||
|
if group.name == "page header":
|
||||||
|
headers.append(group)
|
||||||
|
elif group.name == "page footer":
|
||||||
|
footers.append(group)
|
||||||
|
|
||||||
|
assert len(headers) == 2, "Expected 2 different headers"
|
||||||
|
assert len(footers) == 2, "Expected 2 different footers"
|
||||||
|
|
||||||
|
assert len(headers[0].children) == 1, "First page header should have 1 paragraph"
|
||||||
|
assert len(headers[1].children) == 2, "Second page header should have 2 paragraphs"
|
||||||
|
|
||||||
|
assert len(footers[0].children) == 1, "First page footer should have 1 paragraph"
|
||||||
|
assert len(footers[1].children) == 4, (
|
||||||
|
"Second page footer should have 3 paragraphs and 1 picture"
|
||||||
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user