From 054c4a634d91c5480df473b289b9403498065334 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Mon, 10 Nov 2025 16:10:12 +0100 Subject: [PATCH] fix(docx): parse page headers and footers (#2599) * fix(docx): parse page headers and footers Signed-off-by: Cesar Berrospi Ramis * chore(docx): rename _add_header with _add_heading To avoid confusion, rename _add_header function name with _add_heading since the function is about adding section headings. Signed-off-by: Cesar Berrospi Ramis * chore(docx): extend the page header and footer parsing to any content type Signed-off-by: Cesar Berrospi Ramis * chore(docx): fix _add_header_footer function Signed-off-by: Cesar Berrospi Ramis --------- Signed-off-by: Cesar Berrospi Ramis --- docling/backend/msword_backend.py | 147 +++++++- tests/data/docx/unit_test_formatting.docx | Bin 19731 -> 29099 bytes .../docling_v2/unit_test_formatting.docx.itxt | 5 +- .../docling_v2/unit_test_formatting.docx.json | 351 +++++++++++++++++- .../docling_v2/unit_test_formatting.docx.md | 6 +- tests/test_backend_msword.py | 29 ++ 6 files changed, 516 insertions(+), 22 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 56c025fb..48d32809 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import Any, Callable, Final, Optional, Union from docling_core.types.doc import ( + ContentLayer, DocItemLabel, DoclingDocument, DocumentOrigin, @@ -95,6 +96,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.listIter = 0 # Track list counters per numId and ilvl self.list_counters: dict[tuple[int, int], int] = {} + # Set starting content layer + self.content_layer = ContentLayer.BODY self.history: dict[str, Any] = { "names": [None], @@ -148,6 +151,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if self.is_valid(): assert self.docx_obj is not None doc, _ = self._walk_linear(self.docx_obj.element.body, doc) + self._add_header_footer(self.docx_obj, doc) return doc else: @@ -258,12 +262,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): label=GroupLabel.SECTION, parent=self.parents[level - 1], name="shape-text", + content_layer=self.content_layer, ) added_elements.append(shape_group.get_ref()) doc.add_text( label=DocItemLabel.TEXT, parent=shape_group, text=text_content, + content_layer=self.content_layer, ) if textbox_elements: @@ -750,7 +756,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): level = self._get_level() # Create a textbox group to contain all text from the textbox textbox_group = doc.add_group( - label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox" + label=GroupLabel.SECTION, + parent=self.parents[level - 1], + name="textbox", + content_layer=self.content_layer, ) elem_ref.append(textbox_group.get_ref()) # Set this as the current parent to ensure textbox content @@ -864,7 +873,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): paragraph_elements: list, ) -> Optional[NodeItem]: return ( - doc.add_inline_group(parent=prev_parent) + doc.add_inline_group(parent=prev_parent, content_layer=self.content_layer) if len(paragraph_elements) > 1 else prev_parent ) @@ -932,7 +941,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if p_style_id in ["Title"]: for key in range(len(self.parents)): self.parents[key] = None - te = doc.add_text(parent=None, label=DocItemLabel.TITLE, text=text) + te = doc.add_text( + parent=None, + label=DocItemLabel.TITLE, + text=text, + content_layer=self.content_layer, + ) self.parents[0] = te elem_ref.append(te.get_ref()) elif "Heading" in p_style_id: @@ -943,7 +957,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) else: is_numbered_style = False - h1 = self._add_header(doc, p_level, text, is_numbered_style) + h1 = self._add_heading(doc, p_level, text, is_numbered_style) elem_ref.extend(h1) elif len(equations) > 0: @@ -956,12 +970,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): label=DocItemLabel.FORMULA, parent=self.parents[level - 1], text=text.replace("", "").replace("", ""), + content_layer=self.content_layer, ) elem_ref.append(t1.get_ref()) else: # Inline equation level = self._get_level() - inline_equation = doc.add_inline_group(parent=self.parents[level - 1]) + inline_equation = doc.add_inline_group( + parent=self.parents[level - 1], content_layer=self.content_layer + ) elem_ref.append(inline_equation.get_ref()) text_tmp = text for eq in equations: @@ -978,12 +995,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): label=DocItemLabel.TEXT, parent=inline_equation, text=pre_eq_text, + content_layer=self.content_layer, ) elem_ref.append(e1.get_ref()) e2 = doc.add_text( label=DocItemLabel.FORMULA, parent=inline_equation, text=eq.replace("", "").replace("", ""), + content_layer=self.content_layer, ) elem_ref.append(e2.get_ref()) @@ -992,6 +1011,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): label=DocItemLabel.TEXT, parent=inline_equation, text=text_tmp.strip(), + content_layer=self.content_layer, ) elem_ref.append(e3.get_ref()) @@ -1018,6 +1038,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): text=text, formatting=format, hyperlink=hyperlink, + content_layer=self.content_layer, ) elem_ref.append(t2.get_ref()) @@ -1037,13 +1058,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): text=text, formatting=format, hyperlink=hyperlink, + content_layer=self.content_layer, ) elem_ref.append(t3.get_ref()) self._update_history(p_style_id, p_level, numid, ilevel) return elem_ref - def _add_header( + def _add_heading( self, doc: DoclingDocument, curr_level: Optional[int], @@ -1154,6 +1176,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): text=text, formatting=format, hyperlink=hyperlink, + content_layer=self.content_layer, ) return elem_ref @@ -1180,7 +1203,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Reset counters for the new numbering sequence self._reset_list_counters_for_new_sequence(numid) - list_gr = doc.add_list_group(name="list", parent=self.parents[level - 1]) + list_gr = doc.add_list_group( + name="list", + parent=self.parents[level - 1], + content_layer=self.content_layer, + ) self.parents[level] = list_gr elem_ref.append(list_gr.get_ref()) @@ -1203,7 +1230,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.level_at_new_list + prev_indent + 1, self.level_at_new_list + ilevel + 1, ): - list_gr1 = doc.add_list_group(name="list", parent=self.parents[i - 1]) + list_gr1 = doc.add_list_group( + name="list", + parent=self.parents[i - 1], + content_layer=self.content_layer, + ) self.parents[i] = list_gr1 elem_ref.append(list_gr1.get_ref()) @@ -1262,11 +1293,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc: DoclingDocument, provs_in_cell: list[RefItem], docling_table: TableItem, + content_layer: ContentLayer = ContentLayer.BODY, ) -> RefItem: group_element = doc.add_group( label=GroupLabel.UNSPECIFIED, name=group_name, parent=docling_table, + content_layer=content_layer, ) for prov in provs_in_cell: group_element.children.append(prov) @@ -1298,7 +1331,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): data = TableData(num_rows=num_rows, num_cols=num_cols) level = self._get_level() - docling_table = doc.add_table(data=data, parent=self.parents[level - 1]) + docling_table = doc.add_table( + data=data, parent=self.parents[level - 1], content_layer=self.content_layer + ) elem_ref.append(docling_table.get_ref()) cell_set: set[CT_Tc] = set() @@ -1349,7 +1384,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): rich_table_cell = True group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}" ref_for_rich_cell = MsWordDocumentBackend._group_cell_elements( - group_name, doc, provs_in_cell, docling_table + group_name, + doc, + provs_in_cell, + docling_table, + content_layer=self.content_layer, ) if rich_table_cell: @@ -1383,6 +1422,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): col_idx += cell.grid_span return elem_ref + def _has_blip(self, element: BaseOxmlElement) -> bool: + """Check if a docx element holds any BLIP as a child. + + Args: + element: a docx element + + Returns: + Whether the element contains a BLIP as a direct child. + """ + + for item in element: + if self.blip_xpath_expr(item): + return True + if item.findall( + ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES + ): + return True + + return False + def _is_rich_table_cell(self, cell: _Cell) -> bool: """Determine whether a docx cell should be parsed as a Docling RichTableCell. @@ -1420,13 +1479,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): tag = child.tag.split("}")[-1] if tag not in allowed_tags: return True - for elem in tc: - if self.blip_xpath_expr(elem): - return True - if elem.findall( - ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES - ): - return True + if self._has_blip(tc): + return True # paragraph must contain runs with no run-properties for para in paragraphs: @@ -1468,6 +1522,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): p1 = doc.add_picture( parent=self.parents[level - 1], caption=None, + content_layer=self.content_layer, ) elem_ref.append(p1.get_ref()) else: @@ -1478,6 +1533,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[level - 1], image=ImageRef.from_pil(image=pil_image, dpi=72), caption=None, + content_layer=self.content_layer, ) elem_ref.append(p2.get_ref()) except (UnidentifiedImageError, OSError): @@ -1485,6 +1541,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): p3 = doc.add_picture( parent=self.parents[level - 1], caption=None, + content_layer=self.content_layer, ) elem_ref.append(p3.get_ref()) return elem_ref @@ -1515,12 +1572,68 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[level - 1], image=ImageRef.from_pil(image=pil_image, dpi=72), caption=None, + content_layer=self.content_layer, ) except (UnidentifiedImageError, OSError): _log.warning("Warning: DrawingML image cannot be loaded by Pillow") doc.add_picture( parent=self.parents[level - 1], caption=None, + content_layer=self.content_layer, ) return + + def _add_header_footer(self, docx_obj: DocxDocument, doc: DoclingDocument) -> None: + """Add section headers and footers. + + Headers and footers are added in the furniture content and only the text paragraphs + are parsed. The paragraphs are attached to a single group item for the header or the + footer. If the document has a section with new header and footer, they will be parsed + in new group items. + + Args: + docx_obj: A docx Document object to be parsed. + doc: A DoclingDocument object to add the header and footer from docx_obj. + """ + current_layer = self.content_layer + base_parent = self.parents[0] + self.content_layer = ContentLayer.FURNITURE + for sec_idx, section in enumerate(docx_obj.sections): + if sec_idx > 0 and not section.different_first_page_header_footer: + continue + + hdr = ( + section.first_page_header + if section.different_first_page_header_footer + else section.header + ) + par = [txt for txt in (par.text.strip() for par in hdr.paragraphs) if txt] + tables = hdr.tables + has_blip = self._has_blip(hdr._element) + if par or tables or has_blip: + self.parents[0] = doc.add_group( + label=GroupLabel.SECTION, + name="page header", + content_layer=self.content_layer, + ) + self._walk_linear(hdr._element, doc) + + ftr = ( + section.first_page_footer + if section.different_first_page_header_footer + else section.footer + ) + par = [txt for txt in (par.text.strip() for par in ftr.paragraphs) if txt] + tables = ftr.tables + has_blip = self._has_blip(ftr._element) + if par or tables or has_blip: + self.parents[0] = doc.add_group( + label=GroupLabel.SECTION, + name="page footer", + content_layer=self.content_layer, + ) + self._walk_linear(ftr._element, doc) + + self.content_layer = current_layer + self.parents[0] = base_parent diff --git a/tests/data/docx/unit_test_formatting.docx b/tests/data/docx/unit_test_formatting.docx index 5d08668e4faf4fe93982a4f0b62ddd177597f5ff..d82edbb281e516f1cadb3c41700fda67e6258d50 100644 GIT binary patch delta 17634 zcmZU(b8u$QAN?8Iwr$(ClZkEH$rIbj#7-u*&53PW6FZrBGoSDNwzhV=>bbY-seAwE z?pL3C`n)?i6S(pK7)wbO6buar0tgBS2#6S{!9NSK3K$4z2f7}M7zFS;RZn)12`%gz z{7ZDKx+(*Pq+uFXkU?DuAheua(|;U%D7V%7YaPmtSG)G(~kuR2>E+XJsg}(8BHBb-0aNkT^T&>Y%jDn?2EXtdJL>~ zfSq=Lvs~CNBY@Vbn{d=OEoR*MjRGub1?3 zA@mep=@ic-e@UW26yDjOl2?3gkMf3csA0SUlYqnlRzcT8{L@1Yh`nD=Zk(_x0MvtG zBr36_8TSkTV@)(kmbA)Mty9b$qtl;cHpL%9n1ZptT$^G1=q^=Iq?90>Us4knN4ylC zU6dNc+cgJY9DFT-g?T&Dfh&@FC5rnT@FB3^3#C&{2|)Xyh15i$;tEpclsbMF^MI&O z0xOpT;u4yyq$pDsN{RbT$V_mOoMTKCM0uP`|E~5gz)`w7|4hO{e^9)m8H#vcHT6K_ z5*2TQE2>*nmD6%CR7!tOCqmX+BKa-Dgwvm}&(@cS!bWx4-9|iCr4LxCu_&CWczi6< z3ncBaJ9SSD*@>cJ%xn$`@3d~$p-fF;kKBy_+@({z@Jy~>ASsF$C7(l>`a}lU;j!;r zvh-nFpQvQ%WE6lNBKNlF;i)GXNPw|~vtx^O21$nO<;IdRKknKeNH0E&f@hDF0tZL} zb;NT=iw7GmV|1VJcY5^Eq}{PNG5!J;6L%v=V!I>?1@Tsa4ML`<@8K%nnJ zxU835<6Y*9?W+Lo$qTtV5Ao73iZQ=i&ExN(pcbBpkP#dFk&jH3f??QOPTDB>616k# zbS(HdL1WC=FJVZn?3*vY%L4y&kue7ia7!f)BCvuxUEWl~S4W>v))s-~uwHGkx6T!o zv}oFc`|QCq)0$&n0ll0>H41Kk%(rh2X3#H?SF6LeL*SRBZJo}A@<}*DmN{~uWXwaB z(LX(-FZ7Q}?o6{tM38+PQE7cTY*|q9Xxt~JQ$l;SEi0p_n2Li=VwZKo*C9XwfXQ0| zt#CksE#Vl*&o-zTz4U{{3YittEl};|L(-HWz=eCsmkbfRLs{}88 zrOj7@G|N7-tVk~2rnGc*z(TDp#ExFO=&u6KL)5q(hq)zj!A=eq3N?JBVOg`r0= z)6>1~{BxVI;C5EDHf|?Re<$7+$cv20Z7cZ*_xy^%6}G{$cWAn)1K-}iFp&CDV5Kjn zZssy;0<_(A$?>per{=H%0O?@Ue4E8r9H~@QpHPGan|*@Z77zzK-l*McQv-l#?$hZD z^}DO%$bJdXcGT8TZ%Jvl-VoJl4mDp78+iR}Y*>yoj{$j9QVg`+^#Leb_!kCsqoh>R ze#vq;=}{KiE&5Wny2J~!LRjZ)fFGR84yqk+C&MfraDU<6u;YILOh2;VJSf6av#16_ zfw;1_9OeD&f!s&Wc{@uyQ>)+|Tt4#<*a~%@@&52v#@m7%Q)UK?S*FDzaL3e?%|f`0 z#w^U>bXa@Wc+dS2%<~hxxjSE-T=33IbLpOG9!;aKb=1^}$NiPV(S>lN6~Z zibZ>t^>u7fH$gD?_y9_kD3#rf#Phs-?n76d{c7^%1lF53*r{HQl(@;?Fo@($i$6BN zWHoKif;g&_EfOAWVxOD@YgZ3r@k|9)LYFPk>)iL0+aKBLxiSlL8 z1^E^m!s>#@NsK2=HGZ2M<6(Cl=Vlb^N^nt-oZwxr^GAR3aljW){;7Z1{2Mavl%R#-hEJZ0WGDNC{18*a1#Hc~ z#hzjiaKonOpsY9}pZ~gF#(q?^7gI_IKC!r76_jp7qxHm(zi5-yS61M+T`}-ja^=XR zUf87+dhRVuw(&}+BAylYr2o2rLvIzVs5Hh9Kv!au4wG--N)(m{OVr@oH7-RjA94al zP6pevB@}8pQ%3~91;p$pR`$Tg?>ZvRmC2Z@{K=={_qT>|z=QnWhzYm-{>W`Ad~WeJ zTX|94s!dd9X8U#9Ez=Zt(6O#Dz&sW66=UCMI2VxLh?MQLXQ2zi_ zMFNs~(v7B@XMAMjO_6Gb$)KZ{OI}Hj;V07;yR%)?l$v5GDLe@>phRi?ND^Yg%9Tz@ z6w5&{i)OTh1bj=DtnF>N?E-TM#M1_Aw*VRu%3`(>zY;YIayGf}2Jk@Ib~Sib09Hj% z3!P3Q$RDDpVYM|J(^pYe&6|)i@R8R_J=m|gR4;uqrnI#ZV58}jd~%xd4ipCr3e(P)><)4Uc!2xoga5; z(0=2mK^{>H;YNSJA{cfcPKucBw7y{HkH$QV$@Pd7S00NzA1{4(Ui%9;zyfpq|MPs{ zer8*>{mEL$gn(~X$AlDi{bSujXtVSm$Q-85%Z&rK5QZccg_$oKEJ{$>z)7Putd3mPU#+ojI!Y30(Hg z|M_D>4|zC-dK@h^)m#Z11JCYKjG6$1NI3{Pg_BG?2q2A7iTfMM&?%b)4u*ncm%B#w zsm%~dww%&8@-9h7*(Lip38_MrKrX(hHV&;1B6uVtV=i*fW;a< zrUKZK8@5;-UaHcom^NBxhSW<`g(?0JQ98@3xZjCn5OkWW#GA=HNaz>g5Hb?gwoA{f zA%gmAD!{zF!?L8&o{8z#H7lA`ZK}|_B?~c7h&b1MQP@C>JA8>2RWl)vizT<1Kn z(P`oA)E-!%Qm$h7M@*#Nk-s4oWqJjRwa~8INTPB^aaT2hDT6#B1^nTI7LWp&%{FZt zFU!)W`Z~fAhif16-FY{u)~|P$!t$Au5_6zMv(~+0(uImWwN7I$D@;ufQ}}nP63B=>w44TLQ|ZY zHBv+0g70Cqn^*hoX-aGvF+otzV~8zk$ImLqM{ik);Yr-he>hT0eumiZygfHxJD1y; z1kg+GErqV)*?SY0@|T~K1!=E21Ph&!90&8AXd=n!Nu3s|+yq`Y3%0d4!_1TIr`+X!F&9*#*?es;VUE4c|j$D1H$5u;2E_|>hz`4S!dme{+> zN}-sFl;flDlcR&EY+6_tT|l>iSNK z32o$>@)95Exjb+-A)joCWx3pAp@aWFlnEQ;rMCS!^0dZi^0gUaTqa*$u1)exS`aOg zbSUXehyP5wU*l~Ipw)8~jpTg1tsEHJ>8J?f3}!2LH8N_hBfib#wU|X%$z=@8&qL7+ zCKcu6&7-dggvZ3-_(t3+2r%J{s2_a|i1~%BIJsO2 zB%{IUnj3*%fqp4b`CegPnU3wPJ z_107b2rAmCCq@TmV=Xk_=%Dw{d+bBqlLJ8x?TiWWz<(&IK5D_L%{@NgHWa2r$z56%H90XfpZxX5@7I7MIJd`1rrW0cEss9f z_H|MJcg;e7%&J{}`Zui0az?<~uNTkEK5LZE4ydvCNG}EAm7m%9H>Xk=0gUY$#S1I* z*p}+NhT47!Q$=U{E@qr3rKe==r@uqZWD2{l1wpD=)G%uTQCz6slzR3CY=-2-B zpDx4L;i!kifB*qa!z2q}VgYm<{sFy1dC5!E+X~@q6_lx_T50c>wR{QM)&gC|A<;5+ zv~ivOHmm_W2I>xXaw2#Vc1<{~;wf1pU(tXMWAAr(8$*+79*+O|c>iJ^%x5A!9HE50 zQvl2P$LQ;2;e%Qx{i%BDO(1EKnccSvoR&@@rsIEw)o(fq<8W&Z1!EDg3qNQWp)Ds- zU9ZXxL~ju&v=q>kDfv1wQGJp)KzwRXV8u*kcbo`uN)`@XJmI1@Y&Iyz~+-z09BjOuCl{)GGBe*uiY*XpM>C^*ZZ;|dNQi?wV=dZCPBslq@2!CL zfa*$o4@U}(Y^@z+m;@Jeoq`0D>m`;;j^@j}7b*D&{WAeRSfyb)?828Jbl$ z;6jglx%ZwnIKVOV<8u+zykcaISHEMpz}`IBY8DKKk6t_qL%v%fNq| zjyCXgmGW@ifv0~d9?X`oHQ6A*t6O}VM5zsJ_`1^>^@0hCX~c8b5@e1HK0 zLi#@$k)@fDshKm&zlf+f^*>$%zIhcS+SUP;7L7_$vstkF=Jg8l|KWA#f4u%A*T7mG>Lo4Cjq3$eth4$Ml^M1jmakE_>^8EFqHBXaQ? zUoLYw6G#@p6n3al*Y5TQg%fi!A?VB)~M_r8w`U zzC7EP_*9t4Lhq%biLC(q@oZ9*%yNzyjc^3RD$usOwLC{VTH{*BDkv*IwnHY1{G0b~2thlmZrj8yrbSn?p=kc}z@I+WZ8X>Xonrty{Q$od+ z_7l>Qsp^B}g~r@gCz=3L#P^MKf&h$Ur2fMCK?RNOk@>~hZzBR0fbVv(%)9l2WF;KV{t+p6xytqtiFQ>G3vDw(vK)DwfuF?)hJa8{L4rleU@6i{$C7+^0V82|#UB^G7@N%U?@N ze-(_MuIX~6W8yO(Zh3@Jt1_(HDbXq;m_H{KuLlykC#j6eq^l^X&mTc~EN?md^9wLZ z898%N|I=7u94x?^efGbon)dPAG(-ljDVrBo%&E0kWp7BaFT-wNAf z>@+-F>vYHymul$?YyzoEl4N9e+H4Y%OibG(xC@cXrWrKsLgU(wM zoDwp&=1N9|6;JWa8%wQElfutOBBJWj(CnJiu#aGTo}un*_M)F5^ucVrvs<)x1G8#! z0DOSpnaLMiGOc19*OsDBII0%}^bFlBa=(tFk*0lO85F=U0P(8jWTjHM~C`k+wu;PzpyH4g< zkOFkO?tg%#VWZzLrAC>60+yhiss!#XfcvmHPJPBzTtG`&E10>^2~p`ls>AwD<}3$r zoUoNS;;o}L+uKo$etpgU(ODQ&>CMD}9}mNa9kE01#wn4nt_=HIqb4Yo7b{?*VJGyjfpxct~U7y zTx!%vNqONkj=ViwWB#LYI`LX(${f#e9vZ2N;W0cMiK_5qk= zIpr#)$K8$fFiy|8qFua}Io6Vuy)No*qJJ~KmM7Iz@At?`=$T1h0kin#71}^Sgywy^ z@BfxplK8@btgar4wdw|Tae98^@5Ikuw`je;t~i~Y znu3H~m#^!RJ^+o?q(v-HARsCD|DR1@{bv&<)1+O# zZEe7H&MCmKjTfZLh{Bzs{(z%ZZb5E_B?%tYj&y9JxAwQWN&`^)Ly)QTwNKVIsKHJ) zl@o$yMM`7)xq(+ru-2sVxVy{q%d8gFL8*Ib7qvRlgCq8R(QUoTyVk*c@Au z5<(D3z>zJDA?56PVKj>!4x017gAhcf{a%izxcEmw;lu17pTWADhE7s$KWR*tc~qQNg{UsgHkAQ?>18jwms`dc9D!Sydq?rf z+m8rQ{c6QI=nIo9O$KaoevvP3?ndwH7nFmqIIQ`~2+ab?8E@X1DYFa&0q@JY=_sr> zRLLmhqEARO20rRM^#NkbfoFSANw8AgSNo$@G-0e#TnB%0gJ@t_rwgg0%PVZQ*n*Ke zsgWLI#@qnf&MZ)(DEAGd;n2nsjJZI|3HSl|Dc>4ph8g^ zfK=t(wsr#uJ}%q0SnKDk*C1S5OdQcQRVKa+1H^hKg&i=wTZJ|0t8Xx_B5N*9A0#v2 z6<_6Iyyk#2eV$Q8Lw>6G?F)=XojZa%41T{Td&hab--J9C}V!RP!xq%}Y}l z=gNks{PPo-BW5ZFaq!lYFJrTNy6sx=I`#bMt|;b&;_G_F@Ro8Z67@R-%n2b;b^rr6 z{R+ka{Fj&lXyb}rGrvRmB=rB+;`!ef;J?MQm@e&b!129!z+VZW@V%*aE$>Mx58%2W zfLU0DhlLrtXHm?ce@X=wbE17*`;n2a)mW&koth!kQzcy===>!p_x;<(=-foEYZ?lWvGuB+ww9V9Snnqv?F0tH@V+7+@5Nba{i%VOeedlt0V8T$Z$KX`3T?2?_| zVUhL~_t%*J|0Q^dSf^_$J#ZWS_Ut39$Ed0zHt<-_@023lG&5nYhy`EVEQw z$3;f%n@Wy!(A(_x{^SP!omGjHYj9pbAoCtVZj4>JvUi5HCW8Tx%{HDRizTL@hZUB; z$VHlhn9y2WWE~<~;jUf*LL8EYn|ORfc?@in_bN8KF5)MUeNx!UsxefD255);eoJ}D z+()7DLJp0$8RybJ%116{41yEm0Xf6uXz>0+v3Bm?45*Mus+8ywt{~HXcKo6(D8fxm zKad>F5q97(crXVz>Zx@*y2>n_d);sQa$E{KKzU65H8bU&W^x?P&9v+=C+pAB&5f~q zWnGD{y-Nu8-oqH|(P5&1Znb3}mXAX0u@3e6BYC z9_hjEwf<;d&wgWxJ`79#N{jlj^rllq7GLV9EGci}Mb*EIt~JR1E7Vsa^iX!Sa|e3t z;N*1S-1Zaj&ZXDWGnvRR08+uUJuo*qw1wkr*0|pH5TOjsDX$mH*@6qUWxwMV6mPr; z@82)}vf;kiJWJSHsCb_i{<1$YQF?oKd+%FrGFRr*I22SiMI(({M2o2UG>I)c-KD?9 zTc8!B;w1qc#Y&teROf0l-;|q~y07!93B??%GVBJ}Gl286V8K-bC7X8j`}0yG_!!3e zOA0$^XqUCP=h=@E2%ts9!)Et$mx)qZitNPPye3UWeL$olO4IU}|I}o<_zYgTPxc1k z!)DJ4;Xu2n6Y!@OhYp%BdQ@bD&=V})1YU-aM!=u@C*fZ{81!6H)M>j)m;n_h&&nH1s8KbbM&(5Znf6qXLPU)$=6N~}!Sc7Kn+PkNQrOF-eZjn>M*t34L} zU<4K0YB26b(WEN8AuR_%1pm~aFis;7ba$F|r|z#W19eiEtF@@8zN^wT^{UZOvimKn zQUz58gKJ=hrSbMEJz8O=`;mqjvuAlvkz+Lj$Z+qNHr&QW>1@q7Tes&~_ICMi;5N3Cl< zYgL43GKuR^asY|bYFhU*W)6XMAX}JTrR-8tR6@FR<3s$(Gt9p;r6}tMVnwP z7*Hxq2tT4zHy~GCT4q0GI-;O*_^@?YS{~7%&k=ez>@9EU zOSg0!WOn{lw*9{l4^TmY(9`wrT-0~v3-|xzBnHlAwl0kSFC}OA_eac3ebgWm9K`NN z{3o`bt5c2}!{!Pum~y@Zbkc?Ls_zb-&%|72o- z)guhot2=i4UJD8z2H85o%fWDrGa&#v<3HtSQ*p+lqqWpR8-sqO{3SPAhV`11kNEw_ z5`I{}$gG8}yfd4`*L0~|VgVY4sVXNSoGAyn-yr@PkAJ*o#RL9v5IK``xFnFHoR=XTZLrp#K^%5NNRgs-Yo)J4HI#G+E=!k`;as#cIU1SXT@$zW%YW=}G5^ zl(s#p1PrfQKh7;n5K9~+MHSW?szu1M2?e8>8bDxSg|BU2bDyw)TJ})Le|&vi-)Emp zZu-n{_j&+kxMBE=S#VX;^)o&ILmYXA?|!l>C@2UOjuLcvX-QmJ zSs9I8N}5zMuK41@_U`VE1PG0gP_eze{m_-?6xdn6!}ZkG-TjF*i46LSemBqO3ea9U zJ~83ojl`AXdOA<8U1tdVYsBXCt`i#XAwCzb5P+AXa7h?f6nh`^!dk5%dwP2MaG&>; z9TgP?;i#gnu0D+Q!U$^5gavYce~;LKX^OJbgq&c=gAR)chV!- z60c&lrzWT(9dwfQNzcF#H)F(+1>ld#=5eEOxY_L|jz@cMvRa^MYHntKT4%D`Xo>ap z^@Rw)Mky&w^GA}h11M1Nu&4fUKs|aII}j_KAM@o(gfU>9A3h1P@@(_&(=;$Jn2y-8 zKaq2FcW*H>HJ$V%+fAd_Ni}KRsJ7j|yK|P2md398n<9pv^CmbaW|2qq3ZP5=V~#La zeG)0t7}4`r*}!B*AWzZNio)X0!b&WgMb&da`~3WTSwq8oHQY!TJg$ZxY7Za-a=A*Y z@u9W%m&BgUTsoI7WCCp6GmrxB>>AAK z)wBG(2I3|sr~)MsjF1bwq4kByZ(gAI&_A~Wp3rXe{8ZgGM>co@zJPS*0Ax}L!@I+= zoyywU+AjCY^;n0gzB1*aFhW89_a-uF8j8eFSj>SOUzTCdgTrGguoqu>1XC3cRFkr= zuiUqQ31}&q{7zN#>3QhC2tjCAK{3by3eNk<=UcwjMUwHx-%|@0&1AD&RXUx+>+wx@ z$5lT$VN^Fyj)JN-K--(0fx-Uo%l%0Ps864w{Nmo?F%=b|`sfn^kZ&-sGI5rs9=Lx? z5y(b->+MPsS)frmlM%?(R_Bk5l#~=(mW+;9MPdQ26CH={M;uTC9i3>4Jild7f6q3Y z7hDnod{2pEj$M6hB|luIRuChn)t1Q!>PYR+^*u+!@1e8=SXvscM|^Hw`#(9pGaWM0VN_u&xu{A-dS3ryw33o)jg87N90&`f1l7atl1hI5xg&&#pSN4S?M7Ct_{#&;XTl2TM0 z`R>|qUL~5)QRf`g{f}`2*zP2cc(jmk!1Gr;-eODJB!%NTIoy_a`S z9*NGUaJt@>v6{O+R z?|R-Yp$hSp#oCo!FEQh9Ev{r5+S)vYsOtc}4Ty7Z%wj&qM^E9Jg@VJ#mzQ>L`=o*C1?l6CfVTTwML3sWlR1Q?7Q8|K0p_t3JHYj9meTKDc(B>G}i)WNz zSdSdkWSl3zp=){JNJ2#wk%mr9O`R?X_ymH;?A+YpflQBmYb_t_uReG8 z;Sfz(3L?{qaOWuae!iS^T`MYoM%B&*OdM>+TY+18)3vIxL%v6=*CruNSzYV8qP{m#Yb|`JXM8E6I>lEelMsx30bL8Z-w; z0W$6z8d$c&Lqo-B&|su&SqKRUbJ;b=&C!ZckoM^Ji{3)q8&Umn0!Pj)_SHtY8X#xB zTv69IHrmJPrJJ9}$MKsy`S0o@t?o(FF| zmQFOYr)faRV_?nNQ|Hyo%Nm=!&jRML@F`OY2}P(y{Q@M~oo056D=VGgB~Oo!OVHlk z`c@oSUOAH-=aaV1rPc*WV07FIv7(Bn%z*Uu3 zHi(rh{L#4ZE#zUxdFUZ6? z9GVzk4na1dPS|2HW1+{@?TTd2v3i?KN`bI#-|7+@9u@{6jiAB&h%x~g1DqC|fJ8*1 zNxB=)S(Q$>kQRy#k^sOaCtk1w4dE3c!z~vXM)9>i6{7vFm#3eJ@ap4p0fG zzDID!vmU&HFG#<)#z&ZW91H?uCt_n=fh@!}l;+6UcEc0n@?Hpfa6~7=48%jsBd^!X z`n_MVWPLen$=2iDoy)7?5f59a`a9ba`g*J7;QILj%Rvc_4jupod~~UR_c9(=8H=!} z4gC`R&LrhpP~Q(xu!T@&3V{X#{P=8(bw+8p0|9#6y%bWQ4NNLZHc40`oRU{q`}>a_ zWQ~sU#_7QNt#$5D=hXz9rS=qh{5m-Ea zP|^v%O4P=?$ppX!LKS|sm{SX=p^}q`}^L$O&mqm)bV~* zN?KYPcI+^*lamu4%2I~j>=OpXIqJ_sc0Z z27Y5f4WJYOR(+Fkb;2JgUz5Y(d+f7N3C$J-jYxF$57ObF%ZEEBqILBX;fw$2)K z4;QOn)QYXAFSMN}q8qMG#!osBJbc3TuT>Hbbs&sAbUzRB3a;4?e0J*;UcwY_EGjxW zI^AA>e%YDT4maAGR#ZF=OcC<@ZI~`gwVQ4qdq4*0kIbA=c~ug2A+dZY)38V?Y_$07 zY`j*Jg3it?&eT_}bh84L(goq7xzRD?zBB{>$PyC+p%GJiOvjq-z@@aRw(YZlXlNPJ z5o&Hvjp;WS5CMlI_CAps+{O)jExaEYawYC*6Guna*E5*gIdsO##SuE@K||D)t@&%A zJAjnV*;Alla2U%*;B6Tq<2$@>V>jFgZ7iJxY|cItiAw3YYGMYWD!AZmZ_mJ>*YP_8 zy9yoYf;_)zDO8C4CfXF80*fr3l@g5VM)()Lpn${COV|;DbW--W)eoenr^nX?#fTnq z|MpjitSl_Yjn#*Vq>?vpoeXpcpkY=4fGqPqi4f|~J_})x=@!^@ZgwD(L3J+wp_pJy z2E%V<#v=nA)>cgDE7@qii5XVvVL}pi;HJ-YxFB*b0JGhOH1Jiz*Q2g9GsHTQ^2nPTZeWz zs7JP`ECo|?sF8RSgFyF?_^N|AM_sm=lf)1T=GwHBlpI&CuC5YuWKF#Ql*}haN7JV{ zVDo(MA}x|{qmb3u6pFndl#VUCyNtYegQC*F=4nm3{)9Wp;PATB>2-VmL?9H9b8~Y; z+y+)sRYk?;b`JTre&3|W%)`M{XrW6{<{}ck%45xSB8&BM-oekxE__%T6O~yeiS=TU zcsphb0&-`XE$0fs5rF~MikJ+#TvGy*lanbP(jb=4P=W8{-ya;2$d*u4I}W63g#3?W z_V+AMNQ9e%r-f3*g%bI!zk0$73J8KjHPzMleUp%czCt{|g!*x2yD@zumLZVrpeSB{ z2L4$way7{btZnDiB5I}z&f|0-Ae^8R^V|j6ElAD@2? zd7p^T$#Va!=0%&T+0a>|o)SvIxM1KB@)fTwC#UTy(^b>Aqd)$|7;SB{QZsb9(b}HD z3=|t1I}7XHl|-~B0lD`vrayZ|X0L$o6Kbk9tKbKatph3!&iFI6zrR2K!AQJfxA)^& zHJ}_=g0bhXF$dtFt*$OXOj)C-H%$oMCOFdnSg}b;Q3nfS z+ugE&*M+?F3ai`wp?57`U%p;O7gayM?i1s~#{v-L60wwD6yEx>vE1G&Pcr%iN;eyX z66ed+Z>O9f*NT9TbA#vlz)tr|65#{~&HZ<=3SR};Wk5TP%q@8L%>b<`}s;X12z!xQmj#_{^j$>N4jW^Y9^aI`8u^&zcwvi z7X`Jvx%tti|2fUq)04}~$46C63~VZsJzC%Y#mpbR)9qXX1_9x7-G}wt4ox!^6&4nP zbn9O~)-w4Q>8GC|;PW(5($d->PqbPc}kS+vW6f^|t=qATQ($lf@ zHU7T5+$-};wzB_+F3#nj$fq}OvlFKJfT@hBbzI`RQ zg@whJ(x+|Q5ydkfpf=;`BgAM)F}aGRC*hn3aoQP+3-czY%+ ztEi~>V%Y6P2i-~ij%Ix47Zw&!P*Ia(B#Pz?nXtZ}a+L$x`fgKGQ}XKc-w_JucaN2I zb$#2*%gaBMtKPo8i~3(*Ay&Ikr9eYv-%%Hkw77y;jfi1TBP~8QBs2qE`VQ@P*=HqL zo(u8y(9d2^YqG7@@T5FR-L=Ew9u@pCpYWIu&=~<%t_U@{>)d0Jm$9VvW&_}@gaM6 zE}<1K;Q49AU>h$eRR(%v9b0M9PRVTLJ|_hrU#Hu$7N=3%y$@dvOxIQx5h7Wyry;g^ znI%44879?YM4miC&H`s4Fn@FYqQBTLRbQq_;W+`jRu0xhr71B&_={NI*h5L$v!&LO z-X~gCSoS7x7wAR zHG6@&SD)#;D&Quox&-}N)o|?1~rwG-oRfHD_*&c9nMLh(-eERkE z335aT;Z?!qZ6XHiQ0Piq^;%Z$H0RNR+Rb+?HBShit4;e>E%pJPH>8k6Glnm$izk=$ z1=Qruh*odX!>{8~vdf!H#BG4TN+(Kbf>T(uFf@HYq)FRC?X3NXe%U*!NrU)%lxth9 z4-Ttvm;D`~CeD~n)LxJNJgT*=mDXM~25qN@)!M#+$XLV#LiRO$Dz zEZI3CN1$Xjm7+t!%>KA(P&oVI8{=Ty(bEUjmBvky{;vN~P$W7#-hv9KMv!6V7`EjX zqZ_*27szbMH~of=fgw)VTn zv-nl3%*ZC{7yC-_RP_^paT|kV#Sc|)SMwlc{kMfUEo_t)6^ zKFq6|tlRK%h2MC-d>LQ4Ol*gt1LARdVRu zd+Rq`Dr9bXCf=j~21jha>edR8Mft-ZssKb<&*pVLx^gVVrWS8>A8Z3Ncw&abJ z8AYZFl)=2z=;W2xhPD-El#HHQcc4={ihoh(kV3M};aGYEG_p=>yx9%OZH(eL9jkxp zQeCtXusS8&Ux1>~R(W%CpisId75vI{2jHiEl!ecJA3GDH*UpSjS80Y^x~9Dd<+__A zaT75?{=!QHl?(pxL%X6c=C z7ik0cg6&67Df9%LazcSdjjmm`AutA<{aAFm?f)TkPsdw=1Cqw)xDKPri8ZD{J(TZi zW8eWoO|_Wc4_0hD@Fy33m;G~XR^2~AJ2W;}a8A!DzgG#~aDgq^e7(TQUn0-G=DM@8 zWW1>pMW{k0ilj_QRDIqLi&VF_D;VnIHsQCX=WhS&nZA`x60PQ%+sMWAMl1l(S5mKU zd_iD_oE$Jfi@DGHbTubjY!kOfA#hO%;?+_<4cx9c-RBdnK<=SSdr{Tf*0MBMYAqTmN z@#b-FNE`Q%-}Z-Z9{a?vK*;dLQDnN>jr_)%PPq>R-apM#+*3|{FVI!6JT^UY<{Zx- zWf2ixi~$0@`!`X$+Lnw1WTO&8OTb@0njU%HWEH2F6LQhuANpy)Ug-0h8kbEN|Nf2| zE^3H96<#Kc$RoMUcTYI{wM&XLBhVu4uL~tWQomMN#x@Uj)IiY@F;t3FH3wtWa#uMw zN4aCoWW6 z%Qd$ht5^NBo)~}S@{i4@er1_0&#!P&?~^<9Mf_wuu=N6&{$pg)Wx#d_1OtOGsELCD zq9+@8+Auzyoa-qkgMQcmLLn>r|Ss*KlfATX=JsI%+HDH_S12=HK z5JiEi&}0KI6&diRAfSR*Kn0-l3=lv}baH{0vJ7IU1!SQOLIJD9-kTIj3Y5eoK8 zK_@q%Yu_h3c!|g$Ru!Y0H&J%-TcCO9YdaCxDbOMTww&3kblHIr+7Z6rUJ_7()ZF^(~qMk^%th CRQuKd delta 8385 zcmZ8`WmFx_w(UlO`^Mef3GNQT-QC@S1=+a!PJkf6-Q9w_Yj6!3++8j?=e+y9d%Azs z=+#xVM$eu#=UQdk5a2WjTxB^ZXiNYM01f~EkO5+F8^WU?0DubE8eB3+Q2n=2+g?^Q z$-B_k;OWN2Ppz~CeZ#r&76jIh5K$lF6e?h6c5d#72IJ_cBE^ADNVj9{cQNLv)8=`j z)*pm0=kx7Tk+h+ubTv{XM>;H>E-0cQ5%MH#kn`;k0~H3Exbv3gMI&S!#70~t?)tnF zc&*Vfj=Geo>I@O{wK_KyK|@Y80Doaz0cRWJpl)0SR4(Fn5$a?rsVag;eXtiTRP>Ld z0eS|Do|f7K$dU;ysTbME^pKFRZ9fulm{(Pxwqb##BNqWuccRkUh##iay?k-fTJM*? z^nd#%r!cZpKMtXoZV?)yY1G2&L=_@pRXLhpVXcxX_@;tVH|KVq5Be!2W48v6?yM)) zEF^leLWCMYc>tB|aw?(S`&`EzWwe(``n|!L@z#Z)!kq>h_ZP5ty#+`Ho706_rKC)d z<&N6EvoK(_qxK^;WeQf*Yz;oJ9wpn~&>w?8T`h5r_h$ZsF3T&4+1Z_RL7a^-EOGQK zNH<+(9%mBvm#u0x3XJ*HS}oBV7!?Z(Y-x{D2LX)%gxv^Og=kgy|9fgGpCC9s(<(xSbJmL|&lny<$R3cmK!5`c4t z@NQLf8Bq!e0U0p2+@!min({}(I1zIsYd6 zQQ^E_dN9Pc9m_S@ zSu@gdqzv5U6!~h)@fE*e4a@SIbPo~Vv!xP~uCIko#Zsg2%3Kjg_X{YV_Hy#77BKCw z29`6WMK$9s!QtmFQE0*FJrtX~WiJzXTE` z&+5aVGj6im7UOjc?FhvmlIx>fG?kT5<$nwyYd%+Fi(@$#TcTPNp2ZD)xSw)Otrkr zmdi6f4cE`;V8aZxnZ*F}qr!^8xmPdB0GoyvtKjSo?`Ee$#^`sgBlxR}m=dJH$=RES z;>JCEo@F|SdoB^7Ud2~E+n~7+)>-5L=Jik<79pkM^s)Opafe-N7-nTB5LA#}=`Wjq zwP3WXZav-r%3i*$NK2o~Wv0wMMr{JB!18jTgSUUqTz|Q^?t8|X9$J|(`w-K&tEt?F zwnnjQf!nkic(J}lcbpC4$bhG@>^0xl#mgJqk-`?i5|HP%U zqWNR>z0ogbM5f8`W*d}L3W6f0L&nJF}U{$zXWJi|eS^ zuc=#t{94e)WhH5;t@PGrDBJBXNF&#<@JHB0;mIduh-E7{8Mp$Z6s}aYTkUNx*n8Gf z8o8O~s70H*BaR%5J@CIoY?4Y;plgSjFpOk6WbsOo*m}+!hppGRf=CcOpT7-}o`TXl`SjSsdZA49vj_`BTlFD&)n>f z({U#atDBBdro5bfCd9)Ey)+vb=^O_x!TeihzF<6+l&RaWCtM2mLZaY{K z70+?nVDa0x6o5?c2N$neFdSPY4)ndi*X|r*nW5?*S|8|0hnB@&6=h4!iip{v7^d$* zzbi{9l#@>%;LI4~D_}QMY?kg$+1ZZ5fhZQsQZsHy#^VmUG;u%ku z>i`LnI|wy_t4UZq?Q>Fe&kBJ_Kz6E2Vkq;Ojq54*KJ_23+Ea;ZnC1FUmXxtTj6g;# zCyUI)h5p%JM3G4B!{3!L*lQugsULRx?>|6`w+(-Ea3Vh-LC+4*`TKg?E`U(yYJ_}_i(PZ*@v|f)B9!B!9ocheMbv)Gja&1_?;dDV-H_aD(QA zpLAUfcBrCQ_8araT$9b3P=KAh$c;iH{BTp&xF1znfUN+o+t|aynOPPTYCO)T431>_ zUHhXio^A`>NEfsgMHNp^3oeZ=LOhyfV1oE`N@P9M$TPrD0IFKGhC5d5m*5v1}oyGjoOvB4rz8u9fRQmr-3zdj~y z&{Py91_{}4#Ln?6)`PlQy2g+O^i_+;^8_FecJ-(&VHmR&jnqI%hh-)cTAVP2#|}Z{ z+rQAP(PF8~A$HNJMb*{r=f^1#8QyfZoeIJWK<;(|taw}%-%p7*qp3eFbhyeKf4YN` z#_N~IQGvn$0;MO^lcEawrb<3?q|PN=Nkj0csdlCyIvJO~l+_cBI9LaY?`Zi*%a%Nw z<#Ll$r(;ihAti#&?=l)NY!#jPFY>282OS|Z3UPLwL1@CEK5Ng#DD(r0#~nvyVGA|J z={);gXSk}@yYTM~^G-#f!b{ya^F51|X8_g2B_9Lql2waI|A550;TzEkIe5q^{N=DC^ z9g-%ukZLb9{`0q_i2zOxkq_6v<{vMC@>{#6>NKbF#(8&l9jz-l{Fl{MYP58V>bLn+ z6rM1==@)@SlKFZsU6N;aww!u}JeXr*5fhFI+GX8`aShUm&m*9@EC+?v@z@LA%5B2`f-;{?w{$m{4vYBzLK!NMQc|I`Ql+Rv;pi4Z-wcwzV~DH8 z_lXF|tTRlXtxFv9A20`>oyM0t->U!7KW{kN@TwH&MVFoMw7a{jDq0$GukRM<4&D5G z@}t<}jN7PlmB@4i$c*$0g|s;iHK}}U3>S|gUseilo|};d^vOCgjer~%!zK+Fg(5~p zd+u9xC`$$R3xCh2&o{n7Z+plEmr3OwFoBIro@8QiP-AwVM~2=iStdlA3JAi2@h9OV ziPa@7D}_a%2^7(Y$5fy&&{;Lbr#k>A&&pg0*>L{Fayh6gG@lSgEeDtDTTvQ#@p~mE_gB z=p&TciBW-r&w3aNkA#1_XE_hUTk3%il*cT&Zajz#Anb-NVB4a2%aeYtzgF&7BBnRO zlNJ!e60DCmes?MGwHEHrWTf|Sb;zd)ndhI|Vp|K4n4*t-34w|YP>rF%_9Pl9#0+hM z!tOLo_B3=vdKha8p75VhYItPi70 zO=BCFkFwlD5p6ErPQ(Z`r=PcKI2~-FMtcOWrTw+l?j2f?lYfPba8=;fw}^j3*41ff zIJdJS4NA{Q`K7hL`|{-{!zes9s!})k^6S~IKGtpbOl!ybGZ>u$3&HCA@zXCz03Zqx zOoWRIT5)~Xs$pmRv!2GyO3;fnMAROIk!xI@nS6c#NsO?_kdH~{TPJOhZZfhQsjH-9 z*C5>!Tb#8uk`H&c^6Bz>a>74ReB*Z|W*9e^UW6w%s<1n(7$@lV*sbK=`eHzxkB_^j zXhrHi*%J!B%I1HH=1XzO#8s)QTd?t_XNLMfhCsRm`J@=u&k1R{v~a<-MfH?pxY~#4 z`#$#19C3u<)5Qa@X#AU)iI{eyaZD6Hab1HQr*l!Gqymh?RTiZcn5D)O>PlhL;vfSx z1ZT#1X86<7V<{y4W_$^4W@wlO62A<}UZXavGls`U&Oz#V{o<#r_Gg;I3czYyk{jRv z`9NB-ma*}S8cAzImxhs2Uz_$2!pq`iO{I~i_~%^Rmv`D;Ph2<8WTi`W-&b3hwCMH4 zB@^h`-Joq~TsBaK8n-aDhke}Ml>VOnQBA=e&CKSUt%B?+p0FC{=MPWituFN^BO5&} zlXqqw6CPANAGwT=G+`x8Lz$)xOFj1hjW1(B&@>x(eYNA8K!(N6!biISaP+{uGTILO z9uvtQcN`psd1vO5+}}JpM~lwJ$j=L@C;`|wWVx~q;dU86d)en^uG#g8D0b=>e`BUo z+t8b`SEJ4K*@SBou`Zo-p8ZEcqJt(6>xg_?apvmeUc zdOV{O9x~WgFd2$>g8G!&^lhJey2)jXP}!##C^mmMCm5)CEY%Qt?(eV4HXlv(xKT|T zb-yTecYV~<-liGKT%z<^w9l39Bn&kwa?iWEAy6bJSbH zGdG|3(*5cDykvh88SH&Q+}rY`GR$hREM3MwH%3b`-$KuY09^b&GsfC(^?JxV9<#r< ztg3?E@O4pLFh>P|g_&jt#rkB!YELH>&PLrD7dAG z+kf$~(}>t<{x9l*V-|`{uc7#FApg6Y(KR~7irX$e?hDz(iY%Qn(-5TL;;ubS*4%=` zl6ERp((i>a%Ab??L{-pWER<|yi`lB@peEuM@3jbdQS+$qv$_@`_9M)-^}aabZCMXN zI!v{qMp6G8_2W``$wMWFfqP()>{4l6d5ZSBmfer4b2Jr&iI#pdUqJXJo*5Ev0dkUV z_o<(lszbH>L?Dlv#MS`XCZmRUR=1F2#yV!KqZ`z;kY!<{mc!OP2=L#Ph2(6bRv$wr zw#!}=%+B%$zr8frFw&AVGBP%aUfZ|SG*l4(ZX5AsktMTsl&d0B;JsJbqc$roK{2IA z7P92rxle?O3tG5Rqz4hE+P*P3JkcXDIY&I(9Ya@r1<4}?Lri~;7%+_p>0BuP2FKHV z(gW6?b#dgWzMl&yhSkR4JGWBdnAd_KcktjxAxPEaEQ*TSuAZ;idY2t9GGHc^O<{6?Q1l?=EhZ+k=yefykGi3{r0pR7H@)9`kC`Lz0_Od$Ey z((p^;=Qi5S$GjXXnV9Hvli6P?IayC6yFcgA?oa+3^qyl#?9V@F4C?OOy}U}J>Q^st zsU|pffv5Xz!v88gKmq<;N(K{R!PQBNAi~1u2(hQreEbLIm zD?&L;DmkfHG2!wC$vPDGFbJPKsu61PNJz;+_vn{1ewEx^9(#1p13CHi!KYI%fprz{ zn>{xTMdA``yeg-n+gQ%bDZXvqvZkil&ePfN!kP6+bG&rk3Qk4R%ZeL6UvE;h!Y$+q zLM%?8X%9VgEShFBL0`vRg`Db$Dr9_m33Q>fomJ4`)oL=eBm|Drbq319WExGVz=OQC zj81%0=PQ2<=SBlfHEZP0lW}U*bS`s^@reBE!@MHb45_x3I&vG-vG;?LH=(Lli0pL7 zHD7260{9g!@7omwGz$zpt1EEq@46)YXS^bPGQ}voq*ky|K*jV8!R5rr+g1`ZE8NCD zi@CXw?klQ!Zj))7_FYE8M9UWZbjq@9j569IP#T?^BU-C)$4lL4W57&jeDSk1SS_N*^;7?ZjohFl& z`5Zv4z>z-_@mBn+agME?lBD7=(kbVh_-NV$Kfn>lGM^-4tlx;tB&R`ImxSTGS-py&Drz%}Q2?F|I+pWR z;<6hi(7u*2&O#hL+ghtxn%2e224&aV5>hk`))AqDD8cIG1WADnsSB#-s#T*8B>Dr& zQXf_?4f8hj=k;fV`yY@nTc6nKvY7GvY(ZCY2DV(UddYbkL;{0w8>_A_>Krg$m^vQ)d<&@lH`|BX*DF^80| z;jkMZH^2lQBPb-JLzwGhfyC+7qJI+GH$0o+G=6GmoU=nh*M}lJGfB}8xW>-hr=)A& zH({~%o4)RxSKiBd+%dfnV2jJToNlEFNa%kuY$rOKi(mQGB2dWkGwxCoRj6Bmm#JYD zuHrY@1K(!}#!0cV9AABMTAdIn%6dWrSLTFopxB>XA&beukb@DV$4AjFfnKNd*Ya?> zEsmyhqI3iE&eTfigO@?j1n53ni;?Iq#~L5id*u4WD|H5MushfsvxxEFx0;44x2;F)~pzQxW+?H#H-D-l(E$#mQ?;q6@$$l?7jFjVg4oEd}oP zRd*xQE608*0g3YY`>-Bh#1D$27pdSnkPd1+7TqZ3NH)w|qULBugMt_$C-YgvV%*PO z1e32lqq{GWWj^pK7j2tQh0>@&AN|y(;r1$`UZZ7)$R;Oz(>?L2<`jXqokC&uE_i&tBQx2)4eX)SLpA1l4;oSj zk7Fa_s^T89agymYacB!7N#V`Z*15+e^X4Jh;SpO@Cb#5`&9Ya4yW8Rsyo7{A>}Glu z4bN+LVADE@F068e;CRPzGwvU?9R}_;72>~@hZ-(*&8oQO)NEi{Zd**$c_x&zs=C@! z86HsiiPjn1?0hZF6tfndCMRbar-zjZCYah**w zT=FKrC&1p*WkN`*PPXC7rKn6rEe=e8Ja zj??{>nef&C(7F2UPTdAaFqDG!kZ+Yml;3PitzI~9Y(|lNr+|?A2uXtK9qp&k4cyt=+9QsHN3JQfwFBqCJ?CU(~eeavQcLR!Bqw zs)Q6iW&St-S2KggFF>&XuYkxxmxMo5V*AHaBPagpPyat%WLYVo-K1WIAiL}q1TZN9 zmX`Fh(e%L4y3v%@eZWPv__n zRwU_^RX9QMGWIC+^sygBTWC{)tn__Ry74O#EEziSi2N>@uu+4za)nvE49SZli4Ib% z&nbtc6#KQd4_;e9n-rvGDkpge;oP1;Ab$m;eX-vXeY(i`T|hJJuPTo+`LnZ#G79VK z8ks#DCeMi|5_(bLdQSdQ>#yQm$+6~LT$C}jT)476zoEuPUG0T#sHL5CcoMpyw&XlN z2ZvrysXE=G)@e9;oU`hVAMZ>|cs}Q}UKl9H?Tyk%K)rtdL@a)C_E>bm)jMk3cw-eD zZlIjzQQO=xe3xa>7neIVC03Lv58HVl{y76_iOMGJnun=??L}}DK5w(IEAPa%3iaO^oJ#@#=1ykHu1?NwEGEv*e;t#u+y|JyDWJXI M&+l(Q$o?Ju57j<|jsO4v diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt index f968494e..f54f15c9 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt @@ -29,4 +29,7 @@ item-0 at level 0: unspecified: group _root_ item-28 at level 5: text: Nested item-29 at level 5: text: italic item-30 at level 5: text: bold - item-31 at level 1: text: \ No newline at end of file + item-31 at level 1: text: + item-32 at level 1: text: The second page of the document with same header and footer + item-33 at level 1: text: + item-34 at level 1: text: The third page of the document with different header and footer \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json index c0a63738..9e1ba463 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json @@ -1,10 +1,10 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "unit_test_formatting", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "binary_hash": 16380079676357958448, + "binary_hash": 4350524979083842953, "filename": "unit_test_formatting.docx" }, "furniture": { @@ -43,6 +43,27 @@ }, { "$ref": "#/texts/25" + }, + { + "$ref": "#/texts/26" + }, + { + "$ref": "#/texts/27" + }, + { + "$ref": "#/texts/28" + }, + { + "$ref": "#/groups/5" + }, + { + "$ref": "#/groups/6" + }, + { + "$ref": "#/groups/7" + }, + { + "$ref": "#/groups/9" } ], "content_layer": "body", @@ -164,6 +185,94 @@ "content_layer": "body", "name": "group", "label": "inline" + }, + { + "self_ref": "#/groups/5", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/29" + } + ], + "content_layer": "furniture", + "name": "page header", + "label": "section" + }, + { + "self_ref": "#/groups/6", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/30" + } + ], + "content_layer": "furniture", + "name": "page footer", + "label": "section" + }, + { + "self_ref": "#/groups/7", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/groups/8" + }, + { + "$ref": "#/texts/34" + } + ], + "content_layer": "furniture", + "name": "page header", + "label": "section" + }, + { + "self_ref": "#/groups/8", + "parent": { + "$ref": "#/groups/7" + }, + "children": [ + { + "$ref": "#/texts/31" + }, + { + "$ref": "#/texts/32" + }, + { + "$ref": "#/texts/33" + } + ], + "content_layer": "furniture", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/9", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/35" + }, + { + "$ref": "#/texts/36" + }, + { + "$ref": "#/pictures/0" + }, + { + "$ref": "#/texts/37" + } + ], + "content_layer": "furniture", + "name": "page footer", + "label": "section" } ], "texts": [ @@ -653,9 +762,245 @@ "prov": [], "orig": "", "text": "" + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "The second page of the document with same header and footer", + "text": "The second page of the document with same header and footer", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/27", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/28", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "The third page of the document with different header and footer", + "text": "The third page of the document with different header and footer", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/29", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "This is a header", + "text": "This is a header", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/30", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "This is a footer", + "text": "This is a footer", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/groups/8" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "Another", + "text": "Another", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/groups/8" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "header", + "text": "header", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/groups/8" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "in bold", + "text": "in bold", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "With 2 paragraphs", + "text": "With 2 paragraphs", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/35", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "Another footer", + "text": "Another footer", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/36", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "With", + "text": "With", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/37", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [], + "orig": "3 paragraphs and a picture", + "text": "3 paragraphs and a picture", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + } + ], + "pictures": [ + { + "self_ref": "#/pictures/0", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "furniture", + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "annotations": [] } ], - "pictures": [], "tables": [], "key_value_items": [], "form_items": [], diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md index 918e89e2..59b2429d 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md @@ -14,4 +14,8 @@ Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docli - **Bold bullet 2** - Underline bullet 3 - Some *italic* **bold** underline - - Nested *italic* **bold** \ No newline at end of file + - Nested *italic* **bold** + +The second page of the document with same header and footer + +The third page of the document with different header and footer \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 385884a5..5af52842 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -3,6 +3,7 @@ import os from pathlib import Path import pytest +from docling_core.types.doc import GroupItem from docling.backend.docx.drawingml.utils import get_libreoffice_cmd from docling.backend.msword_backend import MsWordDocumentBackend @@ -208,3 +209,31 @@ def test_is_rich_table_cell(docx_paths): f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} " f"with text: {cell.text}" ) + + +def test_add_header_footer(documents): + """Test the funciton _add_header_footer.""" + + name = "unit_test_formatting.docx" + doc = next(item[1] for item in documents if item[0].name == name) + + headers: list[GroupItem] = [] + footers: list[GroupItem] = [] + for group in doc.groups: + if not isinstance(group, GroupItem): + continue + if group.name == "page header": + headers.append(group) + elif group.name == "page footer": + footers.append(group) + + assert len(headers) == 2, "Expected 2 different headers" + assert len(footers) == 2, "Expected 2 different footers" + + assert len(headers[0].children) == 1, "First page header should have 1 paragraph" + assert len(headers[1].children) == 2, "Second page header should have 2 paragraphs" + + assert len(footers[0].children) == 1, "First page footer should have 1 paragraph" + assert len(footers[1].children) == 4, ( + "Second page footer should have 3 paragraphs and 1 picture" + )