From ef623ffceefe40aa237e163b564310ed81296bcf Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Thu, 6 Nov 2025 05:25:53 +0100 Subject: [PATCH] fix(docx): slow table parsing (#2553) * chore(docx): remove unnecessary import Signed-off-by: Cesar Berrospi Ramis * fix(docx): simplify parsing of simple tables Simplify the parsing of tables with just text (no rich cells). Move nested function group_cell_elements out of _handle_tables for readability. Signed-off-by: Cesar Berrospi Ramis * chore(docx): reuse method for finding inline pictures Signed-off-by: Cesar Berrospi Ramis * chore(docx): format strikethrough text Signed-off-by: Cesar Berrospi Ramis * tests(docx): use fixtures to avoid converting same file multiple times Signed-off-by: Cesar Berrospi Ramis * fix(docx): remove unnecessary argument docx_obj in functions Signed-off-by: Cesar Berrospi Ramis * tests(docx): add test for rich table cells Signed-off-by: Cesar Berrospi Ramis * chore(docx): small improvements in backend and its unit tests Signed-off-by: Cesar Berrospi Ramis * chore(docx): parse superscript and subscript formatted text Signed-off-by: Cesar Berrospi Ramis --------- Signed-off-by: Cesar Berrospi Ramis --- docling/backend/msword_backend.py | 262 +- tests/data/docx/docx_rich_cells.docx | Bin 0 -> 24320 bytes .../docling_v2/docx_rich_cells.docx.itxt | 107 + .../docling_v2/docx_rich_cells.docx.json | 2928 +++++++++++++++++ .../docling_v2/docx_rich_cells.docx.md | 25 + tests/test_backend_msword.py | 262 +- 6 files changed, 3366 insertions(+), 218 deletions(-) create mode 100644 tests/data/docx/docx_rich_cells.docx create mode 100644 tests/data/groundtruth/docling_v2/docx_rich_cells.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/docx_rich_cells.docx.json create mode 100644 tests/data/groundtruth/docling_v2/docx_rich_cells.docx.md diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index a782f4b1..56c025fb 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -3,7 +3,7 @@ import re from copy import deepcopy from io import BytesIO from pathlib import Path -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, Final, Optional, Union from docling_core.types.doc import ( DocItemLabel, @@ -17,9 +17,9 @@ from docling_core.types.doc import ( RichTableCell, TableCell, TableData, - TextItem, + TableItem, ) -from docling_core.types.doc.document import Formatting +from docling_core.types.doc.document import Formatting, Script from docx import Document from docx.document import Document as DocxDocument from docx.oxml.table import CT_Tc @@ -36,7 +36,6 @@ from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.docx.drawingml.utils import ( get_docx_to_pdf_converter, - get_libreoffice_cmd, get_pil_from_dml_docx, ) from docling.backend.docx.latex.omml import oMath2Latex @@ -47,6 +46,18 @@ _log = logging.getLogger(__name__) class MsWordDocumentBackend(DeclarativeDocumentBackend): + _BLIP_NAMESPACES: Final = { + "a": "http://schemas.openxmlformats.org/drawingml/2006/main", + "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006", + "v": "urn:schemas-microsoft-com:vml", + "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape", + "w10": "urn:schemas-microsoft-com:office:word", + "a14": "http://schemas.microsoft.com/office/drawing/2010/main", + } + @override def __init__( self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] @@ -58,6 +69,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.xml_namespaces = { "w": "http://schemas.microsoft.com/office/word/2003/wordml" } + self.blip_xpath_expr = etree.XPath( + ".//a:blip", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES + ) # self.initialise(path_or_stream) # Word file: self.path_or_stream: Union[BytesIO, Path] = path_or_stream @@ -133,8 +147,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc = DoclingDocument(name=self.file.stem or "file", origin=origin) if self.is_valid(): assert self.docx_obj is not None - doc, _ = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc) - # doc, _ = doc_info + doc, _ = self._walk_linear(self.docx_obj.element.body, doc) + return doc else: raise RuntimeError( @@ -192,7 +206,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def _walk_linear( self, body: BaseOxmlElement, - docx_obj: DocxDocument, doc: DoclingDocument, # parent: ) -> tuple[DoclingDocument, list[RefItem]]: @@ -200,20 +213,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): for element in body: tag_name = etree.QName(element).localname # Check for Inline Images (blip elements) - namespaces = { - "a": "http://schemas.openxmlformats.org/drawingml/2006/main", - "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", - "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", - "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", - "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006", - "v": "urn:schemas-microsoft-com:vml", - "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape", - "w10": "urn:schemas-microsoft-com:office:word", - "a14": "http://schemas.microsoft.com/office/drawing/2010/main", - } - xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces) - drawing_blip = xpath_expr(element) - drawingml_els = element.findall(".//w:drawing", namespaces=namespaces) + drawing_blip = self.blip_xpath_expr(element) + drawingml_els = element.findall( + ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES + ) # Check for textbox content - check multiple textbox formats # Only process if the element hasn't been processed before @@ -221,7 +224,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if element_id not in self.processed_textbox_elements: # Modern Word textboxes txbx_xpath = etree.XPath( - ".//w:txbxContent|.//v:textbox//w:p", namespaces=namespaces + ".//w:txbxContent|.//v:textbox//w:p", + namespaces=MsWordDocumentBackend._BLIP_NAMESPACES, ) textbox_elements = txbx_xpath(element) @@ -230,7 +234,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Additional checks for textboxes in DrawingML and VML formats alt_txbx_xpath = etree.XPath( ".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t", - namespaces=namespaces, + namespaces=MsWordDocumentBackend._BLIP_NAMESPACES, ) textbox_elements = alt_txbx_xpath(element) @@ -238,7 +242,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if not textbox_elements: shape_text_xpath = etree.XPath( ".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t", - namespaces=namespaces, + namespaces=MsWordDocumentBackend._BLIP_NAMESPACES, ) shape_text_elements = shape_text_xpath(element) if shape_text_elements: @@ -272,26 +276,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): _log.debug( f"Found textbox content with {len(textbox_elements)} elements" ) - tbc = self._handle_textbox_content(textbox_elements, docx_obj, doc) + tbc = self._handle_textbox_content(textbox_elements, doc) added_elements.extend(tbc) # Check for Tables - if element.tag.endswith("tbl"): + if tag_name == "tbl": try: - t = self._handle_tables(element, docx_obj, doc) + t = self._handle_tables(element, doc) added_elements.extend(t) except Exception: _log.debug("could not parse a table, broken docx table") # Check for Image elif drawing_blip: - pics = self._handle_pictures(docx_obj, drawing_blip, doc) + pics = self._handle_pictures(drawing_blip, doc) added_elements.extend(pics) # Check for Text after the Image if ( - tag_name in ["p"] - and element.find(".//w:t", namespaces=namespaces) is not None + tag_name == "p" + and element.find( + ".//w:t", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES + ) + is not None ): - te1 = self._handle_text_elements(element, docx_obj, doc) + te1 = self._handle_text_elements(element, doc) added_elements.extend(te1) # Check for DrawingML elements elif drawingml_els: @@ -314,18 +321,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): else: self._handle_drawingml(doc=doc, drawingml_els=drawingml_els) # Check for the sdt containers, like table of contents - elif tag_name in ["sdt"]: - sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) + elif tag_name == "sdt": + sdt_content = element.find( + ".//w:sdtContent", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES + ) if sdt_content is not None: # Iterate paragraphs, runs, or text inside . - paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces) + paragraphs = sdt_content.findall( + ".//w:p", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES + ) for p in paragraphs: - te = self._handle_text_elements(p, docx_obj, doc) + te = self._handle_text_elements(p, doc) added_elements.extend(te) # Check for Text - elif tag_name in ["p"]: + elif tag_name == "p": # "tcPr", "sectPr" - te = self._handle_text_elements(element, docx_obj, doc) + te = self._handle_text_elements(element, doc) added_elements.extend(te) else: _log.debug(f"Ignoring element in DOCX with tag: {tag_name}") @@ -384,16 +395,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): for key in keys_to_reset: self.list_counters[key] = 0 - def _is_numbered_list(self, docx_obj: DocxDocument, numId: int, ilvl: int) -> bool: + def _is_numbered_list(self, numId: int, ilvl: int) -> bool: """Check if a list is numbered based on its numFmt value.""" try: # Access the numbering part of the document - if not hasattr(docx_obj, "part") or not hasattr(docx_obj.part, "package"): + if not hasattr(self.docx_obj, "part") or not hasattr( + self.docx_obj.part, "package" + ): return False numbering_part = None # Find the numbering part - for part in docx_obj.part.package.parts: + for part in self.docx_obj.part.package.parts: if "numbering" in part.partname: numbering_part = part break @@ -523,15 +536,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def _get_format_from_run(cls, run: Run) -> Optional[Formatting]: # The .bold and .italic properties are booleans, but .underline can be an enum # like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean - has_bold = run.bold or False - has_italic = run.italic or False + is_bold = run.bold or False + is_italic = run.italic or False + is_strikethrough = run.font.strike or False # Convert any non-None underline value to True - has_underline = bool(run.underline is not None and run.underline) + is_underline = bool(run.underline is not None and run.underline) + is_sub = run.font.subscript or False + is_sup = run.font.superscript or False + script = Script.SUB if is_sub else Script.SUPER if is_sup else Script.BASELINE return Formatting( - bold=has_bold, - italic=has_italic, - underline=has_underline, + bold=is_bold, + italic=is_italic, + underline=is_underline, + strikethrough=is_strikethrough, + script=script, ) def _get_paragraph_elements(self, paragraph: Paragraph): @@ -724,7 +743,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def _handle_textbox_content( self, textbox_elements: list, - docx_obj: DocxDocument, doc: DoclingDocument, ) -> list[RefItem]: elem_ref: list[RefItem] = [] @@ -766,7 +784,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Process all the paragraphs for p, position in all_paragraphs: # Create paragraph object to get text content - paragraph = Paragraph(p, docx_obj) + paragraph = Paragraph(p, self.docx_obj) text_content = paragraph.text # Create a unique identifier based on content and position @@ -782,7 +800,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Mark this paragraph as processed processed_paragraphs.add(paragraph_id) - elem_ref.extend(self._handle_text_elements(p, docx_obj, doc)) + elem_ref.extend(self._handle_text_elements(p, doc)) # Restore original parent self.parents[level] = original_parent @@ -854,11 +872,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def _handle_text_elements( self, element: BaseOxmlElement, - docx_obj: DocxDocument, doc: DoclingDocument, ) -> list[RefItem]: elem_ref: list[RefItem] = [] - paragraph = Paragraph(element, docx_obj) + paragraph = Paragraph(element, self.docx_obj) paragraph_elements = self._get_paragraph_elements(paragraph) text, equations = self._handle_equations_in_text( element=element, text=paragraph.text @@ -884,7 +901,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): and p_style_id not in ["Title", "Heading"] ): # Check if this is actually a numbered list by examining the numFmt - is_numbered = self._is_numbered_list(docx_obj, numid, ilevel) + is_numbered = self._is_numbered_list(numid, ilevel) li = self._add_list_item( doc=doc, @@ -1239,14 +1256,35 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) return elem_ref + @staticmethod + def _group_cell_elements( + group_name: str, + doc: DoclingDocument, + provs_in_cell: list[RefItem], + docling_table: TableItem, + ) -> RefItem: + group_element = doc.add_group( + label=GroupLabel.UNSPECIFIED, + name=group_name, + parent=docling_table, + ) + for prov in provs_in_cell: + group_element.children.append(prov) + pr_item = prov.resolve(doc) + item_parent = pr_item.parent.resolve(doc) + if pr_item.get_ref() in item_parent.children: + item_parent.children.remove(pr_item.get_ref()) + pr_item.parent = group_element.get_ref() + ref_for_rich_cell = group_element.get_ref() + return ref_for_rich_cell + def _handle_tables( self, element: BaseOxmlElement, - docx_obj: DocxDocument, doc: DoclingDocument, ) -> list[RefItem]: elem_ref: list[RefItem] = [] - table: Table = Table(element, docx_obj) + table: Table = Table(element, self.docx_obj) num_rows = len(table.rows) num_cols = len(table.columns) _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns") @@ -1255,7 +1293,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): cell_element = table.rows[0].cells[0] # In case we have a table of only 1 cell, we consider it furniture # And proceed processing the content of the cell as though it's in the document body - self._walk_linear(cell_element._element, docx_obj, doc) + self._walk_linear(cell_element._element, doc) return elem_ref data = TableData(num_rows=num_rows, num_cols=num_cols) @@ -1300,52 +1338,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): text = text.replace("", "$").replace("", "$") provs_in_cell: list[RefItem] = [] - _, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc) - ref_for_rich_cell = provs_in_cell[0] - rich_table_cell = False + rich_table_cell: bool = self._is_rich_table_cell(cell) - def group_cell_elements( - group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem] - ) -> RefItem: - group_element = doc.add_group( - label=GroupLabel.UNSPECIFIED, - name=group_name, - parent=docling_table, - ) - for prov in provs_in_cell: - group_element.children.append(prov) - pr_item = prov.resolve(doc) - item_parent = pr_item.parent.resolve(doc) - if pr_item.get_ref() in item_parent.children: - item_parent.children.remove(pr_item.get_ref()) - pr_item.parent = group_element.get_ref() - ref_for_rich_cell = group_element.get_ref() - return ref_for_rich_cell + if rich_table_cell: + _, provs_in_cell = self._walk_linear(cell._element, doc) + _log.debug(f"Table cell {row_idx},{col_idx} rich? {rich_table_cell}") - if len(provs_in_cell) > 1: + if len(provs_in_cell) > 0: # Cell has multiple elements, we need to group them rich_table_cell = True group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}" - ref_for_rich_cell = group_cell_elements( - group_name, doc, provs_in_cell + ref_for_rich_cell = MsWordDocumentBackend._group_cell_elements( + group_name, doc, provs_in_cell, docling_table ) - elif len(provs_in_cell) == 1: - item_ref = provs_in_cell[0] - pr_item = item_ref.resolve(doc) - if isinstance(pr_item, TextItem): - # Cell has only one element and it's just a text - rich_table_cell = False - doc.delete_items(node_items=[pr_item]) - else: - rich_table_cell = True - group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}" - ref_for_rich_cell = group_cell_elements( - group_name, doc, provs_in_cell - ) - else: - rich_table_cell = False - if rich_table_cell: rich_cell = RichTableCell( text=text, @@ -1377,17 +1383,79 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): col_idx += cell.grid_span return elem_ref + def _is_rich_table_cell(self, cell: _Cell) -> bool: + """Determine whether a docx cell should be parsed as a Docling RichTableCell. + + A docx cell can hold rich content and be parsed with a Docling RichTableCell. + However, this requires walking through the lxml elements and creating + node items. If the cell holds only plain text, a TableCell, the parsing + is simpler and using a TableCell is prefered. + + Plain text means: + - The cell has only one paragraph + - The paragraph consists solely of runs with no run properties + (no need of Docling formatting). + - No other block-level elements are present inside the cell element. + + Args: + cell: A docx cell + + Returns: + Whether the docx cell should be parsed as RichTableCell + """ + tc = cell._tc + + # must contain only one paragraph + paragraphs = list( + tc.iterchildren( + "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p" + ) + ) + if len(paragraphs) > 1: + return True + + # no other content + allowed_tags = {"p", "tcPr"} # paragraph or table-cell properties + for child in tc: + tag = child.tag.split("}")[-1] + if tag not in allowed_tags: + return True + for elem in tc: + if self.blip_xpath_expr(elem): + return True + if elem.findall( + ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES + ): + return True + + # paragraph must contain runs with no run-properties + for para in paragraphs: + runs = list( + para.iterchildren( + "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r" + ) + ) + for rn in runs: + item: Run = Run(rn, self.docx_obj) + if item is not None: + fm = MsWordDocumentBackend._get_format_from_run(item) + if fm != Formatting(): + return True + + # All checks passed: plain text only + return False + def _handle_pictures( - self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument + self, drawing_blip: Any, doc: DoclingDocument ) -> list[RefItem]: def get_docx_image(drawing_blip: Any) -> Optional[bytes]: image_data: Optional[bytes] = None rId = drawing_blip[0].get( "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" ) - if rId in docx_obj.part.rels: + if rId in self.docx_obj.part.rels: # Access the image part using the relationship ID - image_part = docx_obj.part.rels[rId].target_part + image_part = self.docx_obj.part.rels[rId].target_part image_data = image_part.blob # Get the binary image data return image_data diff --git a/tests/data/docx/docx_rich_cells.docx b/tests/data/docx/docx_rich_cells.docx new file mode 100644 index 0000000000000000000000000000000000000000..a70febb17cec9d3abe5d7764a7bafca9d0977f40 GIT binary patch literal 24320 zcmeFY1Cwn*vn|@%-Mekuwr$(CZQHiZ-L`Gpwr%tEckX);=f-*e;NFa=nrqd{87r!0 zW#-6HIprjQL687I0saC20N?|h;Dc$~0RjN5`~d(!1o#W2DQIisWNhQ4tK@EH?5IuS zW^IL600Kmo2LSY||9|EG;1y_0ny?t4gA;xU{tldLUX^Gk`!_gN5NnKS_6``q_Hc1n) zZ;*=wmI14if9ROP-jFy|RCp7uV`4koK9r z=*0X&;;e^bmWcCilJ+e|iZXOW7x?m<&Vc4eryD?WaOXdIw`IG1sLbPMldo|?aq{uN zNOk0h84S?)*RZsyUc@5`q35F#NOJ=6nMFgQhQb_s<&~E-AI7|N`lV$HF-=dfa&F=7 z5bXA8bRP&4-nvvq>bThc?RxD&rSrsAhG+}&Cn^jpI$Gkac!Np^jN8t+bC1HFlA-Wm zD)4;paM}+q6Dd!V|#kHBMuG#;M@)X0RDGT zTx}hUXpL+QovnYP*?+>=rM6~lAsdR1uK5q}_yA93GpQbM$fZ+s>vQ8GcMwg8*8*J*O=BMhry^don+yGEe$1jxnFhDUZ8G&$s=dZ8tUDfSE86d%e@k@W zX$Husa{P)*Rg606zCA!igM?(L|zlerLAx7<!b;?Mv7H@P z9kO7I)?yZM)L5bHj80t`S!$8v5v98|yR%!Sa8D(DMhv1bjzD zO2=6>za1Sf`@T(d7Lrxr^lu+rgZAq+uJW;T`=sv~N*3$Aia-b1Hyq^g3{DiaI&)77hi|f0<0Jt6n8hQj|Z&UwvRm8s#0c z54M2JRyiNK9xMf(H)(1>JW_o5umg%~IBN%#XU_zw5|UN2xdo5LZ$D$S9cSD6{U5+8 zB;hkFxR5f1wuZ>rCt^-%nNLs&0c}-AG;nLR?Ki{_91vR5KKY+?w+@0bTVJt+8f9U5 zk;^*gw4k4(mzD(=QeNtWUFkp@QyrQTHLVVh5|yNQuy#?LK(3 zq|R{)c?nRkNJ#|csRCxNYRUo$OiJuzB!V-cVQXySPu+hmE$CIY_y&?H?4LewsSoFl z{0uY}>eV=15yanm8$3vYN>lu<3r36n(HPx8)hphzm zoCFum+*_{C!Y&=)j)30}L1dGWLwSsr4xTWj^OqCunSlr~!zAEM*16_|%y zq|ZMy=;xG__YElKegxON!9R&~qTzDPPW^C_V&w7G66!9a{vi2M?P2#aX3}XknRf>1 zk`lxt3l=kH$Vjj$LGkVtB7c(q>T99 zSj6y(Sxz%-ET(uL>_}T*BMdt(f zQ=tip9-C{(7Ak^2X=y!8yuu_cY~h(UkkwNkDj(~amYJIl6+{Fh@|Q0K+2E{|5a6=4 z+Cqa0{JsO3?=CJXoF3GkdF*Tu&A<}iyYfrRn-2h6_ti6goCGoiDKOMu(%onZPc5Zg z96kuEAbn9bHk$6FHGJR!6tv;NE;C@1TydnKfE(8xNSG$E6k89ok&Jk$H^ke?;-RGw zFmT1TVFhm-eJqmd6T``k5>o#*aZ8&v3u{m8c7IySGRO}Gu&4I6U^aK2BNq-ZkDVlq zB!`VJ4EpilFTqtevvS4dp9x9PJ#DQu?QY3Hs+R^j>h?HD?nHBn4_w`llOsmQcC_l{#5Sn1AU#wOrIhX_&P_sP7+F;g#7v zTg!_rvx`F=t|~Iodg&HwIp{3nVq>OQUd?vQt3erj*Tp|es|(0Wq7I8v(ME=Ku(&@@ z4G__%TTFl}Ldwyvf3NUM%ulw{pmRpb%=VD-oeSF{M3h(Eu^RCW3!s-?NH^e2cL@eg*iz2O zwybJG%;s$_38Z#%y~!7VoE8#mCG}-UXY-!+$H_A43-!w~Sw(5)!x-qvvKi`3P&KhV z&14_rBD4!q))}SF4s94@3MeQDWdg4m70FB3xiVVQdiNT&hJ|p>>~x1&{Y>q8Nxy#H zLKMjYhjl`jieQ~`_uSPEo{ai3aR)IHD(jYp%!i1N^#3v>dGnJrODUr#dz%Zu*`WJ$ z+KlNUy!;e{+oRPIS0JZic0gSRwz@;NWC|3wWUeS*SUFbIUwZsqO;R zYzj%1bQHi3b=j>Kn9t+Z==ONeF5OUDf zbWWhYeWF#ez9iPdz$MFm+QOzdW8+uGo-Hjwdp&M6k!+0_E56<-vS`)X5{~ld`DFVP zhr4FVhA)@1S~c2O-i7gnpz+UwB_67yLDslaMLlMpN$39e-`~_K9XJga!y~_ho-|@_ z$Hb&y1uxn`5C>QuuU!D<7OBFeCYe<{+=SdoyWK;imQ*@63hC>Z)KxYzQA>yfcvWwF znu~^i7B%Uj2PTxKL^BC$jJ>3nu_$QBVM#4R?tKWnUk2bg+ivxpRwqBS6PY)E=ss#H zYt0i?N4;F+l&zwyM6aocBr+%oy8!z%IJo<1ySz+91~K5{*ZG0P>Y$XJ(kfx-eIN{< zTmEWE+v0XmqJPRS(3MNq?nPuXMeVnvl`V&5VG&&MIDvO&P+oM z=^BJb>_vU_3ow4`2u>LbK;u)`Gj$!*2}{xwSK>l5S8Y8g@5xVKWRss+_wts;e@%Uu z%9*Zza)_fTE5?a$PLIF~O*zMnu@q~`ZLYN4>5x+>>y4PFcXx+m6RA)xPMqi(NiSgy zTukx@(t?~Toxp{Ut0Y2r_EWc+gO5R}^fymCLc zcbM1^+hX&b*!YVB+d0R2g5E!^0>hGiWy>A``M4?XcbF1G00lPbD#Yuz%DzF z^J%)@PM2;Hh?=kBs$1_e48lTfkZfiCQj&M}pJ|tOT!>FypgjgXRfM^Lwmk=(o!ydA zlx4N|U0>(zld}JJ1&~;Mn2{5&HXYZ~;S>6vWF=QGS8syzrndVm^qR6WNe<5i_a8I5 zp{6>d7s^gRKk#@d)iXUvs*4c+s^`W#sBhYaE!?sg3-j^PXxvmBU2$L}T6(>$Abyg1 z>LDD<-Cu-^dvJO-m~#T@3@{c_zEMiZrLIZ^P%nCd91R0kfEP)q^KOb=RT(V`7L&Wh zpp(+4ji&ndpATCUAn>NZ0NDQSM{X5t&+2r^%kmX%F1Ek-3ybdL$#$2joBH<__Uqx6 z16b^bxE6^08{p=a*8J&t4v_`#p{?;1?KaET$CqXr@7G05zwGphYs|;pgY59tM{c$z z=pPFfj!W!QNgO$F6z1LX+!H)J86IQtdK*CCZ+Kfa>3sHx@gU_i)Rsy4dBz5w54l|V zxtUc~7gU^&aY{^P&H1wxptoo}-5MGcewgm3CxE#f?00WnNe{uBN&Cb^8&jQ~2HWO#Xi&{_xw z+;bnVFu;*i!9nm>-kqMp8;;QKOnzz#1lzuCpJQTNgdfZ49=przJNd_Z#Y20bK%|{N zA~^yx=cSIFUnkA$c*cUM!P?6|8jt1*SZV;wY??*Pns(1t7I)lG2v>|<0P?qWNF9Qu zxLEz=2I9koTFHdVsjuB+6A7WvyXBo2VJyflYX;Wn!DW~t@Yeo0kZT01G{AbuNV@s~ zpk3c_sGpTcPrh*`ez?NNn3C{Sto349tRXastU^vLV6?v z;~}TO@)B1-49va9URX)S_U7G=4nKr9C`O#rKFz!&EBZOz1Q=1yBE~V@4upBW%tqeN zox^KjktR_t3n8^Y92Y#Trh2VjPfakCGg&~rHdqedJP-LGvh|&QNdt|Y-R|GaW%|guyFP-Npt|kQ=V)$sQS}(Peoj3e+ zoM!~+5nTleKdcyl+*&SK7C#L_8e0`pbDXAr;oj7|l;TeJr~M80mz?;Y^hMDwMJvWH zvB3cX0DuMX7x2I6i~q&({dW@M|Ks}re+l_t?fgYRXU^R?+K)=yKcC$CpA_M4 zms^`V$gy~agv)ZCA0T(-!$rjsrAtCT@lN@6!>&Q6UrMR({$?t*#Tnv8AvkH1vxWki z_JF6l&nV~{fpqGoG&dgagHfi0--c34Z$$pckK0DO3Ng=yL&@B1@Zalrvd&2`*)t3x zKv80FonBf(u0f$ZJJgZSFE_Xb_7LGk46{=^jeeh;$Qgs-%4Ku6 zg|2Piu%9phn{|^&fBpR2JY=0rZhFqJ_qh7butRVeFknQ-o;`U;o0^s!$&O4F~nySs}*LO}31w`s*%to>yD z-ZilDi(1>8o14qq+cQ{fx21@Qi3dmb?BrZsS)WKVr%cFcXn1~wo`3X6(5 z+EsF=?+ZHFlUTd4gkha@i#Eq9nC~g^Du@T1WPVfA(8SE>vt)8dWpOxD*xv5;;m0C< zHkvPxG&VIcKd;kSZ!|}Hd3k~Qp(7L*r24>#S^LV6a4@Hw+9Do3j~xh=%#V4o#Y5;a z&JUjin7g%l^s4FV>Q0Ak*_=o@xwteN8yQWy5$&c@Yo-{sY*bn8-`_h(h>N4wz9tD_ zXMga{37O{Ny;CKgn!wCeorFs?gmu5l>l)7R8d#VKKZ|vI%3BLo= zfbAmwWl{!~RtA92TL!X*=9}Q=QvlTQ;41|73WS44kE*Q0_|z-HP^Gvo%-`~FZ|AsN zYXBP?8w-n#MJzV2e_yft{dcD~u3kPyHn=g3DYsG@;n~L>MWY2Og zjZF(I4zl(IK!$U64Py2BMS5NZc9RuYh7=Ei&k&4oZ4u`PEi2}J(-X>|HP*ION`=}AdRRt)Lw@3QzjY$uwwT~8Rm zx|*7irn%nBz&>uR7;l&aIM{9?$1J-#=yKkebS*&o_N&d4VdUW&-|KsJdcO;4W@e_h z9`?O;b%kmChKNi3Ff(VPP@fWVzC92aB2KPp#Sfnc&ascpL@BRd}@U6^~U6 zbV<|8@0ZA&x~WCPdcM;Q3_Q7ZLE#r;GDsw3a25qHQ_TZOF zjg77HD`RM4va%z;6C2L0K;k>gA64bs3|Gu@({8^0 z2nek1#kmr8N6Z32<}@xX$ZU;WlFze}-1a6vWJ;Avy8}dHe(=%h>1q8Q*Q@m*T{Go+ zrFxMIPylt;QuH3K!oH|V<-bfqdHDn3J3m#}h(2$!>EYkI*3mGE3!myIC`bF~3+^o&)j>lXT3T5y6ZW`=Z8G9e%JRVgg_{eL$>S5F5b92i@@RhcN@q55?Get=1brYqN|a7@8y zeUC?sk4t=eOG-jcnlzYp4uEI1n>Bs5S!3Q)NM?rA2?VlnwM%9Xy^b zghfRONl8IbrF2(0+S$=*w9$@5eUKlWaC6fT!Kgg!oJ?msYEGfS0t<%1V2{6Fth5;# zn&pA`iwa@jO_}k+QwgwV%Xodiopj#F%6~`H%=t|mY{r`ZG4r5qX>Z5K&dvrNZfH9Q zd_T5ZKNP}4pj*UqR8&w1_)cR7j*pLrudFF8-K8b;7J4o78?Npi(q=(r0{Tfo)qq8i z{FqoU4(-`66P@-)iE=B%3AbG~9LrZ(Y+BEELO_X`<1bU8#h5JLV(+&-Y@(Ja%n*H!SEkBH{Dkn zWvd69`Ef#A-`Hpys}pb3pzfb!t>e%he2e%{dt+tX_Mry*D4qdjr((7G5waxrP;P-2mv07AvS5G^QsM(>-pp6I7W_H#;!`Mq5n=pYAGSqvIA z$Bb{;3c8YsJ^H;GCEFYuf}aFf2B3CT6gj|KL3y|MYhIqyxbZ#^pGU{uN^)#U9Jq21 zymXAi!SS(WphOeOxXp$$rrK;>PH;9Xt9OZnBrw}HEsoKlAt9jRFe>y<2oqpYfT@9T zaCii2guAhWddW|m^r-W!+T48Zk{{VozFd=`AU&~ z3Vu}&FfLdY1J}^`X%80IFf&gB0U*qHOpGgF1?YO>EZN)6SVC+b3jvRI$V90ASg^Uo zb=sMO`{hd(SF>hJ-5y;zoGPxdkOhi^S!UojTg?YIFPCVxa!^#zAb>|#GFWfpF%{7; ziyGi>k)L#8PWg3xp!r)crADAg5P(lFR%qv>dOM)N$6ZT7`5J(Pf<%+J)%;1h1vP`G z)?n3|@*8LU>vtA8Lmk%>P;Tz-_Gb;)p<_l2kzFoB(Y!=LJW!`+(qA(o60?%Maw3>Uh60 zDK#|}J$e}5-rk-IVJTgE_8Eob0x@}2&0<6{cU$zDt!acuIF)L*oCQ_M1@pRvgRXTS zgX!Bp4hT2!@Bo&-3NUiSsHJyp>#wIMFx6Fdsa5SkqzGwhZH~Gr?}EOAj??BO{UJ|T zU_ZHP2emc>uwV$3KPKa*Y2m+LVKkX^^Ee!h?~J0caoLMD>c7J2Z`1=s7J>hHkp!E8 z-dIqQ3j=Vxh-B=leF}lLWr7Ll){MPwHaYdMfG?^JjwRwlI6BV&Z(WcvtAnhy0N%qy z>k~9*>h2A0;|S}5s+I5-_x}?*VfAXBfI;pLVFTXB0l$K2tcuNSk;I9c1w&#eyn8oVN|pX{L1BMs%qNuh*C-^M3>_&pD+{Z| zFu$WClQrdCJ5+kYvwvUU5sKN4KRXoQ^I zO=bE60*J>pfw@k6b>isg=4J+UJDbWtz9>x7Bw&cVq9t!Fcqgf2 z_6%V74}@6*;I;&w!9CV5h#PJIHxN$%G+~|zMSZXZlXuSABs zB+hGG3g%HVpxsj;L2MrYraEQ5Irb(|z5czknDL>G3Gh`Y&Gl1cMGMmp(R3Ivi-mg2x zA_5-PP*z4ZHTXrBUW(Tj7m=K<*S9x=%?L_8xUhk_L5jpCRv)#01STta=`AQ&`G{U?9fyfA7!QX7q>W$|> zrO`xg?PP|??7jiZ9|>PF*Xb-ba-Q4n@^sH1Mst5VO1+!rys<08<#d6n>3*k>O*)xD z-rbD_!xs{d)1u*GUdKGqVK&&2)L+%P?hRFjSwqL7aSOETUXG!pcKEB^=|+F^a=rB- zDtmpNizXo<5uu= z7Xh!tBva%LDtBzw)v52!84!^QGEZsPc^Yajfx+oQrQPL`41>!fpk?W@zq`X9kxb6T;oTCLJuH~u2`HEHrn^taq zw?Fi$?&ZbRL+hyM?cH@^aQKuDBws9)^pC_tM>3k-L*ZFMCtvP%16Sl?x$5JL73fCR z_v=FUrOv;@<%)nm&Q@*zQ>ffaMsm50LgMaE*KI$f*ZuLa$$F13FST}iQhj~>>1v%J zPVL*<+h!43`jp9fy)op@C@C@V7~Y@RXMMa3a)l+P`28pEP>^3kJUo!e;c|cEUw9Ky zC{vm*Q!I(82Tv2)%D9c^L;UQ+UNNZSI;>svpC<9UAgW->l3$hFS!H{oX^o;S}2di3>+v#V{zv(!&MkCn; zFVI#li_K$Eua6hp((>~1ADZ1BWZ<2YU*5%Q zeqmt&0TD4VN~Ca3j}Glu%Sj4g>sO|xrlf!A5oOlj`O0f)d9{_5m3_%qe*F9vO8)!= zneYBB0T?R%C9wd+g=K`Q1@r>`_rl6qWtRjhuwT}T;J>kBPG-i|#7+9R!0U&-H zAJAQYY}sPOL~QS~6v@G$Ao*4S#FfbU>wWgY%n255u9U+;bYz+z;~-nH_|9E*8w}rA z7APC|$Mn~NdDR5~jUED@0OF$Sau}!Wrq@>!=52eJT$Epu?5p#tD*^ue#S=&Fg#KvF z91^jE4Zk{6@MV214xJm#({RyshfN2LU=|ymf1Q%2ZC$S;h)lA?$+5^=(5P?tu7#e= zYPdo)6nHdrN{+{YB@Y2q7XQ5KAqOBguP|yqAczA#>3dAYPuBYs=955J7{pq_rqa)m zLmOZ1V-psnX>cCrU59kdYz?5$U$I2ZB zNq13CT}{sl&bu^l4ZnIrThEBOErSiZ8}xWa758WFQ)pL+|4Lrv+V%h|X>B)SV=mdgCkH(bt(8Y{hTKjzKbwtABq~!J;A|+x5E#vJ5zHUa34tTNSu_TaMvamu;2R}N6{r!^&U~iMyHnI%5#VL18U9kZ>#=#OVwfhg+ z9e|vgaKwo{LN7~${eXzVaA#i<3n%9Z!SYA4lv6bH}3 zWXMVsNdAGVdJKpl^ByoV8i`YH6$5)f*(~T8Pyi%n007_ahZ-}dIAp4&$X1l0VpD)+ zV>860vfL-2wK7@B-q4=vSY$LOFHYr_UF(J@hgr&Cw47;2I|H9Fo!%YI`cC3lk}%nz z7^W(FkQ)c{qhiqmiJ!1iDR<+*c?#<1E0K}J80Q-j{c)3lq3*|UBTK-5suns;F|5y@ zdP@R+;7b$UIjeDXoT7ye_G%gWDZ7Alc#M%E@2C1c{%I(Yw!2Y}wq(5yShr}pOoF&$ zNgCOvr5IfvWmmPQ2z@}=A?T3Wz+Xq1NfmkOr@NqxV@B0^QW@J(d1O^sM*zElTKf?= zZ^{s-in!YAxS=;FD~y>RObeA;E9Xm(0?b(y5;-<)UX1182uf=-&jwgzXhThi6kh6x zAx8)Qu#>O-C#sztG>VbKn5c#GQ!ausG!=pM*ud(iHYi)~J#p2bHxwCF4 z-3p@mL`*S*=iavcMR|{7x+_k|w{UTm10{+>3T1u-5^qx)2g#qz#RLYL;W$xrDOqFC zusS{{IIIah+7EvsVh3(_GaY4zZioH~5t*Lkt|$~pCRAj?jq0q4BsEiAxJnKuV)0y| zKK8VDWX_OzxvH3wPE0)-2vxAO+$;28qG&yze@PAQpet7aGUm7$ zXV=w}KHcf+rCE>tE7uLdi$dgy4ywsjd^LYSxH7eH0U09b?rk;x5W7gONLb zi7hsUpe!Qlg4bOZi(JBy`OVu1m?~((C%R!=3RSk?GW!)?p1W6y8$ucGN9wh-PyH=M5P>a=k&5!!DnQ|n@glI@W$vh0I;-1lgt@qzK87N2V)xkEFLNU)u^(7ICcx3 zfU&~Q`t!O2fr{W^tnP{0-%iy+4u?84j?Uw#7SHS;S*F!sD*Uz<+c$S?xkZ(IJ|`W~ z4W~C`x8uHI4V^*Q!(86BwOq7p#k8f-!fdt(TBS0O*~sO}^|+JxD1_u$HR-R2W>`P% z`;j;D;`0M%%TueBxzH-}R1)5fvfSf)`+2^@D^?+<@jYny4(zLh8QtYlbr!qPWK%+4 z8vXgaiaRQ`oddJ&*qaXOIdJF3Vg1(hEViFp_rXuWrtIKvYc5*VeF?IY;Xl+#8iOpn#^Wi=F4kepO zg@*GXK5q^HXsK#nQEXV^#7PLH=A2(g&#l2+Ji92gWgF@v21=fT?5#( zObEsG9u(9z?+Uc;+I56YN+yj=4I+is5P_Uzx02I70}yBAaLI+w_R{sg*+WD|PTNS) z$QHfU7TM(@yoLO9Ll5sYM&LPY?TlXNIlPLhCLH|(d7Wm~u1*vR8K^-z5`rz%a{aFeWtHlD zPpO(9u92M-7~~T2$Wrl5t=?Q72G5oFO6Nb~#@>N)!^0y_Jm@6X4Nlj$EK!TQcUCJ6A;7Olci5;}=<#Oo#KW{tjC2gc zD6icIXpV3oISniugDKY2W@(^xmXp)-52^MBLI}KL28amMS(Ehj!s$&+qSzbV$moSJ zjI1)z@-^9bbSm=^=A&y({hfdPsHU3HYF!!A8m-o2{j93rwICPA)OHCb?@I zM>FV7c#&@mbd6HXZwd}9OOo%*mluF(ZAZN_%IAV~i&D$Hql1n|J_qQR?j?pI`vjST zr3&@x_NF8;2iR|gJd;m>T|r=*ljfXCNKv%Nq95{cXvsg24wn!1L<&!<{PB0##Jcdp zO0RzJrR?AHu+(oyNHV{K4PLu0FB6R%Zp7-=@-t^bfQ^t7NrJO>T({)uX#-*3gD zvp#<+-sv~Ss2N~Oe=|>+)S8ht-Q#dHEz6}DutPWTzxd(AQuwlcz4HD4^^BVoOBL9^ zr|ZgMzenr88vk`HZ{uujVC?Yg8~-co$lS2mU_l$X0lUP80`8C^QZOwP*XMjr$)^lq z9;h}Cj|n~pk!Vib=$^#Uk6ltl6IZ|I=LvJ#YzMF<$RzlAf5k4%ojl$8NM$>xq8l4v zYP$%~?AquH&jr=L_geY-F!wre;dJ*;01!40xM{CpyXo%vc`I?9&|!!rO`DCuE{{L# zGCtmMHWiM4VxoRAWfOU`#(F23S}?c2D5*=4Dans*GL`^>KIc2hWk`5un<<-yBZf?J zRdx85vi6bW#sF9N2%@5G3rRR0W6WUmH+3WbQ{uSgJJNah*lfwe*_l!Nu668l#JS4$o%cs_wxq zJ3uR(gu0|QKkRyR8@iY~!baY@!W2MwKEgC+s;^Q5Q&)+>&0ZmdGJ;hbR#?8n08A}; zH9P+-7otzSI{c}8xnJ7&*-9|0DU>L_NVuL>>peA|kY;lG-mg%-W=CVz_NoL_i?8?5 zdNfrqZ>Nfp(Nk?xjMz+7AyEigj^M`t+SKE*UB%m!0go%Q4)gO9ic07zKVs2B83Xw|U%Y)l6Lv7Mrh58l4c971Kr$)|~_>4=lBKV|? zyE^rfxR4aEO1aOLM&QkT*efGHs`{{Q$J0wjA%FzFDJ;b{R7EdT)bHzG_$16Hq0inG zm9GkJG5iPIB&U`5<+dV(P+}+EC@s(Q!Sr&AsD?xmNts)$UPN3Fb4u@OE=n5IUy)pO z)Z&E0%lfWA1?GRSfzd*?ZY(aPxMCx2fpvCI&zJ|4bJs3{g8FbTwFak?y(8HoEt2cN zv%6WTW`rPJ2#P}IOLgLfA5E$=+evKVS~Qd3rLM>RhXSM-BkAELvyqHq{xntG{||19xJ?`=7MpEW?nq zte`X}p8cZ`R~L=1&NcZ-%i2BWi>t2Zr-_~dpTORfhYj9FR?poD*3RZFlH)vv@Mg$M zAQ%K1w$t{Vj{V7sO1AURit5(-?&_L-DM1Ht-;0Qbn#O%9Z3lASjtIx<`h6-e1RtLvb4{m-YI`Jab-;+i8p*v8d(~Nr$nJMDCM>sO`$y;Z_gQAGj>dfJ zQ5zv(63Di%N-=jWy75RKoC4DKLBRzYn6ttOTPhz+|4Pme_qsW2I8b9+;h~5KWMo{2 zxPtw!7QFOyOi08DK0!d)}HGb=!?ugRy)7+q0jb z16_qRY;0eZ!SV%cDX`}PoS?Qxc1sXP|0gNMvl}pM@Dr7H8zm!eDVYKi=ve%yicx6v z6BXsnNU--H1wiJ%VIU#>jwvZ~W;UJP56=?94)E;i5pt0T-uoqyyYei9*j2GY?;6!EJ=`m0mP7UvWI)te$<4{9e;%-lHgudW`7 z4voG2K;7mlf&(rBP(vle0AL6p0uTw93{(yZZwA2X3eE&T(G?7ZEftV!A-J%w78%$p zcL!{VH{|LiavkX4t*E(o5oMLNoUQ2(iK@K?q=_2>SE3G+{?McAO7YF@H}ITUcVSuwItDfKeC+! z8#D;(85X>tQ06JZj8D>QZD~PQ=}-vHP-gZ6+}Xn!n2rpl?_zL&`@#HWcuDYvVUKG zblhM;@!3iLg2m#!;wfsTQg>l+=4?i+VU$*2by+tJ3!+*^$|o*ko37E_6bs-083OK) z$h2Wpj7)t1V*^f$PBq6nSnCz!&?Y0YGmFW zrs^L4JPUT^Z#+j~h(G1%GdCf$M=hSELRSb5pfRjJ?7%|nvx{12NtECEcFOmyLBiY% z53>sgASL+zC)B=x{<-a6IHD+9mU%FuC}@nPF_fQKbO0Mx>JyxAP@8|UsBmL%UozGe zq^3gLpD|SFLH7W`EVM9+UIdiw%rvBv#zKf&dvZ3dkeco7e^i7GL4fMVb>%#T4RSGp3JO9RsW69J`{_4NLt&Xzuc6Q~A(hz03w$PPI`!tQ&9#ykX zR#0Yd&>lwkJ`zxz>wvNJ;oFhqU+EfTm3_Z+a8MRiKoq;}1%4)RsO`f<9Iyb}j|XpB zbI@*Q{=I`>Rb+12pesj!k*n7Tj$aWQz=nj?iuFSun zjqKD}5m40VDjyf*J|3{ow0{-Gg^L0#*RrR!7&crN1|4lUhk!9cfXoyl3~q)cnOA5} zWgj(M7eo$JqL)t%5v_--S)>WR4*($OVuH?Xx0)VgOQ`)bp=@z8;J{p@rd(!uT7 z@jJBarnR&LvF(ulZ%8kB3Y}6xWtPv$1FSGU$#4w54VP=~bZ&A&*uzggU7B?;a87`r zrA0Ihsm#+;C(Pfl_qdO%8&NQ)54q&bG@}+7S`5jaE7lMFD3BpiRLg*)&@p3&Hy>$4M%Ae6#OaZsZTacymHSF7@X0|3{O8K7K&){*5AJ`ofg%cNj7KREC;K2j_XZ&sD!rX zcfe=CEa$U$F685ig$5yge@bVh^k;KRxPOp4sqd*Z{6>MDk`qmQCLMHzh|m>Xof?R~ zVhhGnhp{=!5rrGHg=f7HVobG1CnPLwB4**qE|g`~E4-rrfv5&zCb5^Ia}U9Dremxi z(B;UK%dsH`xvC&ptV_sKFrn#u7Dg45JVaED+HZnwqH8P(21k|YV4(h^Pb`&6d=)tQ6Mq{CU9rwY;#*E(Uh>{H zrQ*++A-jqN-=ppMLIF%nxGp-+$2RnX1bNbBCW_SMLsLwBpL z;Pa@H3kONAofS^6+mbuA{`xnLyOUmo*-S7=K3f{2F^}|Po*K1%_fEa=lMfH>kf^~2 zI%3kh@5G^aRtBOpT9V&}@!MdC(o}&$c=HYYpnauA05HERV}KqO|Iqb%4W+Lf?<{EJ z08grBiPP!Yll1)#fjl6a9;^Un&$yGW>LMEQy+^PE&+7^%qByk~ppW5j^LLA%V&Bp6 zm;WvT3#x5@trGX(6`;bTJSh(62E;V#nD1Fyk0=o~^?zfOJcX@QwF+Kb7D8*0%Kx%;`) zyTl3|ADsxp(Tu&NCP{)>mZB+NB6H7jc;)?ap8i^r$<;4~zLI(HZNZ48*gF$_<*JZF0Z~SS~v& z>~c?B#7dtGAPL9`_YzWFUDMxn+9+2N%ZW2WEKed5>QeZn5t1=03;Kb?W-&pwCqq zJ{R?FG=ehw<$Q(sk)nuOuwhD%b)>9}Tl{=OFYYkE%oH`Ee6;pXy)D{~(-h`(lNodH zQf%q%Bb06YcTK2ncdC`CZ|N4GdSbfWiC2Ez5T{Iysat@L!m57f^*yF-zfS>}J5FNY z6E-n0!-k8O%f7v`;ZY1M=M?=c04U=fSG0W*CKL;Y_s#I_j3$^00^*>Av@ z)I(xme9NpNEu1;cgv}JlKWx2U5E@Y6)>;N1?<(cuJ!@FRXM0jWF>W+lEDWYiV4-bSp*j z-O8jsGF3E3z>kqbNmBeMp-Rly8QT1JQvv+(6mVs8?V>rbwe$y790EweZT4MtaYl#WJQ3TbdpP#>BCoE$egb4!N{-&jEudw4ir`|zQd;D|3fxL$ z6D0P`HV6bI=h3x>P8$@sA=vAOV-u`rEeqrRHob~m_SIujf0`OEP%RCwN(2R~tLooo zs<0~}ak}8Ac`!+(2blf7l<&LQR!59&_q)+7a)Rb6{%z>^5u>V7{AO&k5<-5W-XSm*1NW3 zV=hoiZpSmUL=*vw0xAk*-e}>CHUYuMJGvU#oU^LxXk~lEQApnJ>JwN1QfC_9n3|C> zl{$k6^iIP37*CTr^;urzMUH3bWm~8Rqe~ma;}Mn&D$9xaCF0# zSq5%iVa_DI<3{sm2MSMjUsvc(%0F>-%hu={(@LB7Q7nq(bZrbDlw9q}qzvm1`NQEz z$f>zvP`|y>&_Z!r=9agqM04!^_h)c60{_E{PSO+=N#p?&cSsWg#Ss)WB*_1Uv z&WrQ2sCeyf*J5yc%USI3U>X~Vb@ZA`iH_k@0s>XKk567=ha=al+UQ}g1Bx?AlZC#J zCMQ2N#zB4@mQD|bCM_+Piv$A`Hy!IEIuGAj#J+t!>}VTtTM+FN19{~8FiS?tt9T4R z_hqK``NTUy;mb?`wX3?PY?9?_-yS{)d*yTXY6lv1HUfKZ-idepB#6=4%_N&antx)0%tvixc?tVCcEi@@eoxt+lDmO=2wOeA05#@ry8Fc+#VPW60@ zWj11^V3${=Sgvc+j&$o*ak7qsQ=K=LO+mg_mt*~4gN=O$M%uBBe1`qF>O*WP z(ezi`gu2DJwpQ}e^<3boxRL8Mnc?`N#|jj=K^ctjFD|c4WrO539zE8m3gwpVMl2%A z^5G}{Ms}r1;~Q5>T?KwoJJyjAQ5wQfd!)be1zm<2YDIpL(ZUA7_MiTjUuYq<|(%E_r03NsieE>tqMz0lpUbAIY3rtk7xTS?KsFmG&a;66XAI5fFJ1x}x!(aSo(Z1v_;$YMgb>nbE6y zS$c9#1ROB7iO1G}5?G%sx@SPA=bXpOK3Q|^D)<+n3dQUDyN2p~!Hun%WuGjP3Ml@k3cJ<>ipVO(Bqjls8Z;&uq~L2X_8aNraHFH5PdVih>n@aaKzXw8d6ab-7w%wC^P_;vy*^Y?9;HYY5<3XFf(9#t%@0fK4SUD> z9gi==eOEhrYaKv#b!?fMP8^YHyH=iD{g*F!zI@C33a95L|E#3~G;2*lPcsU1#qK@C zs%*lRnZouq!&{?`uwQx$N82chw{V=B`AAAQzV`iG_f3h}zhX1p;;dXHZN`Lzfe0d; z0l4JOY<)7ZsB+>=lNFrBi(Anxg8{i?7XpKa{$V1p##}JxblLQ@e?qjOa-*26Sk^pI zE%RfOEe;aFi2O~^XUzq<1pDRBXP`8O0x~6a=z>+=1IGaSK@tr&93olu%t^~9XR88= zXTyEs2X8?$xwBKN%1#pe$CchaMeiwqK5w;8j?dvFqXMCW$<~zc?)89(DTy{EBEGsf z&$7e4F~yenYrCQ*JM&`QVU#(Mr9MY1vV&Y+&X6-P$h6pmNpu8NZF0WVw`q_Xw0AQ! zFs5IQb%HzL`nZAn`10KEWV1JI)xt4ip+%u=Cj3zAYV_Auy`OT7^z#=l91xA2tcZ;1 z-M|0Fc?mR8`z2ut8J)9lqq>W<_?6BpD9|gHk4w&!OXptF9E)77AW~Z)9ZEj3zN?aO z0UYjb`ce&d=*#DQ9;4kF*_Kwm5PAkEK$ejGKA1BN;_Xv*JBrpRgRJ!~c~39$eI2&l zy;WDu_6yWzyjP91j9tLgwVb0C7^7?Rt$|s5W_CcONO20DK~q-h)J_?>;~4Q55NpB_ zI-lYwnH8B=n7<_Ta{-zKEB2P&{Cn#7!}Y3~yE*VWOD?I=xYrDs-y{gxm! zt6o+Y)ywZX!u?t4KN2OPOR>H>KX#Kp~U&*3MMa9i0VzM)z@$;*?K(2{Lkvqyr zp8E2_3DT11D6_EN=Gv&P5m{DqIYJmzD45Xsy=nvVPK4+8G%V3=#Z4%PBv)n#f-w`Q>#h4k3o`cIRsTbgJ>=()j$aggtKb(+c zL!WM{JW#foG@!zC~(0z)(Vi zamk!6Ju;vC$>SFP;E_PAOVu?@|a z%2uG_7#$^=VDzOY$<)44{TbTZJ72BG_H2{p*Rn%wk5B95nl5+6<}n-0n`nK-TGXWs z6JxqgqthBH3-B^4hrEhvl7hw*xzZIqA6$SBvVx}D`m5L>LB^7-E$q5Cc6+c+-MKkM=HzRKRH?I-5tpX&cs^#vt={%1=3c03x`$+=_AZA2T zgzRmyKuzbv@_JdN7yhp)dTnfs6}kOZO85jA!OQ{2j$5Uu5APZiCSc!d8|dY`rKKEd zf|I&TOGtk&Y24<4$)?tr(?OS(h;vtDVc-WkJV>p7A;sNZP1R7*k#}DkiK6|*aLDCF zQWZEFEl}Mf-gCPoIOJ#omKU{LwC=AfsiRJE4&hHUD`Q4b`kR;NySYW9P8ML5q?`ImzG-K=hy(oWqo&b6 z`Ygc*%%)j{&n_T*7XMeD1vol>zry~{Wf9YcctO-!?Pu7r-{DOPlQ#0m`nR#7NOne> ztON8=JM2ED$}_Z?$wW`SnVN{IB+;63OI6y785<>%32UmG*OU(Kb+0oIn}TR8Ry#z@ zyq!|}a=w|UfstU7aEv>?eJfiL$gLu(B!UTriyM;nRDel4U8rh5N5 z&TZUt8!a+VV)tTq2s~h zTQ=2V)@7I0E=RpDwFPk_i@go^DFhC3A+XdMwxKrBP zvB)#g?69skbUPt<vTY;ZMZsP?!nZx19>5iNayk>7=9Kd}bRSZranL3XY zR_AHKgQ*0a2NbjmkEmH*`picz2sUHibAI*IeG)DAi~DBdtluLVx4Ym(DZT+S%wTj+ z(Q2K{!A8OAwbb`S9{CyF*BWe_Q;-%w$lz1mfDP%`_lV>AS9T8>g%uG3{pl@@LV?uL>nM;lB%;`~)K*1(W=M z{|A|qznK2BjKxo`6|x^pH$^RO?##_v|DP0y$^PrN+^qb+NpZ8D@h1g<^9RMR)r~jd zH;aCL!o8mVfd5tkbQ6Cw&-)YKBmDR9|IYf}1mAqR_z8|u`2oK1tZ|d!X88M)fnDtf z!*9XxP5kfS$WJs9(xL_u(*FrgZo+@RGyDpV*Zu|m<36D*hxT2(1PKT6mOwapp5FIw F{{n-V7H | +| Text and picture | | \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 8959f8f9..385884a5 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -1,3 +1,4 @@ +import logging import os from pathlib import Path @@ -18,23 +19,109 @@ from docling.document_converter import DocumentConverter from .test_data_gen_flag import GEN_TEST_DATA from .verify_utils import verify_document, verify_export +_log = logging.getLogger(__name__) + GENERATE = GEN_TEST_DATA IS_CI = bool(os.getenv("CI")) +@pytest.fixture(scope="module") +def docx_paths() -> list[Path]: + # Define the directory you want to search + directory = Path("./tests/data/docx/") + + # List all docx files in the directory and its subdirectories + docx_files = sorted(directory.rglob("*.docx")) + + return docx_files + + +def get_converter(): + converter = DocumentConverter(allowed_formats=[InputFormat.DOCX]) + + return converter + + +@pytest.fixture(scope="module") +def documents(docx_paths) -> list[tuple[Path, DoclingDocument]]: + documents: list[dict[Path, DoclingDocument]] = [] + + converter = get_converter() + + for docx_path in docx_paths: + _log.debug(f"converting {docx_path}") + + gt_path = ( + docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name + ) + + conv_result: ConversionResult = converter.convert(docx_path) + + doc: DoclingDocument = conv_result.document + + assert doc, f"Failed to convert document from file {gt_path}" + documents.append((gt_path, doc)) + + return documents + + +def _test_e2e_docx_conversions_impl(docx_paths: list[tuple[Path, DoclingDocument]]): + has_libreoffice = False + try: + cmd = get_libreoffice_cmd(raise_if_unavailable=True) + if cmd is not None: + has_libreoffice = True + except Exception: + pass + + for docx_path, doc in docx_paths: + if not IS_CI and not has_libreoffice and docx_path.name == "drawingml.docx": + print(f"Skipping {docx_path} because no Libreoffice is installed.") + continue + + pred_md: str = doc.export_to_markdown() + assert verify_export(pred_md, str(docx_path) + ".md", generate=GENERATE), ( + f"export to markdown failed on {docx_path}" + ) + + pred_itxt: str = doc._export_to_indented_text( + max_text_len=70, explicit_tables=False + ) + assert verify_export(pred_itxt, str(docx_path) + ".itxt", generate=GENERATE), ( + f"export to indented-text failed on {docx_path}" + ) + + assert verify_document(doc, str(docx_path) + ".json", generate=GENERATE), ( + f"DoclingDocument verification failed on {docx_path}" + ) + + if docx_path.name == "word_tables.docx": + pred_html: str = doc.export_to_html() + assert verify_export( + pred_text=pred_html, + gtfile=str(docx_path) + ".html", + generate=GENERATE, + ), f"export to html failed on {docx_path}" + + +flaky_file = "textbox.docx" + + +def test_e2e_docx_conversions(documents): + target = [item for item in documents if item[0].name != flaky_file] + _test_e2e_docx_conversions_impl(target) + + @pytest.mark.xfail(strict=False) -def test_textbox_extraction(): - in_path = Path("tests/data/docx/textbox.docx") - in_doc = InputDocument( - path_or_stream=in_path, - format=InputFormat.DOCX, - backend=MsWordDocumentBackend, - ) - backend = MsWordDocumentBackend( - in_doc=in_doc, - path_or_stream=in_path, - ) - doc = backend.convert() +def test_textbox_conversion(documents): + target = [item for item in documents if item[0].name == flaky_file] + _test_e2e_docx_conversions_impl(target) + + +@pytest.mark.xfail(strict=False) +def test_textbox_extraction(documents): + name = "textbox.docx" + doc = next(item[1] for item in documents if item[0].name == name) # Verify if a particular textbox content is extracted textbox_found = False @@ -44,18 +131,9 @@ def test_textbox_extraction(): assert textbox_found -def test_heading_levels(): - in_path = Path("tests/data/docx/word_sample.docx") - in_doc = InputDocument( - path_or_stream=in_path, - format=InputFormat.DOCX, - backend=MsWordDocumentBackend, - ) - backend = MsWordDocumentBackend( - in_doc=in_doc, - path_or_stream=in_path, - ) - doc = backend.convert() +def test_heading_levels(documents): + name = "word_sample.docx" + doc = next(item[1] for item in documents if item[0].name == name) found_lvl_1 = found_lvl_2 = False for item, _ in doc.iterate_items(): @@ -69,104 +147,11 @@ def test_heading_levels(): assert found_lvl_1 and found_lvl_2 -def get_docx_paths(): - # Define the directory you want to search - directory = Path("./tests/data/docx/") +def test_text_after_image_anchors(documents): + """Test to analyse whether text gets parsed after image anchors.""" - # List all PDF files in the directory and its subdirectories - pdf_files = sorted(directory.rglob("*.docx")) - return pdf_files - - -def get_converter(): - converter = DocumentConverter(allowed_formats=[InputFormat.DOCX]) - - return converter - - -def _test_e2e_docx_conversions_impl(docx_paths: list[Path]): - converter = get_converter() - - has_libreoffice = False - try: - cmd = get_libreoffice_cmd(raise_if_unavailable=True) - if cmd is not None: - has_libreoffice = True - except Exception: - pass - - for docx_path in docx_paths: - if ( - not IS_CI - and not has_libreoffice - and str(docx_path) in ("tests/data/docx/drawingml.docx",) - ): - print(f"Skipping {docx_path} because no Libreoffice is installed.") - continue - - gt_path = ( - docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name - ) - - conv_result: ConversionResult = converter.convert(docx_path) - - doc: DoclingDocument = conv_result.document - - pred_md: str = doc.export_to_markdown() - assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), ( - f"export to markdown failed on {docx_path}" - ) - - pred_itxt: str = doc._export_to_indented_text( - max_text_len=70, explicit_tables=False - ) - assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), ( - f"export to indented-text failed on {docx_path}" - ) - - assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), ( - f"DoclingDocument verification failed on {docx_path}" - ) - - if docx_path.name == "word_tables.docx": - pred_html: str = doc.export_to_html() - assert verify_export( - pred_text=pred_html, - gtfile=str(gt_path) + ".html", - generate=GENERATE, - ), f"export to html failed on {docx_path}" - - -flaky_path = Path("tests/data/docx/textbox.docx") - - -def test_e2e_docx_conversions(): - _test_e2e_docx_conversions_impl( - docx_paths=[path for path in get_docx_paths() if path != flaky_path] - ) - - -@pytest.mark.xfail(strict=False) -def test_textbox_conversion(): - _test_e2e_docx_conversions_impl(docx_paths=[flaky_path]) - - -def test_text_after_image_anchors(): - """ - Test to analyse whether text gets parsed after image anchors. - """ - - in_path = Path("tests/data/docx/word_image_anchors.docx") - in_doc = InputDocument( - path_or_stream=in_path, - format=InputFormat.DOCX, - backend=MsWordDocumentBackend, - ) - backend = MsWordDocumentBackend( - in_doc=in_doc, - path_or_stream=in_path, - ) - doc = backend.convert() + name = "word_image_anchors.docx" + doc = next(item[1] for item in documents if item[0].name == name) found_text_after_anchor_1 = found_text_after_anchor_2 = ( found_text_after_anchor_3 @@ -188,3 +173,38 @@ def test_text_after_image_anchors(): and found_text_after_anchor_3 and found_text_after_anchor_4 ) + + +def test_is_rich_table_cell(docx_paths): + """Test the function is_rich_table_cell.""" + + name = "docx_rich_cells.docx" + path = next(item for item in docx_paths if item.name == name) + + in_doc = InputDocument( + path_or_stream=path, + format=InputFormat.DOCX, + backend=MsWordDocumentBackend, + filename=name, + ) + backend = MsWordDocumentBackend( + in_doc=in_doc, + path_or_stream=path, + ) + + gt_cells: list[bool] = [] + # table: Table with rich cells + gt_cells.extend([False, False, True, True, True, True, True, False]) + # table: Table with nested table + gt_cells.extend([False, False, False, True, True, True]) + # table: Table with pictures + gt_cells.extend([False, False, False, True, True, False]) + gt_it = iter(gt_cells) + + for idx_t, table in enumerate(backend.docx_obj.tables): + for idx_r, row in enumerate(table.rows): + for idx_c, cell in enumerate(row.cells): + assert next(gt_it) == backend._is_rich_table_cell(cell), ( + f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} " + f"with text: {cell.text}" + )