From a1cb0dd34489a6bc473599745442c83f9b3ec0a8 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Thu, 3 Apr 2025 14:21:34 +0200 Subject: [PATCH] fix minor bugs, mark helper methods internal Signed-off-by: Panos Vagenas --- docling/backend/msword_backend.py | 212 ++++--- tests/data/docx/unit_test_formatting.docx | Bin 19505 -> 19731 bytes .../docling_v2/unit_test_formatting.docx.itxt | 30 + .../docling_v2/unit_test_formatting.docx.json | 577 ++++++++++++++++++ .../docling_v2/unit_test_formatting.docx.md | 17 + tests/test_backend_msword.py | 8 +- 6 files changed, 752 insertions(+), 92 deletions(-) create mode 100644 tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json create mode 100644 tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 2c4cc65f..d6b73f70 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -26,6 +26,7 @@ from docx.text.run import Run from lxml import etree from lxml.etree import XPath from PIL import Image, UnidentifiedImageError +from pydantic import AnyUrl from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend @@ -121,14 +122,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc = DoclingDocument(name=self.file.stem or "file", origin=origin) if self.is_valid(): assert self.docx_obj is not None - doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc) + doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc) return doc else: raise RuntimeError( f"Cannot convert doc with {self.document_hash} because the backend failed to init." ) - def update_history( + def _update_history( self, name: str, level: Optional[int], @@ -141,26 +142,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.history["numids"].append(numid) self.history["indents"].append(ilevel) - def prev_name(self) -> Optional[str]: + def _prev_name(self) -> Optional[str]: return self.history["names"][-1] - def prev_level(self) -> Optional[int]: + def _prev_level(self) -> Optional[int]: return self.history["levels"][-1] - def prev_numid(self) -> Optional[int]: + def _prev_numid(self) -> Optional[int]: return self.history["numids"][-1] - def prev_indent(self) -> Optional[int]: + def _prev_indent(self) -> Optional[int]: return self.history["indents"][-1] - def get_level(self) -> int: + def _get_level(self) -> int: """Return the first None index.""" for k, v in self.parents.items(): if k >= 0 and v == None: return k return 0 - def walk_linear( + def _walk_linear( self, body: BaseOxmlElement, docx_obj: DocxDocument, @@ -180,12 +181,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Check for Tables if element.tag.endswith("tbl"): try: - self.handle_tables(element, docx_obj, doc) + self._handle_tables(element, docx_obj, doc) except Exception: _log.debug("could not parse a table, broken docx table") elif drawing_blip: - self.handle_pictures(docx_obj, drawing_blip, doc) + self._handle_pictures(docx_obj, drawing_blip, doc) # Check for the sdt containers, like table of contents elif tag_name in ["sdt"]: sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) @@ -193,16 +194,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Iterate paragraphs, runs, or text inside . paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces) for p in paragraphs: - self.handle_text_elements(p, docx_obj, doc) + self._handle_text_elements(p, docx_obj, doc) # Check for Text elif tag_name in ["p"]: # "tcPr", "sectPr" - self.handle_text_elements(element, docx_obj, doc) + self._handle_text_elements(element, docx_obj, doc) else: _log.debug(f"Ignoring element in DOCX with tag: {tag_name}") return doc - def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]: + def _str_to_int( + self, s: Optional[str], default: Optional[int] = 0 + ) -> Optional[int]: if s is None: return None try: @@ -210,7 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): except ValueError: return default - def split_text_and_number(self, input_string: str) -> list[str]: + def _split_text_and_number(self, input_string: str) -> list[str]: match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string) if match: parts = list(filter(None, match.groups())) @@ -218,7 +221,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): else: return [input_string] - def get_numId_and_ilvl( + def _get_numId_and_ilvl( self, paragraph: Paragraph ) -> tuple[Optional[int], Optional[int]]: # Access the XML element of the paragraph @@ -233,12 +236,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None - return self.str_to_int(numId, None), self.str_to_int(ilvl, None) + return self._str_to_int(numId, None), self._str_to_int(ilvl, None) return None, None # If the paragraph is not part of a list - def get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]: - parts = self.split_text_and_number(style_label) + def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]: + parts = self._split_text_and_number(style_label) if len(parts) == 2: parts.sort() @@ -246,15 +249,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): label_level: Optional[int] = 0 if parts[0].strip().lower() == "heading": label_str = "Heading" - label_level = self.str_to_int(parts[1], None) + label_level = self._str_to_int(parts[1], None) if parts[1].strip().lower() == "heading": label_str = "Heading" - label_level = self.str_to_int(parts[0], None) + label_level = self._str_to_int(parts[0], None) return label_str, label_level return style_label, None - def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]: + def _get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]: if paragraph.style is None: return "Normal", None @@ -267,21 +270,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if ":" in label: parts = label.split(":") if len(parts) == 2: - return parts[0], self.str_to_int(parts[1], None) + return parts[0], self._str_to_int(parts[1], None) if "heading" in label.lower(): - return self.get_heading_and_level(label) + return self._get_heading_and_level(label) if "heading" in name.lower(): - return self.get_heading_and_level(name) + return self._get_heading_and_level(name) return label, None @classmethod - def _get_format_from_run(cls, run: Run) -> Formatting: - return Formatting( - bold=run.bold if run.bold is not None else False, - italic=run.italic if run.italic is not None else False, - underline=run.underline if run.underline is not None else False, + def _get_format_from_run(cls, run: Run) -> Optional[Formatting]: + has_any_formatting = run.bold or run.italic or run.underline + return ( + Formatting( + bold=run.bold or False, + italic=run.italic or False, + underline=run.underline or False, + ) + if has_any_formatting + else None ) def _get_paragraph_elements(self, paragraph: Paragraph): @@ -289,7 +297,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): Extract paragraph elements along with their formatting and hyperlink """ - paragraph_elements: list[tuple[str, Formatting, Path | None]] = [] + # for now retain empty paragraphs for backwards compatibility: + if paragraph.text.strip() == "": + return [("", None, None)] + + paragraph_elements: list[ + tuple[str, Optional[Formatting], Optional[Union[AnyUrl, Path]]] + ] = [] group_text = "" previous_format = None @@ -306,13 +320,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): else: continue - # Initialize previous_format with the first format - previous_format = previous_format or format - - if (len(text.strip()) and (format != previous_format)) or ( + if (len(text.strip()) and format != previous_format) or ( hyperlink is not None ): - # If the style changes for a non empty text, add the previous group if len(group_text.strip()) > 0: paragraph_elements.append( @@ -335,7 +345,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): return paragraph_elements - def handle_equations_in_text(self, element, text): + def _handle_equations_in_text(self, element, text): only_texts = [] only_equations = [] texts_and_equations = [] @@ -381,7 +391,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): return output_text, only_equations - def handle_text_elements( + def _create_or_reuse_parent( + self, + *, + doc: DoclingDocument, + prev_parent: Optional[NodeItem], + paragraph_elements: list, + ) -> Optional[NodeItem]: + return ( + doc.add_group(label=GroupLabel.INLINE, parent=prev_parent) + if len(paragraph_elements) > 1 + else prev_parent + ) + + def _handle_text_elements( self, element: BaseOxmlElement, docx_obj: DocxDocument, @@ -390,19 +413,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): paragraph = Paragraph(element, docx_obj) raw_text = paragraph.text - text, equations = self.handle_equations_in_text(element=element, text=raw_text) + text, equations = self._handle_equations_in_text(element=element, text=raw_text) if text is None: return paragraph_elements = self._get_paragraph_elements(paragraph) + text = text.strip() # Common styles for bullet and numbered lists. # "List Bullet", "List Number", "List Paragraph" # Identify wether list is a numbered list or not # is_numbered = "List Bullet" not in paragraph.style.name is_numbered = False - p_style_id, p_level = self.get_label_and_level(paragraph) - numid, ilevel = self.get_numId_and_ilvl(paragraph) + p_style_id, p_level = self._get_label_and_level(paragraph) + numid, ilevel = self._get_numId_and_ilvl(paragraph) if numid == 0: numid = None @@ -413,18 +437,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): and ilevel is not None and p_style_id not in ["Title", "Heading"] ): - self._add_listitem( - doc, - numid, - ilevel, - paragraph_elements, - is_numbered, + self._add_list_item( + doc=doc, + numid=numid, + ilevel=ilevel, + elements=paragraph_elements, + is_numbered=is_numbered, ) - self.update_history(p_style_id, p_level, numid, ilevel) + self._update_history(p_style_id, p_level, numid, ilevel) return elif ( numid is None - and self.prev_numid() is not None + and self._prev_numid() is not None and p_style_id not in ["Title", "Heading"] ): # Close list if self.level_at_new_list: @@ -452,12 +476,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) else: is_numbered_style = False - self.add_header(doc, p_level, text, is_numbered_style) + self._add_header(doc, p_level, text, is_numbered_style) elif len(equations) > 0: if (raw_text is None or len(raw_text) == 0) and len(text) > 0: # Standalone equation - level = self.get_level() + level = self._get_level() doc.add_text( label=DocItemLabel.FORMULA, parent=self.parents[level - 1], @@ -465,7 +489,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) else: # Inline equation - level = self.get_level() + level = self._get_level() inline_equation = doc.add_group( label=GroupLabel.INLINE, parent=self.parents[level - 1] ) @@ -504,14 +528,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): "ListBullet", "Quote", ]: - level = self.get_level() - inline_fmt = doc.add_group( - label=GroupLabel.INLINE, parent=self.parents[level - 1] + level = self._get_level() + parent = self._create_or_reuse_parent( + doc=doc, + prev_parent=self.parents.get(level - 1), + paragraph_elements=paragraph_elements, ) for text, format, hyperlink in paragraph_elements: doc.add_text( label=DocItemLabel.PARAGRAPH, - parent=inline_fmt, + parent=parent, text=text, formatting=format, hyperlink=hyperlink, @@ -520,30 +546,32 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): else: # Text style names can, and will have, not only default values but user values too # hence we treat all other labels as pure text - level = self.get_level() - inline_fmt = doc.add_group( - label=GroupLabel.INLINE, parent=self.parents[level - 1] + level = self._get_level() + parent = self._create_or_reuse_parent( + doc=doc, + prev_parent=self.parents.get(level - 1), + paragraph_elements=paragraph_elements, ) for text, format, hyperlink in paragraph_elements: doc.add_text( label=DocItemLabel.PARAGRAPH, - parent=inline_fmt, + parent=parent, text=text, formatting=format, hyperlink=hyperlink, ) - self.update_history(p_style_id, p_level, numid, ilevel) + self._update_history(p_style_id, p_level, numid, ilevel) return - def add_header( + def _add_header( self, doc: DoclingDocument, curr_level: Optional[int], text: str, is_numbered_style: bool = False, ) -> None: - level = self.get_level() + level = self._get_level() if isinstance(curr_level, int): if curr_level > level: # add invisible group @@ -599,8 +627,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) return - def _add_listitem( + def _add_list_item( self, + *, doc: DoclingDocument, numid: int, ilevel: int, @@ -609,9 +638,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) -> None: enum_marker = "" - level = self.get_level() - prev_indent = self.prev_indent() - if self.prev_numid() is None: # Open new list + level = self._get_level() + prev_indent = self._prev_indent() + if self._prev_numid() is None: # Open new list self.level_at_new_list = level self.parents[level] = doc.add_group( @@ -623,22 +652,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - - inline_fmt = doc.add_group( - label=GroupLabel.INLINE, parent=self.parents[level] + new_parent = self._create_or_reuse_parent( + doc=doc, + prev_parent=self.parents[level], + paragraph_elements=elements, ) for text, format, hyperlink in elements: doc.add_list_item( marker=enum_marker, enumerated=is_numbered, - parent=inline_fmt, + parent=new_parent, text=text, formatting=format, hyperlink=hyperlink, ) elif ( - self.prev_numid() == numid + self._prev_numid() == numid and self.level_at_new_list is not None and prev_indent is not None and prev_indent < ilevel @@ -667,21 +697,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): enum_marker = str(self.listIter) + "." is_numbered = True - inline_fmt = doc.add_group( - label=GroupLabel.INLINE, - parent=self.parents[self.level_at_new_list + ilevel], + new_parent = self._create_or_reuse_parent( + doc=doc, + prev_parent=self.parents[self.level_at_new_list + ilevel], + paragraph_elements=elements, ) for text, format, hyperlink in elements: doc.add_list_item( marker=enum_marker, enumerated=is_numbered, - parent=inline_fmt, + parent=new_parent, text=text, formatting=format, hyperlink=hyperlink, ) elif ( - self.prev_numid() == numid + self._prev_numid() == numid and self.level_at_new_list is not None and prev_indent is not None and ilevel < prev_indent @@ -695,43 +726,46 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - inline_fmt = doc.add_group( - label=GroupLabel.INLINE, - parent=self.parents[self.level_at_new_list + ilevel], + new_parent = self._create_or_reuse_parent( + doc=doc, + prev_parent=self.parents[self.level_at_new_list + ilevel], + paragraph_elements=elements, ) for text, format, hyperlink in elements: doc.add_list_item( marker=enum_marker, enumerated=is_numbered, - parent=inline_fmt, + parent=new_parent, text=text, formatting=format, hyperlink=hyperlink, ) self.listIter = 0 - elif self.prev_numid() == numid or prev_indent == ilevel: + elif self._prev_numid() == numid or prev_indent == ilevel: # TODO: Set marker and enumerated arguments if this is an enumeration element. self.listIter += 1 if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - inline_fmt = doc.add_group( - label=GroupLabel.INLINE, parent=self.parents[level - 1] + new_parent = self._create_or_reuse_parent( + doc=doc, + prev_parent=self.parents[level - 1], + paragraph_elements=elements, ) for text, format, hyperlink in elements: # Add the list item to the parent group doc.add_list_item( marker=enum_marker, enumerated=is_numbered, - parent=inline_fmt, + parent=new_parent, text=text, formatting=format, hyperlink=hyperlink, ) return - def handle_tables( + def _handle_tables( self, element: BaseOxmlElement, docx_obj: DocxDocument, @@ -746,7 +780,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): cell_element = table.rows[0].cells[0] # In case we have a table of only 1 cell, we consider it furniture # And proceed processing the content of the cell as though it's in the document body - self.walk_linear(cell_element._element, docx_obj, doc) + self._walk_linear(cell_element._element, docx_obj, doc) return data = TableData(num_rows=num_rows, num_cols=num_cols) @@ -791,11 +825,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): data.table_cells.append(table_cell) col_idx += cell.grid_span - level = self.get_level() + level = self._get_level() doc.add_table(data=data, parent=self.parents[level - 1]) return - def handle_pictures( + def _handle_pictures( self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument ) -> None: def get_docx_image(drawing_blip): @@ -808,7 +842,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): image_data = image_part.blob # Get the binary image data return image_data - level = self.get_level() + level = self._get_level() # Open the BytesIO object with PIL to create an Image try: image_data = get_docx_image(drawing_blip) diff --git a/tests/data/docx/unit_test_formatting.docx b/tests/data/docx/unit_test_formatting.docx index da816467cfa4f56de93124797034767181d0edb1..5d08668e4faf4fe93982a4f0b62ddd177597f5ff 100644 GIT binary patch delta 13758 zcmbVz1#D%zl5MEN%*@OT9cE^x4l^_7gqcrw7&^?%Nr#!4xx>tyPJZrt^X6)l%BSV>kvgS2V2wTkG?C(1P4Ev``T^vUaQh*t zH}}U=f}QxkQpMWg_kAgmd2SKK8!Tgs967$2{5h>P<++)Sz&o*D1NPKU%+rFMbQV zP=C0e)*tZpiNi@7FVS78NarQ{E1G|pP8AR zn)%k2?3X~K=uvYLbjd~X&}z7esuiJ-ExtL%Nrhfj8zJ@mQci8VVm#mt14zJ4E-nRE7&o?S(naPHVW^lD663(-LKF%B?5X%<`o?+ZSJ z?4jIG@1Yz_OMQU7(VyTiZNaqS+GzOk36fm+c;kLvDla!j-CEkY*Q$Ff98P08JyMYG zyJ~Gjj1;T;EP31$GtdL{c2N3He(U ztu|ZhYu4V)lqN2Qc}n4yo`_RBLw8*7h;3rAN))XyBf8NnyBr<~B5RM?v#`xtX8;kL z#~a|sF!5CwtJhV?hJV6z#927&@VxtWIQsDlL zFhEa@g}7>9lL>R&1=3b@eR=9PlU(yNPhgG{iy}`9zh@}?9w!mzUZ<5)Cwew5Y4IHB zy(7)L_4`TZ$I5N>tvi;pWtwqJ2?Q=gJ5PN~B+RyQ&1u<&<{j%umpw=$cNHC|Yc$|K zS-yKu4sF`2`Lv)9vY;FnKNx0jPpv#GSNlP(UFPL*x7p8UHt4{R3^~_Kyr%_xFG4?78XUDI#pH zC#PEYg}o-U-&=b!F1YnIc#dfod3Od2U8k#IS^XjW(It)?^Zqj=uY8BoY^W%Ilc#R7 zPR;49!+NMRDxU44(d@Tx83+QcA9TD*;Ye(?D4Opzu2$DD(=5fn@aAA=281*Kb6uDv zH76owk8Fgt7v-@$p_L|9$5ebL1ETWim&7DT{xd95YXu{J9tSD4o)g-Ufcd&l8rj~B zB2vPxo+OH03n>%KwfS|t{2EJ>kO2!fb`C#YvNZ;pe>*-$tnnoEuqPa+0E0maO5seC z?~tWF&cwK3zr}~WDK=7d#`h@trP;alXRq(KJR)b}9bkS#L>VG2)f-XZ}jkck1h4jIhoLt9kmyu_QTP{Wla(mxnS=bNoI z1Oja2gzn{=U`CpACj2PM1FZS5UB;iEFHLg5k>asrGTD-8_iazTJzSQ0;IFC8iYs4V zmzOx^sMP^Qg!rwP@IOxP~W{S}_py!)8cfTy|{(;Q~;pIXWBQ9oS7_ zE_#ZPZm0srVg0J(VI}f4y30-iF@Zu7>hTSk5^c3D3+zc-RF%bvK>}86vGcqN4S?>} z?s51*J(ZG)d_GXveO*d(Ncx`&2C9IhlX9a;O%BMSGrJ(t-34Szw;<5pi=~T0gmoVPz`9LH?8>?`2moU3Hvc=s3P?V8jk_zS;^A*WDsYXjlZsUfs}=^8EF3e2kW3ke-EE z{OH9~6ek`dY%{Z;+i~_cj7A&-M4?T2>YLLR0LLyPxV94~R z>REu@gm6=HUNlqaMs`ja8roVU>rk?o$I>&L>y45@C2>Qfe3c z*Ioh+q9h9rfer!*0{!_;0`k?WN3sW)g+};4RU15zTwK1aO1ZdG{`3J<&jgINbC|Q$ z_mu#RQG0~;rw(Dve<^qHeH4~q1pht1<-b@9rmTSE_DPw2Mm zd3K5WC6__hI)U*h8YBDyB5_L`Qc}gnI5rMhfwTn9A{RXsK*lmL9gh?n)hZn=3Q>fD z`pUQZM4AG60r$YC-#5NdcX!wcn?d;rWD*mbG|9;9sMh2up9G~%yj*}f6(k4)(w~Tf zC{~BKybKDKDo{v09$lVHUwhpcm*xnVJSTN4V8!v5ZdU-hLksXBRdca9uQbn*`%Wsq zvw_Ik-Yfxr4Nt(NZszrDXLiXeL40EfoB+;*P#BZu4)9q~8=ogf4K;pL+&9ne)&u@n z@>{al>ZD#^&g9cHSotm_EN5*r2YrumRuxB}(GkM`vZT$pcG6umYshRv{2C8dVtQ_f%(iuIzJdM~C;d~mEv{ZfNrugUHK2>yC-%Z}N%;scB^gPv=8MW&6$0g(G+T0^=soyk`hZ?pr zbcC_*ZcC4-DbVX0T{6RW78R4w|R-)+850uZ1ub8wOGLD+gB-|4Lfe~$|lOxO>NKz{n^(!Af? z+NaBoh6ryBzRT9TW6b-B)VyJq{1orCUy?dtE4I40`@3_H?srXzfxBXRVNX zF5BiwcHxH_6uaf$-jildTp_+a|V5iL@8z5Rn zW8ZKr9>B&8wM_ubV_THeVC7-N<|CH9urO�o1{`yXAl6O)d+De*9^)qNgTm zqNi^bzO!wuZLB2x-9GBeBu!#zFI!C_&-0{wKxtA|ifBxWAYjh1_ml`87qoP%K#QMh z{Xu8fzm?# zdS#)^wx|h7YUj?2h?lCtQ5+SuTeDcZ^BJvFry>&5NRZ@dSN3sCep*4K{HH_zDlxob z9L;8%UQilw{8zMcNMP*N-9j$U?;`hg(kdF#6Xnwi`u<}DXD$uJO{O{P9aa#h=~o*7 zQd0q{1D-xu+E(K0Ig%9IE@vf(<5OWsZ!?H>G{icaQAYlh-HSIpUK)?Af-61gI}gVX zRc7XRjKwDyXA6{yz5&+tr+w&0Q{lG3*eQ%dFqho z()n(B(9-2<_stL2JltFcW{|XYotEhMZsHtdL3=27I&(~?;@F`+~){<4Vv^Nou$C_ z2g8<~4dZYTga>gWRYDfpzWyT?V(Y82=!^D9`!Zs1-WeqF%aYP;r{gzy#Idj__zRAC z(XsSNUJ!dUro|*FL%k*h23d9D`Xp4x?V5F16=AD6)I#7~5a3MKOOexN1W)Tu$}k6R z>~d$Lc4bBf<0s%}gEc-`^H4nj3OP`%0xv1BF?C7hN~L=2nMiL?N#e`;&A0q*y+yrQ z!GULZ^tM-)`W!~werv$3tiI0S6eSj)VqyA0q_M%?&`g^H_D>Zctip*jTFWlaIcY1_ z**?tzO^N05SAeC2C_ZkcAy3C8t0_Y~g*-{B%|mFI+q?gk3}nm+`Fl9jmJ7%v4m}_w zvr~|hy-@68XW2iAdEt_$Lgv~4Ji!{+aMS5;5a z?vD(A@GwQCoi29L`NZ^OzI6~>&d0AUwDJ`({ffKMKmrK#$n!8Xu0vP;CVBqqDn>sg zQl9IpM@p?7B0=7OukXy5@FVtDcgS*bFxXH8@!4s#Q=sPs?VTL7POH7~yfDq+q9dgu z%Fs;^1Rjdd&T=G*)0sNETCZ%sXqEQRJ!U71ecT}>UA}{7AiwQ8O&lhS4t64h(OAhY zDLNA=+i$>6Y7Ww^apb8mG*&WClv4qxz^5v-K}u5;CY7y{T@xmSSa;tk-L%^Yr1R-a zhrn1r3p0`5AW(5L&D=fy*5fnW;@m=%nuV9pUX=k$zLe_KC_rm6?6h?^1EgE~0SP{_ zip8g}UbKj8veQ2j!S&im4Hz_I=%YU&=My!?G8^SZ;5ZmBBbMWS^}!l_?;YF!6It#9 zqkP@I{aPf66vXbQIwNaa0?$8a{T!&Svw6W~Q>osUe-6L8^gJY)`JrUHd*hHP{FL?Y)kR&h{8{}UjKgY zds&uPmTzOl^$r8qcDE=4AWba gOupy@dCz}*n#klm_*H{NydKRH25 zAOCCg{K#*0w+{{i5`zH(g8HA(!rsl!*v#3=9$@j8j~!20v)|?X3@+f#c%j=AOMqs& zS{8}&oZ?aEdEG@v91Jtb$b=9{H#<#C_3<_F3z|ntRBOs@340Cqu{6L;#e9Req1{I! z_ZS^KQgzCwUiWH(HBvrI>A@wq*T>@U@-#b|Sg)YcZBwJm$U4FGU$4i1IDdc9MTNlw zFc>;w*xh;>73&pCcOwz74a46YeFK#*Vp?|+5Tr~zTqc?x zL)LNQxWk>aL?J(v_{M0(88^?PP~p=J5Le))uL?2M>eFnMfFC;~?E-U*b88y6EPCvA z?S%|xd*>r1i?D_nSA;|>iiiwR68t!J$YM_L7#i*wrt=hW);bB z)z-zkt3P!_z)7?)s*5HmO)ES8F!$0WSk}U&FG4WAB?IM*;e2gsU7NH1P+PqKlrK4g zJVd&Fp@eI6pe@^2OiH=y{baCcY$VS>?oTdt|C;`-fuFmoWc-9U`wyx+ChW3(rP zu6s7ZfcFjReWjaGJEl-Z8`&IJW~`C{1=t~1ZL!}XIeEzD|1W#!avYP5Ihz+A$h5|dp8M(j6+nqNvb6$+aC&X2>ipW>F zroa3yoJAdak?pAfI>Z}N98rQ2N`o;JumjcmJlki*{J%p8W-x^H{%ExWH1s%DP&N~Z zUXsAunrdhXyFH}YLi?>A7OVg-QdnjEIc(-D)O|R{tm;ip!s`|1e2R5)@#+(LlqfKc zGkfD((YQV0c3aF4&En)+bO(gvU%OXF%!FxX7_MRbx6=!|-rqJSKTx^qzgL``KRe9A z<8Z}W6_JQmkCPAh>btK2{Iw{fS10NS2_cq7Bg%inBknEn>%!`UZ3fv>q2WL|ifCYe zFEyh0@*}cCt@oUIC#O0DyDS(FVmG7(Ep$Ahp)=t;5tqr_WMP9_j-on$wjeVw8SEtq zfd#dr=lPjajcW>F==cHZj1zD*pJ-j|(g$&4z%t)OJZ%{4Rlo89IIr2p2=8cR4<>;6 zs@Jef3((Kr1p}Cl_}dJHVx6=^vekX>%Gxc$I&K+pHLZM)$LDHS8Vc`HEj#XFnz2*8}&m90hT?7V< z&%jHp1_T7@KiI*acwRWO>x1loXZFCRY6y(R#aW&xi`j~Glhb-F*jCgWV^L@ZD z?eO$AKY!fd24l=$YLO2U4@3ClY1Ksbr34<02Gkamp3@X-FZWNn6a-4*xQap6bjh$j_bIp z;Lx1Z5i>28;csLUHd@*VP_p=XJu`_7W<0uewym)(5snc++&sh@>T9CYRQ$sRAnKJD z1&`kIp#u@%H@VrZ6mew9otU8)tkyfzQzBqq@m;(!#hyhFGx3e&*8?!YKH+J}w}(?X zdt~6w#2RxV>BS|Z-|K)st9~4*&u8YF)3iEu+{b}iW+-@VXgxh3E40FCSLNMu{kMge z-MtxhNHCdKzhw5H@vQ|VzTj+mo!}@2(0Jm9?ffF3En)y^piTVclr`wSw3eqzmWT9b-rOH3|)X?^hNCpxRzvbk9nfw%3s6P6jiyr`{UN zw=ht^huz4oA`Vj7eb;(f0vp|%DO&W_?uyzqfmzQ4R!+597`mfz+XZO#hYi+R!o->+!r~6)L9oYBOwVv9b4%=M z^`Zv?sX7^8ZF&Tn#C&)!-UWIQJOC-%VeMT|j(rCp_e2bC+kCG9n zr9Ob{G1Mtp%uQ0?gaPTxu-hlL_#}=;Eo5juk?!x&HzU7MQc@M3gNpecc+3pv0ws*b zJ3Q;t(fSY4`4fz;xryBk0;Q7BM+3|=d}+i{Q6cW5o~>rPtYFB-kS=>Gx~j<|JX#}3 z6+-R$94 zx5QAPn%S(i%!g|DvoLt`j-mmx%_+ymio@QADs~0Mllj9{#k8~R#~(Zmqx+Nvdks>% z>_BqM!e8t%z_2a8YA@Q+x}i;}W@&T^MZTY+zsBaohII6A$i$_oMz$&zWs^h8G**L- zHW#oSr9CfK+8zNgelc3ub64)S)&sr6`FO&=S^*!E?TU+aW4yP#Yo72k&?5F!|x^(AgOzP{GPfEDFj7W9r}LC zE9tBd;yI*Y!+%6Poje?^Ncw>s6NbLorHwo4MMA5*0So{v6d}}4ym?$aD!LJ&#+#ZS z9m-c75G^QaYR~oD6H(fJ{*Y30ia!$Xrfi!wZ@=W6Vd5{0d=sZ16FH?&v*0V6W4J1r zY6e_eTqRLUV`W5427Vz&WsI{Y@5&QF2TAX-l{HP zs)D?>Gcp52Wl~u(B-jo?QvAljS7DJoQhbFm2nV2yDK0I--mKxqJ?i|r+306}k`}uu z|Aftw!vv(>F5S;OYs&l9smrz{Hy1qSmsP~S6^b;r-maTm|Hm!$`1;#ep5Yej8lamp zM{%N$KGyuw3)x;nTy&=oea^|l1Y<^bZ~&Z@VQ@jpce^a)k?c**tt2RV;nGND)fRM*)Hfg{ancckA zwmttRx2oBA8{J~vrXR~M)YSWEXlOUIg83?H3W7%D+E?~ zS|k+NGFJq;L1_mBo=VL!9*wra1BF)1>GKhW!w~uRkpEx^h8_SKE>yW9=nYCF5IQBu zU<_&;-_A(>f8$6XWJHiNOw2W4CiJW!M z_=7BXYSm%V^;$?bS7LA)@4S>dxRD#Bu~OS6810xw9{+7OINU_t!4(|lP&u`P#(K=` z{%chm1(@5;?P2)?l`r7rh~J>iG&@vjNe=X>pqwIAHLW4kcLNpO3)m!8tsA6#SitbQ z$RNzcZ6(_#lQJMrlgA4-XlGSq&exFIY<8N?j)NFQ;PU7mQD^P`#?a9h> zsgvV+H`nm13!S(j)C9rET+$Lwc33?zE|Z9WZbWUfb!W(Wx*SSwg57iWevSA#}VS3 zkLzxxRYn4qY#k9$*k%!K8|qYyE5f6Skh@+I7R*b(fD9_jA=dplII{$j`QlTUMH$I= zyoH}K5~_f|B#J_OVtOL2L3R~B#TDU-G%8n`D>s!$u;{T3A@o{@wIH^dN61cch=yC2 znxAViEFj~TQw*#3@KouP0sng~|6GdGf(qKgg-9crVd&d9}y0gv!sk@0Jz*B`7KDQ^vd)N0p9s-oFKv zCs+KZ^Iu;XQTQL7|B~`Qr*QWzH3R}s9nhv*%l>$@iS-y}HkNr0;MpnW-q%L;I|!8Y zDZ5m%@VaZbf2ND96u_l^mP42z7jA_Y{rf(g;3(X@0^C)65H8;@e* z#;AB`g$X%uu)7N!o0<60y56?9W$Cl$`$rhxpb`idM3h}xC?_aHH-31NW!wwCTk8D~ z#o(O%4PGJE8tuPt01V=O}jVtT()qC|jLQ_r>vAf!-T?zcKY;VH4W%|a7F z=_x@FRGd_}=^*>vfag~jL0|bW{Fb(7EW*nD)$FNv*wQlYw*z;i^ebEd)sA}0I_kIA zpd;j5R#vj+N=5B5H-1qwV+0Hr0Fr!(+&9mX%tf+7saIx|@4iQ56!RYdgX6{f*ol~ku4&V9Ffzy%ljAdrNs{R(Uojy4NnypJXsl zth+L!Y<##jbMRoaFw$MEFt^gBy0{ao;`sShQcWP9ec|`enik|d@Hwo*-s&pUin{hs zl;n(0(WAoRgFabjLLIVh_|OS|I`7)y3lLf3RZX9_1`OYhOM#fRb6KRHy&+p|hxma<J+^S_vn z{7xsL1;|d`E5|moCaUGpH5FYGV5^jgwtF{tC)qFkSV-<(eY`%9?xK}c`mfNZV;u5~*-z>iKIyp{e5az6B>fxN3Ssh7c9{(~k zb^ua2JE2GzpH5u8JiN5UBRK%6WwJdDk=R>?0L{)j_vd%EreD)3v9jfhhw4u^o^;up zbC?nDPa`YI`ZhFaAKkVb9iBM}CkI6_FqwXCm#u#75ZLNo{fHq1~FE5*z9|YHCWl z$mUGw-OnQuHge%KMWw6knt|w=><>*D<2;UvwEFZR6bvf4qWw+QWUEi7!{Q{fo7|EvVncH)}F@b%lNgjDDvyp+-pP;`y$ z?gy;wc>Y#Oo|cw{S$0@+JVU#cvvos3tO)RgY4R6y-reSJDF0aa?#{72O`8tXCxyT8lJlAyX!YQ`l z1XTqY%o|;ntHv3{;J5-Altqe)Yh=T6C)2QR^##w&XDMIVh>lS zhP!rj`8Hw*dBvo^sCp*IogO;Z+)zGij^Ov?Mn}>Okw)cq?Mv>ZX0l8dJjO6*#gKya``IuXDp!d&ldUGs6wV-3F$w0e3U( zcQ^l)?!*2EO^lr0@+$cAn&7J5Q-m25V1*JpOALAf78oMt6_HgNKF{ty><;tCC$m5v zM&#_k>Cn=5XkZ|vB44pa!Lm_kB;+SF`kY>gbs(YU*b~0^r0RfGB?kR8B7PgSq9Uxh za}mS3u3fh`C?hQSzOy-|<$1>li@PmCQk%3=$+Yj1h6<62u94_Um_~rA$Yc!!K#}Jj zy}5fCUaUdVnUi>8agiW^4W2_vX>-RkX*+*0_{YUmVAZWCP3RWRld88R;t3NKq|qEZ zkn*&^=mVl_eM7)iF*ek?!#H3xsD2&JKva9%1e>(k?-k_W^(G-CRr}|+wHsln$~?T1 zyQVq(lgeGZWsX&6tRhM61SiLF0PJobfzLw;@;ryzJ0t#sK1kR4k4H5tEdEe3>XWbQ zOv1b-JE{$W`Qx(+ta}5ShyJ|V%2!mE8+@+W_2s7vs6S|=i=C1NM|{N(p4lg0yzH>> zCx`-_yLvwQwdIOfMl__8riN2JLGB1+KP;aS{z-0v`HQ(1EQr{#R!=5Q0Sf4`Ut@;v zKw1{WZsV#0a!`Yb4%e+fAj*QT3WiCoKP5S&DDM(_C>+SG-J|pChA*UR#>P zl#CehY&Ku0F#W=Pj<@?@^PmN2Z8H)V*~c~O2Z1#?`Pm8!B*w$glzcOo89d!I z@GesfyelZlh0`DzzWgK2Ys2Rrq^PXyqt?VpFkDvf)ZedhPBJ2Mg%V*wltEsN6%;RJ zi%3fw%P!nXof2fB=Zn~bTbW=^*M&picf){*6ugrs$mIErv?P+?CLJ#iUg@E-^iV(?&ARs_!M0|QBSx_(rkiU7He=(H1KZ!tBpG2U)ZtMOo5-30gX>$C3 z5j{TvC!c`72orx7PIO?LG#UP1Y`KL`K-7OxiT(yqVgVXRQMJ&vl0h0i$He@&DU%eS-e6Zc*Tw5?o%~J-EBOySux)g}~+8=bpQ3pZ~AA zb*E~o)~dGEH9gO)>Fxze1+Pv9ueyc?Ob@@?E6M#_9v z;UAM@xm+yOJXz7epVx!YMU!9)FL7LIEZcv-vM#3EYt_LMjavY#g5aCxQvBYqybb@YI&bjqoORJagnOkG=IF1`anpR zDZ}{0v*4SRvX&ALrGG-#zL#EL2WBGsmuICtv)HpFSzjnE@QdBjYu>Q1ZylATgSEp{QLtatSmG**_G zyh)KN7p6}ony*&=z`{QIf{<1=n%d`))Gc>E{+8_}!;4YSOo9~5KrI%&kVK{)sId$y z!gNPSvEbU_^E6X5TEA5kG_ARnoHHKSyDCdnjfTcn@FQj`qoNbtNb{z}9H^1pCv>z$ zL3IF50M4bp1JL2|*`l&}L<|!xSEuHo{Tcu(8|^)i`e+cXb1~9kr|l~zYzw>h%H1tw?iI9 z*e=MtI7M^kK`z_BESW~wS-eVa&0&9&1|+$OPi@ft%y@+%C@$NT5hSdBOR=c*mQsd1 zhLKDGYR0BvE_H&jN6`AZ!3Sz>cpCm7wt;Nsdj=PZJ3(;c?3v`{egyd$)e5hNs*H;b z!2%;>ACC!X(5>C@VB!Z=!f?E5<6FIBX=Ym5t#E>3sJSJQT+gIiqUs(&gGCU$kN#q% z7%IBSE&rhP#JI&GYMJah*~{m(7jQwKrd4q`EGx#L%T$JWay<#1$(q3Bez&%#uQ z2cN^x&}=myAYn{3+jYI9@$P;sJVVg{83HA_u%E5XuZ0d~f;0*N~gNC7OQGd?F7_9pO z)OxPGn`Sq!F8$C?A#yicHaC=4(0=b79jmvL9?X;HeyCsN za`|Y`xt5^PlbkDWbZiNopfWr?p^&hH28LdcEbJl1N(m&JhPH@AfF)Am^<pH#*OQ;+q83Y0%T25kTzKW4Z?1G)+^QyN0D__q`N02?8v% zA4d;`5O!jC`Vz~i2#oya! z#NfLKSn@eGph9YD3Iu;DCa=C1zkS5|`!VM0y6keDt)RPy7!skRqD?4zcd1c9DX$Az zSHS8mle#K3@DhjYnTO@e=e?FV3`_x%&lp!6i(xJ=yu6SZ)-dv=C^gOMdfpXh2e0fI zlw2824)QWCkL{hpza!>?zUTH4!WShw)DQ2?_EhTctvKVQR9o2I^$z~Fb%$LWDe0^p z(c&66)4l&<_3C`+xj_d@%!N{Z?1!PXXDz^e)0>=#k|tJFOpRG+HShPAsK};eU;TT6 zfgY5xnps_uApe(3%{@Ksn9QeR)QW|7FA>ckpSq@vpKI^npcrIKC~eAEupW>MCNZ#X zm&S}acu8~q5zEE?fs_&oCFRG38w#fnU==LjSile`na3X@48fW>b>Vtl+{5JC+s#42 z!L@Y~?F+)9nUpPp#m;Np5GV4Th)b@EOM^}G?~BvWx44G^vyoo8EA5-UZ!VgW@q@a! zB}BpwL)jH|>vRn2dhQPrOnId6GN3CcpAZ@&2bc3o^Bk@Bq&|iXOwOhz31FBUgG=71 z>E4RyR8;!sFm|UhcatLS*r3pG9Z+h$6nFR|VHk3K)8qGB-`wmU6 z=lV^Nlerv!!b6kcpI}BQ^SAHU1=}&kBv0nzJ?m{TsB*E{aUrZ&rZDxX?*6=BJChzd z+$B1Nii6X43qA^Eb(&%}<0yjgZ7WpoEN2a1Q0b&s8H!5uiFlX;63%9P`6CVI;Q;Nx zyTLyG-&&$n7L=YV&N)|d$YG>DL_@;ASHM?$L>i(?L zF~bsT7<`+JqHY=zQULZsI2b!BT)RtBm6r?Ee>S)k)rioSHembOB8fYIu5(*{WNuar zw$`LSpYJAxI6OafCONAw@yVUOc>aO;Cz?MU1~fK@3YpLtw#?Rr0!KF1K6@mJx_G-kkp~ML2C>>LSKZW zZkel)<-(fdY|w~PkS-H9JJ&B}(^J9bW(vI&u#GO#713%aEg)9uRZ#QFM%J)YAc`62 zVa2@6Xk~K&ZaS|O+McqY0^+JQBZ5qay{TwPq>Z0TkTk#cmbhKH*>=Lo7LTCETeD5U zU|o~@^yZ$7=P*4=d9#N}QMKmD^iU{#s+y0PRcENigJTEuX1sBWozul1{piOjy~pCw zmby!qN&UxHHKM`Fs_FF+^qVM{^Q zwN4UNVYFp7E;>%C{V}lmrf$Ry`6n3&%h*cT&%)R(l42LgRk>r?{+n7)1)+H5u`NXySu4)3 zB9j`m5_wY%f=f*T+pzi4O;#xtB*r|k15&xI&`Kc(u&ho}ENVv6UDv^)bFswQpLKHI zSglJZ>vq1^4LT2R{PCKFD*M{g7#{OCm1U9#SWq5@k|E(z#d{s;CYRm>V`ZkL*vfK< z@%ieyoJ06{csMuQbU9_aC1>0e_$MpaU#6tGJ35AM*3qR?p9aN<2fkDBP|BjzgFlT* z$dEmVkJ?8kP=&oYt2|Ddto{F>f>;c}E}1 zgU;*HHaJ!=gASvdU^Uq_ieB;lacHfg@84}xVaRtxn>3A)Uuv2o(xh`nE}d%#W;+- zGTC(w6#%`MeLpbgCo5)TE;pU$f5rHPL&qQ||L#yVK_tF6ST&3y>Ts3gnY&S6JK5Jd z@o-K{cCb%;g#EU2OJ=C3C5lAOvo-%*oE$=}L4+Yx46KfN)bLB+6{o61nfIkHq@75bLsE8p=EbO4U# zspg7wkv*NjjX@C0a1LS&_VlT?2sfM!GFHyj3;8lx1M7E0*x^;U$!-+KRUQix*!sO_ z#6SX3$Mp^LsH#-0Zxfy5(D&+IyoMh$@qKkBWJSL~(AE5gqOmQR9@M9Ybyi6H!;**} zk1_ORwJ4)w>gmxkIw{WQ>amcx3~=t@`iSvNmPx#i5T}wMZfs_KQumXH=>Ru1R2J_N z+9v%|cvNG;axppC41(XxY3)ktb3l3&LV^T|oBSjpb1R%C&z#sX(~9V+oPC}iezn73 zJVaqm?IHrbX$PsWKXYtG9NbR@7r4{t?#hV(tP;0piV$OR54aQS7#8s=tdWfj;->^^ zlXS!rt@PFS7)u$PW306n+b>kRoLA0zIkd%+#djOHTP>jAV+}U4OrqCYfhLHG=5xxn zNvp39%|zhHss)5>modI5{TOz$r=Tcg>{=co1Q>V>iqzPdPheoj$RI^ZTtIWxZkH3) z=gSnp{Hinr9F0iy~hnk=R?U6k82a2bQnNTY(|7}lmWm!86w zm$Du6cK7_*qFsuda=+hfv)$oEK$Viep#P!f{m_RW_t8sro4h{)XRihzt$BYwIm{9~ zr;v$<;8=zX6P5IETHgGu4oLRq$h%Sw-UKA&gE8-t55nvByC$1lD<}L6JfzZa5yMjt z$vNaU>de_r`YVg6BE~XgMc%ZRGDD3v$(kB-V8qz>?R9|~AybX#bNn?oW}paSTMVC$ z>}Xi(NmB;I}6#v24$+_MkQda9Bxqwa^6~C=Ad?xDW9ATbXaNr}~6QPer zd4Pe0mE0Poogqn_F16K84vTT)LOj^CjT^B48AV}o7cQE=PN9sT<4ZC|Kpk&*&#f)t zOU}4blWED!x`#`Hq?m0u)`pSC*-$Er$stQF#z0Fqt`e~#&#;FKLR4T#TX~K*k%pA! z;uPaLEW=64be&74IKZ_UfY4@yid^T4(o$=YJKL#j`j=Vm#&nOjah%-56TG-HAod~h zwyiN*mK=R)kr{>Px6mBe8FgkkmPQxua2&Hav}q4)FxbPi#ulMuS^{obylIp139(1Jn>f{^H71J`O%DzU! z`eK63;qe1OnLvk7JSDZ3x^e@GlQ}Pid7Z8bSyR(z)uVkBXUW-G z;rD>AO`6^@o3e48!g!Kl0Lto~(Lln^^N-fw8<&d@POBXbe6j_4Yu$=GCA{5Z-zL%y zW-Y_Hb9NciuCLLrvGc5GSbuo)Iq8$iEh5n9{W^Wx=mRdr`Pmx?cA?OrN3E8Eo)o#Y zvlaV@o-cASuGja=QmntkFl#ZS@d`>!!3GtvL9i1rwyEHtyKi{2pNhZL!js{TpZEDh zMI7Y0uZ2eRIxOo2g@zBj)gYuOP4#8DkynRM>*Lh?>J42?IW=Q;!p@l2AJFGN7dQTb zO949Ft_DnQr8i~o*()?X&+czljH7NNvzr=c9v5;JmVSlcF*0Z&xUCiU>kU(Y`44C%;kmqEJLg2?K8x6uCSMcZ=%){ z&$O*JliO6aR0-{@5`VKNY0#_u@qs5#4-WTaYxQqFqh%W+t9PSJGm@Est}PU?eenq$ z_{yH<{*ybM_osG5dbjDx1T@BG)`YobMaZhqxM0=0O642WKbusbBRzY27ZMCC9SaN$ z{l8P_c0X*5&77?4EdI%%k0p)&gMEXSP%p><7jRVCiMB==5}a0HnCH1&=G7KDxs2qm zsL~@*m48@$0tFX3L_=O`i0Tn!6u%jWdJH{VJ<(XOdpy1cv02E)7pgMoy{|YEk)x-m zR%^Fp0a?Tq2Niip*q_Z+V+vYnR(yclU6f1LYTBnTTCF26u_N~+8aXaD>S(kX8At@cy&^_sjMRqbv8uc)U$;Xl3y2F8%Na=hG4O*=nOdiKla#$PwwNSyd!&R(% z`(0NsVuhCe2YSlP5TE!EUzw~~ak0n<)L}5A00(d+Kr=sGm9UT3eG!tgCVPwp6-_&H6-4Aaxd?>6~ZkA zUIHCnCaBSoxt^|eRAvZ6Pahcskw}lgas%&Pwb&y(#)XH;S{Xwg3xheVz0HSmv zTyR%fXqIFN92!Ww=PRi9B51ERNm^l35B1mBz4fUZzP|!OXkB76wxQVWG;65Ntg=An z3JnIIQ{T)FlQT*e@H1=8`)$z2fz)-cdBoKfICwEEH5(^r66Y^>7yOgAqqrlu1a6%- zKZzgx8wb?8aakCz;RGGV=Nc;kpy9zFZeV1nML+Ys;!HSby^upJPy|mVxWDkMfkp+7 z!5XL3(>t2U+X15yUc-ljXWuoHSk<_D$9-|T{qWMDm%9Rh}i1UoVRi0yukN^Aw ziJ(vZO0jLS@H`c-LKPsA-3^B`cKGf#5QLNBnB6O3p_^ckl;pT=Y6=IOMuao5?-p(h zslK`5akUObq)kXJoV?TS9^j-ps&Ll?W@R}Z<+qk|ox}sSrQb0s&N6rFXuZCF4|KrU z?)m1TvoFT5TO?{|50dEa1>_jurZ-qlHz1fua027C5_j*qAy0tC*&@ z>a@y@?z36)7bY>&m4v^6DbZ%py}^4upL5vV*y0e6oD2aYZ0g2%@K)vz!52(%UtG`0(A{|mj&2yYaewIqrtcoK=O5|<| z$P=@YQ>~#Kb|P>@+d-wl>f`Scy#OkVRnupq$w1f=(CvnIStvm-=9Snks4-&J!5a2Q;GKcx3wEug|y_&6=kbel0UU&ZuVkK0Z5PguIVZio~ zXZjq-zuw=Szzw`mjZ4$I3C=|gziX`3GbP=$yL!{l#7V0@dQ@P3JKo9o1&_b>+qn06 zOS(a>?Dg(}$($ei`O4eDvqW=a7R#$Qz)v!PD31!^s1lIF_mQ=%(taN;hv#YUIn<`~ zjkfsO@0R?D%fP!O5l_`7^4HxPj1}rgJ6i)!6YIMsIOiO4MXiALN#BMqsIQ^ZNJpB;HaS~a{+nzz$%TH!}J&Q7?QDN191Jup)+QAzZ< zh;$k4jS-sdBHHo6zvW*ji6i+e%PaF{Uaub)Sgd1(kn5#sutu}pPIJ0J;{xEIV>fS~ z9~eq#9W@h(w0XF{Pk3#8*EW|OYyxlFvl`KyV7`~%GVm?9?QF5y3Zf0j!;!M^+P3tR z_Sa*9$&ioggOrjsP97#d$VX9kf6xBnthm#Kw4x_}%O|CJ0(+fVw(y+Y>)#-*?~VOD z*lj7j{TI|HKY`GUKK~P=Y6Pe=iW>*=eBad`{k;^;gMBAdIo^wcTSiuhbl;77uyxTLHcha(Da8DnjYa*Nfm{*)<{e>w9;)nj zB3`Vjx5i_&u9wYH&`%zG2(>QV{*>8^&1v;a_~cUVuZ9&n^8U?XY)}Vu69bFHdJKJ} zOL{n8{avt}v83II6yj&*QwJ%19Y1+s%t-AR2Cz%DhCho6*7oGRQrp4AI7RWga4$`4 z>&Cv)XYhSb?Q>dY0$|v3DG_W{wlhK*SX?ug#pO z<_gze7uX)cs4zRR z=D92+;N{8~k{n}z1o)_}N_G?6l>FK%p{Fxg55M3|9-cQcasL2{PRnt*CE)I@9S7FG z=-Qg$)spc21_;>158F(FkJA0dP%~I#rwJ3{#%ME~L3p{&ajat5wOD6p4$JT>yERyK zjl&wK{nX#?aiNmQcu`=+P%U#rVe6E=A~45LX>%Z)>2pLe>y*hp=@g+tFsgBb^}=DT z>;}T0+Xlj`*nHemqCx*#S9S&XH;D^msSghQokNdv}|7nv-&CrfY?N`9RY<T=3^ z3aRAn)^L&;pu)3n^`F+a@;@Kb>0xsf{JMa}L$SEPR^p=fd(;$5n?S?S}j z8fuWZUL|Dl9P_9nQPxWdM#aIfS|wxRe}dGY#GiRE1UeusWSF4$7s z9p1e#GA15PM8=YWuzIt4HIb^aL{d$t!zSzEfBW)3T^Uf9`=PdqFdyMpfUn7jSNt)L zRNE~nkSuYWk4v%Etyh!c04j7!@>T?S($LC^{{QIzkUxayRn}|8ID3}9MwTYx|D(wK zLx(ES{D=Lh|58+;!>an<#DA<%vHb_}Ut0ci2{T?sJ%m6T#EGxO&YbgOM-vyNc47Ye zD4dW4i-P-;IOODcafevO{`T#{VI^h5i)Hqb5SAdwUbvFK0&~yaZ{E*(uuBe*p^-#MSuka?H2%U(PSuFA#*hag`Ef+FyEew6aL{ z!up;I72*MGK1(7y#ocEHMJ;6;L0;Yt!gbwa`tsA)Tbr!UdH^bn=ILHQb}+T|=i#gj ztOMs~UKYoj-D1zPFa}lpEEL5$LyxzfLN_HN56baWLCM{uts#3Pg8&eL{T*+!&S zx5TvQ?&0~Q{lP8o&sQ~iji6<)RH$}Mbm`tUsv6Z zk32n>_gX}RW1e$#coSaz;J9SOv+p<6*O6F`GhA&6!KVrG?`0_!k{TTqnx>JFu?6$H?p>dIa zNR5E!cTJDemtgryLvBv!pAQw`hTLyV0V2h__hu3LcZZuE<#`54B}_+I9d<9{5%g2- zBkxaYZ>#$TzH(N*Z;+tPqju#__vC;%rNJa?6DjZ@Mb&wAxfuUP|5%V^rhE{O=}6E+ z!9`3}-SAR;<>XzqMc835E?l z7qapvSMj~cc;c6QC3>#}h93{U-&T<6Nelcc@1?lc8Ou1oeMLZ#{G+7@&kR9m$3+bo z*hIy?rbWNtVf|eE!YUMQITRPa@T4oiF~7J@^mAzveLKsm84@2gkTsgW`FCRd@c8h-(Nyr~MoxnFF z15zQ1X}0S~;8qqZ*&iNxQ`8T)jBC33`89gk7nB#3inL3{_iiToEew1ycI1H6Smn-4 z!N)QBrr$=hrayFG`aOU`$K%!UrUotNGOytiX)c39EpzjP`xz~Q)5_Y!H6hLWm9|4| zrv1wgh;!nSxPm5#h>Wb@#Qxp41sR=2?YM>`x#V`BY&xT5m0f(d}{ub%Qr4V z_^a5}YuHhp=@@S(M8kBl8r=iA?Zz(%SFK}v_N#6=i#w=D)m+Y%}&Fj9+)*R4Xy4NBpry!r* zGXLy%(bVWM?6Wf~Witerq}3Sv;1zW~_siBBkO#;BH7{-+2C1)PqVfk=RHFM{ zqj2XpH3yo4acTjAwGA*Rd#c3Wuu?MQ)xXcFFL=wG&op8@6@t<@V(SPx%-pgckJlM7 zd9C1rfhT z)il(dI?$IEE&E18Syo%Db5(+E9T}NXE{m#%y|c%6magpt9^o&Ed=XtI?#$lo^8JB| z0|C7Yxwic2ks)-{`XVXnEwuScQ}kA2d4t3gxb@V3@h#Lq9p_aK)V$K)XR6xT*+}-- zkxfd8fT%ea6u@G!j=wowFZ!;ZSD;IFSXZEu{tf%jv1789?ydL~z`rm{n&2}>Td}r( zM+*RtM*bmXsrZswX`s#f{I#1b;P(S0-MDi2R|R5m&7O!bh6d_rW@_y8Q-Axzk1x%r6?AI0Rt4>6M(1@FyWD*xUt(?T-hBrZvvd66m1gxNKRHJEbSxP} z4WF+^)f$R@BwNRN@^M1I(Xv}MMr_D zTSH4rjS#w+4!e6pA%TlpFhx~<=iksC+>k(58Q&-VU5m|vBbtIPDw*=skGo&iT05DT zp)e&@OvDqt72D#{&!2?!94t8b>=k!67)``HP0U$n;%X?zBmC9^WCeaxk3$ZelN=Z+ zaklr*G!M%Ot!?8;aN^*pk?UomBOFML>GZX9CHL9eQeT5Oxgj3c1jV9$Z)%9r)tL$h zlptnTLBKS0UU{Dla|}=PmnRmgFt37rzcoLEUM&_T6#nAX5kJ;XPnk~*=VFUmCP>^B z+t-jQb&Wdl(;ZXNm6R!VrjHdk@wXykLj7$6{5ts5Yl^fU)6$p=ofI;Gq&mlxe=*|% zMfANr0(`-{hc~XPbH2Cql-~{dzP?;Hu(8wbS5qvk&ttz@<*g@xdr9Jb9hKZN z5CW&9q2W%M7C}l95|eZ8#4+tr)EF6hl~zaOS2~94h{x(QL7-v3=YIg<*`fqwFs+;=EC+(EX^AK zBB@y~{OgsTF9WA`cHnsE?tT91Nk3`{xd2yQNQ;Mt>2I8hP@kH6*w4@*8hfkesACQF zwGJ|*B77En}^{D^56tFxoMW4r8Y9yS;o3XCh=pawo6+{Aa3Qoy$nk z&#M(Q3^w9_zl818#NK&yxHYdMxnr&k>wIaQ7+pSJHx3Sq&)?;i*)^>p1=*yV2e!?h zA|0RzH*anMJ09`_EHc){FX2nzIw8SAbx+X$q+_8DG%vT8Yau16z=2V_Rc6#SoFgIF zw!!%IE=I&uk?riE99u*!nF)saxL>N2B6bl?2Ac_(^e(Gkf28^#S&*P}eQ9TmLIqvh zjV6Z2VQtk_Mt*NR%Xx9S55^U zzW^$TNCKb4E68*p@efeQf^XOEd(lm1kg%WHix5zn(*e&$mWInE^=dG)aw=lrS*Ga! zlwXvMqCqxvDOR92N9@&YQCY-K_br8ZuZHs7CbZp}Y`95-o%kfg@`nK~?Nh7NNLSTjg`*wipmK{8Qk$`tcspKoH@Lw} z3M0@+@#henE_KvEk^fZzuV=Sti+vH-Ysiurlj|I_ z%=q5k$b|iCcZ=Gsk8;$(sD(J>TMDu0zJ187d&YJF+v*Fb`vNt-FvFX6@%gRx7k3Pf zwa!r+XqoJy=%*!&Z^;kFxx@5M$X*EnMWIsN3LGk*UAPP%7uZ_^N&ph{M>+=n-zQi= z2{K9$CrF?J879JiT6()ax@@ix{;MeW@h1N7D-{*QEK5oFuSUrahLaD5e-6I<`wGJV zIml8F{?mIl|G^OU!GQX2>c_|n8&oMvPWXQsdHz2PzIdQ>S$>EBLJ++i4dK80ApWPN zND7dh90lS3EFk{dSQIU&T#l0Pe^wR#&5*+YI+x>vKoSIj3t@wVW_!tYstF@1s=6^bl3ObRefF_at$Nv8UvFLe# diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt new file mode 100644 index 00000000..2860c30b --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt @@ -0,0 +1,30 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: paragraph: italic + item-2 at level 1: paragraph: bold + item-3 at level 1: paragraph: underline + item-4 at level 1: paragraph: hyperlink + item-5 at level 1: paragraph: italic and bold hyperlink + item-6 at level 1: inline: group group + item-7 at level 2: paragraph: Normal + item-8 at level 2: paragraph: italic + item-9 at level 2: paragraph: bold + item-10 at level 2: paragraph: underline + item-11 at level 2: paragraph: and + item-12 at level 2: paragraph: hyperlink + item-13 at level 2: paragraph: on the same line + item-14 at level 1: paragraph: + item-15 at level 1: list: group list + item-16 at level 2: list_item: Italic bullet 1 + item-17 at level 2: list_item: Bold bullet 2 + item-18 at level 2: list_item: Underline bullet 3 + item-19 at level 2: inline: group group + item-20 at level 3: list_item: Some + item-21 at level 3: list_item: italic + item-22 at level 3: list_item: bold + item-23 at level 3: list_item: underline + item-24 at level 2: list: group list + item-25 at level 3: inline: group group + item-26 at level 4: list_item: Nested + item-27 at level 4: list_item: italic + item-28 at level 4: list_item: bold + item-29 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json new file mode 100644 index 00000000..9ad75e89 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json @@ -0,0 +1,577 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.3.0", + "name": "unit_test_formatting", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 16380079676357958448, + "filename": "unit_test_formatting.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/12" + }, + { + "$ref": "#/groups/1" + }, + { + "$ref": "#/texts/23" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/13" + }, + { + "$ref": "#/texts/14" + }, + { + "$ref": "#/texts/15" + }, + { + "$ref": "#/groups/2" + }, + { + "$ref": "#/groups/3" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/texts/16" + }, + { + "$ref": "#/texts/17" + }, + { + "$ref": "#/texts/18" + }, + { + "$ref": "#/texts/19" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/groups/4" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/4", + "parent": { + "$ref": "#/groups/3" + }, + "children": [ + { + "$ref": "#/texts/20" + }, + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "underline", + "text": "underline", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "hyperlink", + "text": "hyperlink", + "hyperlink": "https:/github.com/DS4SD/docling" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "italic and bold hyperlink", + "text": "italic and bold hyperlink", + "formatting": { + "bold": true, + "italic": true, + "underline": false, + "strikethrough": false + }, + "hyperlink": "https:/github.com/DS4SD/docling" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Normal", + "text": "Normal" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "underline", + "text": "underline", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "and", + "text": "and" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "hyperlink", + "text": "hyperlink", + "hyperlink": "https:/github.com/DS4SD/docling" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "on the same line", + "text": "on the same line" + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Italic bullet 1", + "text": "Italic bullet 1", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Bold bullet 2", + "text": "Bold bullet 2", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Underline bullet 3", + "text": "Underline bullet 3", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Some", + "text": "Some", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "underline", + "text": "underline", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Nested", + "text": "Nested", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md new file mode 100644 index 00000000..918e89e2 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md @@ -0,0 +1,17 @@ +*italic* + +**bold** + +underline + +[hyperlink](https:/github.com/DS4SD/docling) + +[***italic and bold hyperlink***](https:/github.com/DS4SD/docling) + +Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docling) on the same line + +- *Italic bullet 1* +- **Bold bullet 2** +- Underline bullet 3 +- Some *italic* **bold** underline + - Nested *italic* **bold** \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index f9843c78..5c43ccf4 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -76,17 +76,19 @@ def test_e2e_docx_conversions(): doc: DoclingDocument = conv_result.document pred_md: str = doc.export_to_markdown() - assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" + assert verify_export( + pred_md, str(gt_path) + ".md", generate=GENERATE + ), "export to md" pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) assert verify_export( - pred_itxt, str(gt_path) + ".itxt" + pred_itxt, str(gt_path) + ".itxt", generate=GENERATE ), "export to indented-text" assert verify_document( - doc, str(gt_path) + ".json", GENERATE + doc, str(gt_path) + ".json", generate=GENERATE ), "document document" if docx_path.name == "word_tables.docx":