diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index a782f4b1..56c025fb 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -3,7 +3,7 @@ import re from copy import deepcopy from io import BytesIO from pathlib import Path -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, Final, Optional, Union from docling_core.types.doc import ( DocItemLabel, @@ -17,9 +17,9 @@ from docling_core.types.doc import ( RichTableCell, TableCell, TableData, - TextItem, + TableItem, ) -from docling_core.types.doc.document import Formatting +from docling_core.types.doc.document import Formatting, Script from docx import Document from docx.document import Document as DocxDocument from docx.oxml.table import CT_Tc @@ -36,7 +36,6 @@ from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.docx.drawingml.utils import ( get_docx_to_pdf_converter, - get_libreoffice_cmd, get_pil_from_dml_docx, ) from docling.backend.docx.latex.omml import oMath2Latex @@ -47,6 +46,18 @@ _log = logging.getLogger(__name__) class MsWordDocumentBackend(DeclarativeDocumentBackend): + _BLIP_NAMESPACES: Final = { + "a": "http://schemas.openxmlformats.org/drawingml/2006/main", + "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006", + "v": "urn:schemas-microsoft-com:vml", + "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape", + "w10": "urn:schemas-microsoft-com:office:word", + "a14": "http://schemas.microsoft.com/office/drawing/2010/main", + } + @override def __init__( self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] @@ -58,6 +69,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.xml_namespaces = { "w": "http://schemas.microsoft.com/office/word/2003/wordml" } + self.blip_xpath_expr = etree.XPath( + ".//a:blip", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES + ) # self.initialise(path_or_stream) # Word file: self.path_or_stream: Union[BytesIO, Path] = path_or_stream @@ -133,8 +147,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc = DoclingDocument(name=self.file.stem or "file", origin=origin) if self.is_valid(): assert self.docx_obj is not None - doc, _ = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc) - # doc, _ = doc_info + doc, _ = self._walk_linear(self.docx_obj.element.body, doc) + return doc else: raise RuntimeError( @@ -192,7 +206,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def _walk_linear( self, body: BaseOxmlElement, - docx_obj: DocxDocument, doc: DoclingDocument, # parent: ) -> tuple[DoclingDocument, list[RefItem]]: @@ -200,20 +213,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): for element in body: tag_name = etree.QName(element).localname # Check for Inline Images (blip elements) - namespaces = { - "a": "http://schemas.openxmlformats.org/drawingml/2006/main", - "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", - "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", - "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", - "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006", - "v": "urn:schemas-microsoft-com:vml", - "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape", - "w10": "urn:schemas-microsoft-com:office:word", - "a14": "http://schemas.microsoft.com/office/drawing/2010/main", - } - xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces) - drawing_blip = xpath_expr(element) - drawingml_els = element.findall(".//w:drawing", namespaces=namespaces) + drawing_blip = self.blip_xpath_expr(element) + drawingml_els = element.findall( + ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES + ) # Check for textbox content - check multiple textbox formats # Only process if the element hasn't been processed before @@ -221,7 +224,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if element_id not in self.processed_textbox_elements: # Modern Word textboxes txbx_xpath = etree.XPath( - ".//w:txbxContent|.//v:textbox//w:p", namespaces=namespaces + ".//w:txbxContent|.//v:textbox//w:p", + namespaces=MsWordDocumentBackend._BLIP_NAMESPACES, ) textbox_elements = txbx_xpath(element) @@ -230,7 +234,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Additional checks for textboxes in DrawingML and VML formats alt_txbx_xpath = etree.XPath( ".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t", - namespaces=namespaces, + namespaces=MsWordDocumentBackend._BLIP_NAMESPACES, ) textbox_elements = alt_txbx_xpath(element) @@ -238,7 +242,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if not textbox_elements: shape_text_xpath = etree.XPath( ".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t", - namespaces=namespaces, + namespaces=MsWordDocumentBackend._BLIP_NAMESPACES, ) shape_text_elements = shape_text_xpath(element) if shape_text_elements: @@ -272,26 +276,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): _log.debug( f"Found textbox content with {len(textbox_elements)} elements" ) - tbc = self._handle_textbox_content(textbox_elements, docx_obj, doc) + tbc = self._handle_textbox_content(textbox_elements, doc) added_elements.extend(tbc) # Check for Tables - if element.tag.endswith("tbl"): + if tag_name == "tbl": try: - t = self._handle_tables(element, docx_obj, doc) + t = self._handle_tables(element, doc) added_elements.extend(t) except Exception: _log.debug("could not parse a table, broken docx table") # Check for Image elif drawing_blip: - pics = self._handle_pictures(docx_obj, drawing_blip, doc) + pics = self._handle_pictures(drawing_blip, doc) added_elements.extend(pics) # Check for Text after the Image if ( - tag_name in ["p"] - and element.find(".//w:t", namespaces=namespaces) is not None + tag_name == "p" + and element.find( + ".//w:t", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES + ) + is not None ): - te1 = self._handle_text_elements(element, docx_obj, doc) + te1 = self._handle_text_elements(element, doc) added_elements.extend(te1) # Check for DrawingML elements elif drawingml_els: @@ -314,18 +321,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): else: self._handle_drawingml(doc=doc, drawingml_els=drawingml_els) # Check for the sdt containers, like table of contents - elif tag_name in ["sdt"]: - sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) + elif tag_name == "sdt": + sdt_content = element.find( + ".//w:sdtContent", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES + ) if sdt_content is not None: # Iterate paragraphs, runs, or text inside . - paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces) + paragraphs = sdt_content.findall( + ".//w:p", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES + ) for p in paragraphs: - te = self._handle_text_elements(p, docx_obj, doc) + te = self._handle_text_elements(p, doc) added_elements.extend(te) # Check for Text - elif tag_name in ["p"]: + elif tag_name == "p": # "tcPr", "sectPr" - te = self._handle_text_elements(element, docx_obj, doc) + te = self._handle_text_elements(element, doc) added_elements.extend(te) else: _log.debug(f"Ignoring element in DOCX with tag: {tag_name}") @@ -384,16 +395,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): for key in keys_to_reset: self.list_counters[key] = 0 - def _is_numbered_list(self, docx_obj: DocxDocument, numId: int, ilvl: int) -> bool: + def _is_numbered_list(self, numId: int, ilvl: int) -> bool: """Check if a list is numbered based on its numFmt value.""" try: # Access the numbering part of the document - if not hasattr(docx_obj, "part") or not hasattr(docx_obj.part, "package"): + if not hasattr(self.docx_obj, "part") or not hasattr( + self.docx_obj.part, "package" + ): return False numbering_part = None # Find the numbering part - for part in docx_obj.part.package.parts: + for part in self.docx_obj.part.package.parts: if "numbering" in part.partname: numbering_part = part break @@ -523,15 +536,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def _get_format_from_run(cls, run: Run) -> Optional[Formatting]: # The .bold and .italic properties are booleans, but .underline can be an enum # like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean - has_bold = run.bold or False - has_italic = run.italic or False + is_bold = run.bold or False + is_italic = run.italic or False + is_strikethrough = run.font.strike or False # Convert any non-None underline value to True - has_underline = bool(run.underline is not None and run.underline) + is_underline = bool(run.underline is not None and run.underline) + is_sub = run.font.subscript or False + is_sup = run.font.superscript or False + script = Script.SUB if is_sub else Script.SUPER if is_sup else Script.BASELINE return Formatting( - bold=has_bold, - italic=has_italic, - underline=has_underline, + bold=is_bold, + italic=is_italic, + underline=is_underline, + strikethrough=is_strikethrough, + script=script, ) def _get_paragraph_elements(self, paragraph: Paragraph): @@ -724,7 +743,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def _handle_textbox_content( self, textbox_elements: list, - docx_obj: DocxDocument, doc: DoclingDocument, ) -> list[RefItem]: elem_ref: list[RefItem] = [] @@ -766,7 +784,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Process all the paragraphs for p, position in all_paragraphs: # Create paragraph object to get text content - paragraph = Paragraph(p, docx_obj) + paragraph = Paragraph(p, self.docx_obj) text_content = paragraph.text # Create a unique identifier based on content and position @@ -782,7 +800,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Mark this paragraph as processed processed_paragraphs.add(paragraph_id) - elem_ref.extend(self._handle_text_elements(p, docx_obj, doc)) + elem_ref.extend(self._handle_text_elements(p, doc)) # Restore original parent self.parents[level] = original_parent @@ -854,11 +872,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def _handle_text_elements( self, element: BaseOxmlElement, - docx_obj: DocxDocument, doc: DoclingDocument, ) -> list[RefItem]: elem_ref: list[RefItem] = [] - paragraph = Paragraph(element, docx_obj) + paragraph = Paragraph(element, self.docx_obj) paragraph_elements = self._get_paragraph_elements(paragraph) text, equations = self._handle_equations_in_text( element=element, text=paragraph.text @@ -884,7 +901,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): and p_style_id not in ["Title", "Heading"] ): # Check if this is actually a numbered list by examining the numFmt - is_numbered = self._is_numbered_list(docx_obj, numid, ilevel) + is_numbered = self._is_numbered_list(numid, ilevel) li = self._add_list_item( doc=doc, @@ -1239,14 +1256,35 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) return elem_ref + @staticmethod + def _group_cell_elements( + group_name: str, + doc: DoclingDocument, + provs_in_cell: list[RefItem], + docling_table: TableItem, + ) -> RefItem: + group_element = doc.add_group( + label=GroupLabel.UNSPECIFIED, + name=group_name, + parent=docling_table, + ) + for prov in provs_in_cell: + group_element.children.append(prov) + pr_item = prov.resolve(doc) + item_parent = pr_item.parent.resolve(doc) + if pr_item.get_ref() in item_parent.children: + item_parent.children.remove(pr_item.get_ref()) + pr_item.parent = group_element.get_ref() + ref_for_rich_cell = group_element.get_ref() + return ref_for_rich_cell + def _handle_tables( self, element: BaseOxmlElement, - docx_obj: DocxDocument, doc: DoclingDocument, ) -> list[RefItem]: elem_ref: list[RefItem] = [] - table: Table = Table(element, docx_obj) + table: Table = Table(element, self.docx_obj) num_rows = len(table.rows) num_cols = len(table.columns) _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns") @@ -1255,7 +1293,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): cell_element = table.rows[0].cells[0] # In case we have a table of only 1 cell, we consider it furniture # And proceed processing the content of the cell as though it's in the document body - self._walk_linear(cell_element._element, docx_obj, doc) + self._walk_linear(cell_element._element, doc) return elem_ref data = TableData(num_rows=num_rows, num_cols=num_cols) @@ -1300,52 +1338,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): text = text.replace("", "$").replace("", "$") provs_in_cell: list[RefItem] = [] - _, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc) - ref_for_rich_cell = provs_in_cell[0] - rich_table_cell = False + rich_table_cell: bool = self._is_rich_table_cell(cell) - def group_cell_elements( - group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem] - ) -> RefItem: - group_element = doc.add_group( - label=GroupLabel.UNSPECIFIED, - name=group_name, - parent=docling_table, - ) - for prov in provs_in_cell: - group_element.children.append(prov) - pr_item = prov.resolve(doc) - item_parent = pr_item.parent.resolve(doc) - if pr_item.get_ref() in item_parent.children: - item_parent.children.remove(pr_item.get_ref()) - pr_item.parent = group_element.get_ref() - ref_for_rich_cell = group_element.get_ref() - return ref_for_rich_cell + if rich_table_cell: + _, provs_in_cell = self._walk_linear(cell._element, doc) + _log.debug(f"Table cell {row_idx},{col_idx} rich? {rich_table_cell}") - if len(provs_in_cell) > 1: + if len(provs_in_cell) > 0: # Cell has multiple elements, we need to group them rich_table_cell = True group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}" - ref_for_rich_cell = group_cell_elements( - group_name, doc, provs_in_cell + ref_for_rich_cell = MsWordDocumentBackend._group_cell_elements( + group_name, doc, provs_in_cell, docling_table ) - elif len(provs_in_cell) == 1: - item_ref = provs_in_cell[0] - pr_item = item_ref.resolve(doc) - if isinstance(pr_item, TextItem): - # Cell has only one element and it's just a text - rich_table_cell = False - doc.delete_items(node_items=[pr_item]) - else: - rich_table_cell = True - group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}" - ref_for_rich_cell = group_cell_elements( - group_name, doc, provs_in_cell - ) - else: - rich_table_cell = False - if rich_table_cell: rich_cell = RichTableCell( text=text, @@ -1377,17 +1383,79 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): col_idx += cell.grid_span return elem_ref + def _is_rich_table_cell(self, cell: _Cell) -> bool: + """Determine whether a docx cell should be parsed as a Docling RichTableCell. + + A docx cell can hold rich content and be parsed with a Docling RichTableCell. + However, this requires walking through the lxml elements and creating + node items. If the cell holds only plain text, a TableCell, the parsing + is simpler and using a TableCell is prefered. + + Plain text means: + - The cell has only one paragraph + - The paragraph consists solely of runs with no run properties + (no need of Docling formatting). + - No other block-level elements are present inside the cell element. + + Args: + cell: A docx cell + + Returns: + Whether the docx cell should be parsed as RichTableCell + """ + tc = cell._tc + + # must contain only one paragraph + paragraphs = list( + tc.iterchildren( + "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p" + ) + ) + if len(paragraphs) > 1: + return True + + # no other content + allowed_tags = {"p", "tcPr"} # paragraph or table-cell properties + for child in tc: + tag = child.tag.split("}")[-1] + if tag not in allowed_tags: + return True + for elem in tc: + if self.blip_xpath_expr(elem): + return True + if elem.findall( + ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES + ): + return True + + # paragraph must contain runs with no run-properties + for para in paragraphs: + runs = list( + para.iterchildren( + "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r" + ) + ) + for rn in runs: + item: Run = Run(rn, self.docx_obj) + if item is not None: + fm = MsWordDocumentBackend._get_format_from_run(item) + if fm != Formatting(): + return True + + # All checks passed: plain text only + return False + def _handle_pictures( - self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument + self, drawing_blip: Any, doc: DoclingDocument ) -> list[RefItem]: def get_docx_image(drawing_blip: Any) -> Optional[bytes]: image_data: Optional[bytes] = None rId = drawing_blip[0].get( "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" ) - if rId in docx_obj.part.rels: + if rId in self.docx_obj.part.rels: # Access the image part using the relationship ID - image_part = docx_obj.part.rels[rId].target_part + image_part = self.docx_obj.part.rels[rId].target_part image_data = image_part.blob # Get the binary image data return image_data diff --git a/tests/data/docx/docx_rich_cells.docx b/tests/data/docx/docx_rich_cells.docx new file mode 100644 index 00000000..a70febb1 Binary files /dev/null and b/tests/data/docx/docx_rich_cells.docx differ diff --git a/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.itxt b/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.itxt new file mode 100644 index 00000000..ef3ed32e --- /dev/null +++ b/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.itxt @@ -0,0 +1,107 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group header-0 + item-2 at level 2: section: group header-1 + item-3 at level 3: section_header: Table with rich cells + item-4 at level 4: table with [4x2] + item-5 at level 5: unspecified: group rich_cell_group_1_0_1 + item-6 at level 6: text: This is a list: + item-7 at level 6: list: group list + item-8 at level 7: list_item: A First + item-9 at level 7: list_item: A Second + item-10 at level 7: list_item: A Third + item-11 at level 5: unspecified: group rich_cell_group_1_1_1 + item-12 at level 6: text: This is a formatted list: + item-13 at level 6: list: group list + item-14 at level 7: list_item: + item-15 at level 8: inline: group group + item-16 at level 9: text: B + item-17 at level 9: text: First + item-18 at level 7: list_item: + item-19 at level 8: inline: group group + item-20 at level 9: text: B + item-21 at level 9: text: Second + item-22 at level 7: list_item: + item-23 at level 8: inline: group group + item-24 at level 9: text: B + item-25 at level 9: text: Third + item-26 at level 5: unspecified: group rich_cell_group_1_0_2 + item-27 at level 6: text: First Paragraph + +Second Paragraph + item-28 at level 6: text: Third paragraph before a numbered list + item-29 at level 6: list: group list + item-30 at level 7: list_item: Number one + item-31 at level 7: list_item: Number two + item-32 at level 7: list_item: Number three + item-33 at level 5: unspecified: group rich_cell_group_1_1_2 + item-34 at level 6: text: This is simple text with + item-35 at level 6: text: bold + item-36 at level 6: text: , + item-37 at level 6: text: strikethrough + item-38 at level 6: text: and + item-39 at level 6: text: italic + item-40 at level 6: text: formatting with x + item-41 at level 6: text: 2 + item-42 at level 6: text: and H + item-43 at level 6: text: 2 + item-44 at level 6: text: O + item-45 at level 5: unspecified: group rich_cell_group_1_0_3 + item-46 at level 6: text: This is a paragraph + item-47 at level 6: text: This is another paragraph + item-48 at level 4: inline: group group + item-49 at level 4: text: + item-50 at level 4: text: + item-51 at level 4: text: + item-52 at level 4: text: + item-53 at level 4: text: + item-54 at level 4: text: + item-55 at level 3: section_header: Table with nested table + item-56 at level 4: text: Before table + item-57 at level 4: table with [3x2] + item-58 at level 5: unspecified: group rich_cell_group_2_1_1 + item-59 at level 6: text: Simple cell with + item-60 at level 6: text: bold + item-61 at level 6: text: and + item-62 at level 6: text: italic + item-63 at level 6: text: text + item-64 at level 5: unspecified: group rich_cell_group_3_0_2 + item-65 at level 6: table with [2x3] + item-66 at level 7: unspecified: group rich_cell_group_3_0_1 + item-67 at level 8: text: Cell 1 + item-68 at level 7: unspecified: group rich_cell_group_3_1_1 + item-69 at level 8: text: Cell 2 + item-70 at level 7: unspecified: group rich_cell_group_3_2_1 + item-71 at level 8: text: Cell 3 + item-72 at level 6: text: + item-73 at level 5: unspecified: group rich_cell_group_4_1_2 + item-74 at level 6: text: Rich cell +A nested table + item-75 at level 6: table with [2x3] + item-76 at level 7: unspecified: group rich_cell_group_4_0_1 + item-77 at level 8: text: Cell 1 + item-78 at level 7: unspecified: group rich_cell_group_4_1_1 + item-79 at level 8: text: Cell 2 + item-80 at level 7: unspecified: group rich_cell_group_4_2_1 + item-81 at level 8: text: Cell 3 + item-82 at level 6: text: + item-83 at level 4: inline: group group + item-84 at level 4: inline: group group + item-85 at level 5: text: After table with + item-86 at level 5: text: bold + item-87 at level 5: text: , + item-88 at level 5: text: underline + item-89 at level 5: text: , + item-90 at level 5: text: strikethrough + item-91 at level 5: text: , and + item-92 at level 5: text: italic + item-93 at level 5: text: formatting + item-94 at level 4: text: + item-95 at level 3: section_header: Table with pictures + item-96 at level 4: text: + item-97 at level 4: table with [3x2] + item-98 at level 5: unspecified: group rich_cell_group_5_1_1 + item-99 at level 6: picture + item-100 at level 5: unspecified: group rich_cell_group_5_0_2 + item-101 at level 6: text: Text and picture + item-102 at level 6: picture + item-103 at level 4: text: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.json b/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.json new file mode 100644 index 00000000..424f8e89 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.json @@ -0,0 +1,2928 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.7.0", + "name": "docx_rich_cells", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 11736721488276662941, + "filename": "docx_rich_cells.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "name": "header-0", + "label": "section" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/39" + }, + { + "$ref": "#/texts/65" + } + ], + "content_layer": "body", + "name": "header-1", + "label": "section" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/groups/3" + }, + "children": [ + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/tables/0" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/groups/2" + } + ], + "content_layer": "body", + "name": "rich_cell_group_1_0_1", + "label": "unspecified" + }, + { + "self_ref": "#/groups/4", + "parent": { + "$ref": "#/groups/8" + }, + "children": [ + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/12" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/5", + "parent": { + "$ref": "#/texts/6" + }, + "children": [ + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/6", + "parent": { + "$ref": "#/texts/9" + }, + "children": [ + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/7", + "parent": { + "$ref": "#/texts/12" + }, + "children": [ + { + "$ref": "#/texts/13" + }, + { + "$ref": "#/texts/14" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/8", + "parent": { + "$ref": "#/tables/0" + }, + "children": [ + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/groups/4" + } + ], + "content_layer": "body", + "name": "rich_cell_group_1_1_1", + "label": "unspecified" + }, + { + "self_ref": "#/groups/9", + "parent": { + "$ref": "#/groups/10" + }, + "children": [ + { + "$ref": "#/texts/17" + }, + { + "$ref": "#/texts/18" + }, + { + "$ref": "#/texts/19" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/10", + "parent": { + "$ref": "#/tables/0" + }, + "children": [ + { + "$ref": "#/texts/15" + }, + { + "$ref": "#/texts/16" + }, + { + "$ref": "#/groups/9" + } + ], + "content_layer": "body", + "name": "rich_cell_group_1_0_2", + "label": "unspecified" + }, + { + "self_ref": "#/groups/11", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/12", + "parent": { + "$ref": "#/tables/0" + }, + "children": [ + { + "$ref": "#/texts/20" + }, + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" + }, + { + "$ref": "#/texts/24" + }, + { + "$ref": "#/texts/25" + }, + { + "$ref": "#/texts/26" + }, + { + "$ref": "#/texts/27" + }, + { + "$ref": "#/texts/28" + }, + { + "$ref": "#/texts/29" + }, + { + "$ref": "#/texts/30" + } + ], + "content_layer": "body", + "name": "rich_cell_group_1_1_2", + "label": "unspecified" + }, + { + "self_ref": "#/groups/13", + "parent": { + "$ref": "#/tables/0" + }, + "children": [ + { + "$ref": "#/texts/31" + }, + { + "$ref": "#/texts/32" + } + ], + "content_layer": "body", + "name": "rich_cell_group_1_0_3", + "label": "unspecified" + }, + { + "self_ref": "#/groups/14", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/15", + "parent": { + "$ref": "#/tables/1" + }, + "children": [ + { + "$ref": "#/texts/41" + }, + { + "$ref": "#/texts/42" + }, + { + "$ref": "#/texts/43" + }, + { + "$ref": "#/texts/44" + }, + { + "$ref": "#/texts/45" + } + ], + "content_layer": "body", + "name": "rich_cell_group_2_1_1", + "label": "unspecified" + }, + { + "self_ref": "#/groups/16", + "parent": { + "$ref": "#/tables/2" + }, + "children": [ + { + "$ref": "#/texts/46" + } + ], + "content_layer": "body", + "name": "rich_cell_group_3_0_1", + "label": "unspecified" + }, + { + "self_ref": "#/groups/17", + "parent": { + "$ref": "#/tables/2" + }, + "children": [ + { + "$ref": "#/texts/47" + } + ], + "content_layer": "body", + "name": "rich_cell_group_3_1_1", + "label": "unspecified" + }, + { + "self_ref": "#/groups/18", + "parent": { + "$ref": "#/tables/2" + }, + "children": [ + { + "$ref": "#/texts/48" + } + ], + "content_layer": "body", + "name": "rich_cell_group_3_2_1", + "label": "unspecified" + }, + { + "self_ref": "#/groups/19", + "parent": { + "$ref": "#/tables/1" + }, + "children": [ + { + "$ref": "#/tables/2" + }, + { + "$ref": "#/texts/49" + } + ], + "content_layer": "body", + "name": "rich_cell_group_3_0_2", + "label": "unspecified" + }, + { + "self_ref": "#/groups/20", + "parent": { + "$ref": "#/tables/3" + }, + "children": [ + { + "$ref": "#/texts/51" + } + ], + "content_layer": "body", + "name": "rich_cell_group_4_0_1", + "label": "unspecified" + }, + { + "self_ref": "#/groups/21", + "parent": { + "$ref": "#/tables/3" + }, + "children": [ + { + "$ref": "#/texts/52" + } + ], + "content_layer": "body", + "name": "rich_cell_group_4_1_1", + "label": "unspecified" + }, + { + "self_ref": "#/groups/22", + "parent": { + "$ref": "#/tables/3" + }, + "children": [ + { + "$ref": "#/texts/53" + } + ], + "content_layer": "body", + "name": "rich_cell_group_4_2_1", + "label": "unspecified" + }, + { + "self_ref": "#/groups/23", + "parent": { + "$ref": "#/tables/1" + }, + "children": [ + { + "$ref": "#/texts/50" + }, + { + "$ref": "#/tables/3" + }, + { + "$ref": "#/texts/54" + } + ], + "content_layer": "body", + "name": "rich_cell_group_4_1_2", + "label": "unspecified" + }, + { + "self_ref": "#/groups/24", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/texts/55" + }, + { + "$ref": "#/texts/56" + }, + { + "$ref": "#/texts/57" + }, + { + "$ref": "#/texts/58" + }, + { + "$ref": "#/texts/59" + }, + { + "$ref": "#/texts/60" + }, + { + "$ref": "#/texts/61" + }, + { + "$ref": "#/texts/62" + }, + { + "$ref": "#/texts/63" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/25", + "parent": { + "$ref": "#/tables/4" + }, + "children": [ + { + "$ref": "#/pictures/0" + } + ], + "content_layer": "body", + "name": "rich_cell_group_5_1_1", + "label": "unspecified" + }, + { + "self_ref": "#/groups/26", + "parent": { + "$ref": "#/tables/4" + }, + "children": [ + { + "$ref": "#/texts/67" + }, + { + "$ref": "#/pictures/1" + } + ], + "content_layer": "body", + "name": "rich_cell_group_5_0_2", + "label": "unspecified" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/tables/0" + }, + { + "$ref": "#/groups/11" + }, + { + "$ref": "#/texts/33" + }, + { + "$ref": "#/texts/34" + }, + { + "$ref": "#/texts/35" + }, + { + "$ref": "#/texts/36" + }, + { + "$ref": "#/texts/37" + }, + { + "$ref": "#/texts/38" + } + ], + "content_layer": "body", + "label": "section_header", + "prov": [], + "orig": "Table with rich cells", + "text": "Table with rich cells", + "level": 2 + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "This is a list:", + "text": "This is a list:", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "A First", + "text": "A First", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + }, + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "A Second", + "text": "A Second", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + }, + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "A Third", + "text": "A Third", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + }, + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/8" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "This is a formatted list:", + "text": "This is a formatted list:", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/4" + }, + "children": [ + { + "$ref": "#/groups/5" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "", + "text": "", + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "B", + "text": "B", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "First", + "text": "First", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/4" + }, + "children": [ + { + "$ref": "#/groups/6" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "", + "text": "", + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "B", + "text": "B", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Second", + "text": "Second", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/groups/4" + }, + "children": [ + { + "$ref": "#/groups/7" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "", + "text": "", + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "B", + "text": "B", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Third", + "text": "Third", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/groups/10" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "First Paragraph\n\nSecond Paragraph", + "text": "First Paragraph\n\nSecond Paragraph", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/groups/10" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Third paragraph before a numbered list", + "text": "Third paragraph before a numbered list", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Number one", + "text": "Number one", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + }, + "enumerated": true, + "marker": "1." + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Number two", + "text": "Number two", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + }, + "enumerated": true, + "marker": "2." + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Number three", + "text": "Number three", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + }, + "enumerated": true, + "marker": "3." + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "This is simple text with", + "text": "This is simple text with", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": ",", + "text": ",", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "strikethrough", + "text": "strikethrough", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": true, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "and", + "text": "and", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/25", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "formatting with x", + "text": "formatting with x", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/27", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "2", + "text": "2", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "super" + } + }, + { + "self_ref": "#/texts/28", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "and H", + "text": "and H", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/29", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "2", + "text": "2", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "sub" + } + }, + { + "self_ref": "#/texts/30", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "O", + "text": "O", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/groups/13" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "This is a paragraph", + "text": "This is a paragraph", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/groups/13" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "This is another paragraph", + "text": "This is another paragraph", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/35", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/36", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/37", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/38", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/39", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/texts/40" + }, + { + "$ref": "#/tables/1" + }, + { + "$ref": "#/groups/14" + }, + { + "$ref": "#/groups/24" + }, + { + "$ref": "#/texts/64" + } + ], + "content_layer": "body", + "label": "section_header", + "prov": [], + "orig": "Table with nested table", + "text": "Table with nested table", + "level": 2 + }, + { + "self_ref": "#/texts/40", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Before table", + "text": "Before table", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/41", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Simple cell with", + "text": "Simple cell with", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/42", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/43", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "and", + "text": "and", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/44", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/45", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "text", + "text": "text", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/46", + "parent": { + "$ref": "#/groups/16" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Cell 1", + "text": "Cell 1", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/47", + "parent": { + "$ref": "#/groups/17" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Cell 2", + "text": "Cell 2", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/48", + "parent": { + "$ref": "#/groups/18" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Cell 3", + "text": "Cell 3", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/49", + "parent": { + "$ref": "#/groups/19" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/50", + "parent": { + "$ref": "#/groups/23" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Rich cell\nA nested table", + "text": "Rich cell\nA nested table", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/51", + "parent": { + "$ref": "#/groups/20" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Cell 1", + "text": "Cell 1", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": true, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/52", + "parent": { + "$ref": "#/groups/21" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Cell 2", + "text": "Cell 2", + "formatting": { + "bold": true, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/53", + "parent": { + "$ref": "#/groups/22" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Cell 3", + "text": "Cell 3", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/54", + "parent": { + "$ref": "#/groups/23" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/55", + "parent": { + "$ref": "#/groups/24" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "After table with", + "text": "After table with", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/56", + "parent": { + "$ref": "#/groups/24" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/57", + "parent": { + "$ref": "#/groups/24" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": ",", + "text": ",", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/58", + "parent": { + "$ref": "#/groups/24" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "underline", + "text": "underline", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/59", + "parent": { + "$ref": "#/groups/24" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": ",", + "text": ",", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/60", + "parent": { + "$ref": "#/groups/24" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "strikethrough", + "text": "strikethrough", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": true, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/61", + "parent": { + "$ref": "#/groups/24" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": ", and", + "text": ", and", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/62", + "parent": { + "$ref": "#/groups/24" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/63", + "parent": { + "$ref": "#/groups/24" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "formatting", + "text": "formatting", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/64", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/65", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/texts/66" + }, + { + "$ref": "#/tables/4" + }, + { + "$ref": "#/texts/68" + } + ], + "content_layer": "body", + "label": "section_header", + "prov": [], + "orig": "Table with pictures", + "text": "Table with pictures", + "level": 2 + }, + { + "self_ref": "#/texts/66", + "parent": { + "$ref": "#/texts/65" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/67", + "parent": { + "$ref": "#/groups/26" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Text and picture", + "text": "Text and picture", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/68", + "parent": { + "$ref": "#/texts/65" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [ + { + "self_ref": "#/pictures/0", + "parent": { + "$ref": "#/groups/25" + }, + "children": [], + "content_layer": "body", + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "image": { + "mimetype": "image/png", + "dpi": 72, + "size": { + "width": 48.0, + "height": 48.0 + }, + "uri": "" + }, + "annotations": [] + }, + { + "self_ref": "#/pictures/1", + "parent": { + "$ref": "#/groups/26" + }, + "children": [], + "content_layer": "body", + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "image": { + "mimetype": "image/png", + "dpi": 72, + "size": { + "width": 48.0, + "height": 48.0 + }, + "uri": "" + }, + "annotations": [] + } + ], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/groups/3" + }, + { + "$ref": "#/groups/8" + }, + { + "$ref": "#/groups/10" + }, + { + "$ref": "#/groups/12" + }, + { + "$ref": "#/groups/13" + } + ], + "content_layer": "body", + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": " Column A", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Column B", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "This is a list:\nA First\nA Second\nA Third ", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/3" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "This is a formatted list:\nB First\nB Second\nB Third", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/8" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "First Paragraph\n\nSecond Paragraph\n\nThird paragraph before a numbered list\nNumber one\nNumber two\nNumber three", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/10" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "This is simple text with bold, strikethrough and italic formatting with x2 and H2O", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/12" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "This is a paragraph\nThis is another paragraph ", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/13" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 4, + "num_cols": 2, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": " Column A", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Column B", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "This is a list:\nA First\nA Second\nA Third ", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "This is a formatted list:\nB First\nB Second\nB Third", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "First Paragraph\n\nSecond Paragraph\n\nThird paragraph before a numbered list\nNumber one\nNumber two\nNumber three", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "This is simple text with bold, strikethrough and italic formatting with x2 and H2O", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "This is a paragraph\nThis is another paragraph ", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + }, + { + "self_ref": "#/tables/1", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/groups/15" + }, + { + "$ref": "#/groups/19" + }, + { + "$ref": "#/groups/23" + } + ], + "content_layer": "body", + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": " Column A", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Column B", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Simple cell upper left", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Simple cell with bold and italic text", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/15" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/19" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Rich cell\nA nested table\n\n", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/23" + } + } + ], + "num_rows": 3, + "num_cols": 2, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": " Column A", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Column B", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Simple cell upper left", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Simple cell with bold and italic text", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Rich cell\nA nested table\n\n", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + }, + { + "self_ref": "#/tables/2", + "parent": { + "$ref": "#/groups/19" + }, + "children": [ + { + "$ref": "#/groups/16" + }, + { + "$ref": "#/groups/17" + }, + { + "$ref": "#/groups/18" + } + ], + "content_layer": "body", + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "A", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "B", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "C", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/16" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 2", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/17" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/18" + } + } + ], + "num_rows": 2, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "A", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "B", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "C", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 2", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + }, + { + "self_ref": "#/tables/3", + "parent": { + "$ref": "#/groups/23" + }, + "children": [ + { + "$ref": "#/groups/20" + }, + { + "$ref": "#/groups/21" + }, + { + "$ref": "#/groups/22" + } + ], + "content_layer": "body", + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "A", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "B", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "C", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/20" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 2", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/21" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/22" + } + } + ], + "num_rows": 2, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "A", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "B", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "C", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 2", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + }, + { + "self_ref": "#/tables/4", + "parent": { + "$ref": "#/texts/65" + }, + "children": [ + { + "$ref": "#/groups/25" + }, + { + "$ref": "#/groups/26" + } + ], + "content_layer": "body", + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Column A", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Column B", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Only text", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/25" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Text and picture\n", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/26" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 3, + "num_cols": 2, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Column A", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Column B", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Only text", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Text and picture\n", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + } + ], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.md b/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.md new file mode 100644 index 00000000..da8f8e82 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/docx_rich_cells.docx.md @@ -0,0 +1,25 @@ +### Table with rich cells + +| Column A | Column B | +|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------| +| This is a list: - A First - A Second - A Third | This is a formatted list: - B **First** - B *Second* - B Third | +| First Paragraph Second Paragraph Third paragraph before a numbered list 1. Number one 2. Number two 3. Number three | This is simple text with **bold** , ~~strikethrough~~ and *italic* formatting with x 2 and H 2 O | +| This is a paragraph This is another paragraph | | + +### Table with nested table + +Before table + +| Column A | Column B | +|----------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| +| Simple cell upper left | Simple cell with **bold** and *italic* text | +| | A | B | C | |----------|--------|------------| | *Cell 1* | Cell 2 | **Cell 3** | | Rich cell A nested table | A | B | C | |------------|--------------|--------| | ~~Cell 1~~ | ***Cell 2*** | Cell 3 | | + +After table with **bold** , underline , ~~strikethrough~~ , and *italic* formatting + +### Table with pictures + +| Column A | Column B | +|----------------------------------|----------------| +| Only text | | +| Text and picture | | \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 8959f8f9..385884a5 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -1,3 +1,4 @@ +import logging import os from pathlib import Path @@ -18,23 +19,109 @@ from docling.document_converter import DocumentConverter from .test_data_gen_flag import GEN_TEST_DATA from .verify_utils import verify_document, verify_export +_log = logging.getLogger(__name__) + GENERATE = GEN_TEST_DATA IS_CI = bool(os.getenv("CI")) +@pytest.fixture(scope="module") +def docx_paths() -> list[Path]: + # Define the directory you want to search + directory = Path("./tests/data/docx/") + + # List all docx files in the directory and its subdirectories + docx_files = sorted(directory.rglob("*.docx")) + + return docx_files + + +def get_converter(): + converter = DocumentConverter(allowed_formats=[InputFormat.DOCX]) + + return converter + + +@pytest.fixture(scope="module") +def documents(docx_paths) -> list[tuple[Path, DoclingDocument]]: + documents: list[dict[Path, DoclingDocument]] = [] + + converter = get_converter() + + for docx_path in docx_paths: + _log.debug(f"converting {docx_path}") + + gt_path = ( + docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name + ) + + conv_result: ConversionResult = converter.convert(docx_path) + + doc: DoclingDocument = conv_result.document + + assert doc, f"Failed to convert document from file {gt_path}" + documents.append((gt_path, doc)) + + return documents + + +def _test_e2e_docx_conversions_impl(docx_paths: list[tuple[Path, DoclingDocument]]): + has_libreoffice = False + try: + cmd = get_libreoffice_cmd(raise_if_unavailable=True) + if cmd is not None: + has_libreoffice = True + except Exception: + pass + + for docx_path, doc in docx_paths: + if not IS_CI and not has_libreoffice and docx_path.name == "drawingml.docx": + print(f"Skipping {docx_path} because no Libreoffice is installed.") + continue + + pred_md: str = doc.export_to_markdown() + assert verify_export(pred_md, str(docx_path) + ".md", generate=GENERATE), ( + f"export to markdown failed on {docx_path}" + ) + + pred_itxt: str = doc._export_to_indented_text( + max_text_len=70, explicit_tables=False + ) + assert verify_export(pred_itxt, str(docx_path) + ".itxt", generate=GENERATE), ( + f"export to indented-text failed on {docx_path}" + ) + + assert verify_document(doc, str(docx_path) + ".json", generate=GENERATE), ( + f"DoclingDocument verification failed on {docx_path}" + ) + + if docx_path.name == "word_tables.docx": + pred_html: str = doc.export_to_html() + assert verify_export( + pred_text=pred_html, + gtfile=str(docx_path) + ".html", + generate=GENERATE, + ), f"export to html failed on {docx_path}" + + +flaky_file = "textbox.docx" + + +def test_e2e_docx_conversions(documents): + target = [item for item in documents if item[0].name != flaky_file] + _test_e2e_docx_conversions_impl(target) + + @pytest.mark.xfail(strict=False) -def test_textbox_extraction(): - in_path = Path("tests/data/docx/textbox.docx") - in_doc = InputDocument( - path_or_stream=in_path, - format=InputFormat.DOCX, - backend=MsWordDocumentBackend, - ) - backend = MsWordDocumentBackend( - in_doc=in_doc, - path_or_stream=in_path, - ) - doc = backend.convert() +def test_textbox_conversion(documents): + target = [item for item in documents if item[0].name == flaky_file] + _test_e2e_docx_conversions_impl(target) + + +@pytest.mark.xfail(strict=False) +def test_textbox_extraction(documents): + name = "textbox.docx" + doc = next(item[1] for item in documents if item[0].name == name) # Verify if a particular textbox content is extracted textbox_found = False @@ -44,18 +131,9 @@ def test_textbox_extraction(): assert textbox_found -def test_heading_levels(): - in_path = Path("tests/data/docx/word_sample.docx") - in_doc = InputDocument( - path_or_stream=in_path, - format=InputFormat.DOCX, - backend=MsWordDocumentBackend, - ) - backend = MsWordDocumentBackend( - in_doc=in_doc, - path_or_stream=in_path, - ) - doc = backend.convert() +def test_heading_levels(documents): + name = "word_sample.docx" + doc = next(item[1] for item in documents if item[0].name == name) found_lvl_1 = found_lvl_2 = False for item, _ in doc.iterate_items(): @@ -69,104 +147,11 @@ def test_heading_levels(): assert found_lvl_1 and found_lvl_2 -def get_docx_paths(): - # Define the directory you want to search - directory = Path("./tests/data/docx/") +def test_text_after_image_anchors(documents): + """Test to analyse whether text gets parsed after image anchors.""" - # List all PDF files in the directory and its subdirectories - pdf_files = sorted(directory.rglob("*.docx")) - return pdf_files - - -def get_converter(): - converter = DocumentConverter(allowed_formats=[InputFormat.DOCX]) - - return converter - - -def _test_e2e_docx_conversions_impl(docx_paths: list[Path]): - converter = get_converter() - - has_libreoffice = False - try: - cmd = get_libreoffice_cmd(raise_if_unavailable=True) - if cmd is not None: - has_libreoffice = True - except Exception: - pass - - for docx_path in docx_paths: - if ( - not IS_CI - and not has_libreoffice - and str(docx_path) in ("tests/data/docx/drawingml.docx",) - ): - print(f"Skipping {docx_path} because no Libreoffice is installed.") - continue - - gt_path = ( - docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name - ) - - conv_result: ConversionResult = converter.convert(docx_path) - - doc: DoclingDocument = conv_result.document - - pred_md: str = doc.export_to_markdown() - assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), ( - f"export to markdown failed on {docx_path}" - ) - - pred_itxt: str = doc._export_to_indented_text( - max_text_len=70, explicit_tables=False - ) - assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), ( - f"export to indented-text failed on {docx_path}" - ) - - assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), ( - f"DoclingDocument verification failed on {docx_path}" - ) - - if docx_path.name == "word_tables.docx": - pred_html: str = doc.export_to_html() - assert verify_export( - pred_text=pred_html, - gtfile=str(gt_path) + ".html", - generate=GENERATE, - ), f"export to html failed on {docx_path}" - - -flaky_path = Path("tests/data/docx/textbox.docx") - - -def test_e2e_docx_conversions(): - _test_e2e_docx_conversions_impl( - docx_paths=[path for path in get_docx_paths() if path != flaky_path] - ) - - -@pytest.mark.xfail(strict=False) -def test_textbox_conversion(): - _test_e2e_docx_conversions_impl(docx_paths=[flaky_path]) - - -def test_text_after_image_anchors(): - """ - Test to analyse whether text gets parsed after image anchors. - """ - - in_path = Path("tests/data/docx/word_image_anchors.docx") - in_doc = InputDocument( - path_or_stream=in_path, - format=InputFormat.DOCX, - backend=MsWordDocumentBackend, - ) - backend = MsWordDocumentBackend( - in_doc=in_doc, - path_or_stream=in_path, - ) - doc = backend.convert() + name = "word_image_anchors.docx" + doc = next(item[1] for item in documents if item[0].name == name) found_text_after_anchor_1 = found_text_after_anchor_2 = ( found_text_after_anchor_3 @@ -188,3 +173,38 @@ def test_text_after_image_anchors(): and found_text_after_anchor_3 and found_text_after_anchor_4 ) + + +def test_is_rich_table_cell(docx_paths): + """Test the function is_rich_table_cell.""" + + name = "docx_rich_cells.docx" + path = next(item for item in docx_paths if item.name == name) + + in_doc = InputDocument( + path_or_stream=path, + format=InputFormat.DOCX, + backend=MsWordDocumentBackend, + filename=name, + ) + backend = MsWordDocumentBackend( + in_doc=in_doc, + path_or_stream=path, + ) + + gt_cells: list[bool] = [] + # table: Table with rich cells + gt_cells.extend([False, False, True, True, True, True, True, False]) + # table: Table with nested table + gt_cells.extend([False, False, False, True, True, True]) + # table: Table with pictures + gt_cells.extend([False, False, False, True, True, False]) + gt_it = iter(gt_cells) + + for idx_t, table in enumerate(backend.docx_obj.tables): + for idx_r, row in enumerate(table.rows): + for idx_c, cell in enumerate(row.cells): + assert next(gt_it) == backend._is_rich_table_cell(cell), ( + f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} " + f"with text: {cell.text}" + )