From 147c7a1bc96b376555bd6ef9037f8b8da0c71420 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 29 Jan 2025 08:32:52 -0500 Subject: [PATCH] feat: use `w:lastRenderedPageBreaks` if present to get approximate pagination Signed-off-by: David Huggins-Daines --- docling/backend/msword_backend.py | 154 +++- .../docling_v2/unit_test_headers.docx.json | 799 ++++++++++++++++-- .../unit_test_headers_numbered.docx.json | 799 ++++++++++++++++-- .../docling_v2/unit_test_lists.docx.json | 799 ++++++++++++++++-- .../docling_v2/word_sample.docx.json | 442 +++++++++- tests/test_backend_msword.py | 23 + 6 files changed, 2841 insertions(+), 175 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 4d4026e3..1659d94d 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -5,12 +5,15 @@ from pathlib import Path from typing import Any, Optional, Union from docling_core.types.doc import ( + BoundingBox, DocItemLabel, DoclingDocument, DocumentOrigin, GroupLabel, ImageRef, NodeItem, + ProvenanceItem, + Size, TableCell, TableData, ) @@ -30,6 +33,8 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) +NO_BBOX = BoundingBox(l=0, t=0, r=0, b=0) +NO_SIZE = Size(width=0, height=0) class MsWordDocumentBackend(DeclarativeDocumentBackend): @@ -57,6 +62,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.level = 0 self.listIter = 0 + self.page_no = 0 + self.prev_hard_break = False self.history: dict[str, Any] = { "names": [None], @@ -85,7 +92,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): @classmethod @override def supports_pagination(cls) -> bool: - return False + # FIXME: This is only true for *some* Word documents, see `has_pagination` below. + return True + + def has_pagination(self) -> bool: + """Can we supply pagination for this particular Word docunent?""" + if self.docx_obj is None: + return False + return ( + self.docx_obj.element.find( + ".//w:lastRenderedPageBreak", namespaces=self.docx_obj.element.nsmap + ) + is not None + ) @override def unload(self): @@ -161,6 +180,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): docx_obj: DocxDocument, doc: DoclingDocument, ) -> DoclingDocument: + if self.has_pagination(): + self.page_no = 1 + doc.add_page(page_no=self.page_no, size=NO_SIZE) + else: + self.page_no = 0 for element in body: tag_name = etree.QName(element).localname # Check for Inline Images (blip elements) @@ -193,6 +217,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): elif tag_name in ["p"]: # "tcPr", "sectPr" self.handle_text_elements(element, docx_obj, doc) + elif tag_name == "sectPr": + # Final section in the document + # Apply section information to this and all preceding pages + self.handle_section(element, docx_obj, doc) else: _log.debug(f"Ignoring element in DOCX with tag: {tag_name}") return doc @@ -260,6 +288,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): else: return label, None + def handle_section(self, element, docx_obj, doc): + if self.page_no == 0: + # No pagination, no pages, no problems! + return + pgsz = element.find("w:pgSz", element.nsmap) + if pgsz is None: + _log.warning("No page size information in section") + return + ns = pgsz.nsmap["w"] + width = pgsz.attrib[f"{{{ns}}}w"] + height = pgsz.attrib[f"{{{ns}}}h"] + size = Size(width=int(width) / 20, height=int(height) / 20) + # Do all pages created up to now + for page in doc.pages.values(): + if page.size is NO_SIZE: + page.size = size + def handle_text_elements( self, element: BaseOxmlElement, @@ -267,10 +312,43 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc: DoclingDocument, ) -> None: paragraph = Paragraph(element, docx_obj) - + sectpr = element.find(".//w:sectPr", element.nsmap) + # Apply section information to this and all preceding pages + if sectpr: + self.handle_section(element, docx_obj, doc) + start_page = self.page_no + if start_page: + # Somewhat complex logic - sometimes we have only hard + # breaks, sometimes we have only soft breaks, sometimes we + # have both (in adjacent paragraphs). + hard_break = element.findall(".//w:br[@w:type='page']", element.nsmap) + soft_break = element.findall(".//w:lastRenderedPageBreak", element.nsmap) + _log.debug( + "paragraph (hard breaks %r, soft breaks %r, prev_hard_break %r): %s", + hard_break, + soft_break, + self.prev_hard_break, + paragraph.text, + ) + if hard_break: + self.prev_hard_break = True + self.page_no += 1 + doc.add_page(page_no=self.page_no, size=NO_SIZE) + elif soft_break and not self.prev_hard_break: + self.page_no += 1 + doc.add_page(page_no=self.page_no, size=NO_SIZE) if paragraph.text is None: return + # If this paragraph has text then cancel a pending hard break + if paragraph.text: + self.prev_hard_break = False text = paragraph.text.strip() + if start_page: + prov = ProvenanceItem( + page_no=start_page, bbox=NO_BBOX, charspan=(0, len(text)) + ) + else: + prov = None # Common styles for bullet and numbered lists. # "List Bullet", "List Number", "List Paragraph" @@ -295,6 +373,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ilevel, text, is_numbered, + prov, ) self.update_history(p_style_id, p_level, numid, ilevel) return @@ -318,10 +397,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): for key in range(len(self.parents)): self.parents[key] = None self.parents[0] = doc.add_text( - parent=None, label=DocItemLabel.TITLE, text=text + parent=None, label=DocItemLabel.TITLE, text=text, prov=prov ) elif "Heading" in p_style_id: - self.add_header(doc, p_level, text) + self.add_header(doc, p_level, text, prov) elif p_style_id in [ "Paragraph", @@ -335,7 +414,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ]: level = self.get_level() doc.add_text( - label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text + label=DocItemLabel.PARAGRAPH, + parent=self.parents[level - 1], + text=text, + prov=prov, ) else: @@ -343,14 +425,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # hence we treat all other labels as pure text level = self.get_level() doc.add_text( - label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text + label=DocItemLabel.PARAGRAPH, + parent=self.parents[level - 1], + text=text, + prov=prov, ) self.update_history(p_style_id, p_level, numid, ilevel) return def add_header( - self, doc: DoclingDocument, curr_level: Optional[int], text: str + self, + doc: DoclingDocument, + curr_level: Optional[int], + text: str, + prov: Union[ProvenanceItem, None] = None, ) -> None: level = self.get_level() if isinstance(curr_level, int): @@ -372,12 +461,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[curr_level - 1], text=text, level=curr_level, + prov=prov, ) else: self.parents[self.level] = doc.add_heading( parent=self.parents[self.level - 1], text=text, level=1, + prov=prov, ) return @@ -388,6 +479,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ilevel: int, text: str, is_numbered: bool = False, + prov: Union[ProvenanceItem, None] = None, ) -> None: enum_marker = "" @@ -410,6 +502,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): enumerated=is_numbered, parent=self.parents[level], text=text, + prov=prov, ) elif ( @@ -446,6 +539,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): enumerated=is_numbered, parent=self.parents[self.level_at_new_list + ilevel], text=text, + prov=prov, ) elif ( @@ -468,6 +562,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): enumerated=is_numbered, parent=self.parents[self.level_at_new_list + ilevel], text=text, + prov=prov, ) self.listIter = 0 @@ -482,6 +577,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): enumerated=is_numbered, parent=self.parents[level - 1], text=text, + prov=prov, ) return @@ -505,6 +601,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): data = TableData(num_rows=num_rows, num_cols=num_cols) cell_set: set[CT_Tc] = set() + start_page = self.page_no + text_len = 0 for row_idx, row in enumerate(table.rows): _log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells") col_idx = 0 @@ -531,6 +629,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) _log.debug(f" spanned before row {spanned_idx}") + # If this cell has text then cancel a pending hard break + if cell.text: + self.prev_hard_break = False + text_len += len(cell.text) + table_cell = TableCell( text=cell.text, row_span=spanned_idx - row_idx, @@ -545,8 +648,27 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): data.table_cells.append(table_cell) col_idx += cell.grid_span + # NOTE: Page numbers will be very inaccurate since a + # table can definitely be split across pages (but + # individual TableCells have no provenance and thus no + # page number) + if start_page: + soft_break = row._element.findall( + ".//w:lastRenderedPageBreak", row._element.nsmap + ) + _log.debug("row (page breaks %r): %s", soft_break, row) + if soft_break and not self.prev_hard_break: + self.page_no += 1 + doc.add_page(page_no=self.page_no, size=NO_SIZE) + + if start_page: + prov = ProvenanceItem( + page_no=start_page, bbox=NO_BBOX, charspan=(0, text_len) + ) + else: + prov = None level = self.get_level() - doc.add_table(data=data, parent=self.parents[level - 1]) + doc.add_table(data=data, parent=self.parents[level - 1], prov=prov) return def handle_pictures( @@ -563,20 +685,36 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): return image_data level = self.get_level() + prov = None # Open the BytesIO object with PIL to create an Image try: image_data = get_docx_image(drawing_blip) image_bytes = BytesIO(image_data) pil_image = Image.open(image_bytes) + if self.page_no: + width, height = pil_image.size + prov = ProvenanceItem( + page_no=self.page_no, + bbox=BoundingBox(l=0, t=0, r=width, b=height), + charspan=(0, 0), + ) doc.add_picture( parent=self.parents[level - 1], image=ImageRef.from_pil(image=pil_image, dpi=72), caption=None, + prov=prov, ) except (UnidentifiedImageError, OSError) as e: _log.warning("Warning: image cannot be loaded by Pillow") + if self.page_no: + prov = ProvenanceItem( + page_no=self.page_no, + bbox=NO_BBOX, + charspan=(0, 0), + ) doc.add_picture( parent=self.parents[level - 1], caption=None, + prov=prov, ) return diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json b/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json index c76d241a..c7adf998 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json @@ -56,7 +56,22 @@ } ], "label": "title", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Test Document", "text": "Test Document" }, @@ -67,7 +82,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -100,7 +130,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 9 + ] + } + ], "orig": "Section 1", "text": "Section 1", "level": 1 @@ -112,7 +157,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -123,7 +183,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Paragraph 1.1", "text": "Paragraph 1.1" }, @@ -134,7 +209,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -145,7 +235,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Paragraph 1.2", "text": "Paragraph 1.2" }, @@ -156,7 +261,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -183,7 +303,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "Section 1.1", "text": "Section 1.1", "level": 2 @@ -195,7 +330,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -206,7 +356,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Paragraph 1.1.1", "text": "Paragraph 1.1.1" }, @@ -217,7 +382,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -228,7 +408,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Paragraph 1.1.2", "text": "Paragraph 1.1.2" }, @@ -239,7 +434,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -269,7 +479,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "Section 1.2", "text": "Section 1.2", "level": 2 @@ -281,7 +506,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -292,7 +532,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Paragraph 1.1.1", "text": "Paragraph 1.1.1" }, @@ -303,7 +558,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -314,7 +584,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Paragraph 1.1.2", "text": "Paragraph 1.1.2" }, @@ -325,7 +610,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -355,7 +655,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Section 1.2.3", "text": "Section 1.2.3", "level": 3 @@ -367,7 +682,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -378,7 +708,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 17 + ] + } + ], "orig": "Paragraph 1.2.3.1", "text": "Paragraph 1.2.3.1" }, @@ -389,7 +734,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -400,7 +760,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 17 + ] + } + ], "orig": "Paragraph 1.2.3.1", "text": "Paragraph 1.2.3.1" }, @@ -411,7 +786,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -422,7 +812,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -455,7 +860,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 9 + ] + } + ], "orig": "Section 2", "text": "Section 2", "level": 1 @@ -467,7 +887,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -478,7 +913,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Paragraph 2.1", "text": "Paragraph 2.1" }, @@ -489,7 +939,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -500,7 +965,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Paragraph 2.2", "text": "Paragraph 2.2" }, @@ -511,7 +991,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -538,7 +1033,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Section 2.1.1", "text": "Section 2.1.1", "level": 3 @@ -550,7 +1060,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -561,7 +1086,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 17 + ] + } + ], "orig": "Paragraph 2.1.1.1", "text": "Paragraph 2.1.1.1" }, @@ -572,7 +1112,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -583,7 +1138,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 17 + ] + } + ], "orig": "Paragraph 2.1.1.1", "text": "Paragraph 2.1.1.1" }, @@ -594,7 +1164,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -624,7 +1209,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "Section 2.1", "text": "Section 2.1", "level": 2 @@ -636,7 +1236,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -647,7 +1262,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Paragraph 2.1.1", "text": "Paragraph 2.1.1" }, @@ -658,7 +1288,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -669,7 +1314,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Paragraph 2.1.2", "text": "Paragraph 2.1.2" }, @@ -680,7 +1340,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -691,7 +1366,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" } @@ -699,5 +1389,20 @@ "pictures": [], "tables": [], "key_value_items": [], - "pages": {} + "pages": { + "1": { + "size": { + "width": 612.0, + "height": 792.0 + }, + "page_no": 1 + }, + "2": { + "size": { + "width": 612.0, + "height": 792.0 + }, + "page_no": 2 + } + } } \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json index 38a25d33..a281c042 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json @@ -150,7 +150,22 @@ } ], "label": "title", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Test Document", "text": "Test Document" }, @@ -161,7 +176,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -172,7 +202,22 @@ }, "children": [], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 9 + ] + } + ], "orig": "Section 1", "text": "Section 1", "level": 1 @@ -184,7 +229,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -195,7 +255,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Paragraph 1.1", "text": "Paragraph 1.1" }, @@ -206,7 +281,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -217,7 +307,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Paragraph 1.2", "text": "Paragraph 1.2" }, @@ -228,7 +333,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -255,7 +375,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "Section 1.1", "text": "Section 1.1", "level": 2 @@ -267,7 +402,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -278,7 +428,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Paragraph 1.1.1", "text": "Paragraph 1.1.1" }, @@ -289,7 +454,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -300,7 +480,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Paragraph 1.1.2", "text": "Paragraph 1.1.2" }, @@ -311,7 +506,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -341,7 +551,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "Section 1.2", "text": "Section 1.2", "level": 2 @@ -353,7 +578,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -364,7 +604,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Paragraph 1.1.1", "text": "Paragraph 1.1.1" }, @@ -375,7 +630,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -386,7 +656,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Paragraph 1.1.2", "text": "Paragraph 1.1.2" }, @@ -397,7 +682,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -427,7 +727,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Section 1.2.3", "text": "Section 1.2.3", "level": 3 @@ -439,7 +754,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -450,7 +780,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 17 + ] + } + ], "orig": "Paragraph 1.2.3.1", "text": "Paragraph 1.2.3.1" }, @@ -461,7 +806,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -472,7 +832,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 17 + ] + } + ], "orig": "Paragraph 1.2.3.1", "text": "Paragraph 1.2.3.1" }, @@ -483,7 +858,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -494,7 +884,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -505,7 +910,22 @@ }, "children": [], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 9 + ] + } + ], "orig": "Section 2", "text": "Section 2", "level": 1 @@ -517,7 +937,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -528,7 +963,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Paragraph 2.1", "text": "Paragraph 2.1" }, @@ -539,7 +989,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -550,7 +1015,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Paragraph 2.2", "text": "Paragraph 2.2" }, @@ -561,7 +1041,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -588,7 +1083,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Section 2.1.1", "text": "Section 2.1.1", "level": 3 @@ -600,7 +1110,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -611,7 +1136,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 17 + ] + } + ], "orig": "Paragraph 2.1.1.1", "text": "Paragraph 2.1.1.1" }, @@ -622,7 +1162,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -633,7 +1188,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 17 + ] + } + ], "orig": "Paragraph 2.1.1.1", "text": "Paragraph 2.1.1.1" }, @@ -644,7 +1214,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -674,7 +1259,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "Section 2.1", "text": "Section 2.1", "level": 2 @@ -686,7 +1286,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -697,7 +1312,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Paragraph 2.1.1", "text": "Paragraph 2.1.1" }, @@ -708,7 +1338,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -719,7 +1364,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Paragraph 2.1.2", "text": "Paragraph 2.1.2" }, @@ -730,7 +1390,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -741,7 +1416,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" } @@ -749,5 +1439,20 @@ "pictures": [], "tables": [], "key_value_items": [], - "pages": {} + "pages": { + "1": { + "size": { + "width": 612.0, + "height": 792.0 + }, + "page_no": 1 + }, + "2": { + "size": { + "width": 612.0, + "height": 792.0 + }, + "page_no": 2 + } + } } \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json index 1410586c..14fa3ec7 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json @@ -309,7 +309,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Test Document", "text": "Test Document", "level": 1 @@ -321,7 +336,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -332,7 +362,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -343,7 +388,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Paragraph 2.1.1", "text": "Paragraph 2.1.1" }, @@ -354,7 +414,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -365,7 +440,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Paragraph 2.1.2", "text": "Paragraph 2.1.2" }, @@ -376,7 +466,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -394,7 +499,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 7 + ] + } + ], "orig": "Test 1:", "text": "Test 1:", "level": 3 @@ -406,7 +526,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item 1", "text": "List item 1", "enumerated": false, @@ -419,7 +554,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item 2", "text": "List item 2", "enumerated": false, @@ -432,7 +582,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item 3", "text": "List item 3", "enumerated": false, @@ -445,7 +610,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -463,7 +643,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 7 + ] + } + ], "orig": "Test 2:", "text": "Test 2:", "level": 3 @@ -475,7 +670,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item a", "text": "List item a", "enumerated": false, @@ -488,7 +698,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item b", "text": "List item b", "enumerated": false, @@ -501,7 +726,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item c", "text": "List item c", "enumerated": false, @@ -514,7 +754,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -532,7 +787,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 7 + ] + } + ], "orig": "Test 3:", "text": "Test 3:", "level": 3 @@ -544,7 +814,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item 1", "text": "List item 1", "enumerated": false, @@ -557,7 +842,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item 2", "text": "List item 2", "enumerated": false, @@ -570,7 +870,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "List item 1.1", "text": "List item 1.1", "enumerated": false, @@ -583,7 +898,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "List item 1.2", "text": "List item 1.2", "enumerated": false, @@ -596,7 +926,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "List item 1.3", "text": "List item 1.3", "enumerated": false, @@ -609,7 +954,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item 3", "text": "List item 3", "enumerated": false, @@ -622,7 +982,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -640,7 +1015,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 7 + ] + } + ], "orig": "Test 4:", "text": "Test 4:", "level": 3 @@ -652,7 +1042,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item 1", "text": "List item 1", "enumerated": false, @@ -665,7 +1070,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "List item 1.1", "text": "List item 1.1", "enumerated": false, @@ -678,7 +1098,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item 2", "text": "List item 2", "enumerated": false, @@ -691,7 +1126,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -709,7 +1159,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 7 + ] + } + ], "orig": "Test 5:", "text": "Test 5:", "level": 3 @@ -721,7 +1186,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item 1", "text": "List item 1", "enumerated": false, @@ -734,7 +1214,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "List item 1.1", "text": "List item 1.1", "enumerated": false, @@ -747,7 +1242,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "List item 1.1.1", "text": "List item 1.1.1", "enumerated": false, @@ -760,7 +1270,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item 3", "text": "List item 3", "enumerated": false, @@ -773,7 +1298,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -797,7 +1337,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 7 + ] + } + ], "orig": "Test 6:", "text": "Test 6:", "level": 3 @@ -809,7 +1364,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item 1", "text": "List item 1", "enumerated": false, @@ -822,7 +1392,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item 2", "text": "List item 2", "enumerated": false, @@ -835,7 +1420,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "List item 1.1", "text": "List item 1.1", "enumerated": false, @@ -848,7 +1448,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "List item 1.2", "text": "List item 1.2", "enumerated": false, @@ -861,7 +1476,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "List item 1.2.1", "text": "List item 1.2.1", "enumerated": false, @@ -874,7 +1504,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "List item 3", "text": "List item 3", "enumerated": false, @@ -887,7 +1532,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -898,7 +1558,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -909,7 +1584,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" } @@ -917,5 +1607,20 @@ "pictures": [], "tables": [], "key_value_items": [], - "pages": {} + "pages": { + "1": { + "size": { + "width": 612.0, + "height": 792.0 + }, + "page_no": 1 + }, + "2": { + "size": { + "width": 612.0, + "height": 792.0 + }, + "page_no": 2 + } + } } \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_sample.docx.json b/tests/data/groundtruth/docling_v2/word_sample.docx.json index 8c6e6298..5daf1af5 100644 --- a/tests/data/groundtruth/docling_v2/word_sample.docx.json +++ b/tests/data/groundtruth/docling_v2/word_sample.docx.json @@ -93,7 +93,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 17 + ] + } + ], "orig": "Summer activities", "text": "Summer activities" }, @@ -117,7 +132,22 @@ } ], "label": "title", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 20 + ] + } + ], "orig": "Swimming in the lake", "text": "Swimming in the lake" }, @@ -128,7 +158,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 4 + ] + } + ], "orig": "Duck", "text": "Duck" }, @@ -139,7 +184,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 33 + ] + } + ], "orig": "Figure 1: This is a cute duckling", "text": "Figure 1: This is a cute duckling" }, @@ -169,7 +229,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 11 + ] + } + ], "orig": "Let\u2019s swim!", "text": "Let\u2019s swim!", "level": 1 @@ -181,7 +256,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 77 + ] + } + ], "orig": "To get started with swimming, first lay down in a water and try not to drown:", "text": "To get started with swimming, first lay down in a water and try not to drown:" }, @@ -192,7 +282,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 29 + ] + } + ], "orig": "You can relax and look around", "text": "You can relax and look around", "enumerated": false, @@ -205,7 +310,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 12 + ] + } + ], "orig": "Paddle about", "text": "Paddle about", "enumerated": false, @@ -218,7 +338,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 19 + ] + } + ], "orig": "Enjoy summer warmth", "text": "Enjoy summer warmth", "enumerated": false, @@ -231,7 +366,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 19 + ] + } + ], "orig": "Also, don\u2019t forget:", "text": "Also, don\u2019t forget:" }, @@ -242,7 +392,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Wear sunglasses", "text": "Wear sunglasses", "enumerated": false, @@ -255,7 +420,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 27 + ] + } + ], "orig": "Don\u2019t forget to drink water", "text": "Don\u2019t forget to drink water", "enumerated": false, @@ -268,7 +448,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ], "orig": "Use sun cream", "text": "Use sun cream", "enumerated": false, @@ -281,7 +476,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], "orig": "Hmm, what else\u2026", "text": "Hmm, what else\u2026" }, @@ -314,7 +524,22 @@ } ], "label": "section_header", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 9 + ] + } + ], "orig": "Let\u2019s eat", "text": "Let\u2019s eat", "level": 2 @@ -326,7 +551,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 85 + ] + } + ], "orig": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice", "text": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice" }, @@ -337,7 +577,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 20 + ] + } + ], "orig": "I like to eat leaves", "text": "I like to eat leaves" }, @@ -348,7 +603,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 61 + ] + } + ], "orig": "Here are some interesting things a respectful duck could eat:", "text": "Here are some interesting things a respectful duck could eat:" }, @@ -359,7 +629,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "orig": "", "text": "" }, @@ -370,7 +655,22 @@ }, "children": [], "label": "paragraph", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 38 + ] + } + ], "orig": "And let\u2019s add another list in the end:", "text": "And let\u2019s add another list in the end:" }, @@ -381,7 +681,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 6 + ] + } + ], "orig": "Leaves", "text": "Leaves", "enumerated": false, @@ -394,7 +709,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 7 + ] + } + ], "orig": "Berries", "text": "Berries", "enumerated": false, @@ -407,7 +737,22 @@ }, "children": [], "label": "list_item", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 5 + ] + } + ], "orig": "Grain", "text": "Grain", "enumerated": false, @@ -422,7 +767,22 @@ }, "children": [], "label": "picture", - "prov": [], + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 397.0, + "b": 397.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -446,7 +806,22 @@ }, "children": [], "label": "table", - "prov": [], + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 120 + ] + } + ], "captions": [], "references": [], "footnotes": [], @@ -757,5 +1132,20 @@ } ], "key_value_items": [], - "pages": {} + "pages": { + "1": { + "size": { + "width": 612.0, + "height": 792.0 + }, + "page_no": 1 + }, + "2": { + "size": { + "width": 612.0, + "height": 792.0 + }, + "page_no": 2 + } + } } \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 86bd837d..0d093a5d 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -6,9 +6,11 @@ from docling.backend.msword_backend import MsWordDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ( ConversionResult, + DocItemLabel, DoclingDocument, InputDocument, SectionHeaderItem, + TextItem, ) from docling.document_converter import DocumentConverter @@ -40,6 +42,27 @@ def test_heading_levels(): assert found_lvl_1 and found_lvl_2 +def test_page_breaks(): + for name in "unit_test_headers.docx", "unit_test_lists.docx", "word_sample.docx": + in_path = Path("tests/data/docx") / name + in_doc = InputDocument( + path_or_stream=in_path, + format=InputFormat.DOCX, + backend=MsWordDocumentBackend, + ) + backend = MsWordDocumentBackend( + in_doc=in_doc, + path_or_stream=in_path, + ) + doc = backend.convert() + assert backend.has_pagination() + # These all have two pages + assert len(doc.pages) == 2 + for item, _ in doc.iterate_items(): + assert item.prov + assert item.prov[0].page_no + + def get_docx_paths(): # Define the directory you want to search