mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 23:12:20 +00:00
feat: use w:lastRenderedPageBreaks
if present to get approximate pagination
Signed-off-by: David Huggins-Daines <dhdaines@logisphere.ca>
This commit is contained in:
parent
b5da4080c9
commit
147c7a1bc9
@ -5,12 +5,15 @@ from pathlib import Path
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
ImageRef,
|
||||
NodeItem,
|
||||
ProvenanceItem,
|
||||
Size,
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
@ -30,6 +33,8 @@ from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
NO_BBOX = BoundingBox(l=0, t=0, r=0, b=0)
|
||||
NO_SIZE = Size(width=0, height=0)
|
||||
|
||||
|
||||
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
@ -57,6 +62,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
self.level = 0
|
||||
self.listIter = 0
|
||||
self.page_no = 0
|
||||
self.prev_hard_break = False
|
||||
|
||||
self.history: dict[str, Any] = {
|
||||
"names": [None],
|
||||
@ -85,7 +92,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
@classmethod
|
||||
@override
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
# FIXME: This is only true for *some* Word documents, see `has_pagination` below.
|
||||
return True
|
||||
|
||||
def has_pagination(self) -> bool:
|
||||
"""Can we supply pagination for this particular Word docunent?"""
|
||||
if self.docx_obj is None:
|
||||
return False
|
||||
return (
|
||||
self.docx_obj.element.find(
|
||||
".//w:lastRenderedPageBreak", namespaces=self.docx_obj.element.nsmap
|
||||
)
|
||||
is not None
|
||||
)
|
||||
|
||||
@override
|
||||
def unload(self):
|
||||
@ -161,6 +180,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
docx_obj: DocxDocument,
|
||||
doc: DoclingDocument,
|
||||
) -> DoclingDocument:
|
||||
if self.has_pagination():
|
||||
self.page_no = 1
|
||||
doc.add_page(page_no=self.page_no, size=NO_SIZE)
|
||||
else:
|
||||
self.page_no = 0
|
||||
for element in body:
|
||||
tag_name = etree.QName(element).localname
|
||||
# Check for Inline Images (blip elements)
|
||||
@ -193,6 +217,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
elif tag_name in ["p"]:
|
||||
# "tcPr", "sectPr"
|
||||
self.handle_text_elements(element, docx_obj, doc)
|
||||
elif tag_name == "sectPr":
|
||||
# Final section in the document
|
||||
# Apply section information to this and all preceding pages
|
||||
self.handle_section(element, docx_obj, doc)
|
||||
else:
|
||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||
return doc
|
||||
@ -260,6 +288,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
return label, None
|
||||
|
||||
def handle_section(self, element, docx_obj, doc):
|
||||
if self.page_no == 0:
|
||||
# No pagination, no pages, no problems!
|
||||
return
|
||||
pgsz = element.find("w:pgSz", element.nsmap)
|
||||
if pgsz is None:
|
||||
_log.warning("No page size information in section")
|
||||
return
|
||||
ns = pgsz.nsmap["w"]
|
||||
width = pgsz.attrib[f"{{{ns}}}w"]
|
||||
height = pgsz.attrib[f"{{{ns}}}h"]
|
||||
size = Size(width=int(width) / 20, height=int(height) / 20)
|
||||
# Do all pages created up to now
|
||||
for page in doc.pages.values():
|
||||
if page.size is NO_SIZE:
|
||||
page.size = size
|
||||
|
||||
def handle_text_elements(
|
||||
self,
|
||||
element: BaseOxmlElement,
|
||||
@ -267,10 +312,43 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc: DoclingDocument,
|
||||
) -> None:
|
||||
paragraph = Paragraph(element, docx_obj)
|
||||
|
||||
sectpr = element.find(".//w:sectPr", element.nsmap)
|
||||
# Apply section information to this and all preceding pages
|
||||
if sectpr:
|
||||
self.handle_section(element, docx_obj, doc)
|
||||
start_page = self.page_no
|
||||
if start_page:
|
||||
# Somewhat complex logic - sometimes we have only hard
|
||||
# breaks, sometimes we have only soft breaks, sometimes we
|
||||
# have both (in adjacent paragraphs).
|
||||
hard_break = element.findall(".//w:br[@w:type='page']", element.nsmap)
|
||||
soft_break = element.findall(".//w:lastRenderedPageBreak", element.nsmap)
|
||||
_log.debug(
|
||||
"paragraph (hard breaks %r, soft breaks %r, prev_hard_break %r): %s",
|
||||
hard_break,
|
||||
soft_break,
|
||||
self.prev_hard_break,
|
||||
paragraph.text,
|
||||
)
|
||||
if hard_break:
|
||||
self.prev_hard_break = True
|
||||
self.page_no += 1
|
||||
doc.add_page(page_no=self.page_no, size=NO_SIZE)
|
||||
elif soft_break and not self.prev_hard_break:
|
||||
self.page_no += 1
|
||||
doc.add_page(page_no=self.page_no, size=NO_SIZE)
|
||||
if paragraph.text is None:
|
||||
return
|
||||
# If this paragraph has text then cancel a pending hard break
|
||||
if paragraph.text:
|
||||
self.prev_hard_break = False
|
||||
text = paragraph.text.strip()
|
||||
if start_page:
|
||||
prov = ProvenanceItem(
|
||||
page_no=start_page, bbox=NO_BBOX, charspan=(0, len(text))
|
||||
)
|
||||
else:
|
||||
prov = None
|
||||
|
||||
# Common styles for bullet and numbered lists.
|
||||
# "List Bullet", "List Number", "List Paragraph"
|
||||
@ -295,6 +373,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
ilevel,
|
||||
text,
|
||||
is_numbered,
|
||||
prov,
|
||||
)
|
||||
self.update_history(p_style_id, p_level, numid, ilevel)
|
||||
return
|
||||
@ -318,10 +397,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
for key in range(len(self.parents)):
|
||||
self.parents[key] = None
|
||||
self.parents[0] = doc.add_text(
|
||||
parent=None, label=DocItemLabel.TITLE, text=text
|
||||
parent=None, label=DocItemLabel.TITLE, text=text, prov=prov
|
||||
)
|
||||
elif "Heading" in p_style_id:
|
||||
self.add_header(doc, p_level, text)
|
||||
self.add_header(doc, p_level, text, prov)
|
||||
|
||||
elif p_style_id in [
|
||||
"Paragraph",
|
||||
@ -335,7 +414,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
]:
|
||||
level = self.get_level()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=self.parents[level - 1],
|
||||
text=text,
|
||||
prov=prov,
|
||||
)
|
||||
|
||||
else:
|
||||
@ -343,14 +425,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
# hence we treat all other labels as pure text
|
||||
level = self.get_level()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=self.parents[level - 1],
|
||||
text=text,
|
||||
prov=prov,
|
||||
)
|
||||
|
||||
self.update_history(p_style_id, p_level, numid, ilevel)
|
||||
return
|
||||
|
||||
def add_header(
|
||||
self, doc: DoclingDocument, curr_level: Optional[int], text: str
|
||||
self,
|
||||
doc: DoclingDocument,
|
||||
curr_level: Optional[int],
|
||||
text: str,
|
||||
prov: Union[ProvenanceItem, None] = None,
|
||||
) -> None:
|
||||
level = self.get_level()
|
||||
if isinstance(curr_level, int):
|
||||
@ -372,12 +461,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
parent=self.parents[curr_level - 1],
|
||||
text=text,
|
||||
level=curr_level,
|
||||
prov=prov,
|
||||
)
|
||||
else:
|
||||
self.parents[self.level] = doc.add_heading(
|
||||
parent=self.parents[self.level - 1],
|
||||
text=text,
|
||||
level=1,
|
||||
prov=prov,
|
||||
)
|
||||
return
|
||||
|
||||
@ -388,6 +479,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
ilevel: int,
|
||||
text: str,
|
||||
is_numbered: bool = False,
|
||||
prov: Union[ProvenanceItem, None] = None,
|
||||
) -> None:
|
||||
enum_marker = ""
|
||||
|
||||
@ -410,6 +502,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
enumerated=is_numbered,
|
||||
parent=self.parents[level],
|
||||
text=text,
|
||||
prov=prov,
|
||||
)
|
||||
|
||||
elif (
|
||||
@ -446,6 +539,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
enumerated=is_numbered,
|
||||
parent=self.parents[self.level_at_new_list + ilevel],
|
||||
text=text,
|
||||
prov=prov,
|
||||
)
|
||||
|
||||
elif (
|
||||
@ -468,6 +562,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
enumerated=is_numbered,
|
||||
parent=self.parents[self.level_at_new_list + ilevel],
|
||||
text=text,
|
||||
prov=prov,
|
||||
)
|
||||
self.listIter = 0
|
||||
|
||||
@ -482,6 +577,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
enumerated=is_numbered,
|
||||
parent=self.parents[level - 1],
|
||||
text=text,
|
||||
prov=prov,
|
||||
)
|
||||
return
|
||||
|
||||
@ -505,6 +601,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
||||
cell_set: set[CT_Tc] = set()
|
||||
start_page = self.page_no
|
||||
text_len = 0
|
||||
for row_idx, row in enumerate(table.rows):
|
||||
_log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
|
||||
col_idx = 0
|
||||
@ -531,6 +629,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
_log.debug(f" spanned before row {spanned_idx}")
|
||||
|
||||
# If this cell has text then cancel a pending hard break
|
||||
if cell.text:
|
||||
self.prev_hard_break = False
|
||||
text_len += len(cell.text)
|
||||
|
||||
table_cell = TableCell(
|
||||
text=cell.text,
|
||||
row_span=spanned_idx - row_idx,
|
||||
@ -545,8 +648,27 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
data.table_cells.append(table_cell)
|
||||
col_idx += cell.grid_span
|
||||
|
||||
# NOTE: Page numbers will be very inaccurate since a
|
||||
# table can definitely be split across pages (but
|
||||
# individual TableCells have no provenance and thus no
|
||||
# page number)
|
||||
if start_page:
|
||||
soft_break = row._element.findall(
|
||||
".//w:lastRenderedPageBreak", row._element.nsmap
|
||||
)
|
||||
_log.debug("row (page breaks %r): %s", soft_break, row)
|
||||
if soft_break and not self.prev_hard_break:
|
||||
self.page_no += 1
|
||||
doc.add_page(page_no=self.page_no, size=NO_SIZE)
|
||||
|
||||
if start_page:
|
||||
prov = ProvenanceItem(
|
||||
page_no=start_page, bbox=NO_BBOX, charspan=(0, text_len)
|
||||
)
|
||||
else:
|
||||
prov = None
|
||||
level = self.get_level()
|
||||
doc.add_table(data=data, parent=self.parents[level - 1])
|
||||
doc.add_table(data=data, parent=self.parents[level - 1], prov=prov)
|
||||
return
|
||||
|
||||
def handle_pictures(
|
||||
@ -563,20 +685,36 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
return image_data
|
||||
|
||||
level = self.get_level()
|
||||
prov = None
|
||||
# Open the BytesIO object with PIL to create an Image
|
||||
try:
|
||||
image_data = get_docx_image(drawing_blip)
|
||||
image_bytes = BytesIO(image_data)
|
||||
pil_image = Image.open(image_bytes)
|
||||
if self.page_no:
|
||||
width, height = pil_image.size
|
||||
prov = ProvenanceItem(
|
||||
page_no=self.page_no,
|
||||
bbox=BoundingBox(l=0, t=0, r=width, b=height),
|
||||
charspan=(0, 0),
|
||||
)
|
||||
doc.add_picture(
|
||||
parent=self.parents[level - 1],
|
||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||
caption=None,
|
||||
prov=prov,
|
||||
)
|
||||
except (UnidentifiedImageError, OSError) as e:
|
||||
_log.warning("Warning: image cannot be loaded by Pillow")
|
||||
if self.page_no:
|
||||
prov = ProvenanceItem(
|
||||
page_no=self.page_no,
|
||||
bbox=NO_BBOX,
|
||||
charspan=(0, 0),
|
||||
)
|
||||
doc.add_picture(
|
||||
parent=self.parents[level - 1],
|
||||
caption=None,
|
||||
prov=prov,
|
||||
)
|
||||
return
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -93,7 +93,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
17
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Summer activities",
|
||||
"text": "Summer activities"
|
||||
},
|
||||
@ -117,7 +132,22 @@
|
||||
}
|
||||
],
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
20
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Swimming in the lake",
|
||||
"text": "Swimming in the lake"
|
||||
},
|
||||
@ -128,7 +158,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
4
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Duck",
|
||||
"text": "Duck"
|
||||
},
|
||||
@ -139,7 +184,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
33
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Figure 1: This is a cute duckling",
|
||||
"text": "Figure 1: This is a cute duckling"
|
||||
},
|
||||
@ -169,7 +229,22 @@
|
||||
}
|
||||
],
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
11
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Let\u2019s swim!",
|
||||
"text": "Let\u2019s swim!",
|
||||
"level": 1
|
||||
@ -181,7 +256,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
77
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "To get started with swimming, first lay down in a water and try not to drown:",
|
||||
"text": "To get started with swimming, first lay down in a water and try not to drown:"
|
||||
},
|
||||
@ -192,7 +282,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
29
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "You can relax and look around",
|
||||
"text": "You can relax and look around",
|
||||
"enumerated": false,
|
||||
@ -205,7 +310,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
12
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Paddle about",
|
||||
"text": "Paddle about",
|
||||
"enumerated": false,
|
||||
@ -218,7 +338,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
19
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Enjoy summer warmth",
|
||||
"text": "Enjoy summer warmth",
|
||||
"enumerated": false,
|
||||
@ -231,7 +366,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
19
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Also, don\u2019t forget:",
|
||||
"text": "Also, don\u2019t forget:"
|
||||
},
|
||||
@ -242,7 +392,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
15
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Wear sunglasses",
|
||||
"text": "Wear sunglasses",
|
||||
"enumerated": false,
|
||||
@ -255,7 +420,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
27
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Don\u2019t forget to drink water",
|
||||
"text": "Don\u2019t forget to drink water",
|
||||
"enumerated": false,
|
||||
@ -268,7 +448,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
13
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Use sun cream",
|
||||
"text": "Use sun cream",
|
||||
"enumerated": false,
|
||||
@ -281,7 +476,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
15
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Hmm, what else\u2026",
|
||||
"text": "Hmm, what else\u2026"
|
||||
},
|
||||
@ -314,7 +524,22 @@
|
||||
}
|
||||
],
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
9
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Let\u2019s eat",
|
||||
"text": "Let\u2019s eat",
|
||||
"level": 2
|
||||
@ -326,7 +551,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 2,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
85
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice",
|
||||
"text": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice"
|
||||
},
|
||||
@ -337,7 +577,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 2,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
20
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "I like to eat leaves",
|
||||
"text": "I like to eat leaves"
|
||||
},
|
||||
@ -348,7 +603,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 2,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
61
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Here are some interesting things a respectful duck could eat:",
|
||||
"text": "Here are some interesting things a respectful duck could eat:"
|
||||
},
|
||||
@ -359,7 +629,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 2,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
@ -370,7 +655,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 2,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
38
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "And let\u2019s add another list in the end:",
|
||||
"text": "And let\u2019s add another list in the end:"
|
||||
},
|
||||
@ -381,7 +681,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 2,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
6
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Leaves",
|
||||
"text": "Leaves",
|
||||
"enumerated": false,
|
||||
@ -394,7 +709,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 2,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
7
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Berries",
|
||||
"text": "Berries",
|
||||
"enumerated": false,
|
||||
@ -407,7 +737,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 2,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
5
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Grain",
|
||||
"text": "Grain",
|
||||
"enumerated": false,
|
||||
@ -422,7 +767,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 397.0,
|
||||
"b": 397.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
@ -446,7 +806,22 @@
|
||||
},
|
||||
"children": [],
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 2,
|
||||
"bbox": {
|
||||
"l": 0.0,
|
||||
"t": 0.0,
|
||||
"r": 0.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
120
|
||||
]
|
||||
}
|
||||
],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
@ -757,5 +1132,20 @@
|
||||
}
|
||||
],
|
||||
"key_value_items": [],
|
||||
"pages": {}
|
||||
"pages": {
|
||||
"1": {
|
||||
"size": {
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"page_no": 1
|
||||
},
|
||||
"2": {
|
||||
"size": {
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"page_no": 2
|
||||
}
|
||||
}
|
||||
}
|
@ -6,9 +6,11 @@ from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
InputDocument,
|
||||
SectionHeaderItem,
|
||||
TextItem,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
@ -40,6 +42,27 @@ def test_heading_levels():
|
||||
assert found_lvl_1 and found_lvl_2
|
||||
|
||||
|
||||
def test_page_breaks():
|
||||
for name in "unit_test_headers.docx", "unit_test_lists.docx", "word_sample.docx":
|
||||
in_path = Path("tests/data/docx") / name
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=in_path,
|
||||
format=InputFormat.DOCX,
|
||||
backend=MsWordDocumentBackend,
|
||||
)
|
||||
backend = MsWordDocumentBackend(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=in_path,
|
||||
)
|
||||
doc = backend.convert()
|
||||
assert backend.has_pagination()
|
||||
# These all have two pages
|
||||
assert len(doc.pages) == 2
|
||||
for item, _ in doc.iterate_items():
|
||||
assert item.prov
|
||||
assert item.prov[0].page_no
|
||||
|
||||
|
||||
def get_docx_paths():
|
||||
|
||||
# Define the directory you want to search
|
||||
|
Loading…
Reference in New Issue
Block a user