mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 15:32:30 +00:00
feat: use w:lastRenderedPageBreaks
if present to get approximate pagination
Signed-off-by: David Huggins-Daines <dhdaines@logisphere.ca>
This commit is contained in:
parent
b5da4080c9
commit
147c7a1bc9
@ -5,12 +5,15 @@ from pathlib import Path
|
|||||||
from typing import Any, Optional, Union
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
|
BoundingBox,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
GroupLabel,
|
GroupLabel,
|
||||||
ImageRef,
|
ImageRef,
|
||||||
NodeItem,
|
NodeItem,
|
||||||
|
ProvenanceItem,
|
||||||
|
Size,
|
||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
@ -30,6 +33,8 @@ from docling.datamodel.base_models import InputFormat
|
|||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
NO_BBOX = BoundingBox(l=0, t=0, r=0, b=0)
|
||||||
|
NO_SIZE = Size(width=0, height=0)
|
||||||
|
|
||||||
|
|
||||||
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||||
@ -57,6 +62,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
self.level = 0
|
self.level = 0
|
||||||
self.listIter = 0
|
self.listIter = 0
|
||||||
|
self.page_no = 0
|
||||||
|
self.prev_hard_break = False
|
||||||
|
|
||||||
self.history: dict[str, Any] = {
|
self.history: dict[str, Any] = {
|
||||||
"names": [None],
|
"names": [None],
|
||||||
@ -85,7 +92,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
@classmethod
|
@classmethod
|
||||||
@override
|
@override
|
||||||
def supports_pagination(cls) -> bool:
|
def supports_pagination(cls) -> bool:
|
||||||
|
# FIXME: This is only true for *some* Word documents, see `has_pagination` below.
|
||||||
|
return True
|
||||||
|
|
||||||
|
def has_pagination(self) -> bool:
|
||||||
|
"""Can we supply pagination for this particular Word docunent?"""
|
||||||
|
if self.docx_obj is None:
|
||||||
return False
|
return False
|
||||||
|
return (
|
||||||
|
self.docx_obj.element.find(
|
||||||
|
".//w:lastRenderedPageBreak", namespaces=self.docx_obj.element.nsmap
|
||||||
|
)
|
||||||
|
is not None
|
||||||
|
)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def unload(self):
|
def unload(self):
|
||||||
@ -161,6 +180,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
docx_obj: DocxDocument,
|
docx_obj: DocxDocument,
|
||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
) -> DoclingDocument:
|
) -> DoclingDocument:
|
||||||
|
if self.has_pagination():
|
||||||
|
self.page_no = 1
|
||||||
|
doc.add_page(page_no=self.page_no, size=NO_SIZE)
|
||||||
|
else:
|
||||||
|
self.page_no = 0
|
||||||
for element in body:
|
for element in body:
|
||||||
tag_name = etree.QName(element).localname
|
tag_name = etree.QName(element).localname
|
||||||
# Check for Inline Images (blip elements)
|
# Check for Inline Images (blip elements)
|
||||||
@ -193,6 +217,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
elif tag_name in ["p"]:
|
elif tag_name in ["p"]:
|
||||||
# "tcPr", "sectPr"
|
# "tcPr", "sectPr"
|
||||||
self.handle_text_elements(element, docx_obj, doc)
|
self.handle_text_elements(element, docx_obj, doc)
|
||||||
|
elif tag_name == "sectPr":
|
||||||
|
# Final section in the document
|
||||||
|
# Apply section information to this and all preceding pages
|
||||||
|
self.handle_section(element, docx_obj, doc)
|
||||||
else:
|
else:
|
||||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||||
return doc
|
return doc
|
||||||
@ -260,6 +288,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
else:
|
else:
|
||||||
return label, None
|
return label, None
|
||||||
|
|
||||||
|
def handle_section(self, element, docx_obj, doc):
|
||||||
|
if self.page_no == 0:
|
||||||
|
# No pagination, no pages, no problems!
|
||||||
|
return
|
||||||
|
pgsz = element.find("w:pgSz", element.nsmap)
|
||||||
|
if pgsz is None:
|
||||||
|
_log.warning("No page size information in section")
|
||||||
|
return
|
||||||
|
ns = pgsz.nsmap["w"]
|
||||||
|
width = pgsz.attrib[f"{{{ns}}}w"]
|
||||||
|
height = pgsz.attrib[f"{{{ns}}}h"]
|
||||||
|
size = Size(width=int(width) / 20, height=int(height) / 20)
|
||||||
|
# Do all pages created up to now
|
||||||
|
for page in doc.pages.values():
|
||||||
|
if page.size is NO_SIZE:
|
||||||
|
page.size = size
|
||||||
|
|
||||||
def handle_text_elements(
|
def handle_text_elements(
|
||||||
self,
|
self,
|
||||||
element: BaseOxmlElement,
|
element: BaseOxmlElement,
|
||||||
@ -267,10 +312,43 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
) -> None:
|
) -> None:
|
||||||
paragraph = Paragraph(element, docx_obj)
|
paragraph = Paragraph(element, docx_obj)
|
||||||
|
sectpr = element.find(".//w:sectPr", element.nsmap)
|
||||||
|
# Apply section information to this and all preceding pages
|
||||||
|
if sectpr:
|
||||||
|
self.handle_section(element, docx_obj, doc)
|
||||||
|
start_page = self.page_no
|
||||||
|
if start_page:
|
||||||
|
# Somewhat complex logic - sometimes we have only hard
|
||||||
|
# breaks, sometimes we have only soft breaks, sometimes we
|
||||||
|
# have both (in adjacent paragraphs).
|
||||||
|
hard_break = element.findall(".//w:br[@w:type='page']", element.nsmap)
|
||||||
|
soft_break = element.findall(".//w:lastRenderedPageBreak", element.nsmap)
|
||||||
|
_log.debug(
|
||||||
|
"paragraph (hard breaks %r, soft breaks %r, prev_hard_break %r): %s",
|
||||||
|
hard_break,
|
||||||
|
soft_break,
|
||||||
|
self.prev_hard_break,
|
||||||
|
paragraph.text,
|
||||||
|
)
|
||||||
|
if hard_break:
|
||||||
|
self.prev_hard_break = True
|
||||||
|
self.page_no += 1
|
||||||
|
doc.add_page(page_no=self.page_no, size=NO_SIZE)
|
||||||
|
elif soft_break and not self.prev_hard_break:
|
||||||
|
self.page_no += 1
|
||||||
|
doc.add_page(page_no=self.page_no, size=NO_SIZE)
|
||||||
if paragraph.text is None:
|
if paragraph.text is None:
|
||||||
return
|
return
|
||||||
|
# If this paragraph has text then cancel a pending hard break
|
||||||
|
if paragraph.text:
|
||||||
|
self.prev_hard_break = False
|
||||||
text = paragraph.text.strip()
|
text = paragraph.text.strip()
|
||||||
|
if start_page:
|
||||||
|
prov = ProvenanceItem(
|
||||||
|
page_no=start_page, bbox=NO_BBOX, charspan=(0, len(text))
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
prov = None
|
||||||
|
|
||||||
# Common styles for bullet and numbered lists.
|
# Common styles for bullet and numbered lists.
|
||||||
# "List Bullet", "List Number", "List Paragraph"
|
# "List Bullet", "List Number", "List Paragraph"
|
||||||
@ -295,6 +373,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
ilevel,
|
ilevel,
|
||||||
text,
|
text,
|
||||||
is_numbered,
|
is_numbered,
|
||||||
|
prov,
|
||||||
)
|
)
|
||||||
self.update_history(p_style_id, p_level, numid, ilevel)
|
self.update_history(p_style_id, p_level, numid, ilevel)
|
||||||
return
|
return
|
||||||
@ -318,10 +397,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
for key in range(len(self.parents)):
|
for key in range(len(self.parents)):
|
||||||
self.parents[key] = None
|
self.parents[key] = None
|
||||||
self.parents[0] = doc.add_text(
|
self.parents[0] = doc.add_text(
|
||||||
parent=None, label=DocItemLabel.TITLE, text=text
|
parent=None, label=DocItemLabel.TITLE, text=text, prov=prov
|
||||||
)
|
)
|
||||||
elif "Heading" in p_style_id:
|
elif "Heading" in p_style_id:
|
||||||
self.add_header(doc, p_level, text)
|
self.add_header(doc, p_level, text, prov)
|
||||||
|
|
||||||
elif p_style_id in [
|
elif p_style_id in [
|
||||||
"Paragraph",
|
"Paragraph",
|
||||||
@ -335,7 +414,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
]:
|
]:
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
label=DocItemLabel.PARAGRAPH,
|
||||||
|
parent=self.parents[level - 1],
|
||||||
|
text=text,
|
||||||
|
prov=prov,
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -343,14 +425,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# hence we treat all other labels as pure text
|
# hence we treat all other labels as pure text
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
label=DocItemLabel.PARAGRAPH,
|
||||||
|
parent=self.parents[level - 1],
|
||||||
|
text=text,
|
||||||
|
prov=prov,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.update_history(p_style_id, p_level, numid, ilevel)
|
self.update_history(p_style_id, p_level, numid, ilevel)
|
||||||
return
|
return
|
||||||
|
|
||||||
def add_header(
|
def add_header(
|
||||||
self, doc: DoclingDocument, curr_level: Optional[int], text: str
|
self,
|
||||||
|
doc: DoclingDocument,
|
||||||
|
curr_level: Optional[int],
|
||||||
|
text: str,
|
||||||
|
prov: Union[ProvenanceItem, None] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
if isinstance(curr_level, int):
|
if isinstance(curr_level, int):
|
||||||
@ -372,12 +461,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
parent=self.parents[curr_level - 1],
|
parent=self.parents[curr_level - 1],
|
||||||
text=text,
|
text=text,
|
||||||
level=curr_level,
|
level=curr_level,
|
||||||
|
prov=prov,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.parents[self.level] = doc.add_heading(
|
self.parents[self.level] = doc.add_heading(
|
||||||
parent=self.parents[self.level - 1],
|
parent=self.parents[self.level - 1],
|
||||||
text=text,
|
text=text,
|
||||||
level=1,
|
level=1,
|
||||||
|
prov=prov,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -388,6 +479,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
ilevel: int,
|
ilevel: int,
|
||||||
text: str,
|
text: str,
|
||||||
is_numbered: bool = False,
|
is_numbered: bool = False,
|
||||||
|
prov: Union[ProvenanceItem, None] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
enum_marker = ""
|
enum_marker = ""
|
||||||
|
|
||||||
@ -410,6 +502,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
enumerated=is_numbered,
|
enumerated=is_numbered,
|
||||||
parent=self.parents[level],
|
parent=self.parents[level],
|
||||||
text=text,
|
text=text,
|
||||||
|
prov=prov,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
@ -446,6 +539,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
enumerated=is_numbered,
|
enumerated=is_numbered,
|
||||||
parent=self.parents[self.level_at_new_list + ilevel],
|
parent=self.parents[self.level_at_new_list + ilevel],
|
||||||
text=text,
|
text=text,
|
||||||
|
prov=prov,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
@ -468,6 +562,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
enumerated=is_numbered,
|
enumerated=is_numbered,
|
||||||
parent=self.parents[self.level_at_new_list + ilevel],
|
parent=self.parents[self.level_at_new_list + ilevel],
|
||||||
text=text,
|
text=text,
|
||||||
|
prov=prov,
|
||||||
)
|
)
|
||||||
self.listIter = 0
|
self.listIter = 0
|
||||||
|
|
||||||
@ -482,6 +577,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
enumerated=is_numbered,
|
enumerated=is_numbered,
|
||||||
parent=self.parents[level - 1],
|
parent=self.parents[level - 1],
|
||||||
text=text,
|
text=text,
|
||||||
|
prov=prov,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -505,6 +601,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
||||||
cell_set: set[CT_Tc] = set()
|
cell_set: set[CT_Tc] = set()
|
||||||
|
start_page = self.page_no
|
||||||
|
text_len = 0
|
||||||
for row_idx, row in enumerate(table.rows):
|
for row_idx, row in enumerate(table.rows):
|
||||||
_log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
|
_log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
|
||||||
col_idx = 0
|
col_idx = 0
|
||||||
@ -531,6 +629,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
_log.debug(f" spanned before row {spanned_idx}")
|
_log.debug(f" spanned before row {spanned_idx}")
|
||||||
|
|
||||||
|
# If this cell has text then cancel a pending hard break
|
||||||
|
if cell.text:
|
||||||
|
self.prev_hard_break = False
|
||||||
|
text_len += len(cell.text)
|
||||||
|
|
||||||
table_cell = TableCell(
|
table_cell = TableCell(
|
||||||
text=cell.text,
|
text=cell.text,
|
||||||
row_span=spanned_idx - row_idx,
|
row_span=spanned_idx - row_idx,
|
||||||
@ -545,8 +648,27 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
data.table_cells.append(table_cell)
|
data.table_cells.append(table_cell)
|
||||||
col_idx += cell.grid_span
|
col_idx += cell.grid_span
|
||||||
|
|
||||||
|
# NOTE: Page numbers will be very inaccurate since a
|
||||||
|
# table can definitely be split across pages (but
|
||||||
|
# individual TableCells have no provenance and thus no
|
||||||
|
# page number)
|
||||||
|
if start_page:
|
||||||
|
soft_break = row._element.findall(
|
||||||
|
".//w:lastRenderedPageBreak", row._element.nsmap
|
||||||
|
)
|
||||||
|
_log.debug("row (page breaks %r): %s", soft_break, row)
|
||||||
|
if soft_break and not self.prev_hard_break:
|
||||||
|
self.page_no += 1
|
||||||
|
doc.add_page(page_no=self.page_no, size=NO_SIZE)
|
||||||
|
|
||||||
|
if start_page:
|
||||||
|
prov = ProvenanceItem(
|
||||||
|
page_no=start_page, bbox=NO_BBOX, charspan=(0, text_len)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
prov = None
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
doc.add_table(data=data, parent=self.parents[level - 1])
|
doc.add_table(data=data, parent=self.parents[level - 1], prov=prov)
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_pictures(
|
def handle_pictures(
|
||||||
@ -563,20 +685,36 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return image_data
|
return image_data
|
||||||
|
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
|
prov = None
|
||||||
# Open the BytesIO object with PIL to create an Image
|
# Open the BytesIO object with PIL to create an Image
|
||||||
try:
|
try:
|
||||||
image_data = get_docx_image(drawing_blip)
|
image_data = get_docx_image(drawing_blip)
|
||||||
image_bytes = BytesIO(image_data)
|
image_bytes = BytesIO(image_data)
|
||||||
pil_image = Image.open(image_bytes)
|
pil_image = Image.open(image_bytes)
|
||||||
|
if self.page_no:
|
||||||
|
width, height = pil_image.size
|
||||||
|
prov = ProvenanceItem(
|
||||||
|
page_no=self.page_no,
|
||||||
|
bbox=BoundingBox(l=0, t=0, r=width, b=height),
|
||||||
|
charspan=(0, 0),
|
||||||
|
)
|
||||||
doc.add_picture(
|
doc.add_picture(
|
||||||
parent=self.parents[level - 1],
|
parent=self.parents[level - 1],
|
||||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||||
caption=None,
|
caption=None,
|
||||||
|
prov=prov,
|
||||||
)
|
)
|
||||||
except (UnidentifiedImageError, OSError) as e:
|
except (UnidentifiedImageError, OSError) as e:
|
||||||
_log.warning("Warning: image cannot be loaded by Pillow")
|
_log.warning("Warning: image cannot be loaded by Pillow")
|
||||||
|
if self.page_no:
|
||||||
|
prov = ProvenanceItem(
|
||||||
|
page_no=self.page_no,
|
||||||
|
bbox=NO_BBOX,
|
||||||
|
charspan=(0, 0),
|
||||||
|
)
|
||||||
doc.add_picture(
|
doc.add_picture(
|
||||||
parent=self.parents[level - 1],
|
parent=self.parents[level - 1],
|
||||||
caption=None,
|
caption=None,
|
||||||
|
prov=prov,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -93,7 +93,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
17
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Summer activities",
|
"orig": "Summer activities",
|
||||||
"text": "Summer activities"
|
"text": "Summer activities"
|
||||||
},
|
},
|
||||||
@ -117,7 +132,22 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"label": "title",
|
"label": "title",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
20
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Swimming in the lake",
|
"orig": "Swimming in the lake",
|
||||||
"text": "Swimming in the lake"
|
"text": "Swimming in the lake"
|
||||||
},
|
},
|
||||||
@ -128,7 +158,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
4
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Duck",
|
"orig": "Duck",
|
||||||
"text": "Duck"
|
"text": "Duck"
|
||||||
},
|
},
|
||||||
@ -139,7 +184,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
33
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Figure 1: This is a cute duckling",
|
"orig": "Figure 1: This is a cute duckling",
|
||||||
"text": "Figure 1: This is a cute duckling"
|
"text": "Figure 1: This is a cute duckling"
|
||||||
},
|
},
|
||||||
@ -169,7 +229,22 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"label": "section_header",
|
"label": "section_header",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
11
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Let\u2019s swim!",
|
"orig": "Let\u2019s swim!",
|
||||||
"text": "Let\u2019s swim!",
|
"text": "Let\u2019s swim!",
|
||||||
"level": 1
|
"level": 1
|
||||||
@ -181,7 +256,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
77
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "To get started with swimming, first lay down in a water and try not to drown:",
|
"orig": "To get started with swimming, first lay down in a water and try not to drown:",
|
||||||
"text": "To get started with swimming, first lay down in a water and try not to drown:"
|
"text": "To get started with swimming, first lay down in a water and try not to drown:"
|
||||||
},
|
},
|
||||||
@ -192,7 +282,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "list_item",
|
"label": "list_item",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
29
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "You can relax and look around",
|
"orig": "You can relax and look around",
|
||||||
"text": "You can relax and look around",
|
"text": "You can relax and look around",
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
@ -205,7 +310,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "list_item",
|
"label": "list_item",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
12
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Paddle about",
|
"orig": "Paddle about",
|
||||||
"text": "Paddle about",
|
"text": "Paddle about",
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
@ -218,7 +338,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "list_item",
|
"label": "list_item",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
19
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Enjoy summer warmth",
|
"orig": "Enjoy summer warmth",
|
||||||
"text": "Enjoy summer warmth",
|
"text": "Enjoy summer warmth",
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
@ -231,7 +366,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
19
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Also, don\u2019t forget:",
|
"orig": "Also, don\u2019t forget:",
|
||||||
"text": "Also, don\u2019t forget:"
|
"text": "Also, don\u2019t forget:"
|
||||||
},
|
},
|
||||||
@ -242,7 +392,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "list_item",
|
"label": "list_item",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
15
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Wear sunglasses",
|
"orig": "Wear sunglasses",
|
||||||
"text": "Wear sunglasses",
|
"text": "Wear sunglasses",
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
@ -255,7 +420,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "list_item",
|
"label": "list_item",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
27
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Don\u2019t forget to drink water",
|
"orig": "Don\u2019t forget to drink water",
|
||||||
"text": "Don\u2019t forget to drink water",
|
"text": "Don\u2019t forget to drink water",
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
@ -268,7 +448,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "list_item",
|
"label": "list_item",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
13
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Use sun cream",
|
"orig": "Use sun cream",
|
||||||
"text": "Use sun cream",
|
"text": "Use sun cream",
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
@ -281,7 +476,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
15
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Hmm, what else\u2026",
|
"orig": "Hmm, what else\u2026",
|
||||||
"text": "Hmm, what else\u2026"
|
"text": "Hmm, what else\u2026"
|
||||||
},
|
},
|
||||||
@ -314,7 +524,22 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"label": "section_header",
|
"label": "section_header",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
9
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Let\u2019s eat",
|
"orig": "Let\u2019s eat",
|
||||||
"text": "Let\u2019s eat",
|
"text": "Let\u2019s eat",
|
||||||
"level": 2
|
"level": 2
|
||||||
@ -326,7 +551,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 2,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
85
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice",
|
"orig": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice",
|
||||||
"text": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice"
|
"text": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice"
|
||||||
},
|
},
|
||||||
@ -337,7 +577,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 2,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
20
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "I like to eat leaves",
|
"orig": "I like to eat leaves",
|
||||||
"text": "I like to eat leaves"
|
"text": "I like to eat leaves"
|
||||||
},
|
},
|
||||||
@ -348,7 +603,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 2,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
61
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Here are some interesting things a respectful duck could eat:",
|
"orig": "Here are some interesting things a respectful duck could eat:",
|
||||||
"text": "Here are some interesting things a respectful duck could eat:"
|
"text": "Here are some interesting things a respectful duck could eat:"
|
||||||
},
|
},
|
||||||
@ -359,7 +629,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 2,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "",
|
"orig": "",
|
||||||
"text": ""
|
"text": ""
|
||||||
},
|
},
|
||||||
@ -370,7 +655,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 2,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
38
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "And let\u2019s add another list in the end:",
|
"orig": "And let\u2019s add another list in the end:",
|
||||||
"text": "And let\u2019s add another list in the end:"
|
"text": "And let\u2019s add another list in the end:"
|
||||||
},
|
},
|
||||||
@ -381,7 +681,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "list_item",
|
"label": "list_item",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 2,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
6
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Leaves",
|
"orig": "Leaves",
|
||||||
"text": "Leaves",
|
"text": "Leaves",
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
@ -394,7 +709,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "list_item",
|
"label": "list_item",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 2,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
7
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Berries",
|
"orig": "Berries",
|
||||||
"text": "Berries",
|
"text": "Berries",
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
@ -407,7 +737,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "list_item",
|
"label": "list_item",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 2,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
5
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"orig": "Grain",
|
"orig": "Grain",
|
||||||
"text": "Grain",
|
"text": "Grain",
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
@ -422,7 +767,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "picture",
|
"label": "picture",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 397.0,
|
||||||
|
"b": 397.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"captions": [],
|
"captions": [],
|
||||||
"references": [],
|
"references": [],
|
||||||
"footnotes": [],
|
"footnotes": [],
|
||||||
@ -446,7 +806,22 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"prov": [],
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 2,
|
||||||
|
"bbox": {
|
||||||
|
"l": 0.0,
|
||||||
|
"t": 0.0,
|
||||||
|
"r": 0.0,
|
||||||
|
"b": 0.0,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
120
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"captions": [],
|
"captions": [],
|
||||||
"references": [],
|
"references": [],
|
||||||
"footnotes": [],
|
"footnotes": [],
|
||||||
@ -757,5 +1132,20 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"key_value_items": [],
|
"key_value_items": [],
|
||||||
"pages": {}
|
"pages": {
|
||||||
|
"1": {
|
||||||
|
"size": {
|
||||||
|
"width": 612.0,
|
||||||
|
"height": 792.0
|
||||||
|
},
|
||||||
|
"page_no": 1
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"size": {
|
||||||
|
"width": 612.0,
|
||||||
|
"height": 792.0
|
||||||
|
},
|
||||||
|
"page_no": 2
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
@ -6,9 +6,11 @@ from docling.backend.msword_backend import MsWordDocumentBackend
|
|||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
InputDocument,
|
InputDocument,
|
||||||
SectionHeaderItem,
|
SectionHeaderItem,
|
||||||
|
TextItem,
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
@ -40,6 +42,27 @@ def test_heading_levels():
|
|||||||
assert found_lvl_1 and found_lvl_2
|
assert found_lvl_1 and found_lvl_2
|
||||||
|
|
||||||
|
|
||||||
|
def test_page_breaks():
|
||||||
|
for name in "unit_test_headers.docx", "unit_test_lists.docx", "word_sample.docx":
|
||||||
|
in_path = Path("tests/data/docx") / name
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=in_path,
|
||||||
|
format=InputFormat.DOCX,
|
||||||
|
backend=MsWordDocumentBackend,
|
||||||
|
)
|
||||||
|
backend = MsWordDocumentBackend(
|
||||||
|
in_doc=in_doc,
|
||||||
|
path_or_stream=in_path,
|
||||||
|
)
|
||||||
|
doc = backend.convert()
|
||||||
|
assert backend.has_pagination()
|
||||||
|
# These all have two pages
|
||||||
|
assert len(doc.pages) == 2
|
||||||
|
for item, _ in doc.iterate_items():
|
||||||
|
assert item.prov
|
||||||
|
assert item.prov[0].page_no
|
||||||
|
|
||||||
|
|
||||||
def get_docx_paths():
|
def get_docx_paths():
|
||||||
|
|
||||||
# Define the directory you want to search
|
# Define the directory you want to search
|
||||||
|
Loading…
Reference in New Issue
Block a user