fix(docx): slow table parsing (#2553)

* chore(docx): remove unnecessary import

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* fix(docx): simplify parsing of simple tables

Simplify the parsing of tables with just text (no rich cells).
Move nested function group_cell_elements out of _handle_tables for readability.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(docx): reuse method for finding inline pictures

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(docx): format strikethrough text

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* tests(docx): use fixtures to avoid converting same file multiple times

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* fix(docx): remove unnecessary argument docx_obj in functions

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* tests(docx): add test for rich table cells

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(docx): small improvements in backend and its unit tests

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(docx): parse superscript and subscript formatted text

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-11-06 05:25:53 +01:00
committed by GitHub
parent 0ba8d5d9e3
commit ef623ffcee
6 changed files with 3366 additions and 218 deletions

View File

@@ -3,7 +3,7 @@ import re
from copy import deepcopy
from io import BytesIO
from pathlib import Path
from typing import Any, Callable, Optional, Union
from typing import Any, Callable, Final, Optional, Union
from docling_core.types.doc import (
DocItemLabel,
@@ -17,9 +17,9 @@ from docling_core.types.doc import (
RichTableCell,
TableCell,
TableData,
TextItem,
TableItem,
)
from docling_core.types.doc.document import Formatting
from docling_core.types.doc.document import Formatting, Script
from docx import Document
from docx.document import Document as DocxDocument
from docx.oxml.table import CT_Tc
@@ -36,7 +36,6 @@ from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.docx.drawingml.utils import (
get_docx_to_pdf_converter,
get_libreoffice_cmd,
get_pil_from_dml_docx,
)
from docling.backend.docx.latex.omml import oMath2Latex
@@ -47,6 +46,18 @@ _log = logging.getLogger(__name__)
class MsWordDocumentBackend(DeclarativeDocumentBackend):
_BLIP_NAMESPACES: Final = {
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
"mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
"v": "urn:schemas-microsoft-com:vml",
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
"w10": "urn:schemas-microsoft-com:office:word",
"a14": "http://schemas.microsoft.com/office/drawing/2010/main",
}
@override
def __init__(
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
@@ -58,6 +69,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.xml_namespaces = {
"w": "http://schemas.microsoft.com/office/word/2003/wordml"
}
self.blip_xpath_expr = etree.XPath(
".//a:blip", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
)
# self.initialise(path_or_stream)
# Word file:
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
@@ -133,8 +147,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
if self.is_valid():
assert self.docx_obj is not None
doc, _ = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
# doc, _ = doc_info
doc, _ = self._walk_linear(self.docx_obj.element.body, doc)
return doc
else:
raise RuntimeError(
@@ -192,7 +206,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def _walk_linear(
self,
body: BaseOxmlElement,
docx_obj: DocxDocument,
doc: DoclingDocument,
# parent:
) -> tuple[DoclingDocument, list[RefItem]]:
@@ -200,20 +213,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
for element in body:
tag_name = etree.QName(element).localname
# Check for Inline Images (blip elements)
namespaces = {
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
"mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
"v": "urn:schemas-microsoft-com:vml",
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
"w10": "urn:schemas-microsoft-com:office:word",
"a14": "http://schemas.microsoft.com/office/drawing/2010/main",
}
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
drawing_blip = xpath_expr(element)
drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
drawing_blip = self.blip_xpath_expr(element)
drawingml_els = element.findall(
".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
)
# Check for textbox content - check multiple textbox formats
# Only process if the element hasn't been processed before
@@ -221,7 +224,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if element_id not in self.processed_textbox_elements:
# Modern Word textboxes
txbx_xpath = etree.XPath(
".//w:txbxContent|.//v:textbox//w:p", namespaces=namespaces
".//w:txbxContent|.//v:textbox//w:p",
namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
)
textbox_elements = txbx_xpath(element)
@@ -230,7 +234,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Additional checks for textboxes in DrawingML and VML formats
alt_txbx_xpath = etree.XPath(
".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t",
namespaces=namespaces,
namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
)
textbox_elements = alt_txbx_xpath(element)
@@ -238,7 +242,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if not textbox_elements:
shape_text_xpath = etree.XPath(
".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t",
namespaces=namespaces,
namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
)
shape_text_elements = shape_text_xpath(element)
if shape_text_elements:
@@ -272,26 +276,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
_log.debug(
f"Found textbox content with {len(textbox_elements)} elements"
)
tbc = self._handle_textbox_content(textbox_elements, docx_obj, doc)
tbc = self._handle_textbox_content(textbox_elements, doc)
added_elements.extend(tbc)
# Check for Tables
if element.tag.endswith("tbl"):
if tag_name == "tbl":
try:
t = self._handle_tables(element, docx_obj, doc)
t = self._handle_tables(element, doc)
added_elements.extend(t)
except Exception:
_log.debug("could not parse a table, broken docx table")
# Check for Image
elif drawing_blip:
pics = self._handle_pictures(docx_obj, drawing_blip, doc)
pics = self._handle_pictures(drawing_blip, doc)
added_elements.extend(pics)
# Check for Text after the Image
if (
tag_name in ["p"]
and element.find(".//w:t", namespaces=namespaces) is not None
tag_name == "p"
and element.find(
".//w:t", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
)
is not None
):
te1 = self._handle_text_elements(element, docx_obj, doc)
te1 = self._handle_text_elements(element, doc)
added_elements.extend(te1)
# Check for DrawingML elements
elif drawingml_els:
@@ -314,18 +321,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else:
self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
# Check for the sdt containers, like table of contents
elif tag_name in ["sdt"]:
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
elif tag_name == "sdt":
sdt_content = element.find(
".//w:sdtContent", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
)
if sdt_content is not None:
# Iterate paragraphs, runs, or text inside <w:sdtContent>.
paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
paragraphs = sdt_content.findall(
".//w:p", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
)
for p in paragraphs:
te = self._handle_text_elements(p, docx_obj, doc)
te = self._handle_text_elements(p, doc)
added_elements.extend(te)
# Check for Text
elif tag_name in ["p"]:
elif tag_name == "p":
# "tcPr", "sectPr"
te = self._handle_text_elements(element, docx_obj, doc)
te = self._handle_text_elements(element, doc)
added_elements.extend(te)
else:
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
@@ -384,16 +395,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
for key in keys_to_reset:
self.list_counters[key] = 0
def _is_numbered_list(self, docx_obj: DocxDocument, numId: int, ilvl: int) -> bool:
def _is_numbered_list(self, numId: int, ilvl: int) -> bool:
"""Check if a list is numbered based on its numFmt value."""
try:
# Access the numbering part of the document
if not hasattr(docx_obj, "part") or not hasattr(docx_obj.part, "package"):
if not hasattr(self.docx_obj, "part") or not hasattr(
self.docx_obj.part, "package"
):
return False
numbering_part = None
# Find the numbering part
for part in docx_obj.part.package.parts:
for part in self.docx_obj.part.package.parts:
if "numbering" in part.partname:
numbering_part = part
break
@@ -523,15 +536,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
# The .bold and .italic properties are booleans, but .underline can be an enum
# like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean
has_bold = run.bold or False
has_italic = run.italic or False
is_bold = run.bold or False
is_italic = run.italic or False
is_strikethrough = run.font.strike or False
# Convert any non-None underline value to True
has_underline = bool(run.underline is not None and run.underline)
is_underline = bool(run.underline is not None and run.underline)
is_sub = run.font.subscript or False
is_sup = run.font.superscript or False
script = Script.SUB if is_sub else Script.SUPER if is_sup else Script.BASELINE
return Formatting(
bold=has_bold,
italic=has_italic,
underline=has_underline,
bold=is_bold,
italic=is_italic,
underline=is_underline,
strikethrough=is_strikethrough,
script=script,
)
def _get_paragraph_elements(self, paragraph: Paragraph):
@@ -724,7 +743,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def _handle_textbox_content(
self,
textbox_elements: list,
docx_obj: DocxDocument,
doc: DoclingDocument,
) -> list[RefItem]:
elem_ref: list[RefItem] = []
@@ -766,7 +784,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Process all the paragraphs
for p, position in all_paragraphs:
# Create paragraph object to get text content
paragraph = Paragraph(p, docx_obj)
paragraph = Paragraph(p, self.docx_obj)
text_content = paragraph.text
# Create a unique identifier based on content and position
@@ -782,7 +800,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Mark this paragraph as processed
processed_paragraphs.add(paragraph_id)
elem_ref.extend(self._handle_text_elements(p, docx_obj, doc))
elem_ref.extend(self._handle_text_elements(p, doc))
# Restore original parent
self.parents[level] = original_parent
@@ -854,11 +872,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def _handle_text_elements(
self,
element: BaseOxmlElement,
docx_obj: DocxDocument,
doc: DoclingDocument,
) -> list[RefItem]:
elem_ref: list[RefItem] = []
paragraph = Paragraph(element, docx_obj)
paragraph = Paragraph(element, self.docx_obj)
paragraph_elements = self._get_paragraph_elements(paragraph)
text, equations = self._handle_equations_in_text(
element=element, text=paragraph.text
@@ -884,7 +901,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
and p_style_id not in ["Title", "Heading"]
):
# Check if this is actually a numbered list by examining the numFmt
is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
is_numbered = self._is_numbered_list(numid, ilevel)
li = self._add_list_item(
doc=doc,
@@ -1239,14 +1256,35 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
return elem_ref
@staticmethod
def _group_cell_elements(
group_name: str,
doc: DoclingDocument,
provs_in_cell: list[RefItem],
docling_table: TableItem,
) -> RefItem:
group_element = doc.add_group(
label=GroupLabel.UNSPECIFIED,
name=group_name,
parent=docling_table,
)
for prov in provs_in_cell:
group_element.children.append(prov)
pr_item = prov.resolve(doc)
item_parent = pr_item.parent.resolve(doc)
if pr_item.get_ref() in item_parent.children:
item_parent.children.remove(pr_item.get_ref())
pr_item.parent = group_element.get_ref()
ref_for_rich_cell = group_element.get_ref()
return ref_for_rich_cell
def _handle_tables(
self,
element: BaseOxmlElement,
docx_obj: DocxDocument,
doc: DoclingDocument,
) -> list[RefItem]:
elem_ref: list[RefItem] = []
table: Table = Table(element, docx_obj)
table: Table = Table(element, self.docx_obj)
num_rows = len(table.rows)
num_cols = len(table.columns)
_log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")
@@ -1255,7 +1293,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
cell_element = table.rows[0].cells[0]
# In case we have a table of only 1 cell, we consider it furniture
# And proceed processing the content of the cell as though it's in the document body
self._walk_linear(cell_element._element, docx_obj, doc)
self._walk_linear(cell_element._element, doc)
return elem_ref
data = TableData(num_rows=num_rows, num_cols=num_cols)
@@ -1300,52 +1338,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
text = text.replace("<eq>", "$").replace("</eq>", "$")
provs_in_cell: list[RefItem] = []
_, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
ref_for_rich_cell = provs_in_cell[0]
rich_table_cell = False
rich_table_cell: bool = self._is_rich_table_cell(cell)
def group_cell_elements(
group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem]
) -> RefItem:
group_element = doc.add_group(
label=GroupLabel.UNSPECIFIED,
name=group_name,
parent=docling_table,
)
for prov in provs_in_cell:
group_element.children.append(prov)
pr_item = prov.resolve(doc)
item_parent = pr_item.parent.resolve(doc)
if pr_item.get_ref() in item_parent.children:
item_parent.children.remove(pr_item.get_ref())
pr_item.parent = group_element.get_ref()
ref_for_rich_cell = group_element.get_ref()
return ref_for_rich_cell
if rich_table_cell:
_, provs_in_cell = self._walk_linear(cell._element, doc)
_log.debug(f"Table cell {row_idx},{col_idx} rich? {rich_table_cell}")
if len(provs_in_cell) > 1:
if len(provs_in_cell) > 0:
# Cell has multiple elements, we need to group them
rich_table_cell = True
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
ref_for_rich_cell = group_cell_elements(
group_name, doc, provs_in_cell
ref_for_rich_cell = MsWordDocumentBackend._group_cell_elements(
group_name, doc, provs_in_cell, docling_table
)
elif len(provs_in_cell) == 1:
item_ref = provs_in_cell[0]
pr_item = item_ref.resolve(doc)
if isinstance(pr_item, TextItem):
# Cell has only one element and it's just a text
rich_table_cell = False
doc.delete_items(node_items=[pr_item])
else:
rich_table_cell = True
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
ref_for_rich_cell = group_cell_elements(
group_name, doc, provs_in_cell
)
else:
rich_table_cell = False
if rich_table_cell:
rich_cell = RichTableCell(
text=text,
@@ -1377,17 +1383,79 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
col_idx += cell.grid_span
return elem_ref
def _is_rich_table_cell(self, cell: _Cell) -> bool:
"""Determine whether a docx cell should be parsed as a Docling RichTableCell.
A docx cell can hold rich content and be parsed with a Docling RichTableCell.
However, this requires walking through the lxml elements and creating
node items. If the cell holds only plain text, a TableCell, the parsing
is simpler and using a TableCell is prefered.
Plain text means:
- The cell has only one paragraph
- The paragraph consists solely of runs with no run properties
(no need of Docling formatting).
- No other block-level elements are present inside the cell element.
Args:
cell: A docx cell
Returns:
Whether the docx cell should be parsed as RichTableCell
"""
tc = cell._tc
# must contain only one paragraph
paragraphs = list(
tc.iterchildren(
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p"
)
)
if len(paragraphs) > 1:
return True
# no other content
allowed_tags = {"p", "tcPr"} # paragraph or table-cell properties
for child in tc:
tag = child.tag.split("}")[-1]
if tag not in allowed_tags:
return True
for elem in tc:
if self.blip_xpath_expr(elem):
return True
if elem.findall(
".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
):
return True
# paragraph must contain runs with no run-properties
for para in paragraphs:
runs = list(
para.iterchildren(
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r"
)
)
for rn in runs:
item: Run = Run(rn, self.docx_obj)
if item is not None:
fm = MsWordDocumentBackend._get_format_from_run(item)
if fm != Formatting():
return True
# All checks passed: plain text only
return False
def _handle_pictures(
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
self, drawing_blip: Any, doc: DoclingDocument
) -> list[RefItem]:
def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
image_data: Optional[bytes] = None
rId = drawing_blip[0].get(
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
)
if rId in docx_obj.part.rels:
if rId in self.docx_obj.part.rels:
# Access the image part using the relationship ID
image_part = docx_obj.part.rels[rId].target_part
image_part = self.docx_obj.part.rels[rId].target_part
image_data = image_part.blob # Get the binary image data
return image_data

BIN
tests/data/docx/docx_rich_cells.docx vendored Normal file

Binary file not shown.

View File

@@ -0,0 +1,107 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group header-0
item-2 at level 2: section: group header-1
item-3 at level 3: section_header: Table with rich cells
item-4 at level 4: table with [4x2]
item-5 at level 5: unspecified: group rich_cell_group_1_0_1
item-6 at level 6: text: This is a list:
item-7 at level 6: list: group list
item-8 at level 7: list_item: A First
item-9 at level 7: list_item: A Second
item-10 at level 7: list_item: A Third
item-11 at level 5: unspecified: group rich_cell_group_1_1_1
item-12 at level 6: text: This is a formatted list:
item-13 at level 6: list: group list
item-14 at level 7: list_item:
item-15 at level 8: inline: group group
item-16 at level 9: text: B
item-17 at level 9: text: First
item-18 at level 7: list_item:
item-19 at level 8: inline: group group
item-20 at level 9: text: B
item-21 at level 9: text: Second
item-22 at level 7: list_item:
item-23 at level 8: inline: group group
item-24 at level 9: text: B
item-25 at level 9: text: Third
item-26 at level 5: unspecified: group rich_cell_group_1_0_2
item-27 at level 6: text: First Paragraph
Second Paragraph
item-28 at level 6: text: Third paragraph before a numbered list
item-29 at level 6: list: group list
item-30 at level 7: list_item: Number one
item-31 at level 7: list_item: Number two
item-32 at level 7: list_item: Number three
item-33 at level 5: unspecified: group rich_cell_group_1_1_2
item-34 at level 6: text: This is simple text with
item-35 at level 6: text: bold
item-36 at level 6: text: ,
item-37 at level 6: text: strikethrough
item-38 at level 6: text: and
item-39 at level 6: text: italic
item-40 at level 6: text: formatting with x
item-41 at level 6: text: 2
item-42 at level 6: text: and H
item-43 at level 6: text: 2
item-44 at level 6: text: O
item-45 at level 5: unspecified: group rich_cell_group_1_0_3
item-46 at level 6: text: This is a paragraph
item-47 at level 6: text: This is another paragraph
item-48 at level 4: inline: group group
item-49 at level 4: text:
item-50 at level 4: text:
item-51 at level 4: text:
item-52 at level 4: text:
item-53 at level 4: text:
item-54 at level 4: text:
item-55 at level 3: section_header: Table with nested table
item-56 at level 4: text: Before table
item-57 at level 4: table with [3x2]
item-58 at level 5: unspecified: group rich_cell_group_2_1_1
item-59 at level 6: text: Simple cell with
item-60 at level 6: text: bold
item-61 at level 6: text: and
item-62 at level 6: text: italic
item-63 at level 6: text: text
item-64 at level 5: unspecified: group rich_cell_group_3_0_2
item-65 at level 6: table with [2x3]
item-66 at level 7: unspecified: group rich_cell_group_3_0_1
item-67 at level 8: text: Cell 1
item-68 at level 7: unspecified: group rich_cell_group_3_1_1
item-69 at level 8: text: Cell 2
item-70 at level 7: unspecified: group rich_cell_group_3_2_1
item-71 at level 8: text: Cell 3
item-72 at level 6: text:
item-73 at level 5: unspecified: group rich_cell_group_4_1_2
item-74 at level 6: text: Rich cell
A nested table
item-75 at level 6: table with [2x3]
item-76 at level 7: unspecified: group rich_cell_group_4_0_1
item-77 at level 8: text: Cell 1
item-78 at level 7: unspecified: group rich_cell_group_4_1_1
item-79 at level 8: text: Cell 2
item-80 at level 7: unspecified: group rich_cell_group_4_2_1
item-81 at level 8: text: Cell 3
item-82 at level 6: text:
item-83 at level 4: inline: group group
item-84 at level 4: inline: group group
item-85 at level 5: text: After table with
item-86 at level 5: text: bold
item-87 at level 5: text: ,
item-88 at level 5: text: underline
item-89 at level 5: text: ,
item-90 at level 5: text: strikethrough
item-91 at level 5: text: , and
item-92 at level 5: text: italic
item-93 at level 5: text: formatting
item-94 at level 4: text:
item-95 at level 3: section_header: Table with pictures
item-96 at level 4: text:
item-97 at level 4: table with [3x2]
item-98 at level 5: unspecified: group rich_cell_group_5_1_1
item-99 at level 6: picture
item-100 at level 5: unspecified: group rich_cell_group_5_0_2
item-101 at level 6: text: Text and picture
item-102 at level 6: picture
item-103 at level 4: text:

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,25 @@
### Table with rich cells
| Column A | Column B |
|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------|
| This is a list: - A First - A Second - A Third | This is a formatted list: - B **First** - B *Second* - B Third |
| First Paragraph Second Paragraph Third paragraph before a numbered list 1. Number one 2. Number two 3. Number three | This is simple text with **bold** , ~~strikethrough~~ and *italic* formatting with x 2 and H 2 O |
| This is a paragraph This is another paragraph | |
### Table with nested table
Before table
| Column A | Column B |
|----------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
| Simple cell upper left | Simple cell with **bold** and *italic* text |
| | A | B | C | |----------|--------|------------| | *Cell 1* | Cell 2 | **Cell 3** | | Rich cell A nested table | A | B | C | |------------|--------------|--------| | ~~Cell 1~~ | ***Cell 2*** | Cell 3 | |
After table with **bold** , underline , ~~strikethrough~~ , and *italic* formatting
### Table with pictures
| Column A | Column B |
|----------------------------------|----------------|
| Only text | <!-- image --> |
| Text and picture <!-- image --> | |

View File

@@ -1,3 +1,4 @@
import logging
import os
from pathlib import Path
@@ -18,23 +19,109 @@ from docling.document_converter import DocumentConverter
from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
_log = logging.getLogger(__name__)
GENERATE = GEN_TEST_DATA
IS_CI = bool(os.getenv("CI"))
@pytest.fixture(scope="module")
def docx_paths() -> list[Path]:
# Define the directory you want to search
directory = Path("./tests/data/docx/")
# List all docx files in the directory and its subdirectories
docx_files = sorted(directory.rglob("*.docx"))
return docx_files
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
return converter
@pytest.fixture(scope="module")
def documents(docx_paths) -> list[tuple[Path, DoclingDocument]]:
documents: list[dict[Path, DoclingDocument]] = []
converter = get_converter()
for docx_path in docx_paths:
_log.debug(f"converting {docx_path}")
gt_path = (
docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name
)
conv_result: ConversionResult = converter.convert(docx_path)
doc: DoclingDocument = conv_result.document
assert doc, f"Failed to convert document from file {gt_path}"
documents.append((gt_path, doc))
return documents
def _test_e2e_docx_conversions_impl(docx_paths: list[tuple[Path, DoclingDocument]]):
has_libreoffice = False
try:
cmd = get_libreoffice_cmd(raise_if_unavailable=True)
if cmd is not None:
has_libreoffice = True
except Exception:
pass
for docx_path, doc in docx_paths:
if not IS_CI and not has_libreoffice and docx_path.name == "drawingml.docx":
print(f"Skipping {docx_path} because no Libreoffice is installed.")
continue
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(docx_path) + ".md", generate=GENERATE), (
f"export to markdown failed on {docx_path}"
)
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(pred_itxt, str(docx_path) + ".itxt", generate=GENERATE), (
f"export to indented-text failed on {docx_path}"
)
assert verify_document(doc, str(docx_path) + ".json", generate=GENERATE), (
f"DoclingDocument verification failed on {docx_path}"
)
if docx_path.name == "word_tables.docx":
pred_html: str = doc.export_to_html()
assert verify_export(
pred_text=pred_html,
gtfile=str(docx_path) + ".html",
generate=GENERATE,
), f"export to html failed on {docx_path}"
flaky_file = "textbox.docx"
def test_e2e_docx_conversions(documents):
target = [item for item in documents if item[0].name != flaky_file]
_test_e2e_docx_conversions_impl(target)
@pytest.mark.xfail(strict=False)
def test_textbox_extraction():
in_path = Path("tests/data/docx/textbox.docx")
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.DOCX,
backend=MsWordDocumentBackend,
)
backend = MsWordDocumentBackend(
in_doc=in_doc,
path_or_stream=in_path,
)
doc = backend.convert()
def test_textbox_conversion(documents):
target = [item for item in documents if item[0].name == flaky_file]
_test_e2e_docx_conversions_impl(target)
@pytest.mark.xfail(strict=False)
def test_textbox_extraction(documents):
name = "textbox.docx"
doc = next(item[1] for item in documents if item[0].name == name)
# Verify if a particular textbox content is extracted
textbox_found = False
@@ -44,18 +131,9 @@ def test_textbox_extraction():
assert textbox_found
def test_heading_levels():
in_path = Path("tests/data/docx/word_sample.docx")
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.DOCX,
backend=MsWordDocumentBackend,
)
backend = MsWordDocumentBackend(
in_doc=in_doc,
path_or_stream=in_path,
)
doc = backend.convert()
def test_heading_levels(documents):
name = "word_sample.docx"
doc = next(item[1] for item in documents if item[0].name == name)
found_lvl_1 = found_lvl_2 = False
for item, _ in doc.iterate_items():
@@ -69,104 +147,11 @@ def test_heading_levels():
assert found_lvl_1 and found_lvl_2
def get_docx_paths():
# Define the directory you want to search
directory = Path("./tests/data/docx/")
def test_text_after_image_anchors(documents):
"""Test to analyse whether text gets parsed after image anchors."""
# List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.docx"))
return pdf_files
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
return converter
def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
converter = get_converter()
has_libreoffice = False
try:
cmd = get_libreoffice_cmd(raise_if_unavailable=True)
if cmd is not None:
has_libreoffice = True
except Exception:
pass
for docx_path in docx_paths:
if (
not IS_CI
and not has_libreoffice
and str(docx_path) in ("tests/data/docx/drawingml.docx",)
):
print(f"Skipping {docx_path} because no Libreoffice is installed.")
continue
gt_path = (
docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name
)
conv_result: ConversionResult = converter.convert(docx_path)
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
f"export to markdown failed on {docx_path}"
)
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
f"export to indented-text failed on {docx_path}"
)
assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
f"DoclingDocument verification failed on {docx_path}"
)
if docx_path.name == "word_tables.docx":
pred_html: str = doc.export_to_html()
assert verify_export(
pred_text=pred_html,
gtfile=str(gt_path) + ".html",
generate=GENERATE,
), f"export to html failed on {docx_path}"
flaky_path = Path("tests/data/docx/textbox.docx")
def test_e2e_docx_conversions():
_test_e2e_docx_conversions_impl(
docx_paths=[path for path in get_docx_paths() if path != flaky_path]
)
@pytest.mark.xfail(strict=False)
def test_textbox_conversion():
_test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
def test_text_after_image_anchors():
"""
Test to analyse whether text gets parsed after image anchors.
"""
in_path = Path("tests/data/docx/word_image_anchors.docx")
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.DOCX,
backend=MsWordDocumentBackend,
)
backend = MsWordDocumentBackend(
in_doc=in_doc,
path_or_stream=in_path,
)
doc = backend.convert()
name = "word_image_anchors.docx"
doc = next(item[1] for item in documents if item[0].name == name)
found_text_after_anchor_1 = found_text_after_anchor_2 = (
found_text_after_anchor_3
@@ -188,3 +173,38 @@ def test_text_after_image_anchors():
and found_text_after_anchor_3
and found_text_after_anchor_4
)
def test_is_rich_table_cell(docx_paths):
"""Test the function is_rich_table_cell."""
name = "docx_rich_cells.docx"
path = next(item for item in docx_paths if item.name == name)
in_doc = InputDocument(
path_or_stream=path,
format=InputFormat.DOCX,
backend=MsWordDocumentBackend,
filename=name,
)
backend = MsWordDocumentBackend(
in_doc=in_doc,
path_or_stream=path,
)
gt_cells: list[bool] = []
# table: Table with rich cells
gt_cells.extend([False, False, True, True, True, True, True, False])
# table: Table with nested table
gt_cells.extend([False, False, False, True, True, True])
# table: Table with pictures
gt_cells.extend([False, False, False, True, True, False])
gt_it = iter(gt_cells)
for idx_t, table in enumerate(backend.docx_obj.tables):
for idx_r, row in enumerate(table.rows):
for idx_c, cell in enumerate(row.cells):
assert next(gt_it) == backend._is_rich_table_cell(cell), (
f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} "
f"with text: {cell.text}"
)