mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
feat: Rich tables support for HTML backend (#2324)
* Rich tables support for HTML backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Decoupling JATS backend from HTML backend, ways of creating tables changed significantly Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * updated and added tests Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Refactored parse_table_data in html_backend into few smaller functions Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Changing scope of few functions in html_backend.py, making them static, when possible Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fix for HTML tables that have tbody and/or thead, now these tables are also properly supported Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
@@ -17,8 +17,11 @@ from docling_core.types.doc import (
|
||||
DocumentOrigin,
|
||||
GroupItem,
|
||||
GroupLabel,
|
||||
RefItem,
|
||||
RichTableCell,
|
||||
TableCell,
|
||||
TableData,
|
||||
TableItem,
|
||||
TextItem,
|
||||
)
|
||||
from docling_core.types.doc.document import ContentLayer, Formatting, Script
|
||||
@@ -276,10 +279,175 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
# reset context
|
||||
self.ctx = _Context()
|
||||
self._walk(content, doc)
|
||||
|
||||
return doc
|
||||
|
||||
def _walk(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
@staticmethod
|
||||
def group_cell_elements(
|
||||
group_name: str,
|
||||
doc: DoclingDocument,
|
||||
provs_in_cell: list[RefItem],
|
||||
docling_table: TableItem,
|
||||
) -> RefItem:
|
||||
group_element = doc.add_group(
|
||||
label=GroupLabel.UNSPECIFIED,
|
||||
name=group_name,
|
||||
parent=docling_table,
|
||||
)
|
||||
for prov in provs_in_cell:
|
||||
group_element.children.append(prov)
|
||||
pr_item = prov.resolve(doc)
|
||||
item_parent = pr_item.parent.resolve(doc)
|
||||
if pr_item.get_ref() in item_parent.children:
|
||||
item_parent.children.remove(pr_item.get_ref())
|
||||
pr_item.parent = group_element.get_ref()
|
||||
ref_for_rich_cell = group_element.get_ref()
|
||||
return ref_for_rich_cell
|
||||
|
||||
@staticmethod
|
||||
def process_rich_table_cells(
|
||||
provs_in_cell: list[RefItem],
|
||||
group_name: str,
|
||||
doc: DoclingDocument,
|
||||
docling_table: TableItem,
|
||||
) -> tuple[bool, RefItem]:
|
||||
rich_table_cell = False
|
||||
ref_for_rich_cell = provs_in_cell[0]
|
||||
if len(provs_in_cell) > 1:
|
||||
# Cell has multiple elements, we need to group them
|
||||
rich_table_cell = True
|
||||
ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
|
||||
group_name, doc, provs_in_cell, docling_table
|
||||
)
|
||||
elif len(provs_in_cell) == 1:
|
||||
item_ref = provs_in_cell[0]
|
||||
pr_item = item_ref.resolve(doc)
|
||||
if isinstance(pr_item, TextItem):
|
||||
# Cell has only one element and it's just a text
|
||||
rich_table_cell = False
|
||||
doc.delete_items(node_items=[pr_item])
|
||||
else:
|
||||
rich_table_cell = True
|
||||
ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
|
||||
group_name, doc, provs_in_cell, docling_table
|
||||
)
|
||||
|
||||
return rich_table_cell, ref_for_rich_cell
|
||||
|
||||
def parse_table_data(
|
||||
self,
|
||||
element: Tag,
|
||||
doc: DoclingDocument,
|
||||
docling_table: TableItem,
|
||||
num_rows: int,
|
||||
num_cols: int,
|
||||
) -> Optional[TableData]:
|
||||
for t in cast(list[Tag], element.find_all(["thead", "tbody"], recursive=False)):
|
||||
t.unwrap()
|
||||
|
||||
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
|
||||
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||
|
||||
# Iterate over the rows in the table
|
||||
start_row_span = 0
|
||||
row_idx = -1
|
||||
|
||||
# We don't want this recursive to support nested tables
|
||||
for row in element("tr", recursive=False):
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
# For each row, find all the column cells (both <td> and <th>)
|
||||
# We don't want this recursive to support nested tables
|
||||
cells = row(["td", "th"], recursive=False)
|
||||
# Check if cell is in a column header or row header
|
||||
col_header = True
|
||||
row_header = True
|
||||
for html_cell in cells:
|
||||
if isinstance(html_cell, Tag):
|
||||
_, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
||||
if html_cell.name == "td":
|
||||
col_header = False
|
||||
row_header = False
|
||||
elif row_span == 1:
|
||||
row_header = False
|
||||
if not row_header:
|
||||
row_idx += 1
|
||||
start_row_span = 0
|
||||
else:
|
||||
start_row_span += 1
|
||||
|
||||
# Extract the text content of each cell
|
||||
col_idx = 0
|
||||
for html_cell in cells:
|
||||
if not isinstance(html_cell, Tag):
|
||||
continue
|
||||
|
||||
# extract inline formulas
|
||||
for formula in html_cell("inline-formula"):
|
||||
math_parts = formula.text.split("$$")
|
||||
if len(math_parts) == 3:
|
||||
math_formula = f"$${math_parts[1]}$$"
|
||||
formula.replace_with(NavigableString(math_formula))
|
||||
|
||||
provs_in_cell: list[RefItem] = []
|
||||
# Parse table cell sub-tree for Rich Cells content:
|
||||
provs_in_cell = self._walk(html_cell, doc)
|
||||
|
||||
rich_table_cell = False
|
||||
ref_for_rich_cell = None
|
||||
if len(provs_in_cell) > 0:
|
||||
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
|
||||
rich_table_cell, ref_for_rich_cell = (
|
||||
HTMLDocumentBackend.process_rich_table_cells(
|
||||
provs_in_cell, group_name, doc, docling_table
|
||||
)
|
||||
)
|
||||
|
||||
# Extracting text
|
||||
text = self.get_text(html_cell).strip()
|
||||
col_span, row_span = self._get_cell_spans(html_cell)
|
||||
if row_header:
|
||||
row_span -= 1
|
||||
while (
|
||||
col_idx < num_cols
|
||||
and grid[row_idx + start_row_span][col_idx] is not None
|
||||
):
|
||||
col_idx += 1
|
||||
for r in range(start_row_span, start_row_span + row_span):
|
||||
for c in range(col_span):
|
||||
if row_idx + r < num_rows and col_idx + c < num_cols:
|
||||
grid[row_idx + r][col_idx + c] = text
|
||||
|
||||
if rich_table_cell:
|
||||
rich_cell = RichTableCell(
|
||||
text=text,
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
start_row_offset_idx=start_row_span + row_idx,
|
||||
end_row_offset_idx=start_row_span + row_idx + row_span,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
column_header=col_header,
|
||||
row_header=((not col_header) and html_cell.name == "th"),
|
||||
ref=ref_for_rich_cell, # points to an artificial group around children
|
||||
)
|
||||
doc.add_table_cell(table_item=docling_table, cell=rich_cell)
|
||||
else:
|
||||
simple_cell = TableCell(
|
||||
text=text,
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
start_row_offset_idx=start_row_span + row_idx,
|
||||
end_row_offset_idx=start_row_span + row_idx + row_span,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
column_header=col_header,
|
||||
row_header=((not col_header) and html_cell.name == "th"),
|
||||
)
|
||||
doc.add_table_cell(table_item=docling_table, cell=simple_cell)
|
||||
return data
|
||||
|
||||
def _walk(self, element: Tag, doc: DoclingDocument) -> list[RefItem]:
|
||||
"""Parse an XML tag by recursively walking its content.
|
||||
|
||||
While walking, the method buffers inline text across tags like <b> or <span>,
|
||||
@@ -289,17 +457,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
element: The XML tag to parse.
|
||||
doc: The Docling document to be updated with the parsed content.
|
||||
"""
|
||||
added_refs: list[RefItem] = []
|
||||
buffer: AnnotatedTextList = AnnotatedTextList()
|
||||
|
||||
def flush_buffer():
|
||||
if not buffer:
|
||||
return
|
||||
return added_refs
|
||||
annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
|
||||
parts = annotated_text_list.split_by_newline()
|
||||
buffer.clear()
|
||||
|
||||
if not "".join([el.text for el in annotated_text_list]):
|
||||
return
|
||||
return added_refs
|
||||
|
||||
for annotated_text_list in parts:
|
||||
with self._use_inline_group(annotated_text_list, doc):
|
||||
@@ -309,15 +478,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
annotated_text.text.strip()
|
||||
)
|
||||
if annotated_text.code:
|
||||
doc.add_code(
|
||||
docling_code2 = doc.add_code(
|
||||
parent=self.parents[self.level],
|
||||
text=seg_clean,
|
||||
content_layer=self.content_layer,
|
||||
formatting=annotated_text.formatting,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
added_refs.append(docling_code2.get_ref())
|
||||
else:
|
||||
doc.add_text(
|
||||
docling_text2 = doc.add_text(
|
||||
parent=self.parents[self.level],
|
||||
label=DocItemLabel.TEXT,
|
||||
text=seg_clean,
|
||||
@@ -325,25 +495,31 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
formatting=annotated_text.formatting,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
added_refs.append(docling_text2.get_ref())
|
||||
|
||||
for node in element.contents:
|
||||
if isinstance(node, Tag):
|
||||
name = node.name.lower()
|
||||
if name == "img":
|
||||
flush_buffer()
|
||||
self._emit_image(node, doc)
|
||||
im_ref3 = self._emit_image(node, doc)
|
||||
added_refs.append(im_ref3)
|
||||
elif name in _FORMAT_TAG_MAP:
|
||||
with self._use_format([name]):
|
||||
self._walk(node, doc)
|
||||
wk = self._walk(node, doc)
|
||||
added_refs.extend(wk)
|
||||
elif name == "a":
|
||||
with self._use_hyperlink(node):
|
||||
self._walk(node, doc)
|
||||
wk2 = self._walk(node, doc)
|
||||
added_refs.extend(wk2)
|
||||
elif name in _BLOCK_TAGS:
|
||||
flush_buffer()
|
||||
self._handle_block(node, doc)
|
||||
blk = self._handle_block(node, doc)
|
||||
added_refs.extend(blk)
|
||||
elif node.find(_BLOCK_TAGS):
|
||||
flush_buffer()
|
||||
self._walk(node, doc)
|
||||
wk3 = self._walk(node, doc)
|
||||
added_refs.extend(wk3)
|
||||
else:
|
||||
buffer.extend(
|
||||
self._extract_text_and_hyperlink_recursively(
|
||||
@@ -363,6 +539,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
|
||||
flush_buffer()
|
||||
return added_refs
|
||||
|
||||
@staticmethod
|
||||
def _collect_parent_format_tags(item: PageElement) -> list[str]:
|
||||
@@ -581,7 +758,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.level -= 1
|
||||
self.content_layer = current_layer
|
||||
|
||||
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
|
||||
added_ref = []
|
||||
tag_name = tag.name.lower()
|
||||
# set default content layer to BODY as soon as we encounter a heading
|
||||
self.content_layer = ContentLayer.BODY
|
||||
@@ -596,12 +774,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
for key in self.parents.keys():
|
||||
self.parents[key] = None
|
||||
self.level = 0
|
||||
self.parents[self.level + 1] = doc.add_title(
|
||||
docling_title = self.parents[self.level + 1] = doc.add_title(
|
||||
text_clean,
|
||||
content_layer=self.content_layer,
|
||||
formatting=annotated_text.formatting,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
added_ref = [docling_title.get_ref()]
|
||||
# the other levels need to be lowered by 1 if a title was set
|
||||
else:
|
||||
level -= 1
|
||||
@@ -623,7 +802,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
_log.debug(f"Remove the tail of level {key}")
|
||||
self.parents[key] = None
|
||||
self.level = level
|
||||
self.parents[self.level + 1] = doc.add_heading(
|
||||
docling_heading = self.parents[self.level + 1] = doc.add_heading(
|
||||
parent=self.parents[self.level],
|
||||
text=text_clean,
|
||||
orig=annotated_text.text,
|
||||
@@ -632,12 +811,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
formatting=annotated_text.formatting,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
added_ref = [docling_heading.get_ref()]
|
||||
self.level += 1
|
||||
for img_tag in tag("img"):
|
||||
if isinstance(img_tag, Tag):
|
||||
self._emit_image(img_tag, doc)
|
||||
im_ref = self._emit_image(img_tag, doc)
|
||||
added_ref.append(im_ref)
|
||||
return added_ref
|
||||
|
||||
def _handle_list(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||
def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem:
|
||||
tag_name = tag.name.lower()
|
||||
start: Optional[int] = None
|
||||
name: str = ""
|
||||
@@ -765,20 +947,50 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
self.parents[self.level + 1] = None
|
||||
self.level -= 1
|
||||
return list_group.get_ref()
|
||||
|
||||
def _handle_block(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||
@staticmethod
|
||||
def get_html_table_row_col(tag: Tag) -> tuple[int, int]:
|
||||
for t in cast(list[Tag], tag.find_all(["thead", "tbody"], recursive=False)):
|
||||
t.unwrap()
|
||||
# Find the number of rows and columns (taking into account spans)
|
||||
num_rows: int = 0
|
||||
num_cols: int = 0
|
||||
for row in tag("tr", recursive=False):
|
||||
col_count = 0
|
||||
is_row_header = True
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
for cell in row(["td", "th"], recursive=False):
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
cell_tag = cast(Tag, cell)
|
||||
col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
|
||||
col_count += col_span
|
||||
if cell_tag.name == "td" or row_span == 1:
|
||||
is_row_header = False
|
||||
num_cols = max(num_cols, col_count)
|
||||
if not is_row_header:
|
||||
num_rows += 1
|
||||
return num_rows, num_cols
|
||||
|
||||
def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
|
||||
added_refs = []
|
||||
tag_name = tag.name.lower()
|
||||
|
||||
if tag_name == "figure":
|
||||
img_tag = tag.find("img")
|
||||
if isinstance(img_tag, Tag):
|
||||
self._emit_image(img_tag, doc)
|
||||
im_ref = self._emit_image(img_tag, doc)
|
||||
added_refs.append(im_ref)
|
||||
|
||||
elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
|
||||
self._handle_heading(tag, doc)
|
||||
heading_refs = self._handle_heading(tag, doc)
|
||||
added_refs.extend(heading_refs)
|
||||
|
||||
elif tag_name in {"ul", "ol"}:
|
||||
self._handle_list(tag, doc)
|
||||
list_ref = self._handle_list(tag, doc)
|
||||
added_refs.append(list_ref)
|
||||
|
||||
elif tag_name in {"p", "address", "summary"}:
|
||||
text_list = self._extract_text_and_hyperlink_recursively(
|
||||
@@ -791,15 +1003,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if seg := annotated_text.text.strip():
|
||||
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
||||
if annotated_text.code:
|
||||
doc.add_code(
|
||||
docling_code = doc.add_code(
|
||||
parent=self.parents[self.level],
|
||||
text=seg_clean,
|
||||
content_layer=self.content_layer,
|
||||
formatting=annotated_text.formatting,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
added_refs.append(docling_code.get_ref())
|
||||
else:
|
||||
doc.add_text(
|
||||
docling_text = doc.add_text(
|
||||
parent=self.parents[self.level],
|
||||
label=DocItemLabel.TEXT,
|
||||
text=seg_clean,
|
||||
@@ -807,22 +1020,27 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
formatting=annotated_text.formatting,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
added_refs.append(docling_text.get_ref())
|
||||
|
||||
for img_tag in tag("img"):
|
||||
if isinstance(img_tag, Tag):
|
||||
self._emit_image(img_tag, doc)
|
||||
|
||||
elif tag_name == "table":
|
||||
data = HTMLDocumentBackend.parse_table_data(tag)
|
||||
for img_tag in tag("img"):
|
||||
if isinstance(img_tag, Tag):
|
||||
self._emit_image(tag, doc)
|
||||
if data is not None:
|
||||
doc.add_table(
|
||||
data=data,
|
||||
num_rows, num_cols = self.get_html_table_row_col(tag)
|
||||
data_e = TableData(num_rows=num_rows, num_cols=num_cols)
|
||||
docling_table = doc.add_table(
|
||||
data=data_e,
|
||||
parent=self.parents[self.level],
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
added_refs.append(docling_table.get_ref())
|
||||
self.parse_table_data(tag, doc, docling_table, num_rows, num_cols)
|
||||
|
||||
for img_tag in tag("img"):
|
||||
if isinstance(img_tag, Tag):
|
||||
im_ref2 = self._emit_image(tag, doc)
|
||||
added_refs.append(im_ref2)
|
||||
|
||||
elif tag_name in {"pre"}:
|
||||
# handle monospace code snippets (pre).
|
||||
@@ -835,13 +1053,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
text_clean = HTMLDocumentBackend._clean_unicode(
|
||||
annotated_text.text.strip()
|
||||
)
|
||||
doc.add_code(
|
||||
docling_code2 = doc.add_code(
|
||||
parent=self.parents[self.level],
|
||||
text=text_clean,
|
||||
content_layer=self.content_layer,
|
||||
formatting=annotated_text.formatting,
|
||||
hyperlink=annotated_text.hyperlink,
|
||||
)
|
||||
added_refs.append(docling_code2.get_ref())
|
||||
|
||||
elif tag_name == "footer":
|
||||
with self._use_footer(tag, doc):
|
||||
@@ -850,8 +1069,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
elif tag_name == "details":
|
||||
with self._use_details(tag, doc):
|
||||
self._walk(tag, doc)
|
||||
return added_refs
|
||||
|
||||
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
|
||||
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> RefItem:
|
||||
figure = img_tag.find_parent("figure")
|
||||
caption: AnnotatedTextList = AnnotatedTextList()
|
||||
|
||||
@@ -894,11 +1114,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
hyperlink=caption_anno_text.hyperlink,
|
||||
)
|
||||
|
||||
doc.add_picture(
|
||||
docling_pic = doc.add_picture(
|
||||
caption=caption_item,
|
||||
parent=self.parents[self.level],
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
return docling_pic.get_ref()
|
||||
|
||||
@staticmethod
|
||||
def get_text(item: PageElement) -> str:
|
||||
@@ -996,106 +1217,3 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
|
||||
return int_spans
|
||||
|
||||
@staticmethod
|
||||
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
|
||||
nested_tables = element.find("table")
|
||||
if nested_tables is not None:
|
||||
_log.debug("Skipping nested table.")
|
||||
return None
|
||||
|
||||
# Find the number of rows and columns (taking into account spans)
|
||||
num_rows = 0
|
||||
num_cols = 0
|
||||
for row in element("tr"):
|
||||
col_count = 0
|
||||
is_row_header = True
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
for cell in row(["td", "th"]):
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
cell_tag = cast(Tag, cell)
|
||||
col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
|
||||
col_count += col_span
|
||||
if cell_tag.name == "td" or row_span == 1:
|
||||
is_row_header = False
|
||||
num_cols = max(num_cols, col_count)
|
||||
if not is_row_header:
|
||||
num_rows += 1
|
||||
|
||||
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
|
||||
|
||||
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||
|
||||
# Iterate over the rows in the table
|
||||
start_row_span = 0
|
||||
row_idx = -1
|
||||
for row in element("tr"):
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
|
||||
# For each row, find all the column cells (both <td> and <th>)
|
||||
cells = row(["td", "th"])
|
||||
|
||||
# Check if cell is in a column header or row header
|
||||
col_header = True
|
||||
row_header = True
|
||||
for html_cell in cells:
|
||||
if isinstance(html_cell, Tag):
|
||||
_, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
||||
if html_cell.name == "td":
|
||||
col_header = False
|
||||
row_header = False
|
||||
elif row_span == 1:
|
||||
row_header = False
|
||||
if not row_header:
|
||||
row_idx += 1
|
||||
start_row_span = 0
|
||||
else:
|
||||
start_row_span += 1
|
||||
|
||||
# Extract the text content of each cell
|
||||
col_idx = 0
|
||||
for html_cell in cells:
|
||||
if not isinstance(html_cell, Tag):
|
||||
continue
|
||||
|
||||
# extract inline formulas
|
||||
for formula in html_cell("inline-formula"):
|
||||
math_parts = formula.text.split("$$")
|
||||
if len(math_parts) == 3:
|
||||
math_formula = f"$${math_parts[1]}$$"
|
||||
formula.replace_with(NavigableString(math_formula))
|
||||
|
||||
# TODO: extract content correctly from table-cells with lists
|
||||
text = HTMLDocumentBackend.get_text(html_cell).strip()
|
||||
col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
||||
if row_header:
|
||||
row_span -= 1
|
||||
while (
|
||||
col_idx < num_cols
|
||||
and grid[row_idx + start_row_span][col_idx] is not None
|
||||
):
|
||||
col_idx += 1
|
||||
for r in range(start_row_span, start_row_span + row_span):
|
||||
for c in range(col_span):
|
||||
if row_idx + r < num_rows and col_idx + c < num_cols:
|
||||
grid[row_idx + r][col_idx + c] = text
|
||||
|
||||
table_cell = TableCell(
|
||||
text=text,
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
start_row_offset_idx=start_row_span + row_idx,
|
||||
end_row_offset_idx=start_row_span + row_idx + row_span,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
column_header=col_header,
|
||||
row_header=((not col_header) and html_cell.name == "th"),
|
||||
)
|
||||
data.table_cells.append(table_cell)
|
||||
|
||||
return data
|
||||
|
||||
@@ -2,9 +2,9 @@ import logging
|
||||
import traceback
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Final, Optional, Union
|
||||
from typing import Final, Optional, Union, cast
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
@@ -12,6 +12,8 @@ from docling_core.types.doc import (
|
||||
GroupItem,
|
||||
GroupLabel,
|
||||
NodeItem,
|
||||
TableCell,
|
||||
TableData,
|
||||
TextItem,
|
||||
)
|
||||
from lxml import etree
|
||||
@@ -535,6 +537,110 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
|
||||
# TODO, see how to implement proper support for rich tables from HTML backend
|
||||
nested_tables = element.find("table")
|
||||
if nested_tables is not None:
|
||||
_log.debug("Skipping nested table.")
|
||||
return None
|
||||
|
||||
# Find the number of rows and columns (taking into account spans)
|
||||
num_rows = 0
|
||||
num_cols = 0
|
||||
for row in element("tr"):
|
||||
col_count = 0
|
||||
is_row_header = True
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
for cell in row(["td", "th"]):
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
cell_tag = cast(Tag, cell)
|
||||
col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
|
||||
col_count += col_span
|
||||
if cell_tag.name == "td" or row_span == 1:
|
||||
is_row_header = False
|
||||
num_cols = max(num_cols, col_count)
|
||||
if not is_row_header:
|
||||
num_rows += 1
|
||||
|
||||
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
|
||||
|
||||
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||
|
||||
# Iterate over the rows in the table
|
||||
start_row_span = 0
|
||||
row_idx = -1
|
||||
for row in element("tr"):
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
|
||||
# For each row, find all the column cells (both <td> and <th>)
|
||||
cells = row(["td", "th"])
|
||||
|
||||
# Check if cell is in a column header or row header
|
||||
col_header = True
|
||||
row_header = True
|
||||
for html_cell in cells:
|
||||
if isinstance(html_cell, Tag):
|
||||
_, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
||||
if html_cell.name == "td":
|
||||
col_header = False
|
||||
row_header = False
|
||||
elif row_span == 1:
|
||||
row_header = False
|
||||
if not row_header:
|
||||
row_idx += 1
|
||||
start_row_span = 0
|
||||
else:
|
||||
start_row_span += 1
|
||||
|
||||
# Extract the text content of each cell
|
||||
col_idx = 0
|
||||
for html_cell in cells:
|
||||
if not isinstance(html_cell, Tag):
|
||||
continue
|
||||
|
||||
# extract inline formulas
|
||||
for formula in html_cell("inline-formula"):
|
||||
math_parts = formula.text.split("$$")
|
||||
if len(math_parts) == 3:
|
||||
math_formula = f"$${math_parts[1]}$$"
|
||||
formula.replace_with(NavigableString(math_formula))
|
||||
|
||||
# TODO: extract content correctly from table-cells with lists
|
||||
text = HTMLDocumentBackend.get_text(html_cell).strip()
|
||||
col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
||||
if row_header:
|
||||
row_span -= 1
|
||||
while (
|
||||
col_idx < num_cols
|
||||
and grid[row_idx + start_row_span][col_idx] is not None
|
||||
):
|
||||
col_idx += 1
|
||||
for r in range(start_row_span, start_row_span + row_span):
|
||||
for c in range(col_span):
|
||||
if row_idx + r < num_rows and col_idx + c < num_cols:
|
||||
grid[row_idx + r][col_idx + c] = text
|
||||
|
||||
table_cell = TableCell(
|
||||
text=text,
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
start_row_offset_idx=start_row_span + row_idx,
|
||||
end_row_offset_idx=start_row_span + row_idx + row_span,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
column_header=col_header,
|
||||
row_header=((not col_header) and html_cell.name == "th"),
|
||||
)
|
||||
data.table_cells.append(table_cell)
|
||||
|
||||
return data
|
||||
|
||||
def _add_table(
|
||||
self, doc: DoclingDocument, parent: NodeItem, table_xml_component: Table
|
||||
) -> None:
|
||||
@@ -543,8 +649,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
if not isinstance(table_tag, Tag):
|
||||
return
|
||||
|
||||
data = HTMLDocumentBackend.parse_table_data(table_tag)
|
||||
|
||||
data = JatsDocumentBackend.parse_table_data(table_tag)
|
||||
# TODO: format label vs caption once styling is supported
|
||||
label = table_xml_component["label"]
|
||||
caption = table_xml_component["caption"]
|
||||
@@ -554,7 +659,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
if table_text
|
||||
else None
|
||||
)
|
||||
|
||||
if data is not None:
|
||||
doc.add_table(data=data, parent=parent, caption=table_caption)
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "example_01",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "example_02",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "example_03",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
@@ -346,7 +346,8 @@
|
||||
"text": "Header 1",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -358,7 +359,8 @@
|
||||
"text": "Header 2",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -370,7 +372,8 @@
|
||||
"text": "Header 3",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -382,7 +385,8 @@
|
||||
"text": "Row 1, Col 1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -394,7 +398,8 @@
|
||||
"text": "Row 1, Col 2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -406,7 +411,8 @@
|
||||
"text": "Row 1, Col 3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -418,7 +424,8 @@
|
||||
"text": "Row 2, Col 1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -430,7 +437,8 @@
|
||||
"text": "Row 2, Col 2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -442,7 +450,8 @@
|
||||
"text": "Row 2, Col 3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -454,7 +463,8 @@
|
||||
"text": "Row 3, Col 1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -466,7 +476,8 @@
|
||||
"text": "Row 3, Col 2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -478,7 +489,8 @@
|
||||
"text": "Row 3, Col 3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 4,
|
||||
@@ -495,7 +507,8 @@
|
||||
"text": "Header 1",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -507,7 +520,8 @@
|
||||
"text": "Header 2",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -519,7 +533,8 @@
|
||||
"text": "Header 3",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
@@ -533,7 +548,8 @@
|
||||
"text": "Row 1, Col 1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -545,7 +561,8 @@
|
||||
"text": "Row 1, Col 2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -557,7 +574,8 @@
|
||||
"text": "Row 1, Col 3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
@@ -571,7 +589,8 @@
|
||||
"text": "Row 2, Col 1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -583,7 +602,8 @@
|
||||
"text": "Row 2, Col 2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -595,7 +615,8 @@
|
||||
"text": "Row 2, Col 3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
@@ -609,7 +630,8 @@
|
||||
"text": "Row 3, Col 1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -621,7 +643,8 @@
|
||||
"text": "Row 3, Col 2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -633,7 +656,8 @@
|
||||
"text": "Row 3, Col 3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "example_04",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
@@ -70,7 +70,8 @@
|
||||
"text": "Header 1",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -82,7 +83,8 @@
|
||||
"text": "Header 2 & 3 (colspan)",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 2,
|
||||
@@ -94,7 +96,8 @@
|
||||
"text": "Row 1 & 2, Col 1 (rowspan)",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -106,7 +109,8 @@
|
||||
"text": "Row 1, Col 2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -118,7 +122,8 @@
|
||||
"text": "Row 1, Col 3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -130,7 +135,8 @@
|
||||
"text": "Row 2, Col 2 & 3 (colspan)",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -142,7 +148,8 @@
|
||||
"text": "Row 3, Col 1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -154,7 +161,8 @@
|
||||
"text": "Row 3, Col 2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -166,7 +174,8 @@
|
||||
"text": "Row 3, Col 3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 4,
|
||||
@@ -183,7 +192,8 @@
|
||||
"text": "Header 1",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -195,7 +205,8 @@
|
||||
"text": "Header 2 & 3 (colspan)",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -207,7 +218,8 @@
|
||||
"text": "Header 2 & 3 (colspan)",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
@@ -221,7 +233,8 @@
|
||||
"text": "Row 1 & 2, Col 1 (rowspan)",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -233,7 +246,8 @@
|
||||
"text": "Row 1, Col 2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -245,7 +259,8 @@
|
||||
"text": "Row 1, Col 3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
@@ -259,7 +274,8 @@
|
||||
"text": "Row 1 & 2, Col 1 (rowspan)",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -271,7 +287,8 @@
|
||||
"text": "Row 2, Col 2 & 3 (colspan)",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -283,7 +300,8 @@
|
||||
"text": "Row 2, Col 2 & 3 (colspan)",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
@@ -297,7 +315,8 @@
|
||||
"text": "Row 3, Col 1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -309,7 +328,8 @@
|
||||
"text": "Row 3, Col 2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -321,7 +341,8 @@
|
||||
"text": "Row 3, Col 3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "example_05",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
@@ -70,7 +70,8 @@
|
||||
"text": "Header 1",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -82,7 +83,8 @@
|
||||
"text": "Header 2 & 3 (colspan)",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 2,
|
||||
@@ -94,7 +96,8 @@
|
||||
"text": "Row 1 & 2, Col 1 (rowspan)",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -106,7 +109,8 @@
|
||||
"text": "Row 1, Col 2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -118,7 +122,8 @@
|
||||
"text": "Row 1, Col 3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -130,7 +135,8 @@
|
||||
"text": "Row 2, Col 2 & 3 (colspan)",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -142,7 +148,8 @@
|
||||
"text": "Row 3, Col 1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -154,7 +161,8 @@
|
||||
"text": "Row 3, Col 2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -166,7 +174,8 @@
|
||||
"text": "Row 3, Col 3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 4,
|
||||
@@ -183,7 +192,8 @@
|
||||
"text": "Header 1",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -195,7 +205,8 @@
|
||||
"text": "Header 2 & 3 (colspan)",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -207,7 +218,8 @@
|
||||
"text": "Header 2 & 3 (colspan)",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
@@ -221,7 +233,8 @@
|
||||
"text": "Row 1 & 2, Col 1 (rowspan)",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -233,7 +246,8 @@
|
||||
"text": "Row 1, Col 2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -245,7 +259,8 @@
|
||||
"text": "Row 1, Col 3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
@@ -259,7 +274,8 @@
|
||||
"text": "Row 1 & 2, Col 1 (rowspan)",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -271,7 +287,8 @@
|
||||
"text": "Row 2, Col 2 & 3 (colspan)",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -283,7 +300,8 @@
|
||||
"text": "Row 2, Col 2 & 3 (colspan)",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
@@ -297,7 +315,8 @@
|
||||
"text": "Row 3, Col 1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -309,7 +328,8 @@
|
||||
"text": "Row 3, Col 2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -321,7 +341,8 @@
|
||||
"text": "Row 3, Col 3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "example_06",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "example_07",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "example_09",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "formatting",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "html_code_snippets",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "hyperlink_01",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "hyperlink_02",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "hyperlink_03",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "hyperlink_04",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "hyperlink_05",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
|
||||
5
tests/data/groundtruth/docling_v2/table_01.html.itxt
vendored
Normal file
5
tests/data/groundtruth/docling_v2/table_01.html.itxt
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: title: Header
|
||||
item-2 at level 2: text: This is the first paragraph.
|
||||
item-3 at level 2: table with [2x2]
|
||||
item-4 at level 2: text: After table
|
||||
213
tests/data/groundtruth/docling_v2/table_01.html.json
vendored
Normal file
213
tests/data/groundtruth/docling_v2/table_01.html.json
vendored
Normal file
@@ -0,0 +1,213 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.7.0",
|
||||
"name": "table_01",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
"binary_hash": 8899613932804813807,
|
||||
"filename": "table_01.html"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/tables/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Header",
|
||||
"text": "Header"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is the first paragraph.",
|
||||
"text": "This is the first paragraph."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "After table",
|
||||
"text": "After table"
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [
|
||||
{
|
||||
"self_ref": "#/tables/0",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"data": {
|
||||
"table_cells": [
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "1...",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "2...",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 2,
|
||||
"num_cols": 2,
|
||||
"grid": [
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "1...",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "2...",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
||||
9
tests/data/groundtruth/docling_v2/table_01.html.md
vendored
Normal file
9
tests/data/groundtruth/docling_v2/table_01.html.md
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
# Header
|
||||
|
||||
This is the first paragraph.
|
||||
|
||||
| A | B |
|
||||
|------|------|
|
||||
| 1... | 2... |
|
||||
|
||||
After table
|
||||
9
tests/data/groundtruth/docling_v2/table_02.html.itxt
vendored
Normal file
9
tests/data/groundtruth/docling_v2/table_02.html.itxt
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: title: Header
|
||||
item-2 at level 2: text: This is the first paragraph.
|
||||
item-3 at level 2: table with [2x2]
|
||||
item-4 at level 3: unspecified: group rich_cell_group_1_0_1
|
||||
item-5 at level 4: text: First Paragraph
|
||||
item-6 at level 4: text: Second Paragraph
|
||||
item-7 at level 4: text: Third Paragraph
|
||||
item-8 at level 2: text: After table
|
||||
277
tests/data/groundtruth/docling_v2/table_02.html.json
vendored
Normal file
277
tests/data/groundtruth/docling_v2/table_02.html.json
vendored
Normal file
@@ -0,0 +1,277 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.7.0",
|
||||
"name": "table_02",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
"binary_hash": 13259165361873975426,
|
||||
"filename": "table_02.html"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/tables/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/3"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "rich_cell_group_1_0_1",
|
||||
"label": "unspecified"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/tables/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/5"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Header",
|
||||
"text": "Header"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is the first paragraph.",
|
||||
"text": "This is the first paragraph."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "First Paragraph",
|
||||
"text": "First Paragraph"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/3",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Second Paragraph",
|
||||
"text": "Second Paragraph"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Third Paragraph",
|
||||
"text": "Third Paragraph"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "After table",
|
||||
"text": "After table"
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [
|
||||
{
|
||||
"self_ref": "#/tables/0",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"data": {
|
||||
"table_cells": [
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "First Paragraph\nSecond Paragraph\nThird Paragraph",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false,
|
||||
"ref": {
|
||||
"$ref": "#/groups/0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "2...",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 2,
|
||||
"num_cols": 2,
|
||||
"grid": [
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "First Paragraph\nSecond Paragraph\nThird Paragraph",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "2...",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
||||
9
tests/data/groundtruth/docling_v2/table_02.html.md
vendored
Normal file
9
tests/data/groundtruth/docling_v2/table_02.html.md
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
# Header
|
||||
|
||||
This is the first paragraph.
|
||||
|
||||
| A | B |
|
||||
|----------------------------------------------------|------|
|
||||
| First Paragraph Second Paragraph Third Paragraph | 2... |
|
||||
|
||||
After table
|
||||
10
tests/data/groundtruth/docling_v2/table_03.html.itxt
vendored
Normal file
10
tests/data/groundtruth/docling_v2/table_03.html.itxt
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: title: Header
|
||||
item-2 at level 2: text: This is the first paragraph.
|
||||
item-3 at level 2: table with [2x2]
|
||||
item-4 at level 3: unspecified: group rich_cell_group_1_0_1
|
||||
item-5 at level 4: list: group list
|
||||
item-6 at level 5: list_item: First item
|
||||
item-7 at level 5: list_item: Second item
|
||||
item-8 at level 5: list_item: Third item
|
||||
item-9 at level 2: text: After table
|
||||
297
tests/data/groundtruth/docling_v2/table_03.html.json
vendored
Normal file
297
tests/data/groundtruth/docling_v2/table_03.html.json
vendored
Normal file
@@ -0,0 +1,297 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.7.0",
|
||||
"name": "table_03",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
"binary_hash": 3098300110939273317,
|
||||
"filename": "table_03.html"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/3"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "list",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/1",
|
||||
"parent": {
|
||||
"$ref": "#/tables/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "rich_cell_group_1_0_1",
|
||||
"label": "unspecified"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/tables/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/5"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Header",
|
||||
"text": "Header"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is the first paragraph.",
|
||||
"text": "This is the first paragraph."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "First item",
|
||||
"text": "First item",
|
||||
"enumerated": false,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/3",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Second item",
|
||||
"text": "Second item",
|
||||
"enumerated": false,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Third item",
|
||||
"text": "Third item",
|
||||
"enumerated": false,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "After table",
|
||||
"text": "After table"
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [
|
||||
{
|
||||
"self_ref": "#/tables/0",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"data": {
|
||||
"table_cells": [
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "First item Second item Third item",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false,
|
||||
"ref": {
|
||||
"$ref": "#/groups/1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "2...",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 2,
|
||||
"num_cols": 2,
|
||||
"grid": [
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "First item Second item Third item",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "2...",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
||||
9
tests/data/groundtruth/docling_v2/table_03.html.md
vendored
Normal file
9
tests/data/groundtruth/docling_v2/table_03.html.md
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
# Header
|
||||
|
||||
This is the first paragraph.
|
||||
|
||||
| A | B |
|
||||
|-----------------------------------------|------|
|
||||
| - First item - Second item - Third item | 2... |
|
||||
|
||||
After table
|
||||
11
tests/data/groundtruth/docling_v2/table_04.html.itxt
vendored
Normal file
11
tests/data/groundtruth/docling_v2/table_04.html.itxt
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: title: Header
|
||||
item-2 at level 2: text: This is the first paragraph.
|
||||
item-3 at level 2: table with [2x2]
|
||||
item-4 at level 3: unspecified: group rich_cell_group_1_0_1
|
||||
item-5 at level 4: text: Some text before list
|
||||
item-6 at level 4: list: group list
|
||||
item-7 at level 5: list_item: First item
|
||||
item-8 at level 5: list_item: Second item
|
||||
item-9 at level 5: list_item: Third item
|
||||
item-10 at level 2: text: After table
|
||||
312
tests/data/groundtruth/docling_v2/table_04.html.json
vendored
Normal file
312
tests/data/groundtruth/docling_v2/table_04.html.json
vendored
Normal file
@@ -0,0 +1,312 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.7.0",
|
||||
"name": "table_04",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
"binary_hash": 2569676747034209441,
|
||||
"filename": "table_04.html"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/3"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/5"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "list",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/1",
|
||||
"parent": {
|
||||
"$ref": "#/tables/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "rich_cell_group_1_0_1",
|
||||
"label": "unspecified"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/tables/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/6"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Header",
|
||||
"text": "Header"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is the first paragraph.",
|
||||
"text": "This is the first paragraph."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Some text before list",
|
||||
"text": "Some text before list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/3",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "First item",
|
||||
"text": "First item",
|
||||
"enumerated": false,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Second item",
|
||||
"text": "Second item",
|
||||
"enumerated": false,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Third item",
|
||||
"text": "Third item",
|
||||
"enumerated": false,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/6",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "After table",
|
||||
"text": "After table"
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [
|
||||
{
|
||||
"self_ref": "#/tables/0",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"data": {
|
||||
"table_cells": [
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Some text before list\n \nFirst item Second item Third item",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false,
|
||||
"ref": {
|
||||
"$ref": "#/groups/1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "2...",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 2,
|
||||
"num_cols": 2,
|
||||
"grid": [
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Some text before list\n \nFirst item Second item Third item",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "2...",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
||||
9
tests/data/groundtruth/docling_v2/table_04.html.md
vendored
Normal file
9
tests/data/groundtruth/docling_v2/table_04.html.md
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
# Header
|
||||
|
||||
This is the first paragraph.
|
||||
|
||||
| A | B |
|
||||
|----------------------------------------------------------------|------|
|
||||
| Some text before list - First item - Second item - Third item | 2... |
|
||||
|
||||
After table
|
||||
7
tests/data/groundtruth/docling_v2/table_05.html.itxt
vendored
Normal file
7
tests/data/groundtruth/docling_v2/table_05.html.itxt
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: title: Header
|
||||
item-2 at level 2: text: This is the first paragraph.
|
||||
item-3 at level 2: table with [2x2]
|
||||
item-4 at level 3: unspecified: group rich_cell_group_2_0_1
|
||||
item-5 at level 4: table with [2x3]
|
||||
item-6 at level 2: text: After table
|
||||
417
tests/data/groundtruth/docling_v2/table_05.html.json
vendored
Normal file
417
tests/data/groundtruth/docling_v2/table_05.html.json
vendored
Normal file
@@ -0,0 +1,417 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.7.0",
|
||||
"name": "table_05",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
"binary_hash": 12827430000043968589,
|
||||
"filename": "table_05.html"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/tables/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/tables/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "rich_cell_group_2_0_1",
|
||||
"label": "unspecified"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/tables/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Header",
|
||||
"text": "Header"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is the first paragraph.",
|
||||
"text": "This is the first paragraph."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "After table",
|
||||
"text": "After table"
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [
|
||||
{
|
||||
"self_ref": "#/tables/0",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"data": {
|
||||
"table_cells": [
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A1B1C1\n\n\nD1E1F1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false,
|
||||
"ref": {
|
||||
"$ref": "#/groups/0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "2...",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 2,
|
||||
"num_cols": 2,
|
||||
"grid": [
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A1B1C1\n\n\nD1E1F1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "2...",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"annotations": []
|
||||
},
|
||||
{
|
||||
"self_ref": "#/tables/1",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"data": {
|
||||
"table_cells": [
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "C1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "D1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "E1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "F1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 2,
|
||||
"num_cols": 3,
|
||||
"grid": [
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "C1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "D1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "E1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "F1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
||||
9
tests/data/groundtruth/docling_v2/table_05.html.md
vendored
Normal file
9
tests/data/groundtruth/docling_v2/table_05.html.md
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
# Header
|
||||
|
||||
This is the first paragraph.
|
||||
|
||||
| A | B |
|
||||
|----------------------------------------------------------------------|------|
|
||||
| | A1 | B1 | C1 | |------|------|------| | D1 | E1 | F1 | | 2... |
|
||||
|
||||
After table
|
||||
11
tests/data/groundtruth/docling_v2/table_06.html.itxt
vendored
Normal file
11
tests/data/groundtruth/docling_v2/table_06.html.itxt
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: title: Header
|
||||
item-2 at level 2: text: This is the first paragraph.
|
||||
item-3 at level 2: table with [2x2]
|
||||
item-4 at level 3: unspecified: group rich_cell_group_4_0_1
|
||||
item-5 at level 4: table with [2x3]
|
||||
item-6 at level 5: unspecified: group rich_cell_group_4_0_1
|
||||
item-7 at level 6: table with [4x2]
|
||||
item-8 at level 7: unspecified: group rich_cell_group_4_0_2
|
||||
item-9 at level 8: table with [2x2]
|
||||
item-10 at level 2: text: After table
|
||||
827
tests/data/groundtruth/docling_v2/table_06.html.json
vendored
Normal file
827
tests/data/groundtruth/docling_v2/table_06.html.json
vendored
Normal file
@@ -0,0 +1,827 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.7.0",
|
||||
"name": "table_06",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
"binary_hash": 3950001743588145047,
|
||||
"filename": "table_06.html"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/tables/2"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/tables/3"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "rich_cell_group_4_0_2",
|
||||
"label": "unspecified"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/1",
|
||||
"parent": {
|
||||
"$ref": "#/tables/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/tables/2"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "rich_cell_group_4_0_1",
|
||||
"label": "unspecified"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/2",
|
||||
"parent": {
|
||||
"$ref": "#/tables/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/tables/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "rich_cell_group_4_0_1",
|
||||
"label": "unspecified"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/tables/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Header",
|
||||
"text": "Header"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is the first paragraph.",
|
||||
"text": "This is the first paragraph."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "After table",
|
||||
"text": "After table"
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [
|
||||
{
|
||||
"self_ref": "#/tables/0",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/2"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"data": {
|
||||
"table_cells": [
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A1B1C1\n\n\nD1\n\n\n\nIII\n\n\nIIIIV\n\n\nV\n\n\n\nE1E2\n\n\nE3E4\n\n\n\n\n\nVIIVIII\n\n\n\nF1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false,
|
||||
"ref": {
|
||||
"$ref": "#/groups/2"
|
||||
}
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "2...",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 2,
|
||||
"num_cols": 2,
|
||||
"grid": [
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A1B1C1\n\n\nD1\n\n\n\nIII\n\n\nIIIIV\n\n\nV\n\n\n\nE1E2\n\n\nE3E4\n\n\n\n\n\nVIIVIII\n\n\n\nF1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "2...",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"annotations": []
|
||||
},
|
||||
{
|
||||
"self_ref": "#/tables/1",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"data": {
|
||||
"table_cells": [
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "C1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "D1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "III\n\n\nIIIIV\n\n\nV\n\n\n\nE1E2\n\n\nE3E4\n\n\n\n\n\nVIIVIII",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false,
|
||||
"ref": {
|
||||
"$ref": "#/groups/1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "F1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 2,
|
||||
"num_cols": 3,
|
||||
"grid": [
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "A1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "B1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "C1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "D1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "III\n\n\nIIIIV\n\n\nV\n\n\n\nE1E2\n\n\nE3E4\n\n\n\n\n\nVIIVIII",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "F1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"annotations": []
|
||||
},
|
||||
{
|
||||
"self_ref": "#/tables/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"data": {
|
||||
"table_cells": [
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "I",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "II",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "III",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "IV",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "V",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "E1E2\n\n\nE3E4",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false,
|
||||
"ref": {
|
||||
"$ref": "#/groups/0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 3,
|
||||
"end_row_offset_idx": 4,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "VII",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 3,
|
||||
"end_row_offset_idx": 4,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "VIII",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 4,
|
||||
"num_cols": 2,
|
||||
"grid": [
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "I",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "II",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "III",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "IV",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "V",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "E1E2\n\n\nE3E4",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 3,
|
||||
"end_row_offset_idx": 4,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "VII",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 3,
|
||||
"end_row_offset_idx": 4,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "VIII",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"annotations": []
|
||||
},
|
||||
{
|
||||
"self_ref": "#/tables/3",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"data": {
|
||||
"table_cells": [
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "E1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "E2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "E3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "E4",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 2,
|
||||
"num_cols": 2,
|
||||
"grid": [
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "E1",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "E2",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "E3",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "E4",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
||||
9
tests/data/groundtruth/docling_v2/table_06.html.md
vendored
Normal file
9
tests/data/groundtruth/docling_v2/table_06.html.md
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
# Header
|
||||
|
||||
This is the first paragraph.
|
||||
|
||||
| A | B |
|
||||
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------|
|
||||
| | A1 | B1 | C1 | |------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------| | D1 | | I | II | |-----|-------------------------------------------------| | III | IV | | V | | E1 | E2 | |------|------| | E3 | E4 | | | VII | VIII | | F1 | | 2... |
|
||||
|
||||
After table
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "unit_test_01",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
|
||||
2477
tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
vendored
2477
tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
vendored
File diff suppressed because it is too large
Load Diff
8435
tests/data/groundtruth/docling_v2/wiki_duck.html.json
vendored
8435
tests/data/groundtruth/docling_v2/wiki_duck.html.json
vendored
File diff suppressed because it is too large
Load Diff
@@ -232,14 +232,10 @@ This article is about the bird. For duck as a food, see [Duck as food](/wiki/Duc
|
||||
|
||||
"Duckling" redirects here. For other uses, see [Duckling (disambiguation)](/wiki/Duckling_(disambiguation)) .
|
||||
|
||||
<!-- image -->
|
||||
|
||||
<!-- image -->
|
||||
|
||||
| Duck | Duck |
|
||||
|--------------------------------|--------------------------------|
|
||||
|----------------------------------------------------------|---------------------------|
|
||||
| | |
|
||||
| Bufflehead (Bucephala albeola) | Bufflehead (Bucephala albeola) |
|
||||
| [Bufflehead](/wiki/Bufflehead) *Bucephala albeola* ( ) | |
|
||||
| Scientific classification | Scientific classification |
|
||||
| Domain: | Eukaryota |
|
||||
| Kingdom: | Animalia |
|
||||
@@ -251,6 +247,10 @@ This article is about the bird. For duck as a food, see [Duck as food](/wiki/Duc
|
||||
| Subfamilies | Subfamilies |
|
||||
| See text | See text |
|
||||
|
||||
<!-- image -->
|
||||
|
||||
<!-- image -->
|
||||
|
||||
**Duck** is the common name for numerous species of [waterfowl](/wiki/Waterfowl) in the [family](/wiki/Family_(biology)) [Anatidae](/wiki/Anatidae) . Ducks are generally smaller and shorter-necked than [swans](/wiki/Swan) and [geese](/wiki/Goose) , which are members of the same family. Divided among several subfamilies, they are a [form taxon](/wiki/Form_taxon) ; they do not represent a [monophyletic group](/wiki/Monophyletic_group) (the group of all descendants of a single common ancestral species), since swans and geese are not considered ducks. Ducks are mostly [aquatic birds](/wiki/Aquatic_bird) , and may be found in both fresh water and sea water.
|
||||
|
||||
Ducks are sometimes confused with several types of unrelated water birds with similar forms, such as [loons](/wiki/Loon) or divers, [grebes](/wiki/Grebe) , [gallinules](/wiki/Gallinule) and [coots](/wiki/Coot) .
|
||||
@@ -509,12 +509,12 @@ The 1992 Disney film [*The Mighty Ducks*](/wiki/The_Mighty_Ducks_(film)) , starr
|
||||
- [Ducks on postage stamps](http://www.stampsbook.org/subject/Duck.html) [Archived](https://web.archive.org/web/20130513022903/http://www.stampsbook.org/subject/Duck.html) 2013-05-13 at the [Wayback Machine](/wiki/Wayback_Machine)
|
||||
- [*Ducks at a Distance, by Rob Hines*](https://gutenberg.org/ebooks/18884) at [Project Gutenberg](/wiki/Project_Gutenberg) - A modern illustrated guide to identification of US waterfowl
|
||||
|
||||
<!-- image -->
|
||||
|
||||
| Authority control databases | Authority control databases |
|
||||
|-------------------------------|---------------------------------------------------|
|
||||
| National | United States France BnF data Japan Latvia Israel |
|
||||
| Other | IdRef |
|
||||
|-------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| National | - [United States](https://id.loc.gov/authorities/sh85039879) - [France](https://catalogue.bnf.fr/ark:/12148/cb119761481) - [BnF data](https://data.bnf.fr/ark:/12148/cb119761481) - [Japan](https://id.ndl.go.jp/auth/ndlna/00564819) - [Latvia](https://kopkatalogs.lv/F?func=direct&local_base=lnc10&doc_number=000090751&P_CON_LNG=ENG) - [Israel](http://olduli.nli.org.il/F/?func=find-b&local_base=NLX10&find_code=UID&request=987007565486205171) |
|
||||
| Other | - [IdRef](https://www.idref.fr/027796124) |
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Retrieved from " [https://en.wikipedia.org/w/index.php?title=Duck&oldid=1246843351](https://en.wikipedia.org/w/index.php?title=Duck&oldid=1246843351) "
|
||||
|
||||
|
||||
24
tests/data/html/table_01.html
vendored
Normal file
24
tests/data/html/table_01.html
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
table, th, td {border: 1px solid black; border-collapse: collapse;}
|
||||
td {padding:30px;}
|
||||
table {margin: 30px;}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Header</h1>
|
||||
<p>This is the first paragraph.</p>
|
||||
<table>
|
||||
<tr>
|
||||
<td>A</td>
|
||||
<td>B</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>1...</td>
|
||||
<td>2...</td>
|
||||
</tr>
|
||||
</table>
|
||||
After table
|
||||
</body>
|
||||
</html>
|
||||
24
tests/data/html/table_02.html
vendored
Normal file
24
tests/data/html/table_02.html
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
table, th, td {border: 1px solid black; border-collapse: collapse;}
|
||||
td {padding:30px;}
|
||||
table {margin: 30px;}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Header</h1>
|
||||
<p>This is the first paragraph.</p>
|
||||
<table>
|
||||
<tr>
|
||||
<td>A</td>
|
||||
<td>B</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>First Paragraph<br>Second Paragraph<br>Third Paragraph</td>
|
||||
<td>2...</td>
|
||||
</tr>
|
||||
</table>
|
||||
After table
|
||||
</body>
|
||||
</html>
|
||||
28
tests/data/html/table_03.html
vendored
Normal file
28
tests/data/html/table_03.html
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
table, th, td {border: 1px solid black; border-collapse: collapse;}
|
||||
td {padding:30px;}
|
||||
table {margin: 30px;}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Header</h1>
|
||||
<p>This is the first paragraph.</p>
|
||||
<table>
|
||||
<tr>
|
||||
<td>A</td>
|
||||
<td>B</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<ul>
|
||||
<li>First item</li><li>Second item</li><li>Third item</li>
|
||||
</ul>
|
||||
</td>
|
||||
<td>2...</td>
|
||||
</tr>
|
||||
</table>
|
||||
After table
|
||||
</body>
|
||||
</html>
|
||||
29
tests/data/html/table_04.html
vendored
Normal file
29
tests/data/html/table_04.html
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
table, th, td {border: 1px solid black; border-collapse: collapse;}
|
||||
td {padding:30px;}
|
||||
table {margin: 30px;}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Header</h1>
|
||||
<p>This is the first paragraph.</p>
|
||||
<table>
|
||||
<tr>
|
||||
<td>A</td>
|
||||
<td>B</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
Some text before list
|
||||
<ul>
|
||||
<li>First item</li><li>Second item</li><li>Third item</li>
|
||||
</ul>
|
||||
</td>
|
||||
<td>2...</td>
|
||||
</tr>
|
||||
</table>
|
||||
After table
|
||||
</body>
|
||||
</html>
|
||||
33
tests/data/html/table_05.html
vendored
Normal file
33
tests/data/html/table_05.html
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
table, th, td {border: 1px solid black; border-collapse: collapse;}
|
||||
td {padding:30px;}
|
||||
table {margin: 30px;}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Header</h1>
|
||||
<p>This is the first paragraph.</p>
|
||||
<table>
|
||||
<tr>
|
||||
<td>A</td>
|
||||
<td>B</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<table>
|
||||
<tr>
|
||||
<td>A1</td><td>B1</td><td>C1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>D1</td><td>E1</td><td>F1</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
<td>2...</td>
|
||||
</tr>
|
||||
</table>
|
||||
After table
|
||||
</body>
|
||||
</html>
|
||||
60
tests/data/html/table_06.html
vendored
Normal file
60
tests/data/html/table_06.html
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
table, th, td {border: 1px solid black; border-collapse: collapse;}
|
||||
td {padding:30px;}
|
||||
table {margin: 30px;}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Header</h1>
|
||||
<p>This is the first paragraph.</p>
|
||||
<table>
|
||||
<tr>
|
||||
<td>A</td>
|
||||
<td>B</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<table>
|
||||
<tr>
|
||||
<td>A1</td><td>B1</td><td>C1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>D1</td>
|
||||
<td>
|
||||
<table>
|
||||
<tr>
|
||||
<td>I</td><td>II</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>III</td><td>IV</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>V</td>
|
||||
<td>
|
||||
<table>
|
||||
<tr>
|
||||
<td>E1</td><td>E2</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>E3</td><td>E4</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>VII</td><td>VIII</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
<td>F1</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
<td>2...</td>
|
||||
</tr>
|
||||
</table>
|
||||
After table
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user