feat: Rich tables support for HTML backend (#2324)

* Rich tables support for HTML backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Decoupling JATS backend from HTML backend, ways of creating tables changed significantly

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* updated and added tests

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Refactored parse_table_data in html_backend into few smaller functions

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Changing scope of few functions in html_backend.py, making them static, when possible

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Fix for HTML tables that have tbody and/or thead, now these tables are also properly supported

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak
2025-09-29 18:12:16 +02:00
committed by GitHub
parent 325877aee9
commit c803abed9a
46 changed files with 9233 additions and 5815 deletions

View File

@@ -17,8 +17,11 @@ from docling_core.types.doc import (
DocumentOrigin,
GroupItem,
GroupLabel,
RefItem,
RichTableCell,
TableCell,
TableData,
TableItem,
TextItem,
)
from docling_core.types.doc.document import ContentLayer, Formatting, Script
@@ -276,10 +279,175 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# reset context
self.ctx = _Context()
self._walk(content, doc)
return doc
def _walk(self, element: Tag, doc: DoclingDocument) -> None:
@staticmethod
def group_cell_elements(
group_name: str,
doc: DoclingDocument,
provs_in_cell: list[RefItem],
docling_table: TableItem,
) -> RefItem:
group_element = doc.add_group(
label=GroupLabel.UNSPECIFIED,
name=group_name,
parent=docling_table,
)
for prov in provs_in_cell:
group_element.children.append(prov)
pr_item = prov.resolve(doc)
item_parent = pr_item.parent.resolve(doc)
if pr_item.get_ref() in item_parent.children:
item_parent.children.remove(pr_item.get_ref())
pr_item.parent = group_element.get_ref()
ref_for_rich_cell = group_element.get_ref()
return ref_for_rich_cell
@staticmethod
def process_rich_table_cells(
provs_in_cell: list[RefItem],
group_name: str,
doc: DoclingDocument,
docling_table: TableItem,
) -> tuple[bool, RefItem]:
rich_table_cell = False
ref_for_rich_cell = provs_in_cell[0]
if len(provs_in_cell) > 1:
# Cell has multiple elements, we need to group them
rich_table_cell = True
ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
group_name, doc, provs_in_cell, docling_table
)
elif len(provs_in_cell) == 1:
item_ref = provs_in_cell[0]
pr_item = item_ref.resolve(doc)
if isinstance(pr_item, TextItem):
# Cell has only one element and it's just a text
rich_table_cell = False
doc.delete_items(node_items=[pr_item])
else:
rich_table_cell = True
ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
group_name, doc, provs_in_cell, docling_table
)
return rich_table_cell, ref_for_rich_cell
def parse_table_data(
self,
element: Tag,
doc: DoclingDocument,
docling_table: TableItem,
num_rows: int,
num_cols: int,
) -> Optional[TableData]:
for t in cast(list[Tag], element.find_all(["thead", "tbody"], recursive=False)):
t.unwrap()
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table
start_row_span = 0
row_idx = -1
# We don't want this recursive to support nested tables
for row in element("tr", recursive=False):
if not isinstance(row, Tag):
continue
# For each row, find all the column cells (both <td> and <th>)
# We don't want this recursive to support nested tables
cells = row(["td", "th"], recursive=False)
# Check if cell is in a column header or row header
col_header = True
row_header = True
for html_cell in cells:
if isinstance(html_cell, Tag):
_, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
if html_cell.name == "td":
col_header = False
row_header = False
elif row_span == 1:
row_header = False
if not row_header:
row_idx += 1
start_row_span = 0
else:
start_row_span += 1
# Extract the text content of each cell
col_idx = 0
for html_cell in cells:
if not isinstance(html_cell, Tag):
continue
# extract inline formulas
for formula in html_cell("inline-formula"):
math_parts = formula.text.split("$$")
if len(math_parts) == 3:
math_formula = f"$${math_parts[1]}$$"
formula.replace_with(NavigableString(math_formula))
provs_in_cell: list[RefItem] = []
# Parse table cell sub-tree for Rich Cells content:
provs_in_cell = self._walk(html_cell, doc)
rich_table_cell = False
ref_for_rich_cell = None
if len(provs_in_cell) > 0:
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
rich_table_cell, ref_for_rich_cell = (
HTMLDocumentBackend.process_rich_table_cells(
provs_in_cell, group_name, doc, docling_table
)
)
# Extracting text
text = self.get_text(html_cell).strip()
col_span, row_span = self._get_cell_spans(html_cell)
if row_header:
row_span -= 1
while (
col_idx < num_cols
and grid[row_idx + start_row_span][col_idx] is not None
):
col_idx += 1
for r in range(start_row_span, start_row_span + row_span):
for c in range(col_span):
if row_idx + r < num_rows and col_idx + c < num_cols:
grid[row_idx + r][col_idx + c] = text
if rich_table_cell:
rich_cell = RichTableCell(
text=text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=start_row_span + row_idx,
end_row_offset_idx=start_row_span + row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
column_header=col_header,
row_header=((not col_header) and html_cell.name == "th"),
ref=ref_for_rich_cell, # points to an artificial group around children
)
doc.add_table_cell(table_item=docling_table, cell=rich_cell)
else:
simple_cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=start_row_span + row_idx,
end_row_offset_idx=start_row_span + row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
column_header=col_header,
row_header=((not col_header) and html_cell.name == "th"),
)
doc.add_table_cell(table_item=docling_table, cell=simple_cell)
return data
def _walk(self, element: Tag, doc: DoclingDocument) -> list[RefItem]:
"""Parse an XML tag by recursively walking its content.
While walking, the method buffers inline text across tags like <b> or <span>,
@@ -289,17 +457,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
element: The XML tag to parse.
doc: The Docling document to be updated with the parsed content.
"""
added_refs: list[RefItem] = []
buffer: AnnotatedTextList = AnnotatedTextList()
def flush_buffer():
if not buffer:
return
return added_refs
annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
parts = annotated_text_list.split_by_newline()
buffer.clear()
if not "".join([el.text for el in annotated_text_list]):
return
return added_refs
for annotated_text_list in parts:
with self._use_inline_group(annotated_text_list, doc):
@@ -309,15 +478,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
annotated_text.text.strip()
)
if annotated_text.code:
doc.add_code(
docling_code2 = doc.add_code(
parent=self.parents[self.level],
text=seg_clean,
content_layer=self.content_layer,
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
added_refs.append(docling_code2.get_ref())
else:
doc.add_text(
docling_text2 = doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=seg_clean,
@@ -325,25 +495,31 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
added_refs.append(docling_text2.get_ref())
for node in element.contents:
if isinstance(node, Tag):
name = node.name.lower()
if name == "img":
flush_buffer()
self._emit_image(node, doc)
im_ref3 = self._emit_image(node, doc)
added_refs.append(im_ref3)
elif name in _FORMAT_TAG_MAP:
with self._use_format([name]):
self._walk(node, doc)
wk = self._walk(node, doc)
added_refs.extend(wk)
elif name == "a":
with self._use_hyperlink(node):
self._walk(node, doc)
wk2 = self._walk(node, doc)
added_refs.extend(wk2)
elif name in _BLOCK_TAGS:
flush_buffer()
self._handle_block(node, doc)
blk = self._handle_block(node, doc)
added_refs.extend(blk)
elif node.find(_BLOCK_TAGS):
flush_buffer()
self._walk(node, doc)
wk3 = self._walk(node, doc)
added_refs.extend(wk3)
else:
buffer.extend(
self._extract_text_and_hyperlink_recursively(
@@ -363,6 +539,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
)
flush_buffer()
return added_refs
@staticmethod
def _collect_parent_format_tags(item: PageElement) -> list[str]:
@@ -581,7 +758,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.level -= 1
self.content_layer = current_layer
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
added_ref = []
tag_name = tag.name.lower()
# set default content layer to BODY as soon as we encounter a heading
self.content_layer = ContentLayer.BODY
@@ -596,12 +774,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
for key in self.parents.keys():
self.parents[key] = None
self.level = 0
self.parents[self.level + 1] = doc.add_title(
docling_title = self.parents[self.level + 1] = doc.add_title(
text_clean,
content_layer=self.content_layer,
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
added_ref = [docling_title.get_ref()]
# the other levels need to be lowered by 1 if a title was set
else:
level -= 1
@@ -623,7 +802,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
_log.debug(f"Remove the tail of level {key}")
self.parents[key] = None
self.level = level
self.parents[self.level + 1] = doc.add_heading(
docling_heading = self.parents[self.level + 1] = doc.add_heading(
parent=self.parents[self.level],
text=text_clean,
orig=annotated_text.text,
@@ -632,12 +811,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
added_ref = [docling_heading.get_ref()]
self.level += 1
for img_tag in tag("img"):
if isinstance(img_tag, Tag):
self._emit_image(img_tag, doc)
im_ref = self._emit_image(img_tag, doc)
added_ref.append(im_ref)
return added_ref
def _handle_list(self, tag: Tag, doc: DoclingDocument) -> None:
def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem:
tag_name = tag.name.lower()
start: Optional[int] = None
name: str = ""
@@ -765,20 +947,50 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.parents[self.level + 1] = None
self.level -= 1
return list_group.get_ref()
def _handle_block(self, tag: Tag, doc: DoclingDocument) -> None:
@staticmethod
def get_html_table_row_col(tag: Tag) -> tuple[int, int]:
for t in cast(list[Tag], tag.find_all(["thead", "tbody"], recursive=False)):
t.unwrap()
# Find the number of rows and columns (taking into account spans)
num_rows: int = 0
num_cols: int = 0
for row in tag("tr", recursive=False):
col_count = 0
is_row_header = True
if not isinstance(row, Tag):
continue
for cell in row(["td", "th"], recursive=False):
if not isinstance(row, Tag):
continue
cell_tag = cast(Tag, cell)
col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
col_count += col_span
if cell_tag.name == "td" or row_span == 1:
is_row_header = False
num_cols = max(num_cols, col_count)
if not is_row_header:
num_rows += 1
return num_rows, num_cols
def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
added_refs = []
tag_name = tag.name.lower()
if tag_name == "figure":
img_tag = tag.find("img")
if isinstance(img_tag, Tag):
self._emit_image(img_tag, doc)
im_ref = self._emit_image(img_tag, doc)
added_refs.append(im_ref)
elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
self._handle_heading(tag, doc)
heading_refs = self._handle_heading(tag, doc)
added_refs.extend(heading_refs)
elif tag_name in {"ul", "ol"}:
self._handle_list(tag, doc)
list_ref = self._handle_list(tag, doc)
added_refs.append(list_ref)
elif tag_name in {"p", "address", "summary"}:
text_list = self._extract_text_and_hyperlink_recursively(
@@ -791,15 +1003,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if seg := annotated_text.text.strip():
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
if annotated_text.code:
doc.add_code(
docling_code = doc.add_code(
parent=self.parents[self.level],
text=seg_clean,
content_layer=self.content_layer,
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
added_refs.append(docling_code.get_ref())
else:
doc.add_text(
docling_text = doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=seg_clean,
@@ -807,22 +1020,27 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
added_refs.append(docling_text.get_ref())
for img_tag in tag("img"):
if isinstance(img_tag, Tag):
self._emit_image(img_tag, doc)
elif tag_name == "table":
data = HTMLDocumentBackend.parse_table_data(tag)
for img_tag in tag("img"):
if isinstance(img_tag, Tag):
self._emit_image(tag, doc)
if data is not None:
doc.add_table(
data=data,
num_rows, num_cols = self.get_html_table_row_col(tag)
data_e = TableData(num_rows=num_rows, num_cols=num_cols)
docling_table = doc.add_table(
data=data_e,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
added_refs.append(docling_table.get_ref())
self.parse_table_data(tag, doc, docling_table, num_rows, num_cols)
for img_tag in tag("img"):
if isinstance(img_tag, Tag):
im_ref2 = self._emit_image(tag, doc)
added_refs.append(im_ref2)
elif tag_name in {"pre"}:
# handle monospace code snippets (pre).
@@ -835,13 +1053,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text_clean = HTMLDocumentBackend._clean_unicode(
annotated_text.text.strip()
)
doc.add_code(
docling_code2 = doc.add_code(
parent=self.parents[self.level],
text=text_clean,
content_layer=self.content_layer,
formatting=annotated_text.formatting,
hyperlink=annotated_text.hyperlink,
)
added_refs.append(docling_code2.get_ref())
elif tag_name == "footer":
with self._use_footer(tag, doc):
@@ -850,8 +1069,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif tag_name == "details":
with self._use_details(tag, doc):
self._walk(tag, doc)
return added_refs
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> RefItem:
figure = img_tag.find_parent("figure")
caption: AnnotatedTextList = AnnotatedTextList()
@@ -894,11 +1114,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
hyperlink=caption_anno_text.hyperlink,
)
doc.add_picture(
docling_pic = doc.add_picture(
caption=caption_item,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
return docling_pic.get_ref()
@staticmethod
def get_text(item: PageElement) -> str:
@@ -996,106 +1217,3 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
)
return int_spans
@staticmethod
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
nested_tables = element.find("table")
if nested_tables is not None:
_log.debug("Skipping nested table.")
return None
# Find the number of rows and columns (taking into account spans)
num_rows = 0
num_cols = 0
for row in element("tr"):
col_count = 0
is_row_header = True
if not isinstance(row, Tag):
continue
for cell in row(["td", "th"]):
if not isinstance(row, Tag):
continue
cell_tag = cast(Tag, cell)
col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
col_count += col_span
if cell_tag.name == "td" or row_span == 1:
is_row_header = False
num_cols = max(num_cols, col_count)
if not is_row_header:
num_rows += 1
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table
start_row_span = 0
row_idx = -1
for row in element("tr"):
if not isinstance(row, Tag):
continue
# For each row, find all the column cells (both <td> and <th>)
cells = row(["td", "th"])
# Check if cell is in a column header or row header
col_header = True
row_header = True
for html_cell in cells:
if isinstance(html_cell, Tag):
_, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
if html_cell.name == "td":
col_header = False
row_header = False
elif row_span == 1:
row_header = False
if not row_header:
row_idx += 1
start_row_span = 0
else:
start_row_span += 1
# Extract the text content of each cell
col_idx = 0
for html_cell in cells:
if not isinstance(html_cell, Tag):
continue
# extract inline formulas
for formula in html_cell("inline-formula"):
math_parts = formula.text.split("$$")
if len(math_parts) == 3:
math_formula = f"$${math_parts[1]}$$"
formula.replace_with(NavigableString(math_formula))
# TODO: extract content correctly from table-cells with lists
text = HTMLDocumentBackend.get_text(html_cell).strip()
col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
if row_header:
row_span -= 1
while (
col_idx < num_cols
and grid[row_idx + start_row_span][col_idx] is not None
):
col_idx += 1
for r in range(start_row_span, start_row_span + row_span):
for c in range(col_span):
if row_idx + r < num_rows and col_idx + c < num_cols:
grid[row_idx + r][col_idx + c] = text
table_cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=start_row_span + row_idx,
end_row_offset_idx=start_row_span + row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
column_header=col_header,
row_header=((not col_header) and html_cell.name == "th"),
)
data.table_cells.append(table_cell)
return data

View File

@@ -2,9 +2,9 @@ import logging
import traceback
from io import BytesIO
from pathlib import Path
from typing import Final, Optional, Union
from typing import Final, Optional, Union, cast
from bs4 import BeautifulSoup, Tag
from bs4 import BeautifulSoup, NavigableString, Tag
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
@@ -12,6 +12,8 @@ from docling_core.types.doc import (
GroupItem,
GroupLabel,
NodeItem,
TableCell,
TableData,
TextItem,
)
from lxml import etree
@@ -535,6 +537,110 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
return
@staticmethod
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
# TODO, see how to implement proper support for rich tables from HTML backend
nested_tables = element.find("table")
if nested_tables is not None:
_log.debug("Skipping nested table.")
return None
# Find the number of rows and columns (taking into account spans)
num_rows = 0
num_cols = 0
for row in element("tr"):
col_count = 0
is_row_header = True
if not isinstance(row, Tag):
continue
for cell in row(["td", "th"]):
if not isinstance(row, Tag):
continue
cell_tag = cast(Tag, cell)
col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
col_count += col_span
if cell_tag.name == "td" or row_span == 1:
is_row_header = False
num_cols = max(num_cols, col_count)
if not is_row_header:
num_rows += 1
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table
start_row_span = 0
row_idx = -1
for row in element("tr"):
if not isinstance(row, Tag):
continue
# For each row, find all the column cells (both <td> and <th>)
cells = row(["td", "th"])
# Check if cell is in a column header or row header
col_header = True
row_header = True
for html_cell in cells:
if isinstance(html_cell, Tag):
_, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
if html_cell.name == "td":
col_header = False
row_header = False
elif row_span == 1:
row_header = False
if not row_header:
row_idx += 1
start_row_span = 0
else:
start_row_span += 1
# Extract the text content of each cell
col_idx = 0
for html_cell in cells:
if not isinstance(html_cell, Tag):
continue
# extract inline formulas
for formula in html_cell("inline-formula"):
math_parts = formula.text.split("$$")
if len(math_parts) == 3:
math_formula = f"$${math_parts[1]}$$"
formula.replace_with(NavigableString(math_formula))
# TODO: extract content correctly from table-cells with lists
text = HTMLDocumentBackend.get_text(html_cell).strip()
col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
if row_header:
row_span -= 1
while (
col_idx < num_cols
and grid[row_idx + start_row_span][col_idx] is not None
):
col_idx += 1
for r in range(start_row_span, start_row_span + row_span):
for c in range(col_span):
if row_idx + r < num_rows and col_idx + c < num_cols:
grid[row_idx + r][col_idx + c] = text
table_cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=start_row_span + row_idx,
end_row_offset_idx=start_row_span + row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
column_header=col_header,
row_header=((not col_header) and html_cell.name == "th"),
)
data.table_cells.append(table_cell)
return data
def _add_table(
self, doc: DoclingDocument, parent: NodeItem, table_xml_component: Table
) -> None:
@@ -543,8 +649,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
if not isinstance(table_tag, Tag):
return
data = HTMLDocumentBackend.parse_table_data(table_tag)
data = JatsDocumentBackend.parse_table_data(table_tag)
# TODO: format label vs caption once styling is supported
label = table_xml_component["label"]
caption = table_xml_component["caption"]
@@ -554,7 +659,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
if table_text
else None
)
if data is not None:
doc.add_table(data=data, parent=parent, caption=table_caption)

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "example_01",
"origin": {
"mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "example_02",
"origin": {
"mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "example_03",
"origin": {
"mimetype": "text/html",
@@ -346,7 +346,8 @@
"text": "Header 1",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -358,7 +359,8 @@
"text": "Header 2",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -370,7 +372,8 @@
"text": "Header 3",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -382,7 +385,8 @@
"text": "Row 1, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -394,7 +398,8 @@
"text": "Row 1, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -406,7 +411,8 @@
"text": "Row 1, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -418,7 +424,8 @@
"text": "Row 2, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -430,7 +437,8 @@
"text": "Row 2, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -442,7 +450,8 @@
"text": "Row 2, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -454,7 +463,8 @@
"text": "Row 3, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -466,7 +476,8 @@
"text": "Row 3, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -478,7 +489,8 @@
"text": "Row 3, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
"num_rows": 4,
@@ -495,7 +507,8 @@
"text": "Header 1",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -507,7 +520,8 @@
"text": "Header 2",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -519,7 +533,8 @@
"text": "Header 3",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -533,7 +548,8 @@
"text": "Row 1, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -545,7 +561,8 @@
"text": "Row 1, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -557,7 +574,8 @@
"text": "Row 1, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -571,7 +589,8 @@
"text": "Row 2, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -583,7 +602,8 @@
"text": "Row 2, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -595,7 +615,8 @@
"text": "Row 2, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -609,7 +630,8 @@
"text": "Row 3, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -621,7 +643,8 @@
"text": "Row 3, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -633,7 +656,8 @@
"text": "Row 3, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
]
]

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "example_04",
"origin": {
"mimetype": "text/html",
@@ -70,7 +70,8 @@
"text": "Header 1",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -82,7 +83,8 @@
"text": "Header 2 & 3 (colspan)",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 2,
@@ -94,7 +96,8 @@
"text": "Row 1 & 2, Col 1 (rowspan)",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -106,7 +109,8 @@
"text": "Row 1, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -118,7 +122,8 @@
"text": "Row 1, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -130,7 +135,8 @@
"text": "Row 2, Col 2 & 3 (colspan)",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -142,7 +148,8 @@
"text": "Row 3, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -154,7 +161,8 @@
"text": "Row 3, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -166,7 +174,8 @@
"text": "Row 3, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
"num_rows": 4,
@@ -183,7 +192,8 @@
"text": "Header 1",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -195,7 +205,8 @@
"text": "Header 2 & 3 (colspan)",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -207,7 +218,8 @@
"text": "Header 2 & 3 (colspan)",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -221,7 +233,8 @@
"text": "Row 1 & 2, Col 1 (rowspan)",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -233,7 +246,8 @@
"text": "Row 1, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -245,7 +259,8 @@
"text": "Row 1, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -259,7 +274,8 @@
"text": "Row 1 & 2, Col 1 (rowspan)",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -271,7 +287,8 @@
"text": "Row 2, Col 2 & 3 (colspan)",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -283,7 +300,8 @@
"text": "Row 2, Col 2 & 3 (colspan)",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -297,7 +315,8 @@
"text": "Row 3, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -309,7 +328,8 @@
"text": "Row 3, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -321,7 +341,8 @@
"text": "Row 3, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
]
]

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "example_05",
"origin": {
"mimetype": "text/html",
@@ -70,7 +70,8 @@
"text": "Header 1",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -82,7 +83,8 @@
"text": "Header 2 & 3 (colspan)",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 2,
@@ -94,7 +96,8 @@
"text": "Row 1 & 2, Col 1 (rowspan)",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -106,7 +109,8 @@
"text": "Row 1, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -118,7 +122,8 @@
"text": "Row 1, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -130,7 +135,8 @@
"text": "Row 2, Col 2 & 3 (colspan)",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -142,7 +148,8 @@
"text": "Row 3, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -154,7 +161,8 @@
"text": "Row 3, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -166,7 +174,8 @@
"text": "Row 3, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
"num_rows": 4,
@@ -183,7 +192,8 @@
"text": "Header 1",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -195,7 +205,8 @@
"text": "Header 2 & 3 (colspan)",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -207,7 +218,8 @@
"text": "Header 2 & 3 (colspan)",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -221,7 +233,8 @@
"text": "Row 1 & 2, Col 1 (rowspan)",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -233,7 +246,8 @@
"text": "Row 1, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -245,7 +259,8 @@
"text": "Row 1, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -259,7 +274,8 @@
"text": "Row 1 & 2, Col 1 (rowspan)",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -271,7 +287,8 @@
"text": "Row 2, Col 2 & 3 (colspan)",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -283,7 +300,8 @@
"text": "Row 2, Col 2 & 3 (colspan)",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -297,7 +315,8 @@
"text": "Row 3, Col 1",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -309,7 +328,8 @@
"text": "Row 3, Col 2",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -321,7 +341,8 @@
"text": "Row 3, Col 3",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
]
]

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "example_06",
"origin": {
"mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "example_07",
"origin": {
"mimetype": "text/html",

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "example_09",
"origin": {
"mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "formatting",
"origin": {
"mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "html_code_snippets",
"origin": {
"mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "hyperlink_01",
"origin": {
"mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "hyperlink_02",
"origin": {
"mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "hyperlink_03",
"origin": {
"mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "hyperlink_04",
"origin": {
"mimetype": "text/html",

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "hyperlink_05",
"origin": {
"mimetype": "text/html",

View File

@@ -0,0 +1,5 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Header
item-2 at level 2: text: This is the first paragraph.
item-3 at level 2: table with [2x2]
item-4 at level 2: text: After table

View File

@@ -0,0 +1,213 @@
{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"name": "table_01",
"origin": {
"mimetype": "text/html",
"binary_hash": 8899613932804813807,
"filename": "table_01.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/1"
},
{
"$ref": "#/tables/0"
},
{
"$ref": "#/texts/2"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Header",
"text": "Header"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "This is the first paragraph.",
"text": "This is the first paragraph."
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "After table",
"text": "After table"
}
],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "1...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
"num_rows": 2,
"num_cols": 2,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "1...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
}
],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,9 @@
# Header
This is the first paragraph.
| A | B |
|------|------|
| 1... | 2... |
After table

View File

@@ -0,0 +1,9 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Header
item-2 at level 2: text: This is the first paragraph.
item-3 at level 2: table with [2x2]
item-4 at level 3: unspecified: group rich_cell_group_1_0_1
item-5 at level 4: text: First Paragraph
item-6 at level 4: text: Second Paragraph
item-7 at level 4: text: Third Paragraph
item-8 at level 2: text: After table

View File

@@ -0,0 +1,277 @@
{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"name": "table_02",
"origin": {
"mimetype": "text/html",
"binary_hash": 13259165361873975426,
"filename": "table_02.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/tables/0"
},
"children": [
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
},
{
"$ref": "#/texts/4"
}
],
"content_layer": "body",
"name": "rich_cell_group_1_0_1",
"label": "unspecified"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/1"
},
{
"$ref": "#/tables/0"
},
{
"$ref": "#/texts/5"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Header",
"text": "Header"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "This is the first paragraph.",
"text": "This is the first paragraph."
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "First Paragraph",
"text": "First Paragraph"
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Second Paragraph",
"text": "Second Paragraph"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Third Paragraph",
"text": "Third Paragraph"
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "After table",
"text": "After table"
}
],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/texts/0"
},
"children": [
{
"$ref": "#/groups/0"
}
],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "First Paragraph\nSecond Paragraph\nThird Paragraph",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false,
"ref": {
"$ref": "#/groups/0"
}
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
"num_rows": 2,
"num_cols": 2,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "First Paragraph\nSecond Paragraph\nThird Paragraph",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
}
],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,9 @@
# Header
This is the first paragraph.
| A | B |
|----------------------------------------------------|------|
| First Paragraph Second Paragraph Third Paragraph | 2... |
After table

View File

@@ -0,0 +1,10 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Header
item-2 at level 2: text: This is the first paragraph.
item-3 at level 2: table with [2x2]
item-4 at level 3: unspecified: group rich_cell_group_1_0_1
item-5 at level 4: list: group list
item-6 at level 5: list_item: First item
item-7 at level 5: list_item: Second item
item-8 at level 5: list_item: Third item
item-9 at level 2: text: After table

View File

@@ -0,0 +1,297 @@
{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"name": "table_03",
"origin": {
"mimetype": "text/html",
"binary_hash": 3098300110939273317,
"filename": "table_03.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/groups/1"
},
"children": [
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
},
{
"$ref": "#/texts/4"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/tables/0"
},
"children": [
{
"$ref": "#/groups/0"
}
],
"content_layer": "body",
"name": "rich_cell_group_1_0_1",
"label": "unspecified"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/1"
},
{
"$ref": "#/tables/0"
},
{
"$ref": "#/texts/5"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Header",
"text": "Header"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "This is the first paragraph.",
"text": "This is the first paragraph."
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "First item",
"text": "First item",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Second item",
"text": "Second item",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Third item",
"text": "Third item",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "After table",
"text": "After table"
}
],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/texts/0"
},
"children": [
{
"$ref": "#/groups/1"
}
],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "First item Second item Third item",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false,
"ref": {
"$ref": "#/groups/1"
}
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
"num_rows": 2,
"num_cols": 2,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "First item Second item Third item",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
}
],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,9 @@
# Header
This is the first paragraph.
| A | B |
|-----------------------------------------|------|
| - First item - Second item - Third item | 2... |
After table

View File

@@ -0,0 +1,11 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Header
item-2 at level 2: text: This is the first paragraph.
item-3 at level 2: table with [2x2]
item-4 at level 3: unspecified: group rich_cell_group_1_0_1
item-5 at level 4: text: Some text before list
item-6 at level 4: list: group list
item-7 at level 5: list_item: First item
item-8 at level 5: list_item: Second item
item-9 at level 5: list_item: Third item
item-10 at level 2: text: After table

View File

@@ -0,0 +1,312 @@
{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"name": "table_04",
"origin": {
"mimetype": "text/html",
"binary_hash": 2569676747034209441,
"filename": "table_04.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/groups/1"
},
"children": [
{
"$ref": "#/texts/3"
},
{
"$ref": "#/texts/4"
},
{
"$ref": "#/texts/5"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/tables/0"
},
"children": [
{
"$ref": "#/texts/2"
},
{
"$ref": "#/groups/0"
}
],
"content_layer": "body",
"name": "rich_cell_group_1_0_1",
"label": "unspecified"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/1"
},
{
"$ref": "#/tables/0"
},
{
"$ref": "#/texts/6"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Header",
"text": "Header"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "This is the first paragraph.",
"text": "This is the first paragraph."
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Some text before list",
"text": "Some text before list"
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "First item",
"text": "First item",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Second item",
"text": "Second item",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Third item",
"text": "Third item",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "After table",
"text": "After table"
}
],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/texts/0"
},
"children": [
{
"$ref": "#/groups/1"
}
],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Some text before list\n \nFirst item Second item Third item",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false,
"ref": {
"$ref": "#/groups/1"
}
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
"num_rows": 2,
"num_cols": 2,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Some text before list\n \nFirst item Second item Third item",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
}
],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,9 @@
# Header
This is the first paragraph.
| A | B |
|----------------------------------------------------------------|------|
| Some text before list - First item - Second item - Third item | 2... |
After table

View File

@@ -0,0 +1,7 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Header
item-2 at level 2: text: This is the first paragraph.
item-3 at level 2: table with [2x2]
item-4 at level 3: unspecified: group rich_cell_group_2_0_1
item-5 at level 4: table with [2x3]
item-6 at level 2: text: After table

View File

@@ -0,0 +1,417 @@
{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"name": "table_05",
"origin": {
"mimetype": "text/html",
"binary_hash": 12827430000043968589,
"filename": "table_05.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/tables/0"
},
"children": [
{
"$ref": "#/tables/1"
}
],
"content_layer": "body",
"name": "rich_cell_group_2_0_1",
"label": "unspecified"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/1"
},
{
"$ref": "#/tables/0"
},
{
"$ref": "#/texts/2"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Header",
"text": "Header"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "This is the first paragraph.",
"text": "This is the first paragraph."
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "After table",
"text": "After table"
}
],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/texts/0"
},
"children": [
{
"$ref": "#/groups/0"
}
],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A1B1C1\n\n\nD1E1F1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false,
"ref": {
"$ref": "#/groups/0"
}
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
"num_rows": 2,
"num_cols": 2,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A1B1C1\n\n\nD1E1F1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
},
{
"self_ref": "#/tables/1",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "C1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "D1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "E1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "F1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
"num_rows": 2,
"num_cols": 3,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "C1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "D1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "E1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "F1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
}
],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,9 @@
# Header
This is the first paragraph.
| A | B |
|----------------------------------------------------------------------|------|
| | A1 | B1 | C1 | |------|------|------| | D1 | E1 | F1 | | 2... |
After table

View File

@@ -0,0 +1,11 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Header
item-2 at level 2: text: This is the first paragraph.
item-3 at level 2: table with [2x2]
item-4 at level 3: unspecified: group rich_cell_group_4_0_1
item-5 at level 4: table with [2x3]
item-6 at level 5: unspecified: group rich_cell_group_4_0_1
item-7 at level 6: table with [4x2]
item-8 at level 7: unspecified: group rich_cell_group_4_0_2
item-9 at level 8: table with [2x2]
item-10 at level 2: text: After table

View File

@@ -0,0 +1,827 @@
{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"name": "table_06",
"origin": {
"mimetype": "text/html",
"binary_hash": 3950001743588145047,
"filename": "table_06.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/tables/2"
},
"children": [
{
"$ref": "#/tables/3"
}
],
"content_layer": "body",
"name": "rich_cell_group_4_0_2",
"label": "unspecified"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/tables/1"
},
"children": [
{
"$ref": "#/tables/2"
}
],
"content_layer": "body",
"name": "rich_cell_group_4_0_1",
"label": "unspecified"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/tables/0"
},
"children": [
{
"$ref": "#/tables/1"
}
],
"content_layer": "body",
"name": "rich_cell_group_4_0_1",
"label": "unspecified"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/1"
},
{
"$ref": "#/tables/0"
},
{
"$ref": "#/texts/2"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Header",
"text": "Header"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "This is the first paragraph.",
"text": "This is the first paragraph."
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "After table",
"text": "After table"
}
],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/texts/0"
},
"children": [
{
"$ref": "#/groups/2"
}
],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A1B1C1\n\n\nD1\n\n\n\nIII\n\n\nIIIIV\n\n\nV\n\n\n\nE1E2\n\n\nE3E4\n\n\n\n\n\nVIIVIII\n\n\n\nF1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false,
"ref": {
"$ref": "#/groups/2"
}
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
"num_rows": 2,
"num_cols": 2,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A1B1C1\n\n\nD1\n\n\n\nIII\n\n\nIIIIV\n\n\nV\n\n\n\nE1E2\n\n\nE3E4\n\n\n\n\n\nVIIVIII\n\n\n\nF1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2...",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
},
{
"self_ref": "#/tables/1",
"parent": {
"$ref": "#/groups/2"
},
"children": [
{
"$ref": "#/groups/1"
}
],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "C1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "D1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "III\n\n\nIIIIV\n\n\nV\n\n\n\nE1E2\n\n\nE3E4\n\n\n\n\n\nVIIVIII",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false,
"ref": {
"$ref": "#/groups/1"
}
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "F1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
"num_rows": 2,
"num_cols": 3,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "C1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "D1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "III\n\n\nIIIIV\n\n\nV\n\n\n\nE1E2\n\n\nE3E4\n\n\n\n\n\nVIIVIII",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "F1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
},
{
"self_ref": "#/tables/2",
"parent": {
"$ref": "#/groups/1"
},
"children": [
{
"$ref": "#/groups/0"
}
],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "I",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "II",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "III",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "IV",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "V",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "E1E2\n\n\nE3E4",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false,
"ref": {
"$ref": "#/groups/0"
}
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "VII",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "VIII",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
"num_rows": 4,
"num_cols": 2,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "I",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "II",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "III",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "IV",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "V",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "E1E2\n\n\nE3E4",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "VII",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "VIII",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
},
{
"self_ref": "#/tables/3",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "E1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "E2",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "E3",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "E4",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
"num_rows": 2,
"num_cols": 2,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "E1",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "E2",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "E3",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "E4",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
}
],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,9 @@
# Header
This is the first paragraph.
| A | B |
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------|
| | A1 | B1 | C1 | |------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------| | D1 | | I | II | |-----|-------------------------------------------------| | III | IV | | V | | E1 | E2 | |------|------| | E3 | E4 | | | VII | VIII | | F1 | | 2... |
After table

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "unit_test_01",
"origin": {
"mimetype": "text/html",

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -232,14 +232,10 @@ This article is about the bird. For duck as a food, see [Duck as food](/wiki/Duc
"Duckling" redirects here. For other uses, see [Duckling (disambiguation)](/wiki/Duckling_(disambiguation)) .
<!-- image -->
<!-- image -->
| Duck | Duck |
|--------------------------------|--------------------------------|
|----------------------------------------------------------|---------------------------|
| | |
| Bufflehead (Bucephala albeola) | Bufflehead (Bucephala albeola) |
| [Bufflehead](/wiki/Bufflehead) *Bucephala albeola* ( ) | |
| Scientific classification | Scientific classification |
| Domain: | Eukaryota |
| Kingdom: | Animalia |
@@ -251,6 +247,10 @@ This article is about the bird. For duck as a food, see [Duck as food](/wiki/Duc
| Subfamilies | Subfamilies |
| See text | See text |
<!-- image -->
<!-- image -->
**Duck** is the common name for numerous species of [waterfowl](/wiki/Waterfowl) in the [family](/wiki/Family_(biology)) [Anatidae](/wiki/Anatidae) . Ducks are generally smaller and shorter-necked than [swans](/wiki/Swan) and [geese](/wiki/Goose) , which are members of the same family. Divided among several subfamilies, they are a [form taxon](/wiki/Form_taxon) ; they do not represent a [monophyletic group](/wiki/Monophyletic_group) (the group of all descendants of a single common ancestral species), since swans and geese are not considered ducks. Ducks are mostly [aquatic birds](/wiki/Aquatic_bird) , and may be found in both fresh water and sea water.
Ducks are sometimes confused with several types of unrelated water birds with similar forms, such as [loons](/wiki/Loon) or divers, [grebes](/wiki/Grebe) , [gallinules](/wiki/Gallinule) and [coots](/wiki/Coot) .
@@ -509,12 +509,12 @@ The 1992 Disney film [*The Mighty Ducks*](/wiki/The_Mighty_Ducks_(film)) , starr
- [Ducks on postage stamps](http://www.stampsbook.org/subject/Duck.html) [Archived](https://web.archive.org/web/20130513022903/http://www.stampsbook.org/subject/Duck.html) 2013-05-13 at the [Wayback Machine](/wiki/Wayback_Machine)
- [*Ducks at a Distance, by Rob Hines*](https://gutenberg.org/ebooks/18884) at [Project Gutenberg](/wiki/Project_Gutenberg) - A modern illustrated guide to identification of US waterfowl
<!-- image -->
| Authority control databases | Authority control databases |
|-------------------------------|---------------------------------------------------|
| National | United States France BnF data Japan Latvia Israel |
| Other | IdRef |
|-------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| National | - [United States](https://id.loc.gov/authorities/sh85039879) - [France](https://catalogue.bnf.fr/ark:/12148/cb119761481) - [BnF data](https://data.bnf.fr/ark:/12148/cb119761481) - [Japan](https://id.ndl.go.jp/auth/ndlna/00564819) - [Latvia](https://kopkatalogs.lv/F?func=direct&local_base=lnc10&doc_number=000090751&P_CON_LNG=ENG) - [Israel](http://olduli.nli.org.il/F/?func=find-b&local_base=NLX10&find_code=UID&request=987007565486205171) |
| Other | - [IdRef](https://www.idref.fr/027796124) |
<!-- image -->
Retrieved from " [https://en.wikipedia.org/w/index.php?title=Duck&amp;oldid=1246843351](https://en.wikipedia.org/w/index.php?title=Duck&oldid=1246843351) "

24
tests/data/html/table_01.html vendored Normal file
View File

@@ -0,0 +1,24 @@
<html>
<head>
<style>
table, th, td {border: 1px solid black; border-collapse: collapse;}
td {padding:30px;}
table {margin: 30px;}
</style>
</head>
<body>
<h1>Header</h1>
<p>This is the first paragraph.</p>
<table>
<tr>
<td>A</td>
<td>B</td>
</tr>
<tr>
<td>1...</td>
<td>2...</td>
</tr>
</table>
After table
</body>
</html>

24
tests/data/html/table_02.html vendored Normal file
View File

@@ -0,0 +1,24 @@
<html>
<head>
<style>
table, th, td {border: 1px solid black; border-collapse: collapse;}
td {padding:30px;}
table {margin: 30px;}
</style>
</head>
<body>
<h1>Header</h1>
<p>This is the first paragraph.</p>
<table>
<tr>
<td>A</td>
<td>B</td>
</tr>
<tr>
<td>First Paragraph<br>Second Paragraph<br>Third Paragraph</td>
<td>2...</td>
</tr>
</table>
After table
</body>
</html>

28
tests/data/html/table_03.html vendored Normal file
View File

@@ -0,0 +1,28 @@
<html>
<head>
<style>
table, th, td {border: 1px solid black; border-collapse: collapse;}
td {padding:30px;}
table {margin: 30px;}
</style>
</head>
<body>
<h1>Header</h1>
<p>This is the first paragraph.</p>
<table>
<tr>
<td>A</td>
<td>B</td>
</tr>
<tr>
<td>
<ul>
<li>First item</li><li>Second item</li><li>Third item</li>
</ul>
</td>
<td>2...</td>
</tr>
</table>
After table
</body>
</html>

29
tests/data/html/table_04.html vendored Normal file
View File

@@ -0,0 +1,29 @@
<html>
<head>
<style>
table, th, td {border: 1px solid black; border-collapse: collapse;}
td {padding:30px;}
table {margin: 30px;}
</style>
</head>
<body>
<h1>Header</h1>
<p>This is the first paragraph.</p>
<table>
<tr>
<td>A</td>
<td>B</td>
</tr>
<tr>
<td>
Some text before list
<ul>
<li>First item</li><li>Second item</li><li>Third item</li>
</ul>
</td>
<td>2...</td>
</tr>
</table>
After table
</body>
</html>

33
tests/data/html/table_05.html vendored Normal file
View File

@@ -0,0 +1,33 @@
<html>
<head>
<style>
table, th, td {border: 1px solid black; border-collapse: collapse;}
td {padding:30px;}
table {margin: 30px;}
</style>
</head>
<body>
<h1>Header</h1>
<p>This is the first paragraph.</p>
<table>
<tr>
<td>A</td>
<td>B</td>
</tr>
<tr>
<td>
<table>
<tr>
<td>A1</td><td>B1</td><td>C1</td>
</tr>
<tr>
<td>D1</td><td>E1</td><td>F1</td>
</tr>
</table>
</td>
<td>2...</td>
</tr>
</table>
After table
</body>
</html>

60
tests/data/html/table_06.html vendored Normal file
View File

@@ -0,0 +1,60 @@
<html>
<head>
<style>
table, th, td {border: 1px solid black; border-collapse: collapse;}
td {padding:30px;}
table {margin: 30px;}
</style>
</head>
<body>
<h1>Header</h1>
<p>This is the first paragraph.</p>
<table>
<tr>
<td>A</td>
<td>B</td>
</tr>
<tr>
<td>
<table>
<tr>
<td>A1</td><td>B1</td><td>C1</td>
</tr>
<tr>
<td>D1</td>
<td>
<table>
<tr>
<td>I</td><td>II</td>
</tr>
<tr>
<td>III</td><td>IV</td>
</tr>
<tr>
<td>V</td>
<td>
<table>
<tr>
<td>E1</td><td>E2</td>
</tr>
<tr>
<td>E3</td><td>E4</td>
</tr>
</table>
</td>
</tr>
<tr>
<td>VII</td><td>VIII</td>
</tr>
</table>
</td>
<td>F1</td>
</tr>
</table>
</td>
<td>2...</td>
</tr>
</table>
After table
</body>
</html>