Merge branch 'main' of github.com:DS4SD/docling into cau/vis-and-profiling-options

This commit is contained in:
Christoph Auer 2024-10-28 13:21:45 +01:00
commit a00f01cf07
8 changed files with 309 additions and 281 deletions

View File

@ -136,7 +136,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def get_direct_text(self, item): def get_direct_text(self, item):
"""Get the direct text of the <li> element (ignoring nested lists).""" """Get the direct text of the <li> element (ignoring nested lists)."""
text = item.find(string=True, recursive=False) text = item.find(string=True, recursive=False)
if isinstance(text, str): if isinstance(text, str):
return text.strip() return text.strip()
@ -149,21 +148,20 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if isinstance(item, str): if isinstance(item, str):
return [item] return [item]
result.append(self.get_direct_text(item)) if item.name not in ["ul", "ol"]:
try:
# Iterate over the children (and their text and tails)
for child in item:
try:
# Recursively get the child's text content
result.extend(self.extract_text_recursively(child))
except:
pass
except:
_log.warn("item has no children")
pass
try: return "".join(result) + " "
# Iterate over the children (and their text and tails)
for child in item:
try:
# Recursively get the child's text content
result.extend(self.extract_text_recursively(child))
except:
pass
except:
_log.warn("item has no children")
pass
return " ".join(result)
def handle_header(self, element, idx, doc): def handle_header(self, element, idx, doc):
"""Handles header tags (h1, h2, etc.).""" """Handles header tags (h1, h2, etc.)."""
@ -255,7 +253,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if nested_lists: if nested_lists:
name = element.name name = element.name
text = self.get_direct_text(element) # Text in list item can be hidden within hierarchy, hence
# we need to extract it recursively
text = self.extract_text_recursively(element)
# Flatten text, remove break lines:
text = text.replace("\n", "").replace("\r", "")
text = " ".join(text.split()).strip()
marker = "" marker = ""
enumerated = False enumerated = False
@ -263,14 +266,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
marker = str(index_in_list) marker = str(index_in_list)
enumerated = True enumerated = True
# create a list-item if len(text) > 0:
self.parents[self.level + 1] = doc.add_list_item( # create a list-item
text=text, self.parents[self.level + 1] = doc.add_list_item(
enumerated=enumerated, text=text,
marker=marker, enumerated=enumerated,
parent=self.parents[self.level], marker=marker,
) parent=self.parents[self.level],
self.level += 1 )
self.level += 1
self.walk(element, doc) self.walk(element, doc)

View File

@ -135,11 +135,29 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc_label = DocItemLabel.TITLE doc_label = DocItemLabel.TITLE
else: else:
doc_label = DocItemLabel.SECTION_HEADER doc_label = DocItemLabel.SECTION_HEADER
snippet_text = element.children[0].children.strip()
parent_element = doc.add_text( # Header could have arbitrary inclusion of bold, italic or emphasis,
label=doc_label, parent=parent_element, text=snippet_text # hence we need to traverse the tree to get full text of a header
) strings = []
# Define a recursive function to traverse the tree
def traverse(node):
# Check if the node has a "children" attribute
if hasattr(node, "children"):
# If "children" is a list, continue traversal
if isinstance(node.children, list):
for child in node.children:
traverse(child)
# If "children" is text, add it to header text
elif isinstance(node.children, str):
strings.append(node.children)
traverse(element)
snippet_text = "".join(strings)
if len(snippet_text) > 0:
parent_element = doc.add_text(
label=doc_label, parent=parent_element, text=snippet_text
)
elif isinstance(element, marko.block.List): elif isinstance(element, marko.block.List):
self.close_table(doc) self.close_table(doc)
@ -286,6 +304,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
parsed_ast = marko_parser.parse(self.markdown) parsed_ast = marko_parser.parse(self.markdown)
# Start iterating from the root of the AST # Start iterating from the root of the AST
self.iterate_elements(parsed_ast, 0, doc, None) self.iterate_elements(parsed_ast, 0, doc, None)
self.process_inline_text(None, doc) # handle last hanging inline text
else: else:
raise RuntimeError( raise RuntimeError(
f"Cannot convert md with {self.document_hash} because the backend failed to init." f"Cannot convert md with {self.document_hash} because the backend failed to init."

View File

@ -14,7 +14,7 @@ from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
USE_V2 = True USE_V2 = True
USE_LEGACY = False USE_LEGACY = True
def export_documents( def export_documents(
@ -78,7 +78,7 @@ def export_documents(
with (output_dir / f"{doc_filename}.legacy.doctags.txt").open( with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
"w", encoding="utf-8" "w", encoding="utf-8"
) as fp: ) as fp:
fp.write(conv_res.legacy_document.export_to_doctags()) fp.write(conv_res.legacy_document.export_to_document_tokens())
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS: elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info( _log.info(

37
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. # This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
[[package]] [[package]]
name = "aiohappyeyeballs" name = "aiohappyeyeballs"
@ -196,8 +196,8 @@ files = [
lazy-object-proxy = ">=1.4.0" lazy-object-proxy = ">=1.4.0"
typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
wrapt = [ wrapt = [
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
{version = ">=1.11,<2", markers = "python_version < \"3.11\""}, {version = ">=1.11,<2", markers = "python_version < \"3.11\""},
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
] ]
[[package]] [[package]]
@ -840,8 +840,8 @@ files = [
docling-core = ">=2.0,<3.0" docling-core = ">=2.0,<3.0"
docutils = "!=0.21" docutils = "!=0.21"
numpy = [ numpy = [
{version = ">=2.0.2,<3.0.0", markers = "python_version >= \"3.13\""},
{version = ">=1.26.4,<2.0.0", markers = "python_version >= \"3.9\" and python_version < \"3.13\""}, {version = ">=1.26.4,<2.0.0", markers = "python_version >= \"3.9\" and python_version < \"3.13\""},
{version = ">=2.0.2,<3.0.0", markers = "python_version >= \"3.13\""},
] ]
pandas = {version = ">=2.1.4,<3.0.0", markers = "python_version >= \"3.9\""} pandas = {version = ">=2.1.4,<3.0.0", markers = "python_version >= \"3.9\""}
python-dotenv = ">=1.0.0,<2.0.0" python-dotenv = ">=1.0.0,<2.0.0"
@ -894,13 +894,13 @@ files = [
[[package]] [[package]]
name = "docling-core" name = "docling-core"
version = "2.1.0" version = "2.2.1"
description = "A python library to define and validate data types in Docling." description = "A python library to define and validate data types in Docling."
optional = false optional = false
python-versions = "<4.0,>=3.9" python-versions = "<4.0,>=3.9"
files = [ files = [
{file = "docling_core-2.1.0-py3-none-any.whl", hash = "sha256:4ccf9c44f8d7cf663657283baea4c4a36e1c4d1fba7df6b70ebc2c16b58f11a4"}, {file = "docling_core-2.2.1-py3-none-any.whl", hash = "sha256:65ed05331f387410950e10d7d2347eae770ab7dc4b5a632715aaa7c66c158cb5"},
{file = "docling_core-2.1.0.tar.gz", hash = "sha256:76ba3cb0a912db712aa89618746d279f1276b943c259dcf9d0b335a30cf7c99e"}, {file = "docling_core-2.2.1.tar.gz", hash = "sha256:4893369fe2aac9dff26c85a4ff87990f2e1645d9e16473ac7309e3459a3c4219"},
] ]
[package.dependencies] [package.dependencies]
@ -928,8 +928,8 @@ jsonlines = ">=3.1.0,<4.0.0"
lxml = ">=4.9.1,<5.0.0" lxml = ">=4.9.1,<5.0.0"
mean_average_precision = ">=2021.4.26.0,<2022.0.0.0" mean_average_precision = ">=2021.4.26.0,<2022.0.0.0"
numpy = [ numpy = [
{version = ">=2.1.0,<3.0.0", markers = "python_version >= \"3.13\""},
{version = ">=1.24.4,<2.0.0", markers = "python_version < \"3.13\""}, {version = ">=1.24.4,<2.0.0", markers = "python_version < \"3.13\""},
{version = ">=2.1.0,<3.0.0", markers = "python_version >= \"3.13\""},
] ]
opencv-python-headless = ">=4.6.0.66,<5.0.0.0" opencv-python-headless = ">=4.6.0.66,<5.0.0.0"
Pillow = ">=10.0.0,<11.0.0" Pillow = ">=10.0.0,<11.0.0"
@ -2074,8 +2074,8 @@ jsonpatch = ">=1.33,<2.0"
langsmith = ">=0.1.112,<0.2.0" langsmith = ">=0.1.112,<0.2.0"
packaging = ">=23.2,<25" packaging = ">=23.2,<25"
pydantic = [ pydantic = [
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
{version = ">=1,<3", markers = "python_full_version < \"3.12.4\""}, {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
] ]
PyYAML = ">=5.3" PyYAML = ">=5.3"
tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0"
@ -2143,8 +2143,8 @@ files = [
httpx = ">=0.23.0,<1" httpx = ">=0.23.0,<1"
orjson = ">=3.9.14,<4.0.0" orjson = ">=3.9.14,<4.0.0"
pydantic = [ pydantic = [
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
{version = ">=1,<3", markers = "python_full_version < \"3.12.4\""}, {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
] ]
requests = ">=2,<3" requests = ">=2,<3"
requests-toolbelt = ">=1.0.0,<2.0.0" requests-toolbelt = ">=1.0.0,<2.0.0"
@ -3514,10 +3514,10 @@ files = [
[package.dependencies] [package.dependencies]
numpy = [ numpy = [
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
] ]
[[package]] [[package]]
@ -3666,9 +3666,9 @@ files = [
[package.dependencies] [package.dependencies]
numpy = [ numpy = [
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
{version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
] ]
python-dateutil = ">=2.8.2" python-dateutil = ">=2.8.2"
pytz = ">=2020.1" pytz = ">=2020.1"
@ -4265,8 +4265,8 @@ files = [
annotated-types = ">=0.6.0" annotated-types = ">=0.6.0"
pydantic-core = "2.23.4" pydantic-core = "2.23.4"
typing-extensions = [ typing-extensions = [
{version = ">=4.12.2", markers = "python_version >= \"3.13\""},
{version = ">=4.6.1", markers = "python_version < \"3.13\""}, {version = ">=4.6.1", markers = "python_version < \"3.13\""},
{version = ">=4.12.2", markers = "python_version >= \"3.13\""},
] ]
[package.extras] [package.extras]
@ -4434,8 +4434,8 @@ files = [
astroid = ">=2.15.8,<=2.17.0-dev0" astroid = ">=2.15.8,<=2.17.0-dev0"
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
dill = [ dill = [
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
{version = ">=0.2", markers = "python_version < \"3.11\""}, {version = ">=0.2", markers = "python_version < \"3.11\""},
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
] ]
isort = ">=4.2.5,<6" isort = ">=4.2.5,<6"
mccabe = ">=0.6,<0.8" mccabe = ">=0.6,<0.8"
@ -5596,6 +5596,11 @@ files = [
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
{file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
{file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
{file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
@ -7175,4 +7180,4 @@ tesserocr = ["tesserocr"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "ae89a74730fdc66a8af64cca75e8713a63592c30894d5cd5ea950f24839c10d9" content-hash = "48127a4b7e05f31a1c9f2c6f9b0ac8da61ac67309a8dd020b41e7ec82ecff38e"

View File

@ -37,7 +37,7 @@ torchvision = [
###################### ######################
python = "^3.10" python = "^3.10"
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-core = "^2.1.0" docling-core = "^2.2.1"
docling-ibm-models = "^2.0.1" docling-ibm-models = "^2.0.1"
deepsearch-glm = "^0.26.1" deepsearch-glm = "^0.26.1"
filetype = "^1.2.0" filetype = "^1.2.0"

View File

@ -139,13 +139,13 @@ tention encoding is then multiplied to the encoded image to produce a feature fo
The output features for each table cell are then fed into the feed-forward network (FFN). The FFN consists of a Multi-Layer Perceptron (3 layers with ReLU activation function) that predicts the normalized coordinates for the bounding box of each table cell. Finally, the predicted bounding boxes are classified based on whether they are empty or not using a linear layer. The output features for each table cell are then fed into the feed-forward network (FFN). The FFN consists of a Multi-Layer Perceptron (3 layers with ReLU activation function) that predicts the normalized coordinates for the bounding box of each table cell. Finally, the predicted bounding boxes are classified based on whether they are empty or not using a linear layer.
Loss Functions. We formulate a multi-task loss Eq. 2 to train our network. The Cross-Entropy loss (denoted as l$_{s}$ ) is used to train the Structure Decoder which predicts the structure tokens. As for the Cell BBox Decoder it is trained with a combination of losses denoted as l$_{box}$ . l$_{box}$ consists of the generally used l$_{1}$ loss for object detection and the IoU loss ( l$_{iou}$ ) to be scale invariant as explained in [25]. In comparison to DETR, we do not use the Hungarian algorithm [15] to match the predicted bounding boxes with the ground-truth boxes, as we have already achieved a one-toone match through two steps: 1) Our token input sequence is naturally ordered, therefore the hidden states of the table data cells are also in order when they are provided as input to the Cell BBox Decoder , and 2) Our bounding boxes generation mechanism (see Sec. 3) ensures a one-to-one mapping between the cell content and its bounding box for all post-processed datasets. Loss Functions. We formulate a multi-task loss Eq. 2 to train our network. The Cross-Entropy loss (denoted as l$\_{s}$ ) is used to train the Structure Decoder which predicts the structure tokens. As for the Cell BBox Decoder it is trained with a combination of losses denoted as l$\_{box}$ . l$\_{box}$ consists of the generally used l$\_{1}$ loss for object detection and the IoU loss ( l$\_{iou}$ ) to be scale invariant as explained in [25]. In comparison to DETR, we do not use the Hungarian algorithm [15] to match the predicted bounding boxes with the ground-truth boxes, as we have already achieved a one-toone match through two steps: 1) Our token input sequence is naturally ordered, therefore the hidden states of the table data cells are also in order when they are provided as input to the Cell BBox Decoder , and 2) Our bounding boxes generation mechanism (see Sec. 3) ensures a one-to-one mapping between the cell content and its bounding box for all post-processed datasets.
The loss used to train the TableFormer can be defined as following: The loss used to train the TableFormer can be defined as following:
l$_{box}$ = λ$_{iou}$l$_{iou}$ + λ$_{l}$$_{1}$ l = λl$_{s}$ + (1 - λ ) l$_{box}$ (1) l$\_{box}$ = λ$\_{iou}$l$\_{iou}$ + λ$\_{l}$$\_{1}$ l = λl$\_{s}$ + (1 - λ ) l$\_{box}$ (1)
where λ ∈ [0, 1], and λ$_{iou}$, λ$_{l}$$_{1}$ ∈$_{R}$ are hyper-parameters. where λ ∈ [0, 1], and λ$\_{iou}$, λ$\_{l}$$\_{1}$ ∈$\_{R}$ are hyper-parameters.
## 5. Experimental Results ## 5. Experimental Results
@ -175,9 +175,9 @@ We also share our baseline results on the challenging SynthTabNet dataset. Throu
The Tree-Edit-Distance-Based Similarity (TEDS) metric was introduced in [37]. It represents the prediction, and ground-truth as a tree structure of HTML tags. This similarity is calculated as: The Tree-Edit-Distance-Based Similarity (TEDS) metric was introduced in [37]. It represents the prediction, and ground-truth as a tree structure of HTML tags. This similarity is calculated as:
TEDS ( T$_{a}$, T$_{b}$ ) = 1 - EditDist ( T$_{a}$, T$_{b}$ ) max ( | T$_{a}$ | , | T$_{b}$ | ) (3) TEDS ( T$\_{a}$, T$\_{b}$ ) = 1 - EditDist ( T$\_{a}$, T$\_{b}$ ) max ( | T$\_{a}$ | , | T$\_{b}$ | ) (3)
where T$_{a}$ and T$_{b}$ represent tables in tree structure HTML format. EditDist denotes the tree-edit distance, and | T | represents the number of nodes in T . where T$\_{a}$ and T$\_{b}$ represent tables in tree structure HTML format. EditDist denotes the tree-edit distance, and | T | represents the number of nodes in T .
## 5.4. Quantitative Analysis ## 5.4. Quantitative Analysis
@ -376,9 +376,9 @@ Here is a step-by-step description of the prediction postprocessing:
- 3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column. - 3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column.
- 4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula: - 4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:
alignment = arg min c { D$_{c}$ } D$_{c}$ = max { x$_{c}$ } - min { x$_{c}$ } (4) alignment = arg min c { D$\_{c}$ } D$\_{c}$ = max { x$\_{c}$ } - min { x$\_{c}$ } (4)
where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for the corresponding point. where c is one of { left, centroid, right } and x$\_{c}$ is the xcoordinate for the corresponding point.
- 5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me- - 5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-

View File

@ -153,7 +153,7 @@ Table 2. TSR and cell detection results compared between OTSL and HTML on the Pu
To illustrate the qualitative differences between OTSL and HTML, Figure 5 demonstrates less overlap and more accurate bounding boxes with OTSL. In Figure 6, OTSL proves to be more effective in handling tables with longer token sequences, resulting in even more precise structure prediction and bounding boxes. To illustrate the qualitative differences between OTSL and HTML, Figure 5 demonstrates less overlap and more accurate bounding boxes with OTSL. In Figure 6, OTSL proves to be more effective in handling tables with longer token sequences, resulting in even more precise structure prediction and bounding boxes.
Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444\_006\_00.png" PubTabNet. μ
<!-- image --> <!-- image -->
@ -161,7 +161,7 @@ Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (
Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet. Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406\_003\_01.png" PubTabNet.
<!-- image --> <!-- image -->

File diff suppressed because it is too large Load Diff