fix(layout,table): orientation-aware layout and table detection

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
Clément Doumouro 2025-07-04 10:12:36 +02:00
parent a47fd8372d
commit 8ffa01bc9f
26 changed files with 571 additions and 96 deletions

View File

@ -1,8 +1,8 @@
import copy import copy
import logging import logging
import warnings import warnings
from copy import deepcopy
from collections.abc import Iterable from collections.abc import Iterable
from copy import deepcopy
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
@ -19,7 +19,7 @@ from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import download_hf_model from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
from docling.utils.layout_postprocessor import LayoutPostprocessor from docling.utils.layout_postprocessor import LayoutPostprocessor
from docling.utils.orientation import detect_orientation from docling.utils.orientation import detect_orientation, rotate_bounding_box
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
from docling.utils.visualization import draw_clusters from docling.utils.visualization import draw_clusters
@ -105,7 +105,6 @@ class LayoutModel(BasePageModel):
self, self,
conv_res, conv_res,
page, page,
page_orientation: int,
clusters, clusters,
mode_prefix: str, mode_prefix: str,
show: bool = False, show: bool = False,
@ -119,10 +118,6 @@ class LayoutModel(BasePageModel):
page_image = deepcopy(page.image) page_image = deepcopy(page.image)
scale_x = page_image.width / page.size.width scale_x = page_image.width / page.size.width
scale_y = page_image.height / page.size.height scale_y = page_image.height / page.size.height
if page_orientation:
page_image = page_image.rotate(-page_orientation, expand=True)
if abs(page_orientation) in [90, 270]:
scale_x, scale_y = scale_y, scale_x
# Filter clusters for left and right images # Filter clusters for left and right images
exclude_labels = { exclude_labels = {
DocItemLabel.FORM, DocItemLabel.FORM,
@ -138,9 +133,6 @@ class LayoutModel(BasePageModel):
# Draw clusters on both images # Draw clusters on both images
draw_clusters(left_image, left_clusters, scale_x, scale_y) draw_clusters(left_image, left_clusters, scale_x, scale_y)
draw_clusters(right_image, right_clusters, scale_x, scale_y) draw_clusters(right_image, right_clusters, scale_x, scale_y)
if page_orientation:
left_image = left_image.rotate(page_orientation, expand=True)
right_image = right_image.rotate(page_orientation, expand=True)
# Combine the images side by side # Combine the images side by side
combined_width = left_image.width * 2 combined_width = left_image.width * 2
combined_height = left_image.height combined_height = left_image.height
@ -183,11 +175,16 @@ class LayoutModel(BasePageModel):
.replace(" ", "_") .replace(" ", "_")
.replace("-", "_") .replace("-", "_")
) # Temporary, until docling-ibm-model uses docling-core types ) # Temporary, until docling-ibm-model uses docling-core types
bbox = BoundingBox.model_validate(pred_item)
if page_orientation:
bbox = rotate_bounding_box(
bbox, page_orientation, page_image.size
).to_bounding_box()
cluster = Cluster( cluster = Cluster(
id=ix, id=ix,
label=label, label=label,
confidence=pred_item["confidence"], confidence=pred_item["confidence"],
bbox=BoundingBox.model_validate(pred_item), bbox=bbox,
cells=[], cells=[],
) )
clusters.append(cluster) clusters.append(cluster)
@ -196,7 +193,6 @@ class LayoutModel(BasePageModel):
self.draw_clusters_and_cells_side_by_side( self.draw_clusters_and_cells_side_by_side(
conv_res, conv_res,
page, page,
page_orientation,
clusters, clusters,
mode_prefix="raw", mode_prefix="raw",
) )
@ -234,7 +230,6 @@ class LayoutModel(BasePageModel):
self.draw_clusters_and_cells_side_by_side( self.draw_clusters_and_cells_side_by_side(
conv_res, conv_res,
page, page,
page_orientation,
processed_clusters, processed_clusters,
mode_prefix="postprocessed", mode_prefix="postprocessed",
) )

View File

@ -1,8 +1,7 @@
import copy import copy
import warnings import warnings
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Iterable, Optional, Tuple, cast
import numpy import numpy
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@ -11,6 +10,7 @@ from docling_core.types.doc.page import (
TextCellUnit, TextCellUnit,
) )
from PIL import ImageDraw from PIL import ImageDraw
from PIL.Image import Image
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import Page, Table, TableStructurePrediction from docling.datamodel.base_models import Page, Table, TableStructurePrediction
@ -23,6 +23,7 @@ from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import download_hf_model from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
from docling.utils.orientation import detect_orientation, rotate_bounding_box
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
@ -30,6 +31,8 @@ class TableStructureModel(BasePageModel):
_model_repo_folder = "ds4sd--docling-models" _model_repo_folder = "ds4sd--docling-models"
_model_path = "model_artifacts/tableformer" _model_path = "model_artifacts/tableformer"
_table_labels = {DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX}
def __init__( def __init__(
self, self,
enabled: bool, enabled: bool,
@ -186,31 +189,48 @@ class TableStructureModel(BasePageModel):
page.predictions.tablestructure = ( page.predictions.tablestructure = (
TableStructurePrediction() TableStructurePrediction()
) # dummy ) # dummy
cells_orientation = detect_orientation(page.cells)
in_tables = [ # Keep only table bboxes
( in_tables_clusters = [
cluster, cluster
[
round(cluster.bbox.l) * self.scale,
round(cluster.bbox.t) * self.scale,
round(cluster.bbox.r) * self.scale,
round(cluster.bbox.b) * self.scale,
],
)
for cluster in page.predictions.layout.clusters for cluster in page.predictions.layout.clusters
if cluster.label if cluster.label in self._table_labels
in [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
] ]
if not len(in_tables):
if not len(in_tables_clusters):
yield page yield page
continue continue
# Rotate and scale table image
page_im = cast(Image, page.get_image())
scaled_page_im: Image = cast(
Image, page.get_image(scale=self.scale)
)
if cells_orientation:
scaled_page_im = scaled_page_im.rotate(
-cells_orientation, expand=True
)
page_input = { page_input = {
"width": page.size.width * self.scale, "width": scaled_page_im.size[0],
"height": page.size.height * self.scale, "height": scaled_page_im.size[1],
"image": numpy.asarray(page.get_image(scale=self.scale)), "image": numpy.asarray(scaled_page_im),
} }
# Rotate and scale table cells
in_tables = [
(
c,
[
round(x) * self.scale
for x in _rotate_bbox(
c.bbox,
orientation=-cells_orientation,
im_size=page_im.size,
)
.to_top_left_origin(page_im.size[1])
.as_tuple()
],
)
for c in in_tables_clusters
]
table_clusters, table_bboxes = zip(*in_tables) table_clusters, table_bboxes = zip(*in_tables)
if len(table_bboxes): if len(table_bboxes):
@ -238,11 +258,16 @@ class TableStructureModel(BasePageModel):
scale=self.scale scale=self.scale
) )
) )
new_bbox = _rotate_bbox(
new_cell.to_bounding_box(),
orientation=-cells_orientation,
im_size=scaled_page_im.size,
).model_dump()
tokens.append( tokens.append(
{ {
"id": new_cell.index, "id": new_cell.index,
"text": new_cell.text, "text": new_cell.text,
"bbox": new_cell.rect.to_bounding_box().model_dump(), "bbox": new_bbox,
} }
) )
page_input["tokens"] = tokens page_input["tokens"] = tokens
@ -302,3 +327,11 @@ class TableStructureModel(BasePageModel):
) )
yield page yield page
def _rotate_bbox(
bbox: BoundingBox, *, orientation: int, im_size: Tuple[int, int]
) -> BoundingBox:
if orientation:
return rotate_bounding_box(bbox, orientation, im_size).to_bounding_box()
return bbox

View File

@ -27,7 +27,6 @@ from docling.utils.ocr_utils import (
parse_tesseract_orientation, parse_tesseract_orientation,
tesseract_box_to_bounding_rectangle, tesseract_box_to_bounding_rectangle,
) )
from docling.utils.orientation import Box
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle from docling_core.types.doc.page import BoundingRectangle
@ -43,7 +43,9 @@ def tesseract_box_to_bounding_rectangle(
orientation: int, orientation: int,
im_size: Tuple[int, int], im_size: Tuple[int, int],
) -> BoundingRectangle: ) -> BoundingRectangle:
# box is in the top, left, height, width format, top left coordinates # bbox is in the top, left, height, width format, top left coordinates
# We detected the tesseract on the document rotated with minus orientation, we have
# to apply an orientation angle
rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size) rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size)
rect = BoundingRectangle( rect = BoundingRectangle(
r_x0=rect.r_x0 / scale, r_x0=rect.r_x0 / scale,
@ -54,7 +56,7 @@ def tesseract_box_to_bounding_rectangle(
r_y2=rect.r_y2 / scale, r_y2=rect.r_y2 / scale,
r_x3=rect.r_x3 / scale, r_x3=rect.r_x3 / scale,
r_y3=rect.r_y3 / scale, r_y3=rect.r_y3 / scale,
coord_origin=CoordOrigin.TOPLEFT, coord_origin=rect.coord_origin,
) )
if original_offset is not None: if original_offset is not None:
if original_offset.coord_origin is not CoordOrigin.TOPLEFT: if original_offset.coord_origin is not CoordOrigin.TOPLEFT:

View File

@ -1,13 +1,15 @@
from collections import Counter from collections import Counter
from operator import itemgetter from operator import itemgetter
from typing import Tuple
from docling_core.types.doc.page import TextCell from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
_ORIENTATIONS = [0, 90, 180, 270] CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
def _clipped_orientation(angle: float) -> int: def _clipped_orientation(angle: float) -> int:
return min((abs(angle - o) % 360, o) for o in _ORIENTATIONS)[1] return min((abs(angle - o) % 360, o) for o in CLIPPED_ORIENTATIONS)[1]
def detect_orientation(cells: list[TextCell]) -> int: def detect_orientation(cells: list[TextCell]) -> int:
@ -15,12 +17,6 @@ def detect_orientation(cells: list[TextCell]) -> int:
return 0 return 0
orientation_counter = Counter(_clipped_orientation(c.rect.angle_360) for c in cells) orientation_counter = Counter(_clipped_orientation(c.rect.angle_360) for c in cells)
return max(orientation_counter.items(), key=itemgetter(1))[0] return max(orientation_counter.items(), key=itemgetter(1))[0]
from typing import Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
def rotate_bounding_box( def rotate_bounding_box(

View File

@ -1,3 +1,9 @@
<document> <document>
<paragraph><location><page_1><loc_12><loc_82><loc_85><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph> <table>
<location><page_1><loc_12><loc_39><loc_67><loc_87></location>
<row_0><col_0><body></col_0><col_1><col_header>Column 0</col_1><col_2><col_header>Column 1</col_2><col_3><col_header>Column 2</col_3></row_0>
<row_1><col_0><row_header>this is row 0</col_0><col_1><body>some cells</col_1><col_2><body>have content</col_2><col_3><body>and</col_3></row_1>
<row_2><col_0><row_header>and row 1</col_0><col_1><body></col_1><col_2><body>other</col_2><col_3><body>have</col_3></row_2>
<row_3><col_0><row_header>and last row 2</col_0><col_1><body>nothing</col_1><col_2><body></col_2><col_3><body>inside</col_3></row_3>
</table>
</document> </document>

View File

@ -1 +1,5 @@
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package | | Column 0 | Column 1 | Column 2 |
|----------------|------------|--------------|------------|
| this is row 0 | some cells | have content | and |
| and row 1 | | other | have |
| and last row 2 | nothing | | inside |

View File

@ -1,3 +0,0 @@
<document>
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
</document>

View File

@ -1 +0,0 @@
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.21306574279092, 74.12495603322407, 152.19606490864376, 154.19400205373182], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}

View File

@ -1 +0,0 @@
package

View File

@ -1 +0,0 @@
[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}]

View File

@ -1,5 +1,9 @@
<document> <document>
<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph> <table>
<paragraph><location><page_1><loc_15><loc_12><loc_88><loc_15></location>JSON and Markdown in an easy self contained</paragraph> <location><page_1><loc_33><loc_13><loc_88><loc_61></location>
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_11></location>Docling bundles PDF document conversion to</paragraph> <row_0><col_0><col_header>inside</col_0><col_1><body></col_1><col_2><col_header>nothing</col_2><col_3><col_header>and last row 2</col_3></row_0>
<row_1><col_0><body>have</col_0><col_1><body>other</col_1><col_2><body></col_2><col_3><body>and row 1</col_3></row_1>
<row_2><col_0><body>and</col_0><col_1><body>have content</col_1><col_2><body>some cells</col_2><col_3><body>this is row 0</col_3></row_2>
<row_3><col_0><body>Column 2</col_0><col_1><body>Column 1</col_1><col_2><body>Column 0</col_2><col_3><body></col_3></row_3>
</table>
</document> </document>

View File

@ -1,5 +1,5 @@
package | inside | | nothing | and last row 2 |
|----------|--------------|------------|------------------|
JSON and Markdown in an easy self contained | have | other | | and row 1 |
| and | have content | some cells | this is row 0 |
Docling bundles PDF document conversion to | Column 2 | Column 1 | Column 0 | |

View File

@ -1,3 +1,9 @@
<document> <document>
<paragraph><location><page_1><loc_82><loc_75><loc_84><loc_88></location>package</paragraph> <table>
<location><page_1><loc_39><loc_33><loc_87><loc_88></location>
<row_0><col_0><body>and last row 2</col_0><col_1><body>and row 1</col_1><col_2><body>this is row 0</col_2><col_3><body></col_3></row_0>
<row_1><col_0><body>nothing</col_0><col_1><body></col_1><col_2><body>some cells</col_2><col_3><body>Column 0</col_3></row_1>
<row_2><col_0><body></col_0><col_1><body>other</col_1><col_2><body>have content</col_2><col_3><body>Column 1</col_3></row_2>
<row_3><col_0><body>inside</col_0><col_1><body>have</col_1><col_2><body>and</col_2><col_3><body>Column 2</col_3></row_3>
</table>
</document> </document>

View File

@ -1 +1,5 @@
package | and last row 2 | and row 1 | this is row 0 | |
|------------------|-------------|-----------------|----------|
| nothing | | some cells | Column 0 |
| | other | have content | Column 1 |
| inside | have | and | Column 2 |

View File

@ -1,4 +1,9 @@
<document> <document>
<paragraph><location><page_1><loc_9><loc_12><loc_11><loc_85></location>Docling bundles PDF document conversion to</paragraph> <table>
<paragraph><location><page_1><loc_12><loc_12><loc_15><loc_85></location><location><page_1><loc_12><loc_12><loc_15><loc_85></location>JSON and Markdown in an easy self contained package</paragraph> <location><page_1><loc_13><loc_12><loc_61><loc_67></location>
<row_0><col_0><body>Column 2</col_0><col_1><body>and</col_1><col_2><body>have</col_2><col_3><body>inside</col_3></row_0>
<row_1><col_0><body>Column 1</col_0><col_1><body>have content</col_1><col_2><body>other</col_2><col_3><body></col_3></row_1>
<row_2><col_0><body>Column 0</col_0><col_1><body>some cells</col_1><col_2><body></col_2><col_3><body>nothing</col_3></row_2>
<row_3><col_0><body></col_0><col_1><body>this is row 0</col_1><col_2><body>and row 1</col_2><col_3><body>and last row 2</col_3></row_3>
</table>
</document> </document>

View File

@ -27,53 +27,468 @@
"file-info": { "file-info": {
"filename": "ocr_test_rotated_90.pdf", "filename": "ocr_test_rotated_90.pdf",
"filename-prov": null, "filename-prov": null,
"document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "document-hash": "2fb20caf4f54c878a0b454b496010d92adc6ae1b7f10fbd9ba1ba26260f818a8",
"#-pages": 1, "#-pages": 1,
"collection-name": null, "collection-name": null,
"description": null, "description": null,
"page-hashes": [ "page-hashes": [
{ {
"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "hash": "56c847ad7c5ab9f0346a325510af001ab66a9bb45f65ffc7bbfc60c929def7d2",
"model": "default", "model": "default",
"page": 1 "page": 1
} }
] ]
}, },
"main-text": [ "main-text": [
{
"name": "Table",
"type": "table",
"$ref": "#/tables/0"
}
],
"figures": [],
"tables": [
{ {
"prov": [ "prov": [
{ {
"bbox": [ "bbox": [
131.21306574279092, 75.13359832763672,
74.12495603322407, 102.99908447265625,
152.19606490864376, 361.18695068359375,
154.19400205373182 562.1403198242188
], ],
"page": 1, "page": 1,
"span": [ "span": [
0, 0,
7 0
], ],
"__ref_s3_data": null "__ref_s3_data": null
} }
], ],
"text": "package", "text": "",
"type": "paragraph", "type": "table",
"payload": null, "payload": null,
"name": "Text", "#-cols": 4,
"font": null "#-rows": 4,
"data": [
[
{
"bbox": [
105.0718660651769,
304.7354643560275,
119.73306194406335,
369.59883715876185
],
"spans": [
[
0,
0
]
],
"text": "Column 2",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
172.26899264661517,
324.3168597625203,
188.15195177751215,
352.46511670018316
],
"spans": [
[
0,
1
]
],
"text": "and",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
240.68788382926402,
321.869185135892,
256.570842960161,
356.13662847492196
],
"spans": [
[
0,
2
]
],
"text": "have",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
312.772072637728,
319.42151173034614,
326.21150018118874,
359.8081389276117
],
"spans": [
[
0,
3
]
],
"text": "inside",
"type": "body",
"col": 3,
"col-header": false,
"col-span": [
3,
4
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
}
],
[
{
"bbox": [
105.0718660651769,
419.77616156495424,
119.73306194406335,
483.4156981046677
],
"spans": [
[
1,
0
]
],
"text": "Column 1",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
},
{
"bbox": [
172.26898999097682,
408.7616301134671,
185.70842261785268,
495.6540658231026
],
"spans": [
[
1,
1
]
],
"text": "have content",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
},
{
"bbox": [
240.68788377535307,
433.23837164942523,
255.34907711253194,
468.729651251476
],
"spans": [
[
1,
2
]
],
"text": "other",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
},
{
"bbox": null,
"spans": [
[
1,
3
]
],
"text": "",
"type": "body"
}
],
[
{
"bbox": [
105.07186605295925,
532.3691850430223,
119.73306193184567,
597.2325578457567
],
"spans": [
[
2,
0
]
],
"text": "Column 0",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
},
{
"bbox": [
172.26899069197702,
529.9215107729757,
186.93018720629036,
600.9040699770771
],
"spans": [
[
2,
1
]
],
"text": "some cells",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
},
{
"bbox": null,
"spans": [
[
2,
2
]
],
"text": "",
"type": "body"
},
{
"bbox": [
311.49999737299976,
536.775000315586,
332.5000022770002,
592.9083316144141
],
"spans": [
[
2,
3
]
],
"text": "nothing",
"type": "body",
"col": 3,
"col-header": false,
"col-span": [
3,
4
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
}
],
[
{
"bbox": null,
"spans": [
[
3,
0
]
],
"text": "",
"type": "body"
},
{
"bbox": [
172.2689900422697,
638.8430233885732,
186.93018846286373,
719.6162777831045
],
"spans": [
[
3,
1
]
],
"text": "this is row 0",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 3,
"row-header": false,
"row-span": [
3,
4
]
},
{
"bbox": [
240.68788248006402,
647.4098827174411,
255.34907835895044,
712.2732555201754
],
"spans": [
[
3,
2
]
],
"text": "and row 1",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 3,
"row-header": false,
"row-span": [
3,
4
]
},
{
"bbox": [
313.9938353514431,
633.9476737903873,
327.43326861374595,
725.735464724632
],
"spans": [
[
3,
3
]
],
"text": "and last row 2",
"type": "body",
"col": 3,
"col-header": false,
"col-span": [
3,
4
],
"row": 3,
"row-header": false,
"row-span": [
3,
4
]
}
]
],
"model": null,
"bounding-box": null
} }
], ],
"figures": [],
"tables": [],
"bitmaps": null, "bitmaps": null,
"equations": [], "equations": [],
"footnotes": [], "footnotes": [],
"page-dimensions": [ "page-dimensions": [
{ {
"height": 595.201171875, "height": 842.0,
"page": 1, "page": 1,
"width": 841.9216918945312 "width": 595.0
} }
], ],
"page-footers": [], "page-footers": [],

View File

@ -1,3 +1,5 @@
Docling bundles PDF document conversion to | Column 2 | and | have | inside |
|------------|---------------|-----------|----------------|
JSON and Markdown in an easy self contained package | Column 1 | have content | other | |
| Column 0 | some cells | | nothing |
| | this is row 0 | and row 1 | and last row 2 |

View File

@ -1 +1,5 @@
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package | | Column 0 | Column 1 | Column 2 |
|----------------|------------|--------------|------------|
| this is row 0 | some cells | have content | and |
| and row 1 | | other | have |
| and last row 2 | nothing | | inside |

View File

@ -1,5 +1,5 @@
package | inside | | nothing | and last row 2 |
|----------|--------------|------------|------------------|
JSON and Markdown in an easy self contained | have | other | | and row 1 |
| and | have content | some cells | this is row 0 |
Docling bundles PDF document conversion to | Column 2 | Column 1 | Column 0 | |

View File

@ -1 +1,5 @@
package | and last row 2 | and row 1 | this is row 0 | |
|------------------|-------------|-----------------|----------|
| nothing | | some cells | Column 0 |
| | other | have content | Column 1 |
| inside | have | and | Column 2 |

View File

@ -1,3 +1,5 @@
Docling bundles PDF document conversion to | Column 2 | and | have | inside |
|------------|---------------|-----------|----------------|
JSON and Markdown in an easy self contained package | Column 1 | have content | other | |
| Column 0 | some cells | | nothing |
| | this is row 0 | and row 1 | and last row 2 |

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.