fix(layout,table): orientation-aware layout and table detection

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
Clément Doumouro 2025-07-04 10:12:36 +02:00
parent a47fd8372d
commit 8ffa01bc9f
26 changed files with 571 additions and 96 deletions

View File

@ -1,8 +1,8 @@
import copy
import logging
import warnings
from copy import deepcopy
from collections.abc import Iterable
from copy import deepcopy
from pathlib import Path
from typing import Optional
@ -19,7 +19,7 @@ from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device
from docling.utils.layout_postprocessor import LayoutPostprocessor
from docling.utils.orientation import detect_orientation
from docling.utils.orientation import detect_orientation, rotate_bounding_box
from docling.utils.profiling import TimeRecorder
from docling.utils.visualization import draw_clusters
@ -105,7 +105,6 @@ class LayoutModel(BasePageModel):
self,
conv_res,
page,
page_orientation: int,
clusters,
mode_prefix: str,
show: bool = False,
@ -119,10 +118,6 @@ class LayoutModel(BasePageModel):
page_image = deepcopy(page.image)
scale_x = page_image.width / page.size.width
scale_y = page_image.height / page.size.height
if page_orientation:
page_image = page_image.rotate(-page_orientation, expand=True)
if abs(page_orientation) in [90, 270]:
scale_x, scale_y = scale_y, scale_x
# Filter clusters for left and right images
exclude_labels = {
DocItemLabel.FORM,
@ -138,9 +133,6 @@ class LayoutModel(BasePageModel):
# Draw clusters on both images
draw_clusters(left_image, left_clusters, scale_x, scale_y)
draw_clusters(right_image, right_clusters, scale_x, scale_y)
if page_orientation:
left_image = left_image.rotate(page_orientation, expand=True)
right_image = right_image.rotate(page_orientation, expand=True)
# Combine the images side by side
combined_width = left_image.width * 2
combined_height = left_image.height
@ -183,11 +175,16 @@ class LayoutModel(BasePageModel):
.replace(" ", "_")
.replace("-", "_")
) # Temporary, until docling-ibm-model uses docling-core types
bbox = BoundingBox.model_validate(pred_item)
if page_orientation:
bbox = rotate_bounding_box(
bbox, page_orientation, page_image.size
).to_bounding_box()
cluster = Cluster(
id=ix,
label=label,
confidence=pred_item["confidence"],
bbox=BoundingBox.model_validate(pred_item),
bbox=bbox,
cells=[],
)
clusters.append(cluster)
@ -196,7 +193,6 @@ class LayoutModel(BasePageModel):
self.draw_clusters_and_cells_side_by_side(
conv_res,
page,
page_orientation,
clusters,
mode_prefix="raw",
)
@ -234,7 +230,6 @@ class LayoutModel(BasePageModel):
self.draw_clusters_and_cells_side_by_side(
conv_res,
page,
page_orientation,
processed_clusters,
mode_prefix="postprocessed",
)

View File

@ -1,8 +1,7 @@
import copy
import warnings
from collections.abc import Iterable
from pathlib import Path
from typing import Optional
from typing import Iterable, Optional, Tuple, cast
import numpy
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@ -11,6 +10,7 @@ from docling_core.types.doc.page import (
TextCellUnit,
)
from PIL import ImageDraw
from PIL.Image import Image
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
@ -23,6 +23,7 @@ from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device
from docling.utils.orientation import detect_orientation, rotate_bounding_box
from docling.utils.profiling import TimeRecorder
@ -30,6 +31,8 @@ class TableStructureModel(BasePageModel):
_model_repo_folder = "ds4sd--docling-models"
_model_path = "model_artifacts/tableformer"
_table_labels = {DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX}
def __init__(
self,
enabled: bool,
@ -186,31 +189,48 @@ class TableStructureModel(BasePageModel):
page.predictions.tablestructure = (
TableStructurePrediction()
) # dummy
in_tables = [
(
cluster,
[
round(cluster.bbox.l) * self.scale,
round(cluster.bbox.t) * self.scale,
round(cluster.bbox.r) * self.scale,
round(cluster.bbox.b) * self.scale,
],
)
cells_orientation = detect_orientation(page.cells)
# Keep only table bboxes
in_tables_clusters = [
cluster
for cluster in page.predictions.layout.clusters
if cluster.label
in [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
if cluster.label in self._table_labels
]
if not len(in_tables):
if not len(in_tables_clusters):
yield page
continue
# Rotate and scale table image
page_im = cast(Image, page.get_image())
scaled_page_im: Image = cast(
Image, page.get_image(scale=self.scale)
)
if cells_orientation:
scaled_page_im = scaled_page_im.rotate(
-cells_orientation, expand=True
)
page_input = {
"width": page.size.width * self.scale,
"height": page.size.height * self.scale,
"image": numpy.asarray(page.get_image(scale=self.scale)),
"width": scaled_page_im.size[0],
"height": scaled_page_im.size[1],
"image": numpy.asarray(scaled_page_im),
}
# Rotate and scale table cells
in_tables = [
(
c,
[
round(x) * self.scale
for x in _rotate_bbox(
c.bbox,
orientation=-cells_orientation,
im_size=page_im.size,
)
.to_top_left_origin(page_im.size[1])
.as_tuple()
],
)
for c in in_tables_clusters
]
table_clusters, table_bboxes = zip(*in_tables)
if len(table_bboxes):
@ -238,11 +258,16 @@ class TableStructureModel(BasePageModel):
scale=self.scale
)
)
new_bbox = _rotate_bbox(
new_cell.to_bounding_box(),
orientation=-cells_orientation,
im_size=scaled_page_im.size,
).model_dump()
tokens.append(
{
"id": new_cell.index,
"text": new_cell.text,
"bbox": new_cell.rect.to_bounding_box().model_dump(),
"bbox": new_bbox,
}
)
page_input["tokens"] = tokens
@ -302,3 +327,11 @@ class TableStructureModel(BasePageModel):
)
yield page
def _rotate_bbox(
bbox: BoundingBox, *, orientation: int, im_size: Tuple[int, int]
) -> BoundingBox:
if orientation:
return rotate_bounding_box(bbox, orientation, im_size).to_bounding_box()
return bbox

View File

@ -27,7 +27,6 @@ from docling.utils.ocr_utils import (
parse_tesseract_orientation,
tesseract_box_to_bounding_rectangle,
)
from docling.utils.orientation import Box
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle
@ -43,7 +43,9 @@ def tesseract_box_to_bounding_rectangle(
orientation: int,
im_size: Tuple[int, int],
) -> BoundingRectangle:
# box is in the top, left, height, width format, top left coordinates
# bbox is in the top, left, height, width format, top left coordinates
# We detected the tesseract on the document rotated with minus orientation, we have
# to apply an orientation angle
rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size)
rect = BoundingRectangle(
r_x0=rect.r_x0 / scale,
@ -54,7 +56,7 @@ def tesseract_box_to_bounding_rectangle(
r_y2=rect.r_y2 / scale,
r_x3=rect.r_x3 / scale,
r_y3=rect.r_y3 / scale,
coord_origin=CoordOrigin.TOPLEFT,
coord_origin=rect.coord_origin,
)
if original_offset is not None:
if original_offset.coord_origin is not CoordOrigin.TOPLEFT:

View File

@ -1,13 +1,15 @@
from collections import Counter
from operator import itemgetter
from typing import Tuple
from docling_core.types.doc.page import TextCell
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
_ORIENTATIONS = [0, 90, 180, 270]
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
def _clipped_orientation(angle: float) -> int:
return min((abs(angle - o) % 360, o) for o in _ORIENTATIONS)[1]
return min((abs(angle - o) % 360, o) for o in CLIPPED_ORIENTATIONS)[1]
def detect_orientation(cells: list[TextCell]) -> int:
@ -15,12 +17,6 @@ def detect_orientation(cells: list[TextCell]) -> int:
return 0
orientation_counter = Counter(_clipped_orientation(c.rect.angle_360) for c in cells)
return max(orientation_counter.items(), key=itemgetter(1))[0]
from typing import Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
def rotate_bounding_box(

View File

@ -1,3 +1,9 @@
<document>
<paragraph><location><page_1><loc_12><loc_82><loc_85><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
<table>
<location><page_1><loc_12><loc_39><loc_67><loc_87></location>
<row_0><col_0><body></col_0><col_1><col_header>Column 0</col_1><col_2><col_header>Column 1</col_2><col_3><col_header>Column 2</col_3></row_0>
<row_1><col_0><row_header>this is row 0</col_0><col_1><body>some cells</col_1><col_2><body>have content</col_2><col_3><body>and</col_3></row_1>
<row_2><col_0><row_header>and row 1</col_0><col_1><body></col_1><col_2><body>other</col_2><col_3><body>have</col_3></row_2>
<row_3><col_0><row_header>and last row 2</col_0><col_1><body>nothing</col_1><col_2><body></col_2><col_3><body>inside</col_3></row_3>
</table>
</document>

View File

@ -1 +1,5 @@
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
| | Column 0 | Column 1 | Column 2 |
|----------------|------------|--------------|------------|
| this is row 0 | some cells | have content | and |
| and row 1 | | other | have |
| and last row 2 | nothing | | inside |

View File

@ -1,3 +0,0 @@
<document>
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
</document>

View File

@ -1 +0,0 @@
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.21306574279092, 74.12495603322407, 152.19606490864376, 154.19400205373182], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}

View File

@ -1 +0,0 @@
package

View File

@ -1 +0,0 @@
[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}]

View File

@ -1,5 +1,9 @@
<document>
<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
<paragraph><location><page_1><loc_15><loc_12><loc_88><loc_15></location>JSON and Markdown in an easy self contained</paragraph>
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_11></location>Docling bundles PDF document conversion to</paragraph>
<table>
<location><page_1><loc_33><loc_13><loc_88><loc_61></location>
<row_0><col_0><col_header>inside</col_0><col_1><body></col_1><col_2><col_header>nothing</col_2><col_3><col_header>and last row 2</col_3></row_0>
<row_1><col_0><body>have</col_0><col_1><body>other</col_1><col_2><body></col_2><col_3><body>and row 1</col_3></row_1>
<row_2><col_0><body>and</col_0><col_1><body>have content</col_1><col_2><body>some cells</col_2><col_3><body>this is row 0</col_3></row_2>
<row_3><col_0><body>Column 2</col_0><col_1><body>Column 1</col_1><col_2><body>Column 0</col_2><col_3><body></col_3></row_3>
</table>
</document>

View File

@ -1,5 +1,5 @@
package
JSON and Markdown in an easy self contained
Docling bundles PDF document conversion to
| inside | | nothing | and last row 2 |
|----------|--------------|------------|------------------|
| have | other | | and row 1 |
| and | have content | some cells | this is row 0 |
| Column 2 | Column 1 | Column 0 | |

View File

@ -1,3 +1,9 @@
<document>
<paragraph><location><page_1><loc_82><loc_75><loc_84><loc_88></location>package</paragraph>
<table>
<location><page_1><loc_39><loc_33><loc_87><loc_88></location>
<row_0><col_0><body>and last row 2</col_0><col_1><body>and row 1</col_1><col_2><body>this is row 0</col_2><col_3><body></col_3></row_0>
<row_1><col_0><body>nothing</col_0><col_1><body></col_1><col_2><body>some cells</col_2><col_3><body>Column 0</col_3></row_1>
<row_2><col_0><body></col_0><col_1><body>other</col_1><col_2><body>have content</col_2><col_3><body>Column 1</col_3></row_2>
<row_3><col_0><body>inside</col_0><col_1><body>have</col_1><col_2><body>and</col_2><col_3><body>Column 2</col_3></row_3>
</table>
</document>

View File

@ -1 +1,5 @@
package
| and last row 2 | and row 1 | this is row 0 | |
|------------------|-------------|-----------------|----------|
| nothing | | some cells | Column 0 |
| | other | have content | Column 1 |
| inside | have | and | Column 2 |

View File

@ -1,4 +1,9 @@
<document>
<paragraph><location><page_1><loc_9><loc_12><loc_11><loc_85></location>Docling bundles PDF document conversion to</paragraph>
<paragraph><location><page_1><loc_12><loc_12><loc_15><loc_85></location><location><page_1><loc_12><loc_12><loc_15><loc_85></location>JSON and Markdown in an easy self contained package</paragraph>
<table>
<location><page_1><loc_13><loc_12><loc_61><loc_67></location>
<row_0><col_0><body>Column 2</col_0><col_1><body>and</col_1><col_2><body>have</col_2><col_3><body>inside</col_3></row_0>
<row_1><col_0><body>Column 1</col_0><col_1><body>have content</col_1><col_2><body>other</col_2><col_3><body></col_3></row_1>
<row_2><col_0><body>Column 0</col_0><col_1><body>some cells</col_1><col_2><body></col_2><col_3><body>nothing</col_3></row_2>
<row_3><col_0><body></col_0><col_1><body>this is row 0</col_1><col_2><body>and row 1</col_2><col_3><body>and last row 2</col_3></row_3>
</table>
</document>

View File

@ -27,53 +27,468 @@
"file-info": {
"filename": "ocr_test_rotated_90.pdf",
"filename-prov": null,
"document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6",
"document-hash": "2fb20caf4f54c878a0b454b496010d92adc6ae1b7f10fbd9ba1ba26260f818a8",
"#-pages": 1,
"collection-name": null,
"description": null,
"page-hashes": [
{
"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3",
"hash": "56c847ad7c5ab9f0346a325510af001ab66a9bb45f65ffc7bbfc60c929def7d2",
"model": "default",
"page": 1
}
]
},
"main-text": [
{
"name": "Table",
"type": "table",
"$ref": "#/tables/0"
}
],
"figures": [],
"tables": [
{
"prov": [
{
"bbox": [
131.21306574279092,
74.12495603322407,
152.19606490864376,
154.19400205373182
75.13359832763672,
102.99908447265625,
361.18695068359375,
562.1403198242188
],
"page": 1,
"span": [
0,
7
0
],
"__ref_s3_data": null
}
],
"text": "package",
"type": "paragraph",
"text": "",
"type": "table",
"payload": null,
"name": "Text",
"font": null
"#-cols": 4,
"#-rows": 4,
"data": [
[
{
"bbox": [
105.0718660651769,
304.7354643560275,
119.73306194406335,
369.59883715876185
],
"spans": [
[
0,
0
]
],
"text": "Column 2",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
172.26899264661517,
324.3168597625203,
188.15195177751215,
352.46511670018316
],
"spans": [
[
0,
1
]
],
"text": "and",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
240.68788382926402,
321.869185135892,
256.570842960161,
356.13662847492196
],
"spans": [
[
0,
2
]
],
"text": "have",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
312.772072637728,
319.42151173034614,
326.21150018118874,
359.8081389276117
],
"spans": [
[
0,
3
]
],
"text": "inside",
"type": "body",
"col": 3,
"col-header": false,
"col-span": [
3,
4
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
}
],
[
{
"bbox": [
105.0718660651769,
419.77616156495424,
119.73306194406335,
483.4156981046677
],
"spans": [
[
1,
0
]
],
"text": "Column 1",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
},
{
"bbox": [
172.26898999097682,
408.7616301134671,
185.70842261785268,
495.6540658231026
],
"spans": [
[
1,
1
]
],
"text": "have content",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
},
{
"bbox": [
240.68788377535307,
433.23837164942523,
255.34907711253194,
468.729651251476
],
"spans": [
[
1,
2
]
],
"text": "other",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
},
{
"bbox": null,
"spans": [
[
1,
3
]
],
"text": "",
"type": "body"
}
],
[
{
"bbox": [
105.07186605295925,
532.3691850430223,
119.73306193184567,
597.2325578457567
],
"spans": [
[
2,
0
]
],
"text": "Column 0",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
},
{
"bbox": [
172.26899069197702,
529.9215107729757,
186.93018720629036,
600.9040699770771
],
"spans": [
[
2,
1
]
],
"text": "some cells",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
},
{
"bbox": null,
"spans": [
[
2,
2
]
],
"text": "",
"type": "body"
},
{
"bbox": [
311.49999737299976,
536.775000315586,
332.5000022770002,
592.9083316144141
],
"spans": [
[
2,
3
]
],
"text": "nothing",
"type": "body",
"col": 3,
"col-header": false,
"col-span": [
3,
4
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
}
],
[
{
"bbox": null,
"spans": [
[
3,
0
]
],
"text": "",
"type": "body"
},
{
"bbox": [
172.2689900422697,
638.8430233885732,
186.93018846286373,
719.6162777831045
],
"spans": [
[
3,
1
]
],
"text": "this is row 0",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 3,
"row-header": false,
"row-span": [
3,
4
]
},
{
"bbox": [
240.68788248006402,
647.4098827174411,
255.34907835895044,
712.2732555201754
],
"spans": [
[
3,
2
]
],
"text": "and row 1",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 3,
"row-header": false,
"row-span": [
3,
4
]
},
{
"bbox": [
313.9938353514431,
633.9476737903873,
327.43326861374595,
725.735464724632
],
"spans": [
[
3,
3
]
],
"text": "and last row 2",
"type": "body",
"col": 3,
"col-header": false,
"col-span": [
3,
4
],
"row": 3,
"row-header": false,
"row-span": [
3,
4
]
}
]
],
"model": null,
"bounding-box": null
}
],
"figures": [],
"tables": [],
"bitmaps": null,
"equations": [],
"footnotes": [],
"page-dimensions": [
{
"height": 595.201171875,
"height": 842.0,
"page": 1,
"width": 841.9216918945312
"width": 595.0
}
],
"page-footers": [],

View File

@ -1,3 +1,5 @@
Docling bundles PDF document conversion to
JSON and Markdown in an easy self contained package
| Column 2 | and | have | inside |
|------------|---------------|-----------|----------------|
| Column 1 | have content | other | |
| Column 0 | some cells | | nothing |
| | this is row 0 | and row 1 | and last row 2 |

View File

@ -1 +1,5 @@
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
| | Column 0 | Column 1 | Column 2 |
|----------------|------------|--------------|------------|
| this is row 0 | some cells | have content | and |
| and row 1 | | other | have |
| and last row 2 | nothing | | inside |

View File

@ -1,5 +1,5 @@
package
JSON and Markdown in an easy self contained
Docling bundles PDF document conversion to
| inside | | nothing | and last row 2 |
|----------|--------------|------------|------------------|
| have | other | | and row 1 |
| and | have content | some cells | this is row 0 |
| Column 2 | Column 1 | Column 0 | |

View File

@ -1 +1,5 @@
package
| and last row 2 | and row 1 | this is row 0 | |
|------------------|-------------|-----------------|----------|
| nothing | | some cells | Column 0 |
| | other | have content | Column 1 |
| inside | have | and | Column 2 |

View File

@ -1,3 +1,5 @@
Docling bundles PDF document conversion to
JSON and Markdown in an easy self contained package
| Column 2 | and | have | inside |
|------------|---------------|-----------|----------------|
| Column 1 | have content | other | |
| Column 0 | some cells | | nothing |
| | this is row 0 | and row 1 | and last row 2 |

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.