diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
index 6005c7d2..f9127307 100644
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -1,8 +1,8 @@
import copy
import logging
import warnings
-from copy import deepcopy
from collections.abc import Iterable
+from copy import deepcopy
from pathlib import Path
from typing import Optional
@@ -19,7 +19,7 @@ from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device
from docling.utils.layout_postprocessor import LayoutPostprocessor
-from docling.utils.orientation import detect_orientation
+from docling.utils.orientation import detect_orientation, rotate_bounding_box
from docling.utils.profiling import TimeRecorder
from docling.utils.visualization import draw_clusters
@@ -105,7 +105,6 @@ class LayoutModel(BasePageModel):
self,
conv_res,
page,
- page_orientation: int,
clusters,
mode_prefix: str,
show: bool = False,
@@ -119,10 +118,6 @@ class LayoutModel(BasePageModel):
page_image = deepcopy(page.image)
scale_x = page_image.width / page.size.width
scale_y = page_image.height / page.size.height
- if page_orientation:
- page_image = page_image.rotate(-page_orientation, expand=True)
- if abs(page_orientation) in [90, 270]:
- scale_x, scale_y = scale_y, scale_x
# Filter clusters for left and right images
exclude_labels = {
DocItemLabel.FORM,
@@ -138,9 +133,6 @@ class LayoutModel(BasePageModel):
# Draw clusters on both images
draw_clusters(left_image, left_clusters, scale_x, scale_y)
draw_clusters(right_image, right_clusters, scale_x, scale_y)
- if page_orientation:
- left_image = left_image.rotate(page_orientation, expand=True)
- right_image = right_image.rotate(page_orientation, expand=True)
# Combine the images side by side
combined_width = left_image.width * 2
combined_height = left_image.height
@@ -183,11 +175,16 @@ class LayoutModel(BasePageModel):
.replace(" ", "_")
.replace("-", "_")
) # Temporary, until docling-ibm-model uses docling-core types
+ bbox = BoundingBox.model_validate(pred_item)
+ if page_orientation:
+ bbox = rotate_bounding_box(
+ bbox, page_orientation, page_image.size
+ ).to_bounding_box()
cluster = Cluster(
id=ix,
label=label,
confidence=pred_item["confidence"],
- bbox=BoundingBox.model_validate(pred_item),
+ bbox=bbox,
cells=[],
)
clusters.append(cluster)
@@ -196,7 +193,6 @@ class LayoutModel(BasePageModel):
self.draw_clusters_and_cells_side_by_side(
conv_res,
page,
- page_orientation,
clusters,
mode_prefix="raw",
)
@@ -234,7 +230,6 @@ class LayoutModel(BasePageModel):
self.draw_clusters_and_cells_side_by_side(
conv_res,
page,
- page_orientation,
processed_clusters,
mode_prefix="postprocessed",
)
diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py
index f5f2cb14..05153ff9 100644
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -1,8 +1,7 @@
import copy
import warnings
-from collections.abc import Iterable
from pathlib import Path
-from typing import Optional
+from typing import Iterable, Optional, Tuple, cast
import numpy
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@@ -11,6 +10,7 @@ from docling_core.types.doc.page import (
TextCellUnit,
)
from PIL import ImageDraw
+from PIL.Image import Image
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
@@ -23,6 +23,7 @@ from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device
+from docling.utils.orientation import detect_orientation, rotate_bounding_box
from docling.utils.profiling import TimeRecorder
@@ -30,6 +31,8 @@ class TableStructureModel(BasePageModel):
_model_repo_folder = "ds4sd--docling-models"
_model_path = "model_artifacts/tableformer"
+ _table_labels = {DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX}
+
def __init__(
self,
enabled: bool,
@@ -186,31 +189,48 @@ class TableStructureModel(BasePageModel):
page.predictions.tablestructure = (
TableStructurePrediction()
) # dummy
-
- in_tables = [
- (
- cluster,
- [
- round(cluster.bbox.l) * self.scale,
- round(cluster.bbox.t) * self.scale,
- round(cluster.bbox.r) * self.scale,
- round(cluster.bbox.b) * self.scale,
- ],
- )
+ cells_orientation = detect_orientation(page.cells)
+ # Keep only table bboxes
+ in_tables_clusters = [
+ cluster
for cluster in page.predictions.layout.clusters
- if cluster.label
- in [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
+ if cluster.label in self._table_labels
]
- if not len(in_tables):
+
+ if not len(in_tables_clusters):
yield page
continue
-
+ # Rotate and scale table image
+ page_im = cast(Image, page.get_image())
+ scaled_page_im: Image = cast(
+ Image, page.get_image(scale=self.scale)
+ )
+ if cells_orientation:
+ scaled_page_im = scaled_page_im.rotate(
+ -cells_orientation, expand=True
+ )
page_input = {
- "width": page.size.width * self.scale,
- "height": page.size.height * self.scale,
- "image": numpy.asarray(page.get_image(scale=self.scale)),
+ "width": scaled_page_im.size[0],
+ "height": scaled_page_im.size[1],
+ "image": numpy.asarray(scaled_page_im),
}
-
+ # Rotate and scale table cells
+ in_tables = [
+ (
+ c,
+ [
+ round(x) * self.scale
+ for x in _rotate_bbox(
+ c.bbox,
+ orientation=-cells_orientation,
+ im_size=page_im.size,
+ )
+ .to_top_left_origin(page_im.size[1])
+ .as_tuple()
+ ],
+ )
+ for c in in_tables_clusters
+ ]
table_clusters, table_bboxes = zip(*in_tables)
if len(table_bboxes):
@@ -238,11 +258,16 @@ class TableStructureModel(BasePageModel):
scale=self.scale
)
)
+ new_bbox = _rotate_bbox(
+ new_cell.to_bounding_box(),
+ orientation=-cells_orientation,
+ im_size=scaled_page_im.size,
+ ).model_dump()
tokens.append(
{
"id": new_cell.index,
"text": new_cell.text,
- "bbox": new_cell.rect.to_bounding_box().model_dump(),
+ "bbox": new_bbox,
}
)
page_input["tokens"] = tokens
@@ -302,3 +327,11 @@ class TableStructureModel(BasePageModel):
)
yield page
+
+
+def _rotate_bbox(
+ bbox: BoundingBox, *, orientation: int, im_size: Tuple[int, int]
+) -> BoundingBox:
+ if orientation:
+ return rotate_bounding_box(bbox, orientation, im_size).to_bounding_box()
+ return bbox
diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py
index ab644f3b..0f9ce201 100644
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -27,7 +27,6 @@ from docling.utils.ocr_utils import (
parse_tesseract_orientation,
tesseract_box_to_bounding_rectangle,
)
-from docling.utils.orientation import Box
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
diff --git a/docling/utils/ocr_utils.py b/docling/utils/ocr_utils.py
index a3e3092e..bf7b510d 100644
--- a/docling/utils/ocr_utils.py
+++ b/docling/utils/ocr_utils.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle
@@ -43,7 +43,9 @@ def tesseract_box_to_bounding_rectangle(
orientation: int,
im_size: Tuple[int, int],
) -> BoundingRectangle:
- # box is in the top, left, height, width format, top left coordinates
+ # bbox is in the top, left, height, width format, top left coordinates
+ # We detected the tesseract on the document rotated with minus orientation, we have
+ # to apply an orientation angle
rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size)
rect = BoundingRectangle(
r_x0=rect.r_x0 / scale,
@@ -54,7 +56,7 @@ def tesseract_box_to_bounding_rectangle(
r_y2=rect.r_y2 / scale,
r_x3=rect.r_x3 / scale,
r_y3=rect.r_y3 / scale,
- coord_origin=CoordOrigin.TOPLEFT,
+ coord_origin=rect.coord_origin,
)
if original_offset is not None:
if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
diff --git a/docling/utils/orientation.py b/docling/utils/orientation.py
index f9c30096..eb118d13 100644
--- a/docling/utils/orientation.py
+++ b/docling/utils/orientation.py
@@ -1,13 +1,15 @@
from collections import Counter
from operator import itemgetter
+from typing import Tuple
-from docling_core.types.doc.page import TextCell
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell
-_ORIENTATIONS = [0, 90, 180, 270]
+CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
def _clipped_orientation(angle: float) -> int:
- return min((abs(angle - o) % 360, o) for o in _ORIENTATIONS)[1]
+ return min((abs(angle - o) % 360, o) for o in CLIPPED_ORIENTATIONS)[1]
def detect_orientation(cells: list[TextCell]) -> int:
@@ -15,12 +17,6 @@ def detect_orientation(cells: list[TextCell]) -> int:
return 0
orientation_counter = Counter(_clipped_orientation(c.rect.angle_360) for c in cells)
return max(orientation_counter.items(), key=itemgetter(1))[0]
-from typing import Tuple
-
-from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import BoundingRectangle
-
-CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
def rotate_bounding_box(
diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test.doctags.txt b/tests/data_scanned/groundtruth/docling_v1/ocr_test.doctags.txt
index b00cc668..927ba0f2 100644
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.doctags.txt
@@ -1,3 +1,9 @@
-Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
+
+
+Column 0Column 1Column 2
+this is row 0some cellshave contentand
+and row 1otherhave
+and last row 2nothinginside
+
\ No newline at end of file
diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test.md b/tests/data_scanned/groundtruth/docling_v1/ocr_test.md
index 42896546..c466de2b 100644
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.md
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.md
@@ -1 +1,5 @@
-Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
\ No newline at end of file
+| | Column 0 | Column 1 | Column 2 |
+|----------------|------------|--------------|------------|
+| this is row 0 | some cells | have content | and |
+| and row 1 | | other | have |
+| and last row 2 | nothing | | inside |
\ No newline at end of file
diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.doctags.txt b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.doctags.txt
deleted file mode 100644
index 0b7a3a14..00000000
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.doctags.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-
-package
-
\ No newline at end of file
diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.json
deleted file mode 100644
index 128a8527..00000000
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.json
+++ /dev/null
@@ -1 +0,0 @@
-{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.21306574279092, 74.12495603322407, 152.19606490864376, 154.19400205373182], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
\ No newline at end of file
diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.md b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.md
deleted file mode 100644
index 597acc76..00000000
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.md
+++ /dev/null
@@ -1 +0,0 @@
-package
\ No newline at end of file
diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.pages.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.pages.json
deleted file mode 100644
index fdc46eda..00000000
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.pages.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}]
\ No newline at end of file
diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt
index 3322c749..0424fbee 100644
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt
@@ -1,5 +1,9 @@
-package
-JSON and Markdown in an easy self contained
-Docling bundles PDF document conversion to
+
+
+insidenothingand last row 2
+haveotherand row 1
+andhave contentsome cellsthis is row 0
+Column 2Column 1Column 0
+
\ No newline at end of file
diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.md b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.md
index 120ab1cc..8521b3f9 100644
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.md
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.md
@@ -1,5 +1,5 @@
-package
-
-JSON and Markdown in an easy self contained
-
-Docling bundles PDF document conversion to
\ No newline at end of file
+| inside | | nothing | and last row 2 |
+|----------|--------------|------------|------------------|
+| have | other | | and row 1 |
+| and | have content | some cells | this is row 0 |
+| Column 2 | Column 1 | Column 0 | |
\ No newline at end of file
diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt
index 8350737b..7ba27bf2 100644
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt
@@ -1,3 +1,9 @@
-package
+
+
+and last row 2and row 1this is row 0
+nothingsome cellsColumn 0
+otherhave contentColumn 1
+insidehaveandColumn 2
+
\ No newline at end of file
diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.md b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.md
index 597acc76..f423a6c2 100644
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.md
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.md
@@ -1 +1,5 @@
-package
\ No newline at end of file
+| and last row 2 | and row 1 | this is row 0 | |
+|------------------|-------------|-----------------|----------|
+| nothing | | some cells | Column 0 |
+| | other | have content | Column 1 |
+| inside | have | and | Column 2 |
\ No newline at end of file
diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt
index c1068b56..5a2c9878 100644
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt
@@ -1,4 +1,9 @@
-Docling bundles PDF document conversion to
-JSON and Markdown in an easy self contained package
+
+
+Column 2andhaveinside
+Column 1have contentother
+Column 0some cellsnothing
+this is row 0and row 1and last row 2
+
\ No newline at end of file
diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json
index 5a622c92..648e8fe1 100644
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json
@@ -27,53 +27,468 @@
"file-info": {
"filename": "ocr_test_rotated_90.pdf",
"filename-prov": null,
- "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6",
+ "document-hash": "2fb20caf4f54c878a0b454b496010d92adc6ae1b7f10fbd9ba1ba26260f818a8",
"#-pages": 1,
"collection-name": null,
"description": null,
"page-hashes": [
{
- "hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3",
+ "hash": "56c847ad7c5ab9f0346a325510af001ab66a9bb45f65ffc7bbfc60c929def7d2",
"model": "default",
"page": 1
}
]
},
"main-text": [
+ {
+ "name": "Table",
+ "type": "table",
+ "$ref": "#/tables/0"
+ }
+ ],
+ "figures": [],
+ "tables": [
{
"prov": [
{
"bbox": [
- 131.21306574279092,
- 74.12495603322407,
- 152.19606490864376,
- 154.19400205373182
+ 75.13359832763672,
+ 102.99908447265625,
+ 361.18695068359375,
+ 562.1403198242188
],
"page": 1,
"span": [
0,
- 7
+ 0
],
"__ref_s3_data": null
}
],
- "text": "package",
- "type": "paragraph",
+ "text": "",
+ "type": "table",
"payload": null,
- "name": "Text",
- "font": null
+ "#-cols": 4,
+ "#-rows": 4,
+ "data": [
+ [
+ {
+ "bbox": [
+ 105.0718660651769,
+ 304.7354643560275,
+ 119.73306194406335,
+ 369.59883715876185
+ ],
+ "spans": [
+ [
+ 0,
+ 0
+ ]
+ ],
+ "text": "Column 2",
+ "type": "body",
+ "col": 0,
+ "col-header": false,
+ "col-span": [
+ 0,
+ 1
+ ],
+ "row": 0,
+ "row-header": false,
+ "row-span": [
+ 0,
+ 1
+ ]
+ },
+ {
+ "bbox": [
+ 172.26899264661517,
+ 324.3168597625203,
+ 188.15195177751215,
+ 352.46511670018316
+ ],
+ "spans": [
+ [
+ 0,
+ 1
+ ]
+ ],
+ "text": "and",
+ "type": "body",
+ "col": 1,
+ "col-header": false,
+ "col-span": [
+ 1,
+ 2
+ ],
+ "row": 0,
+ "row-header": false,
+ "row-span": [
+ 0,
+ 1
+ ]
+ },
+ {
+ "bbox": [
+ 240.68788382926402,
+ 321.869185135892,
+ 256.570842960161,
+ 356.13662847492196
+ ],
+ "spans": [
+ [
+ 0,
+ 2
+ ]
+ ],
+ "text": "have",
+ "type": "body",
+ "col": 2,
+ "col-header": false,
+ "col-span": [
+ 2,
+ 3
+ ],
+ "row": 0,
+ "row-header": false,
+ "row-span": [
+ 0,
+ 1
+ ]
+ },
+ {
+ "bbox": [
+ 312.772072637728,
+ 319.42151173034614,
+ 326.21150018118874,
+ 359.8081389276117
+ ],
+ "spans": [
+ [
+ 0,
+ 3
+ ]
+ ],
+ "text": "inside",
+ "type": "body",
+ "col": 3,
+ "col-header": false,
+ "col-span": [
+ 3,
+ 4
+ ],
+ "row": 0,
+ "row-header": false,
+ "row-span": [
+ 0,
+ 1
+ ]
+ }
+ ],
+ [
+ {
+ "bbox": [
+ 105.0718660651769,
+ 419.77616156495424,
+ 119.73306194406335,
+ 483.4156981046677
+ ],
+ "spans": [
+ [
+ 1,
+ 0
+ ]
+ ],
+ "text": "Column 1",
+ "type": "body",
+ "col": 0,
+ "col-header": false,
+ "col-span": [
+ 0,
+ 1
+ ],
+ "row": 1,
+ "row-header": false,
+ "row-span": [
+ 1,
+ 2
+ ]
+ },
+ {
+ "bbox": [
+ 172.26898999097682,
+ 408.7616301134671,
+ 185.70842261785268,
+ 495.6540658231026
+ ],
+ "spans": [
+ [
+ 1,
+ 1
+ ]
+ ],
+ "text": "have content",
+ "type": "body",
+ "col": 1,
+ "col-header": false,
+ "col-span": [
+ 1,
+ 2
+ ],
+ "row": 1,
+ "row-header": false,
+ "row-span": [
+ 1,
+ 2
+ ]
+ },
+ {
+ "bbox": [
+ 240.68788377535307,
+ 433.23837164942523,
+ 255.34907711253194,
+ 468.729651251476
+ ],
+ "spans": [
+ [
+ 1,
+ 2
+ ]
+ ],
+ "text": "other",
+ "type": "body",
+ "col": 2,
+ "col-header": false,
+ "col-span": [
+ 2,
+ 3
+ ],
+ "row": 1,
+ "row-header": false,
+ "row-span": [
+ 1,
+ 2
+ ]
+ },
+ {
+ "bbox": null,
+ "spans": [
+ [
+ 1,
+ 3
+ ]
+ ],
+ "text": "",
+ "type": "body"
+ }
+ ],
+ [
+ {
+ "bbox": [
+ 105.07186605295925,
+ 532.3691850430223,
+ 119.73306193184567,
+ 597.2325578457567
+ ],
+ "spans": [
+ [
+ 2,
+ 0
+ ]
+ ],
+ "text": "Column 0",
+ "type": "body",
+ "col": 0,
+ "col-header": false,
+ "col-span": [
+ 0,
+ 1
+ ],
+ "row": 2,
+ "row-header": false,
+ "row-span": [
+ 2,
+ 3
+ ]
+ },
+ {
+ "bbox": [
+ 172.26899069197702,
+ 529.9215107729757,
+ 186.93018720629036,
+ 600.9040699770771
+ ],
+ "spans": [
+ [
+ 2,
+ 1
+ ]
+ ],
+ "text": "some cells",
+ "type": "body",
+ "col": 1,
+ "col-header": false,
+ "col-span": [
+ 1,
+ 2
+ ],
+ "row": 2,
+ "row-header": false,
+ "row-span": [
+ 2,
+ 3
+ ]
+ },
+ {
+ "bbox": null,
+ "spans": [
+ [
+ 2,
+ 2
+ ]
+ ],
+ "text": "",
+ "type": "body"
+ },
+ {
+ "bbox": [
+ 311.49999737299976,
+ 536.775000315586,
+ 332.5000022770002,
+ 592.9083316144141
+ ],
+ "spans": [
+ [
+ 2,
+ 3
+ ]
+ ],
+ "text": "nothing",
+ "type": "body",
+ "col": 3,
+ "col-header": false,
+ "col-span": [
+ 3,
+ 4
+ ],
+ "row": 2,
+ "row-header": false,
+ "row-span": [
+ 2,
+ 3
+ ]
+ }
+ ],
+ [
+ {
+ "bbox": null,
+ "spans": [
+ [
+ 3,
+ 0
+ ]
+ ],
+ "text": "",
+ "type": "body"
+ },
+ {
+ "bbox": [
+ 172.2689900422697,
+ 638.8430233885732,
+ 186.93018846286373,
+ 719.6162777831045
+ ],
+ "spans": [
+ [
+ 3,
+ 1
+ ]
+ ],
+ "text": "this is row 0",
+ "type": "body",
+ "col": 1,
+ "col-header": false,
+ "col-span": [
+ 1,
+ 2
+ ],
+ "row": 3,
+ "row-header": false,
+ "row-span": [
+ 3,
+ 4
+ ]
+ },
+ {
+ "bbox": [
+ 240.68788248006402,
+ 647.4098827174411,
+ 255.34907835895044,
+ 712.2732555201754
+ ],
+ "spans": [
+ [
+ 3,
+ 2
+ ]
+ ],
+ "text": "and row 1",
+ "type": "body",
+ "col": 2,
+ "col-header": false,
+ "col-span": [
+ 2,
+ 3
+ ],
+ "row": 3,
+ "row-header": false,
+ "row-span": [
+ 3,
+ 4
+ ]
+ },
+ {
+ "bbox": [
+ 313.9938353514431,
+ 633.9476737903873,
+ 327.43326861374595,
+ 725.735464724632
+ ],
+ "spans": [
+ [
+ 3,
+ 3
+ ]
+ ],
+ "text": "and last row 2",
+ "type": "body",
+ "col": 3,
+ "col-header": false,
+ "col-span": [
+ 3,
+ 4
+ ],
+ "row": 3,
+ "row-header": false,
+ "row-span": [
+ 3,
+ 4
+ ]
+ }
+ ]
+ ],
+ "model": null,
+ "bounding-box": null
}
],
- "figures": [],
- "tables": [],
"bitmaps": null,
"equations": [],
"footnotes": [],
"page-dimensions": [
{
- "height": 595.201171875,
+ "height": 842.0,
"page": 1,
- "width": 841.9216918945312
+ "width": 595.0
}
],
"page-footers": [],
diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.md b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.md
index 8d77a437..a45b3c36 100644
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.md
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.md
@@ -1,3 +1,5 @@
-Docling bundles PDF document conversion to
-
-JSON and Markdown in an easy self contained package
\ No newline at end of file
+| Column 2 | and | have | inside |
+|------------|---------------|-----------|----------------|
+| Column 1 | have content | other | |
+| Column 0 | some cells | | nothing |
+| | this is row 0 | and row 1 | and last row 2 |
\ No newline at end of file
diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test.md b/tests/data_scanned/groundtruth/docling_v2/ocr_test.md
index 42896546..c466de2b 100644
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.md
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.md
@@ -1 +1,5 @@
-Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
\ No newline at end of file
+| | Column 0 | Column 1 | Column 2 |
+|----------------|------------|--------------|------------|
+| this is row 0 | some cells | have content | and |
+| and row 1 | | other | have |
+| and last row 2 | nothing | | inside |
\ No newline at end of file
diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.md b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.md
index 120ab1cc..8521b3f9 100644
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.md
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.md
@@ -1,5 +1,5 @@
-package
-
-JSON and Markdown in an easy self contained
-
-Docling bundles PDF document conversion to
\ No newline at end of file
+| inside | | nothing | and last row 2 |
+|----------|--------------|------------|------------------|
+| have | other | | and row 1 |
+| and | have content | some cells | this is row 0 |
+| Column 2 | Column 1 | Column 0 | |
\ No newline at end of file
diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.md b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.md
index 597acc76..f423a6c2 100644
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.md
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.md
@@ -1 +1,5 @@
-package
\ No newline at end of file
+| and last row 2 | and row 1 | this is row 0 | |
+|------------------|-------------|-----------------|----------|
+| nothing | | some cells | Column 0 |
+| | other | have content | Column 1 |
+| inside | have | and | Column 2 |
\ No newline at end of file
diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.md b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.md
index 8d77a437..a45b3c36 100644
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.md
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.md
@@ -1,3 +1,5 @@
-Docling bundles PDF document conversion to
-
-JSON and Markdown in an easy self contained package
\ No newline at end of file
+| Column 2 | and | have | inside |
+|------------|---------------|-----------|----------------|
+| Column 1 | have content | other | |
+| Column 0 | some cells | | nothing |
+| | this is row 0 | and row 1 | and last row 2 |
\ No newline at end of file
diff --git a/tests/data_scanned/ocr_test.pdf b/tests/data_scanned/ocr_test.pdf
index b79f3c28..d7f83728 100644
Binary files a/tests/data_scanned/ocr_test.pdf and b/tests/data_scanned/ocr_test.pdf differ
diff --git a/tests/data_scanned/ocr_test_rotated_180.pdf b/tests/data_scanned/ocr_test_rotated_180.pdf
index 1c030b49..22529b46 100644
Binary files a/tests/data_scanned/ocr_test_rotated_180.pdf and b/tests/data_scanned/ocr_test_rotated_180.pdf differ
diff --git a/tests/data_scanned/ocr_test_rotated_270.pdf b/tests/data_scanned/ocr_test_rotated_270.pdf
index a6e90baf..ccf3c612 100644
Binary files a/tests/data_scanned/ocr_test_rotated_270.pdf and b/tests/data_scanned/ocr_test_rotated_270.pdf differ
diff --git a/tests/data_scanned/ocr_test_rotated_90.pdf b/tests/data_scanned/ocr_test_rotated_90.pdf
index bd08daae..3aa4904b 100644
Binary files a/tests/data_scanned/ocr_test_rotated_90.pdf and b/tests/data_scanned/ocr_test_rotated_90.pdf differ