diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index 6005c7d2..f9127307 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -1,8 +1,8 @@ import copy import logging import warnings -from copy import deepcopy from collections.abc import Iterable +from copy import deepcopy from pathlib import Path from typing import Optional @@ -19,7 +19,7 @@ from docling.models.base_model import BasePageModel from docling.models.utils.hf_model_download import download_hf_model from docling.utils.accelerator_utils import decide_device from docling.utils.layout_postprocessor import LayoutPostprocessor -from docling.utils.orientation import detect_orientation +from docling.utils.orientation import detect_orientation, rotate_bounding_box from docling.utils.profiling import TimeRecorder from docling.utils.visualization import draw_clusters @@ -105,7 +105,6 @@ class LayoutModel(BasePageModel): self, conv_res, page, - page_orientation: int, clusters, mode_prefix: str, show: bool = False, @@ -119,10 +118,6 @@ class LayoutModel(BasePageModel): page_image = deepcopy(page.image) scale_x = page_image.width / page.size.width scale_y = page_image.height / page.size.height - if page_orientation: - page_image = page_image.rotate(-page_orientation, expand=True) - if abs(page_orientation) in [90, 270]: - scale_x, scale_y = scale_y, scale_x # Filter clusters for left and right images exclude_labels = { DocItemLabel.FORM, @@ -138,9 +133,6 @@ class LayoutModel(BasePageModel): # Draw clusters on both images draw_clusters(left_image, left_clusters, scale_x, scale_y) draw_clusters(right_image, right_clusters, scale_x, scale_y) - if page_orientation: - left_image = left_image.rotate(page_orientation, expand=True) - right_image = right_image.rotate(page_orientation, expand=True) # Combine the images side by side combined_width = left_image.width * 2 combined_height = left_image.height @@ -183,11 +175,16 @@ class LayoutModel(BasePageModel): .replace(" ", "_") .replace("-", "_") ) # Temporary, until docling-ibm-model uses docling-core types + bbox = BoundingBox.model_validate(pred_item) + if page_orientation: + bbox = rotate_bounding_box( + bbox, page_orientation, page_image.size + ).to_bounding_box() cluster = Cluster( id=ix, label=label, confidence=pred_item["confidence"], - bbox=BoundingBox.model_validate(pred_item), + bbox=bbox, cells=[], ) clusters.append(cluster) @@ -196,7 +193,6 @@ class LayoutModel(BasePageModel): self.draw_clusters_and_cells_side_by_side( conv_res, page, - page_orientation, clusters, mode_prefix="raw", ) @@ -234,7 +230,6 @@ class LayoutModel(BasePageModel): self.draw_clusters_and_cells_side_by_side( conv_res, page, - page_orientation, processed_clusters, mode_prefix="postprocessed", ) diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index f5f2cb14..05153ff9 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -1,8 +1,7 @@ import copy import warnings -from collections.abc import Iterable from pathlib import Path -from typing import Optional +from typing import Iterable, Optional, Tuple, cast import numpy from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell @@ -11,6 +10,7 @@ from docling_core.types.doc.page import ( TextCellUnit, ) from PIL import ImageDraw +from PIL.Image import Image from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.base_models import Page, Table, TableStructurePrediction @@ -23,6 +23,7 @@ from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel from docling.models.utils.hf_model_download import download_hf_model from docling.utils.accelerator_utils import decide_device +from docling.utils.orientation import detect_orientation, rotate_bounding_box from docling.utils.profiling import TimeRecorder @@ -30,6 +31,8 @@ class TableStructureModel(BasePageModel): _model_repo_folder = "ds4sd--docling-models" _model_path = "model_artifacts/tableformer" + _table_labels = {DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX} + def __init__( self, enabled: bool, @@ -186,31 +189,48 @@ class TableStructureModel(BasePageModel): page.predictions.tablestructure = ( TableStructurePrediction() ) # dummy - - in_tables = [ - ( - cluster, - [ - round(cluster.bbox.l) * self.scale, - round(cluster.bbox.t) * self.scale, - round(cluster.bbox.r) * self.scale, - round(cluster.bbox.b) * self.scale, - ], - ) + cells_orientation = detect_orientation(page.cells) + # Keep only table bboxes + in_tables_clusters = [ + cluster for cluster in page.predictions.layout.clusters - if cluster.label - in [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX] + if cluster.label in self._table_labels ] - if not len(in_tables): + + if not len(in_tables_clusters): yield page continue - + # Rotate and scale table image + page_im = cast(Image, page.get_image()) + scaled_page_im: Image = cast( + Image, page.get_image(scale=self.scale) + ) + if cells_orientation: + scaled_page_im = scaled_page_im.rotate( + -cells_orientation, expand=True + ) page_input = { - "width": page.size.width * self.scale, - "height": page.size.height * self.scale, - "image": numpy.asarray(page.get_image(scale=self.scale)), + "width": scaled_page_im.size[0], + "height": scaled_page_im.size[1], + "image": numpy.asarray(scaled_page_im), } - + # Rotate and scale table cells + in_tables = [ + ( + c, + [ + round(x) * self.scale + for x in _rotate_bbox( + c.bbox, + orientation=-cells_orientation, + im_size=page_im.size, + ) + .to_top_left_origin(page_im.size[1]) + .as_tuple() + ], + ) + for c in in_tables_clusters + ] table_clusters, table_bboxes = zip(*in_tables) if len(table_bboxes): @@ -238,11 +258,16 @@ class TableStructureModel(BasePageModel): scale=self.scale ) ) + new_bbox = _rotate_bbox( + new_cell.to_bounding_box(), + orientation=-cells_orientation, + im_size=scaled_page_im.size, + ).model_dump() tokens.append( { "id": new_cell.index, "text": new_cell.text, - "bbox": new_cell.rect.to_bounding_box().model_dump(), + "bbox": new_bbox, } ) page_input["tokens"] = tokens @@ -302,3 +327,11 @@ class TableStructureModel(BasePageModel): ) yield page + + +def _rotate_bbox( + bbox: BoundingBox, *, orientation: int, im_size: Tuple[int, int] +) -> BoundingBox: + if orientation: + return rotate_bounding_box(bbox, orientation, im_size).to_bounding_box() + return bbox diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index ab644f3b..0f9ce201 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -27,7 +27,6 @@ from docling.utils.ocr_utils import ( parse_tesseract_orientation, tesseract_box_to_bounding_rectangle, ) -from docling.utils.orientation import Box from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/utils/ocr_utils.py b/docling/utils/ocr_utils.py index a3e3092e..bf7b510d 100644 --- a/docling/utils/ocr_utils.py +++ b/docling/utils/ocr_utils.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Tuple from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import BoundingRectangle @@ -43,7 +43,9 @@ def tesseract_box_to_bounding_rectangle( orientation: int, im_size: Tuple[int, int], ) -> BoundingRectangle: - # box is in the top, left, height, width format, top left coordinates + # bbox is in the top, left, height, width format, top left coordinates + # We detected the tesseract on the document rotated with minus orientation, we have + # to apply an orientation angle rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size) rect = BoundingRectangle( r_x0=rect.r_x0 / scale, @@ -54,7 +56,7 @@ def tesseract_box_to_bounding_rectangle( r_y2=rect.r_y2 / scale, r_x3=rect.r_x3 / scale, r_y3=rect.r_y3 / scale, - coord_origin=CoordOrigin.TOPLEFT, + coord_origin=rect.coord_origin, ) if original_offset is not None: if original_offset.coord_origin is not CoordOrigin.TOPLEFT: diff --git a/docling/utils/orientation.py b/docling/utils/orientation.py index f9c30096..eb118d13 100644 --- a/docling/utils/orientation.py +++ b/docling/utils/orientation.py @@ -1,13 +1,15 @@ from collections import Counter from operator import itemgetter +from typing import Tuple -from docling_core.types.doc.page import TextCell +from docling_core.types.doc import BoundingBox, CoordOrigin +from docling_core.types.doc.page import BoundingRectangle, TextCell -_ORIENTATIONS = [0, 90, 180, 270] +CLIPPED_ORIENTATIONS = [0, 90, 180, 270] def _clipped_orientation(angle: float) -> int: - return min((abs(angle - o) % 360, o) for o in _ORIENTATIONS)[1] + return min((abs(angle - o) % 360, o) for o in CLIPPED_ORIENTATIONS)[1] def detect_orientation(cells: list[TextCell]) -> int: @@ -15,12 +17,6 @@ def detect_orientation(cells: list[TextCell]) -> int: return 0 orientation_counter = Counter(_clipped_orientation(c.rect.angle_360) for c in cells) return max(orientation_counter.items(), key=itemgetter(1))[0] -from typing import Tuple - -from docling_core.types.doc import BoundingBox, CoordOrigin -from docling_core.types.doc.page import BoundingRectangle - -CLIPPED_ORIENTATIONS = [0, 90, 180, 270] def rotate_bounding_box( diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test.doctags.txt b/tests/data_scanned/groundtruth/docling_v1/ocr_test.doctags.txt index b00cc668..927ba0f2 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.doctags.txt +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.doctags.txt @@ -1,3 +1,9 @@ -Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package + + +Column 0Column 1Column 2 +this is row 0some cellshave contentand +and row 1otherhave +and last row 2nothinginside +
\ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test.md b/tests/data_scanned/groundtruth/docling_v1/ocr_test.md index 42896546..c466de2b 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.md +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.md @@ -1 +1,5 @@ -Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package \ No newline at end of file +| | Column 0 | Column 1 | Column 2 | +|----------------|------------|--------------|------------| +| this is row 0 | some cells | have content | and | +| and row 1 | | other | have | +| and last row 2 | nothing | | inside | \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.doctags.txt b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.doctags.txt deleted file mode 100644 index 0b7a3a14..00000000 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.doctags.txt +++ /dev/null @@ -1,3 +0,0 @@ - -package - \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.json deleted file mode 100644 index 128a8527..00000000 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.json +++ /dev/null @@ -1 +0,0 @@ -{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.21306574279092, 74.12495603322407, 152.19606490864376, 154.19400205373182], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null} \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.md b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.md deleted file mode 100644 index 597acc76..00000000 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.md +++ /dev/null @@ -1 +0,0 @@ -package \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.pages.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.pages.json deleted file mode 100644 index fdc46eda..00000000 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.pages.json +++ /dev/null @@ -1 +0,0 @@ -[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}] \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt index 3322c749..0424fbee 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.doctags.txt @@ -1,5 +1,9 @@ -package -JSON and Markdown in an easy self contained -Docling bundles PDF document conversion to + + +insidenothingand last row 2 +haveotherand row 1 +andhave contentsome cellsthis is row 0 +Column 2Column 1Column 0 +
\ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.md b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.md index 120ab1cc..8521b3f9 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.md +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.md @@ -1,5 +1,5 @@ -package - -JSON and Markdown in an easy self contained - -Docling bundles PDF document conversion to \ No newline at end of file +| inside | | nothing | and last row 2 | +|----------|--------------|------------|------------------| +| have | other | | and row 1 | +| and | have content | some cells | this is row 0 | +| Column 2 | Column 1 | Column 0 | | \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt index 8350737b..7ba27bf2 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.doctags.txt @@ -1,3 +1,9 @@ -package + + +and last row 2and row 1this is row 0 +nothingsome cellsColumn 0 +otherhave contentColumn 1 +insidehaveandColumn 2 +
\ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.md b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.md index 597acc76..f423a6c2 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.md +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.md @@ -1 +1,5 @@ -package \ No newline at end of file +| and last row 2 | and row 1 | this is row 0 | | +|------------------|-------------|-----------------|----------| +| nothing | | some cells | Column 0 | +| | other | have content | Column 1 | +| inside | have | and | Column 2 | \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt index c1068b56..5a2c9878 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.doctags.txt @@ -1,4 +1,9 @@ -Docling bundles PDF document conversion to -JSON and Markdown in an easy self contained package + + +Column 2andhaveinside +Column 1have contentother +Column 0some cellsnothing +this is row 0and row 1and last row 2 +
\ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json index 5a622c92..648e8fe1 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.json @@ -27,53 +27,468 @@ "file-info": { "filename": "ocr_test_rotated_90.pdf", "filename-prov": null, - "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", + "document-hash": "2fb20caf4f54c878a0b454b496010d92adc6ae1b7f10fbd9ba1ba26260f818a8", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [ { - "hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", + "hash": "56c847ad7c5ab9f0346a325510af001ab66a9bb45f65ffc7bbfc60c929def7d2", "model": "default", "page": 1 } ] }, "main-text": [ + { + "name": "Table", + "type": "table", + "$ref": "#/tables/0" + } + ], + "figures": [], + "tables": [ { "prov": [ { "bbox": [ - 131.21306574279092, - 74.12495603322407, - 152.19606490864376, - 154.19400205373182 + 75.13359832763672, + 102.99908447265625, + 361.18695068359375, + 562.1403198242188 ], "page": 1, "span": [ 0, - 7 + 0 ], "__ref_s3_data": null } ], - "text": "package", - "type": "paragraph", + "text": "", + "type": "table", "payload": null, - "name": "Text", - "font": null + "#-cols": 4, + "#-rows": 4, + "data": [ + [ + { + "bbox": [ + 105.0718660651769, + 304.7354643560275, + 119.73306194406335, + 369.59883715876185 + ], + "spans": [ + [ + 0, + 0 + ] + ], + "text": "Column 2", + "type": "body", + "col": 0, + "col-header": false, + "col-span": [ + 0, + 1 + ], + "row": 0, + "row-header": false, + "row-span": [ + 0, + 1 + ] + }, + { + "bbox": [ + 172.26899264661517, + 324.3168597625203, + 188.15195177751215, + 352.46511670018316 + ], + "spans": [ + [ + 0, + 1 + ] + ], + "text": "and", + "type": "body", + "col": 1, + "col-header": false, + "col-span": [ + 1, + 2 + ], + "row": 0, + "row-header": false, + "row-span": [ + 0, + 1 + ] + }, + { + "bbox": [ + 240.68788382926402, + 321.869185135892, + 256.570842960161, + 356.13662847492196 + ], + "spans": [ + [ + 0, + 2 + ] + ], + "text": "have", + "type": "body", + "col": 2, + "col-header": false, + "col-span": [ + 2, + 3 + ], + "row": 0, + "row-header": false, + "row-span": [ + 0, + 1 + ] + }, + { + "bbox": [ + 312.772072637728, + 319.42151173034614, + 326.21150018118874, + 359.8081389276117 + ], + "spans": [ + [ + 0, + 3 + ] + ], + "text": "inside", + "type": "body", + "col": 3, + "col-header": false, + "col-span": [ + 3, + 4 + ], + "row": 0, + "row-header": false, + "row-span": [ + 0, + 1 + ] + } + ], + [ + { + "bbox": [ + 105.0718660651769, + 419.77616156495424, + 119.73306194406335, + 483.4156981046677 + ], + "spans": [ + [ + 1, + 0 + ] + ], + "text": "Column 1", + "type": "body", + "col": 0, + "col-header": false, + "col-span": [ + 0, + 1 + ], + "row": 1, + "row-header": false, + "row-span": [ + 1, + 2 + ] + }, + { + "bbox": [ + 172.26898999097682, + 408.7616301134671, + 185.70842261785268, + 495.6540658231026 + ], + "spans": [ + [ + 1, + 1 + ] + ], + "text": "have content", + "type": "body", + "col": 1, + "col-header": false, + "col-span": [ + 1, + 2 + ], + "row": 1, + "row-header": false, + "row-span": [ + 1, + 2 + ] + }, + { + "bbox": [ + 240.68788377535307, + 433.23837164942523, + 255.34907711253194, + 468.729651251476 + ], + "spans": [ + [ + 1, + 2 + ] + ], + "text": "other", + "type": "body", + "col": 2, + "col-header": false, + "col-span": [ + 2, + 3 + ], + "row": 1, + "row-header": false, + "row-span": [ + 1, + 2 + ] + }, + { + "bbox": null, + "spans": [ + [ + 1, + 3 + ] + ], + "text": "", + "type": "body" + } + ], + [ + { + "bbox": [ + 105.07186605295925, + 532.3691850430223, + 119.73306193184567, + 597.2325578457567 + ], + "spans": [ + [ + 2, + 0 + ] + ], + "text": "Column 0", + "type": "body", + "col": 0, + "col-header": false, + "col-span": [ + 0, + 1 + ], + "row": 2, + "row-header": false, + "row-span": [ + 2, + 3 + ] + }, + { + "bbox": [ + 172.26899069197702, + 529.9215107729757, + 186.93018720629036, + 600.9040699770771 + ], + "spans": [ + [ + 2, + 1 + ] + ], + "text": "some cells", + "type": "body", + "col": 1, + "col-header": false, + "col-span": [ + 1, + 2 + ], + "row": 2, + "row-header": false, + "row-span": [ + 2, + 3 + ] + }, + { + "bbox": null, + "spans": [ + [ + 2, + 2 + ] + ], + "text": "", + "type": "body" + }, + { + "bbox": [ + 311.49999737299976, + 536.775000315586, + 332.5000022770002, + 592.9083316144141 + ], + "spans": [ + [ + 2, + 3 + ] + ], + "text": "nothing", + "type": "body", + "col": 3, + "col-header": false, + "col-span": [ + 3, + 4 + ], + "row": 2, + "row-header": false, + "row-span": [ + 2, + 3 + ] + } + ], + [ + { + "bbox": null, + "spans": [ + [ + 3, + 0 + ] + ], + "text": "", + "type": "body" + }, + { + "bbox": [ + 172.2689900422697, + 638.8430233885732, + 186.93018846286373, + 719.6162777831045 + ], + "spans": [ + [ + 3, + 1 + ] + ], + "text": "this is row 0", + "type": "body", + "col": 1, + "col-header": false, + "col-span": [ + 1, + 2 + ], + "row": 3, + "row-header": false, + "row-span": [ + 3, + 4 + ] + }, + { + "bbox": [ + 240.68788248006402, + 647.4098827174411, + 255.34907835895044, + 712.2732555201754 + ], + "spans": [ + [ + 3, + 2 + ] + ], + "text": "and row 1", + "type": "body", + "col": 2, + "col-header": false, + "col-span": [ + 2, + 3 + ], + "row": 3, + "row-header": false, + "row-span": [ + 3, + 4 + ] + }, + { + "bbox": [ + 313.9938353514431, + 633.9476737903873, + 327.43326861374595, + 725.735464724632 + ], + "spans": [ + [ + 3, + 3 + ] + ], + "text": "and last row 2", + "type": "body", + "col": 3, + "col-header": false, + "col-span": [ + 3, + 4 + ], + "row": 3, + "row-header": false, + "row-span": [ + 3, + 4 + ] + } + ] + ], + "model": null, + "bounding-box": null } ], - "figures": [], - "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [ { - "height": 595.201171875, + "height": 842.0, "page": 1, - "width": 841.9216918945312 + "width": 595.0 } ], "page-footers": [], diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.md b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.md index 8d77a437..a45b3c36 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.md +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.md @@ -1,3 +1,5 @@ -Docling bundles PDF document conversion to - -JSON and Markdown in an easy self contained package \ No newline at end of file +| Column 2 | and | have | inside | +|------------|---------------|-----------|----------------| +| Column 1 | have content | other | | +| Column 0 | some cells | | nothing | +| | this is row 0 | and row 1 | and last row 2 | \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test.md b/tests/data_scanned/groundtruth/docling_v2/ocr_test.md index 42896546..c466de2b 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.md +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.md @@ -1 +1,5 @@ -Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package \ No newline at end of file +| | Column 0 | Column 1 | Column 2 | +|----------------|------------|--------------|------------| +| this is row 0 | some cells | have content | and | +| and row 1 | | other | have | +| and last row 2 | nothing | | inside | \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.md b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.md index 120ab1cc..8521b3f9 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.md +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.md @@ -1,5 +1,5 @@ -package - -JSON and Markdown in an easy self contained - -Docling bundles PDF document conversion to \ No newline at end of file +| inside | | nothing | and last row 2 | +|----------|--------------|------------|------------------| +| have | other | | and row 1 | +| and | have content | some cells | this is row 0 | +| Column 2 | Column 1 | Column 0 | | \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.md b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.md index 597acc76..f423a6c2 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.md +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.md @@ -1 +1,5 @@ -package \ No newline at end of file +| and last row 2 | and row 1 | this is row 0 | | +|------------------|-------------|-----------------|----------| +| nothing | | some cells | Column 0 | +| | other | have content | Column 1 | +| inside | have | and | Column 2 | \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.md b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.md index 8d77a437..a45b3c36 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.md +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.md @@ -1,3 +1,5 @@ -Docling bundles PDF document conversion to - -JSON and Markdown in an easy self contained package \ No newline at end of file +| Column 2 | and | have | inside | +|------------|---------------|-----------|----------------| +| Column 1 | have content | other | | +| Column 0 | some cells | | nothing | +| | this is row 0 | and row 1 | and last row 2 | \ No newline at end of file diff --git a/tests/data_scanned/ocr_test.pdf b/tests/data_scanned/ocr_test.pdf index b79f3c28..d7f83728 100644 Binary files a/tests/data_scanned/ocr_test.pdf and b/tests/data_scanned/ocr_test.pdf differ diff --git a/tests/data_scanned/ocr_test_rotated_180.pdf b/tests/data_scanned/ocr_test_rotated_180.pdf index 1c030b49..22529b46 100644 Binary files a/tests/data_scanned/ocr_test_rotated_180.pdf and b/tests/data_scanned/ocr_test_rotated_180.pdf differ diff --git a/tests/data_scanned/ocr_test_rotated_270.pdf b/tests/data_scanned/ocr_test_rotated_270.pdf index a6e90baf..ccf3c612 100644 Binary files a/tests/data_scanned/ocr_test_rotated_270.pdf and b/tests/data_scanned/ocr_test_rotated_270.pdf differ diff --git a/tests/data_scanned/ocr_test_rotated_90.pdf b/tests/data_scanned/ocr_test_rotated_90.pdf index bd08daae..3aa4904b 100644 Binary files a/tests/data_scanned/ocr_test_rotated_90.pdf and b/tests/data_scanned/ocr_test_rotated_90.pdf differ