This commit is contained in:
Clément Doumouro 2025-07-10 04:37:57 +00:00 committed by GitHub
commit fb900115ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
49 changed files with 40355 additions and 2146 deletions

View File

@ -2,6 +2,7 @@ import copy
import logging
import warnings
from collections.abc import Iterable
from copy import deepcopy
from pathlib import Path
from typing import Optional
@ -19,6 +20,7 @@ from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device
from docling.utils.layout_postprocessor import LayoutPostprocessor
from docling.utils.orientation import detect_orientation, rotate_bounding_box
from docling.utils.profiling import TimeRecorder
from docling.utils.visualization import draw_clusters
@ -157,7 +159,9 @@ class LayoutModel(BasePageModel):
assert page.size is not None
page_image = page.get_image(scale=1.0)
assert page_image is not None
page_orientation = detect_orientation(page.cells)
if page_orientation:
page_image = page_image.rotate(-page_orientation, expand=True)
clusters = []
for ix, pred_item in enumerate(
self.layout_predictor.predict(page_image)
@ -168,11 +172,16 @@ class LayoutModel(BasePageModel):
.replace(" ", "_")
.replace("-", "_")
) # Temporary, until docling-ibm-model uses docling-core types
bbox = BoundingBox.model_validate(pred_item)
if page_orientation:
bbox = rotate_bounding_box(
bbox, page_orientation, page_image.size
).to_bounding_box()
cluster = Cluster(
id=ix,
label=label,
confidence=pred_item["confidence"],
bbox=BoundingBox.model_validate(pred_item),
bbox=bbox,
cells=[],
)
clusters.append(cluster)

View File

@ -107,10 +107,10 @@ class OcrMacModel(BaseOcrModel):
x2 = x1 + w * im_width
y1 = y2 - h * im_height
left = x1 / self.scale
top = y1 / self.scale
right = x2 / self.scale
bottom = y2 / self.scale
left = x1 / self.scale + ocr_rect.l
top = y1 / self.scale + ocr_rect.t
right = x2 / self.scale + ocr_rect.l
bottom = y2 / self.scale + ocr_rect.t
cells.append(
TextCell(

View File

@ -1,8 +1,7 @@
import copy
import warnings
from collections.abc import Iterable
from pathlib import Path
from typing import Optional
from typing import Iterable, Optional, Tuple, cast
import numpy
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@ -11,6 +10,7 @@ from docling_core.types.doc.page import (
TextCellUnit,
)
from PIL import ImageDraw
from PIL.Image import Image
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
@ -23,6 +23,7 @@ from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device
from docling.utils.orientation import detect_orientation, rotate_bounding_box
from docling.utils.profiling import TimeRecorder
@ -30,6 +31,8 @@ class TableStructureModel(BasePageModel):
_model_repo_folder = "ds4sd--docling-models"
_model_path = "model_artifacts/tableformer"
_table_labels = {DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX}
def __init__(
self,
enabled: bool,
@ -186,31 +189,48 @@ class TableStructureModel(BasePageModel):
page.predictions.tablestructure = (
TableStructurePrediction()
) # dummy
in_tables = [
(
cluster,
[
round(cluster.bbox.l) * self.scale,
round(cluster.bbox.t) * self.scale,
round(cluster.bbox.r) * self.scale,
round(cluster.bbox.b) * self.scale,
],
)
cells_orientation = detect_orientation(page.cells)
# Keep only table bboxes
in_tables_clusters = [
cluster
for cluster in page.predictions.layout.clusters
if cluster.label
in [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
if cluster.label in self._table_labels
]
if not len(in_tables):
if not len(in_tables_clusters):
yield page
continue
# Rotate and scale table image
page_im = cast(Image, page.get_image())
scaled_page_im: Image = cast(
Image, page.get_image(scale=self.scale)
)
if cells_orientation:
scaled_page_im = scaled_page_im.rotate(
-cells_orientation, expand=True
)
page_input = {
"width": page.size.width * self.scale,
"height": page.size.height * self.scale,
"image": numpy.asarray(page.get_image(scale=self.scale)),
"width": scaled_page_im.size[0],
"height": scaled_page_im.size[1],
"image": numpy.asarray(scaled_page_im),
}
# Rotate and scale table cells
in_tables = [
(
c,
[
round(x) * self.scale
for x in _rotate_bbox(
c.bbox,
orientation=-cells_orientation,
im_size=page_im.size,
)
.to_top_left_origin(page_im.size[1])
.as_tuple()
],
)
for c in in_tables_clusters
]
table_clusters, table_bboxes = zip(*in_tables)
if len(table_bboxes):
@ -238,11 +258,16 @@ class TableStructureModel(BasePageModel):
scale=self.scale
)
)
new_bbox = _rotate_bbox(
new_cell.to_bounding_box(),
orientation=cells_orientation,
im_size=scaled_page_im.size,
).model_dump()
tokens.append(
{
"id": new_cell.index,
"text": new_cell.text,
"bbox": new_cell.rect.to_bounding_box().model_dump(),
"bbox": new_bbox,
}
)
page_input["tokens"] = tokens
@ -302,3 +327,11 @@ class TableStructureModel(BasePageModel):
)
yield page
def _rotate_bbox(
bbox: BoundingBox, *, orientation: int, im_size: Tuple[int, int]
) -> BoundingBox:
if orientation:
return rotate_bounding_box(bbox, orientation, im_size).to_bounding_box()
return bbox

View File

@ -3,7 +3,10 @@ from typing import Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle
from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
from docling.utils.orientation import (
CLIPPED_ORIENTATIONS,
rotate_bounding_box,
)
def map_tesseract_script(script: str) -> str:
@ -40,7 +43,9 @@ def tesseract_box_to_bounding_rectangle(
orientation: int,
im_size: Tuple[int, int],
) -> BoundingRectangle:
# box is in the top, left, height, width format, top left coordinates
# bbox is in the top, left, height, width format, top left coordinates
# We detected the tesseract on the document rotated with minus orientation, we have
# to apply an orientation angle
rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size)
rect = BoundingRectangle(
r_x0=rect.r_x0 / scale,
@ -51,7 +56,7 @@ def tesseract_box_to_bounding_rectangle(
r_y2=rect.r_y2 / scale,
r_x3=rect.r_x3 / scale,
r_y3=rect.r_y3 / scale,
coord_origin=CoordOrigin.TOPLEFT,
coord_origin=rect.coord_origin,
)
if original_offset is not None:
if original_offset.coord_origin is not CoordOrigin.TOPLEFT:

View File

@ -1,11 +1,24 @@
from collections import Counter
from operator import itemgetter
from typing import Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle
from docling_core.types.doc.page import BoundingRectangle, TextCell
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
def _clipped_orientation(angle: float) -> int:
return min((abs(angle - o) % 360, o) for o in CLIPPED_ORIENTATIONS)[1]
def detect_orientation(cells: list[TextCell]) -> int:
if not cells:
return 0
orientation_counter = Counter(_clipped_orientation(c.rect.angle_360) for c in cells)
return max(orientation_counter.items(), key=itemgetter(1))[0]
def rotate_bounding_box(
bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
) -> BoundingRectangle:

View File

@ -213,10 +213,10 @@
"prov": [
{
"bbox": [
139.66741943359375,
139.66746520996094,
322.5054626464844,
475.00927734375,
454.45458984375
475.0093078613281,
454.4546203613281
],
"page": 1,
"span": [

View File

@ -2705,7 +2705,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9373534917831421,
"confidence": 0.9373531937599182,
"cells": [
{
"index": 0,
@ -2745,7 +2745,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.8858680725097656,
"confidence": 0.8858677744865417,
"cells": [
{
"index": 1,
@ -2785,7 +2785,7 @@
"b": 152.90697999999998,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9806433916091919,
"confidence": 0.9806435108184814,
"cells": [
{
"index": 2,
@ -3155,7 +3155,7 @@
"b": 327.98218,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9591909050941467,
"confidence": 0.9591910243034363,
"cells": [
{
"index": 15,
@ -3339,9 +3339,9 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.66741943359375,
"t": 337.54541015625,
"r": 475.00927734375,
"l": 139.66746520996094,
"t": 337.5453796386719,
"r": 475.0093078613281,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
},
@ -7846,7 +7846,7 @@
"b": 518.17419,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9589294195175171,
"confidence": 0.9589295387268066,
"cells": [
{
"index": 91,
@ -8243,9 +8243,9 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.66741943359375,
"t": 337.54541015625,
"r": 475.00927734375,
"l": 139.66746520996094,
"t": 337.5453796386719,
"r": 475.0093078613281,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
},
@ -13641,7 +13641,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9373534917831421,
"confidence": 0.9373531937599182,
"cells": [
{
"index": 0,
@ -13687,7 +13687,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.8858680725097656,
"confidence": 0.8858677744865417,
"cells": [
{
"index": 1,
@ -13733,7 +13733,7 @@
"b": 152.90697999999998,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9806433916091919,
"confidence": 0.9806435108184814,
"cells": [
{
"index": 2,
@ -14121,7 +14121,7 @@
"b": 327.98218,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9591909050941467,
"confidence": 0.9591910243034363,
"cells": [
{
"index": 15,
@ -14311,9 +14311,9 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.66741943359375,
"t": 337.54541015625,
"r": 475.00927734375,
"l": 139.66746520996094,
"t": 337.5453796386719,
"r": 475.0093078613281,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
},
@ -19701,7 +19701,7 @@
"b": 518.17419,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9589294195175171,
"confidence": 0.9589295387268066,
"cells": [
{
"index": 91,
@ -20116,7 +20116,7 @@
"b": 152.90697999999998,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9806433916091919,
"confidence": 0.9806435108184814,
"cells": [
{
"index": 2,
@ -20504,7 +20504,7 @@
"b": 327.98218,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9591909050941467,
"confidence": 0.9591910243034363,
"cells": [
{
"index": 15,
@ -20694,9 +20694,9 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.66741943359375,
"t": 337.54541015625,
"r": 475.00927734375,
"l": 139.66746520996094,
"t": 337.5453796386719,
"r": 475.0093078613281,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
},
@ -26084,7 +26084,7 @@
"b": 518.17419,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9589294195175171,
"confidence": 0.9589295387268066,
"cells": [
{
"index": 91,
@ -26499,7 +26499,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9373534917831421,
"confidence": 0.9373531937599182,
"cells": [
{
"index": 0,
@ -26545,7 +26545,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.8858680725097656,
"confidence": 0.8858677744865417,
"cells": [
{
"index": 1,

View File

@ -1,2 +1,2 @@
<doctag><text><loc_59><loc_46><loc_424><loc_90>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
<doctag><text><loc_60><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
</doctag>

View File

@ -1,3 +1,8 @@
<document>
<paragraph><location><page_1><loc_12><loc_82><loc_85><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
<table>
<location><page_1><loc_9><loc_45><loc_70><loc_86></location>
<row_0><col_0><col_header>Vertically merged</col_0><col_1><col_header>Other merged column</col_1><col_2><col_header>Yet another column</col_2></row_0>
<row_1><col_0><body>value</col_0><col_1><body>Some other value</col_1><col_2><body>Yet another value</col_2></row_1>
<row_2><col_0><body>value</col_0><col_1><body>Some other value</col_1><col_2><body>Yet another value</col_2></row_2>
</table>
</document>

View File

@ -27,53 +27,321 @@
"file-info": {
"filename": "ocr_test.pdf",
"filename-prov": null,
"document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61",
"document-hash": "0f391d12850f72bb91897f7f3bebfd4a0a8357e2a883ac1f664e32342c04e418",
"#-pages": 1,
"collection-name": null,
"description": null,
"page-hashes": [
{
"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3",
"hash": "32f328168da3f69890a725c1168799f9ff7337249e98b1f36c12965551477be5",
"model": "default",
"page": 1
}
]
},
"main-text": [
{
"name": "Table",
"type": "table",
"$ref": "#/tables/0"
}
],
"figures": [],
"tables": [
{
"prov": [
{
"bbox": [
69.6796630536824,
689.0124221922704,
504.8720051760782,
764.9216921155637
69.04969024658203,
277.41973876953125,
551.0990600585938,
524.3504486083984
],
"page": 1,
"span": [
0,
94
0
],
"__ref_s3_data": null
}
],
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
"type": "paragraph",
"text": "",
"type": "table",
"payload": null,
"name": "Text",
"font": null
"#-cols": 3,
"#-rows": 3,
"data": [
[
{
"bbox": [
97.33333333333333,
105.66666666666666,
190.0,
126.33333333333334
],
"spans": [
[
0,
0
]
],
"text": "Vertically merged",
"type": "col_header",
"col": 0,
"col-header": true,
"col-span": [
0,
1
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
232.66666666666666,
105.66666666666666,
364.0,
126.33333333333334
],
"spans": [
[
0,
1
]
],
"text": "Other merged column",
"type": "col_header",
"col": 1,
"col-header": true,
"col-span": [
1,
2
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
406.3333333333333,
105.66666666666666,
518.3333333333333,
121.66666666666666
],
"spans": [
[
0,
2
]
],
"text": "Yet another column",
"type": "col_header",
"col": 2,
"col-header": true,
"col-span": [
2,
3
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
}
],
[
{
"bbox": [
121.66666666666667,
204.33333333333334,
168.66666666666666,
220.0
],
"spans": [
[
1,
0
]
],
"text": "value",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
},
{
"bbox": [
247.0,
188.33333333333331,
349.6666666666667,
204.33333333333334
],
"spans": [
[
1,
1
]
],
"text": "Some other value",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
},
{
"bbox": [
408.3333333333333,
188.33333333333331,
514.0,
204.33333333333334
],
"spans": [
[
1,
2
]
],
"text": "Yet another value",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
}
],
[
{
"bbox": [
121.66666666666667,
284.0,
168.66666666666666,
300.0
],
"spans": [
[
2,
0
]
],
"text": "value",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
},
{
"bbox": [
247.0,
268.0,
349.6666666666667,
284.0
],
"spans": [
[
2,
1
]
],
"text": "Some other value",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
},
{
"bbox": [
408.3333333333333,
268.0,
514.0,
284.0
],
"spans": [
[
2,
2
]
],
"text": "Yet another value",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
}
]
],
"model": null,
"bounding-box": null
}
],
"figures": [],
"tables": [],
"bitmaps": null,
"equations": [],
"footnotes": [],
"page-dimensions": [
{
"height": 841.9216918945312,
"height": 612.0,
"page": 1,
"width": 595.201171875
"width": 792.0
}
],
"page-footers": [],

View File

@ -1 +1,4 @@
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
| Vertically merged | Other merged column | Yet another column |
|---------------------|-----------------------|----------------------|
| value | Some other value | Yet another value |
| value | Some other value | Yet another value |

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +0,0 @@
<document>
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
</document>

View File

@ -1 +0,0 @@
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.21306574279092, 74.12495603322407, 152.19606490864376, 154.19400205373182], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}

View File

@ -1 +0,0 @@
package

View File

@ -1 +0,0 @@
[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}]

View File

@ -1,4 +1,8 @@
<document>
<paragraph><location><page_1><loc_74><loc_16><loc_88><loc_18></location>package</paragraph>
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph>
<table>
<location><page_1><loc_30><loc_14><loc_91><loc_55></location>
<row_0><col_0><col_header>Vertically merged</col_0><col_1><col_header>Other merged column</col_1><col_2><col_header>Yet another column</col_2></row_0>
<row_1><col_0><body>value</col_0><col_1><body>Some other value</col_1><col_2><body>Yet another value</col_2></row_1>
<row_2><col_0><body>value</col_0><col_1><body>Some other value</col_1><col_2><body>Yet another value</col_2></row_2>
</table>
</document>

View File

@ -27,13 +27,13 @@
"file-info": {
"filename": "ocr_test_rotated_180.pdf",
"filename-prov": null,
"document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982",
"document-hash": "361fa0fc8db9c3a973d316d08509ac78cc0e7f81dea94358319092640d439ca0",
"#-pages": 1,
"collection-name": null,
"description": null,
"page-hashes": [
{
"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49",
"hash": "ab89ee70d4aee0b8dc5ed72ad42e16e98a8ec9c2eea1e03d99b50c25bbc5a806",
"model": "default",
"page": 1
}
@ -41,62 +41,307 @@
},
"main-text": [
{
"prov": [
{
"bbox": [
441.2561096985719,
131.89488404865142,
522.0347860494834,
151.87873262042876
],
"page": 1,
"span": [
0,
7
],
"__ref_s3_data": null
}
],
"text": "package",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
"name": "Table",
"type": "table",
"$ref": "#/tables/0"
}
],
"figures": [],
"tables": [
{
"prov": [
{
"bbox": [
89.23887497045128,
77.02339852098021,
523.208764293368,
124.75312428291147
240.90093994140625,
87.64955139160156,
722.950309753418,
334.58026123046875
],
"page": 1,
"span": [
0,
86
0
],
"__ref_s3_data": null
}
],
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
"type": "paragraph",
"text": "",
"type": "table",
"payload": null,
"name": "Text",
"font": null
"#-cols": 3,
"#-rows": 3,
"data": [
[
{
"bbox": [
97.33333333333337,
105.66666666666669,
190.0,
126.33333333333337
],
"spans": [
[
0,
0
]
],
"text": "Vertically merged",
"type": "col_header",
"col": 0,
"col-header": true,
"col-span": [
0,
1
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
232.33333333333326,
105.66666666666669,
363.6666666666667,
126.33333333333337
],
"spans": [
[
0,
1
]
],
"text": "Other merged column",
"type": "col_header",
"col": 1,
"col-header": true,
"col-span": [
1,
2
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
406.3333333333333,
105.66666666666669,
518.0,
121.66666666666663
],
"spans": [
[
0,
2
]
],
"text": "Yet another column",
"type": "col_header",
"col": 2,
"col-header": true,
"col-span": [
2,
3
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
}
],
[
{
"bbox": [
121.66666666666663,
204.0,
168.66666666666663,
220.0
],
"spans": [
[
1,
0
]
],
"text": "value",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
},
{
"bbox": [
247.0,
188.0,
349.6666666666667,
204.0
],
"spans": [
[
1,
1
]
],
"text": "Some other value",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
},
{
"bbox": [
408.3333333333333,
188.0,
514.0,
204.0
],
"spans": [
[
1,
2
]
],
"text": "Yet another value",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
}
],
[
{
"bbox": [
121.66666666666663,
284.0,
168.66666666666663,
300.0
],
"spans": [
[
2,
0
]
],
"text": "value",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
},
{
"bbox": [
247.0,
268.0,
349.6666666666667,
284.0
],
"spans": [
[
2,
1
]
],
"text": "Some other value",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
},
{
"bbox": [
408.3333333333333,
268.0,
514.0,
284.0
],
"spans": [
[
2,
2
]
],
"text": "Yet another value",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
}
]
],
"model": null,
"bounding-box": null
}
],
"figures": [],
"tables": [],
"bitmaps": null,
"equations": [],
"footnotes": [],
"page-dimensions": [
{
"height": 841.9216918945312,
"height": 612.0,
"page": 1,
"width": 595.201171875
"width": 792.0
}
],
"page-footers": [],

View File

@ -1,3 +1,4 @@
package
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained
| Vertically merged | Other merged column | Yet another column |
|---------------------|-----------------------|----------------------|
| value | Some other value | Yet another value |
| value | Some other value | Yet another value |

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,6 @@
<document>
<paragraph><location><page_1><loc_82><loc_74><loc_84><loc_88></location>package</paragraph>
<table>
<location><page_1><loc_45><loc_30><loc_86><loc_91></location>
<row_0><col_0><body>Yet another value</col_0><col_1><body>Some other value</col_1><col_2><body>value</col_2></row_0>
</table>
</document>

View File

@ -27,53 +27,149 @@
"file-info": {
"filename": "ocr_test_rotated_270.pdf",
"filename-prov": null,
"document-hash": "52f54e7183bdb73aa3713c7b169baca93e276963a138418c26e7d6a1ea128f14",
"document-hash": "753140dc9b8c39b67c6f6712e2a1de4c364c808ca09d13dd05b79c23192429dc",
"#-pages": 1,
"collection-name": null,
"description": null,
"page-hashes": [
{
"hash": "59bc9ddba89e7b008185dd16d384493beb034686e5670546786390c5d237a304",
"hash": "c8fa256d58940f76c5e0ec6b65548a2e939f867c2c75d0ee27f5f70ff32a44be",
"model": "default",
"page": 1
}
]
},
"main-text": [
{
"name": "Table",
"type": "table",
"$ref": "#/tables/0"
}
],
"figures": [],
"tables": [
{
"prov": [
{
"bbox": [
690.2441821046808,
442.39487414368364,
709.8255852011977,
523.076601235155
277.4178771972656,
240.90216064453125,
524.3541717529297,
722.9614028930664
],
"page": 1,
"span": [
0,
7
0
],
"__ref_s3_data": null
}
],
"text": "package",
"type": "paragraph",
"text": "",
"type": "table",
"payload": null,
"name": "Text",
"font": null
"#-cols": 3,
"#-rows": 1,
"data": [
[
{
"bbox": [
98.0,
296.6666666666667,
203.66666666666669,
344.0
],
"spans": [
[
0,
0
]
],
"text": "Yet another value",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
262.3333333333333,
296.6666666666667,
365.0,
344.0
],
"spans": [
[
0,
1
]
],
"text": "Some other value",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
443.33333333333337,
312.0,
490.33333333333337,
328.0
],
"spans": [
[
0,
2
]
],
"text": "value",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
}
]
],
"model": null,
"bounding-box": null
}
],
"figures": [],
"tables": [],
"bitmaps": null,
"equations": [],
"footnotes": [],
"page-dimensions": [
{
"height": 595.201171875,
"height": 792.0,
"page": 1,
"width": 841.9216918945312
"width": 612.0
}
],
"page-footers": [],

View File

@ -1 +0,0 @@
package

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,5 @@
<document>
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
<table>
<location><page_1><loc_14><loc_9><loc_55><loc_70></location>
</table>
</document>

View File

@ -27,53 +27,62 @@
"file-info": {
"filename": "ocr_test_rotated_90.pdf",
"filename-prov": null,
"document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6",
"document-hash": "418ae4425f514f002bd4223ea3003c17f319cbeafd67801732d58f2bedb3bd91",
"#-pages": 1,
"collection-name": null,
"description": null,
"page-hashes": [
{
"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3",
"hash": "36315c08dc861ecde4be6179d2f155da0519b93e0311c290f8db164f593d36d8",
"model": "default",
"page": 1
}
]
},
"main-text": [
{
"name": "Table",
"type": "table",
"$ref": "#/tables/0"
}
],
"figures": [],
"tables": [
{
"prov": [
{
"bbox": [
131.21306574279092,
74.12495603322407,
152.19606490864376,
154.19400205373182
87.64582824707031,
69.0385971069336,
334.5821228027344,
551.0978393554688
],
"page": 1,
"span": [
0,
7
0
],
"__ref_s3_data": null
}
],
"text": "package",
"type": "paragraph",
"text": "",
"type": "table",
"payload": null,
"name": "Text",
"font": null
"#-cols": 0,
"#-rows": 0,
"data": [],
"model": null,
"bounding-box": null
}
],
"figures": [],
"tables": [],
"bitmaps": null,
"equations": [],
"footnotes": [],
"page-dimensions": [
{
"height": 595.201171875,
"height": 792.0,
"page": 1,
"width": 841.9216918945312
"width": 612.0
}
],
"page-footers": [],

View File

@ -1 +0,0 @@
package

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +1,2 @@
<doctag><text><loc_59><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
<doctag><otsl><loc_44><loc_72><loc_348><loc_273><ched>Vertically merged<ched>Other merged column<ched>Yet another column<nl><fcel>value<fcel>Some other value<fcel>Yet another value<nl><fcel>value<fcel>Some other value<fcel>Yet another value<nl></otsl>
</doctag>

View File

@ -4,7 +4,7 @@
"name": "ocr_test",
"origin": {
"mimetype": "application/pdf",
"binary_hash": 14853448746796404529,
"binary_hash": 14846044078209721391,
"filename": "ocr_test.pdf"
},
"furniture": {
@ -18,7 +18,7 @@
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
"$ref": "#/tables/0"
}
],
"content_layer": "body",
@ -26,44 +26,402 @@
"label": "unspecified"
},
"groups": [],
"texts": [
"texts": [],
"pictures": [],
"tables": [
{
"self_ref": "#/texts/0",
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"label": "table",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 69.68,
"t": 764.92,
"r": 504.87,
"b": 689.01,
"l": 69.05,
"t": 524.35,
"r": 551.1,
"b": 277.42,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
94
0
]
}
],
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"bbox": {
"l": 97.33,
"t": 105.67,
"r": 190.0,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Vertically merged",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 121.67,
"t": 204.33,
"r": 168.67,
"b": 220.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 121.67,
"t": 284.0,
"r": 168.67,
"b": 300.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 232.67,
"t": 105.67,
"r": 364.0,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Other merged column",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 188.33,
"r": 349.67,
"b": 204.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 268.0,
"r": 349.67,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 406.33,
"t": 105.67,
"r": 518.33,
"b": 121.67,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another column",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 188.33,
"r": 514.0,
"b": 204.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 268.0,
"r": 514.0,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
}
],
"num_rows": 3,
"num_cols": 3,
"grid": [
[
{
"bbox": {
"l": 97.33,
"t": 105.67,
"r": 190.0,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Vertically merged",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 232.67,
"t": 105.67,
"r": 364.0,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Other merged column",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 406.33,
"t": 105.67,
"r": 518.33,
"b": 121.67,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another column",
"column_header": true,
"row_header": false,
"row_section": false
}
],
[
{
"bbox": {
"l": 121.67,
"t": 204.33,
"r": 168.67,
"b": 220.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 188.33,
"r": 349.67,
"b": 204.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 188.33,
"r": 514.0,
"b": 204.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"bbox": {
"l": 121.67,
"t": 284.0,
"r": 168.67,
"b": 300.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 268.0,
"r": 349.67,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 268.0,
"r": 514.0,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
}
]
]
},
"annotations": []
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {
"1": {
"size": {
"width": 595.2,
"height": 841.92
"width": 792.0,
"height": 612.0
},
"page_no": 1
}

View File

@ -1 +1,4 @@
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
| Vertically merged | Other merged column | Yet another column |
|---------------------|-----------------------|----------------------|
| value | Some other value | Yet another value |
| value | Some other value | Yet another value |

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,2 @@
<doctag><text><loc_371><loc_410><loc_439><loc_422>package</text>
<text><loc_75><loc_426><loc_440><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
<doctag><otsl><loc_152><loc_227><loc_456><loc_428><ched>Vertically merged<ched>Other merged column<ched>Yet another column<nl><fcel>value<fcel>Some other value<fcel>Yet another value<nl><fcel>value<fcel>Some other value<fcel>Yet another value<nl></otsl>
</doctag>

View File

@ -4,7 +4,7 @@
"name": "ocr_test_rotated_180",
"origin": {
"mimetype": "application/pdf",
"binary_hash": 2530576989861832966,
"binary_hash": 16151733167151414937,
"filename": "ocr_test_rotated_180.pdf"
},
"furniture": {
@ -18,10 +18,7 @@
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
"$ref": "#/tables/0"
}
],
"content_layer": "body",
@ -29,71 +26,402 @@
"label": "unspecified"
},
"groups": [],
"texts": [
"texts": [],
"pictures": [],
"tables": [
{
"self_ref": "#/texts/0",
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"label": "table",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 441.26,
"t": 151.88,
"r": 522.03,
"b": 131.89,
"l": 240.9,
"t": 334.58,
"r": 722.95,
"b": 87.65,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
7
0
]
}
],
"orig": "package",
"text": "package"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"bbox": {
"l": 97.33,
"t": 105.67,
"r": 190.0,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Vertically merged",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 121.67,
"t": 204.0,
"r": 168.67,
"b": 220.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 121.67,
"t": 284.0,
"r": 168.67,
"b": 300.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 232.33,
"t": 105.67,
"r": 363.67,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Other merged column",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 188.0,
"r": 349.67,
"b": 204.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 268.0,
"r": 349.67,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 406.33,
"t": 105.67,
"r": 518.0,
"b": 121.67,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another column",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 188.0,
"r": 514.0,
"b": 204.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 268.0,
"r": 514.0,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
}
],
"num_rows": 3,
"num_cols": 3,
"grid": [
[
{
"bbox": {
"l": 97.33,
"t": 105.67,
"r": 190.0,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Vertically merged",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 232.33,
"t": 105.67,
"r": 363.67,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Other merged column",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 406.33,
"t": 105.67,
"r": 518.0,
"b": 121.67,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another column",
"column_header": true,
"row_header": false,
"row_section": false
}
],
[
{
"bbox": {
"l": 121.67,
"t": 204.0,
"r": 168.67,
"b": 220.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 188.0,
"r": 349.67,
"b": 204.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 188.0,
"r": 514.0,
"b": 204.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"bbox": {
"l": 121.67,
"t": 284.0,
"r": 168.67,
"b": 300.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 268.0,
"r": 349.67,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 268.0,
"r": 514.0,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
}
]
]
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 89.24,
"t": 124.75,
"r": 523.21,
"b": 77.02,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
86
]
}
],
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
"annotations": []
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {
"1": {
"size": {
"width": 595.2,
"height": 841.92
"width": 792.0,
"height": 612.0
},
"page_no": 1
}

View File

@ -1,3 +1,4 @@
package
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained
| Vertically merged | Other merged column | Yet another column |
|---------------------|-----------------------|----------------------|
| value | Some other value | Yet another value |
| value | Some other value | Yet another value |

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,2 @@
<doctag><page_header><loc_426><loc_60><loc_454><loc_424>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
<text><loc_410><loc_61><loc_422><loc_128>package</text>
<doctag><otsl><loc_227><loc_44><loc_428><loc_348><fcel>Yet another value<fcel>Some other value<fcel>value<nl></otsl>
</doctag>

View File

@ -4,7 +4,7 @@
"name": "ocr_test_rotated_270",
"origin": {
"mimetype": "application/pdf",
"binary_hash": 10890858393843077593,
"binary_hash": 8365439800722100027,
"filename": "ocr_test_rotated_270.pdf"
},
"furniture": {
@ -18,10 +18,7 @@
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
"$ref": "#/tables/0"
}
],
"content_layer": "body",
@ -29,71 +26,170 @@
"label": "unspecified"
},
"groups": [],
"texts": [
"texts": [],
"pictures": [],
"tables": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "page_header",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 717.17,
"t": 524.3,
"r": 764.9,
"b": 90.33,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
86
]
}
],
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
},
{
"self_ref": "#/texts/1",
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"label": "table",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 690.24,
"t": 523.08,
"r": 709.83,
"b": 442.39,
"l": 277.42,
"t": 722.96,
"r": 524.35,
"b": 240.9,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
7
0
]
}
],
"orig": "package",
"text": "package"
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"bbox": {
"l": 443.33,
"t": 312.0,
"r": 490.33,
"b": 328.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 262.33,
"t": 296.67,
"r": 365.0,
"b": 344.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 98.0,
"t": 296.67,
"r": 203.67,
"b": 344.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
}
],
"num_rows": 1,
"num_cols": 3,
"grid": [
[
{
"bbox": {
"l": 98.0,
"t": 296.67,
"r": 203.67,
"b": 344.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 262.33,
"t": 296.67,
"r": 365.0,
"b": 344.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 443.33,
"t": 312.0,
"r": 490.33,
"b": 328.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
}
]
]
},
"annotations": []
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {
"1": {
"size": {
"width": 841.92,
"height": 595.2
"width": 612.0,
"height": 792.0
},
"page_no": 1
}

View File

@ -1 +0,0 @@
package

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,2 @@
<doctag><page_header><loc_46><loc_75><loc_75><loc_440>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
<text><loc_78><loc_370><loc_90><loc_438>package</text>
<doctag><otsl><loc_72><loc_152><loc_273><loc_456></otsl>
</doctag>

View File

@ -4,7 +4,7 @@
"name": "ocr_test_rotated_90",
"origin": {
"mimetype": "application/pdf",
"binary_hash": 6989291015361162334,
"binary_hash": 6752841177619701916,
"filename": "ocr_test_rotated_90.pdf"
},
"furniture": {
@ -18,10 +18,7 @@
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
"$ref": "#/tables/0"
}
],
"content_layer": "body",
@ -29,71 +26,52 @@
"label": "unspecified"
},
"groups": [],
"texts": [
"texts": [],
"pictures": [],
"tables": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "page_header",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 77.1,
"t": 506.07,
"r": 126.08,
"b": 71.88,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
86
]
}
],
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
},
{
"self_ref": "#/texts/1",
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"label": "table",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 131.21,
"t": 154.19,
"r": 152.2,
"b": 74.12,
"l": 87.65,
"t": 551.1,
"r": 334.58,
"b": 69.04,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
7
0
]
}
],
"orig": "package",
"text": "package"
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [],
"num_rows": 0,
"num_cols": 0,
"grid": []
},
"annotations": []
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {
"1": {
"size": {
"width": 841.92,
"height": 595.2
"width": 612.0,
"height": 792.0
},
"page_no": 1
}

View File

@ -1 +0,0 @@
package

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -73,8 +73,8 @@ def test_e2e_conversions():
# only works on mac
if "darwin" == sys.platform:
engines.append((OcrMacOptions(), True))
engines.append((OcrMacOptions(force_full_page_ocr=True), True))
engines.append((OcrMacOptions(), False))
engines.append((OcrMacOptions(force_full_page_ocr=True), False))
for ocr_options, supports_rotation in engines:
print(