This commit is contained in:
Clément Doumouro 2025-07-10 04:37:57 +00:00 committed by GitHub
commit fb900115ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
49 changed files with 40355 additions and 2146 deletions

View File

@ -2,6 +2,7 @@ import copy
import logging import logging
import warnings import warnings
from collections.abc import Iterable from collections.abc import Iterable
from copy import deepcopy
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
@ -19,6 +20,7 @@ from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import download_hf_model from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
from docling.utils.layout_postprocessor import LayoutPostprocessor from docling.utils.layout_postprocessor import LayoutPostprocessor
from docling.utils.orientation import detect_orientation, rotate_bounding_box
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
from docling.utils.visualization import draw_clusters from docling.utils.visualization import draw_clusters
@ -157,7 +159,9 @@ class LayoutModel(BasePageModel):
assert page.size is not None assert page.size is not None
page_image = page.get_image(scale=1.0) page_image = page.get_image(scale=1.0)
assert page_image is not None assert page_image is not None
page_orientation = detect_orientation(page.cells)
if page_orientation:
page_image = page_image.rotate(-page_orientation, expand=True)
clusters = [] clusters = []
for ix, pred_item in enumerate( for ix, pred_item in enumerate(
self.layout_predictor.predict(page_image) self.layout_predictor.predict(page_image)
@ -168,11 +172,16 @@ class LayoutModel(BasePageModel):
.replace(" ", "_") .replace(" ", "_")
.replace("-", "_") .replace("-", "_")
) # Temporary, until docling-ibm-model uses docling-core types ) # Temporary, until docling-ibm-model uses docling-core types
bbox = BoundingBox.model_validate(pred_item)
if page_orientation:
bbox = rotate_bounding_box(
bbox, page_orientation, page_image.size
).to_bounding_box()
cluster = Cluster( cluster = Cluster(
id=ix, id=ix,
label=label, label=label,
confidence=pred_item["confidence"], confidence=pred_item["confidence"],
bbox=BoundingBox.model_validate(pred_item), bbox=bbox,
cells=[], cells=[],
) )
clusters.append(cluster) clusters.append(cluster)

View File

@ -107,10 +107,10 @@ class OcrMacModel(BaseOcrModel):
x2 = x1 + w * im_width x2 = x1 + w * im_width
y1 = y2 - h * im_height y1 = y2 - h * im_height
left = x1 / self.scale left = x1 / self.scale + ocr_rect.l
top = y1 / self.scale top = y1 / self.scale + ocr_rect.t
right = x2 / self.scale right = x2 / self.scale + ocr_rect.l
bottom = y2 / self.scale bottom = y2 / self.scale + ocr_rect.t
cells.append( cells.append(
TextCell( TextCell(

View File

@ -1,8 +1,7 @@
import copy import copy
import warnings import warnings
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Iterable, Optional, Tuple, cast
import numpy import numpy
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@ -11,6 +10,7 @@ from docling_core.types.doc.page import (
TextCellUnit, TextCellUnit,
) )
from PIL import ImageDraw from PIL import ImageDraw
from PIL.Image import Image
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import Page, Table, TableStructurePrediction from docling.datamodel.base_models import Page, Table, TableStructurePrediction
@ -23,6 +23,7 @@ from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import download_hf_model from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
from docling.utils.orientation import detect_orientation, rotate_bounding_box
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
@ -30,6 +31,8 @@ class TableStructureModel(BasePageModel):
_model_repo_folder = "ds4sd--docling-models" _model_repo_folder = "ds4sd--docling-models"
_model_path = "model_artifacts/tableformer" _model_path = "model_artifacts/tableformer"
_table_labels = {DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX}
def __init__( def __init__(
self, self,
enabled: bool, enabled: bool,
@ -186,31 +189,48 @@ class TableStructureModel(BasePageModel):
page.predictions.tablestructure = ( page.predictions.tablestructure = (
TableStructurePrediction() TableStructurePrediction()
) # dummy ) # dummy
cells_orientation = detect_orientation(page.cells)
in_tables = [ # Keep only table bboxes
( in_tables_clusters = [
cluster, cluster
[
round(cluster.bbox.l) * self.scale,
round(cluster.bbox.t) * self.scale,
round(cluster.bbox.r) * self.scale,
round(cluster.bbox.b) * self.scale,
],
)
for cluster in page.predictions.layout.clusters for cluster in page.predictions.layout.clusters
if cluster.label if cluster.label in self._table_labels
in [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
] ]
if not len(in_tables):
if not len(in_tables_clusters):
yield page yield page
continue continue
# Rotate and scale table image
page_im = cast(Image, page.get_image())
scaled_page_im: Image = cast(
Image, page.get_image(scale=self.scale)
)
if cells_orientation:
scaled_page_im = scaled_page_im.rotate(
-cells_orientation, expand=True
)
page_input = { page_input = {
"width": page.size.width * self.scale, "width": scaled_page_im.size[0],
"height": page.size.height * self.scale, "height": scaled_page_im.size[1],
"image": numpy.asarray(page.get_image(scale=self.scale)), "image": numpy.asarray(scaled_page_im),
} }
# Rotate and scale table cells
in_tables = [
(
c,
[
round(x) * self.scale
for x in _rotate_bbox(
c.bbox,
orientation=-cells_orientation,
im_size=page_im.size,
)
.to_top_left_origin(page_im.size[1])
.as_tuple()
],
)
for c in in_tables_clusters
]
table_clusters, table_bboxes = zip(*in_tables) table_clusters, table_bboxes = zip(*in_tables)
if len(table_bboxes): if len(table_bboxes):
@ -238,11 +258,16 @@ class TableStructureModel(BasePageModel):
scale=self.scale scale=self.scale
) )
) )
new_bbox = _rotate_bbox(
new_cell.to_bounding_box(),
orientation=cells_orientation,
im_size=scaled_page_im.size,
).model_dump()
tokens.append( tokens.append(
{ {
"id": new_cell.index, "id": new_cell.index,
"text": new_cell.text, "text": new_cell.text,
"bbox": new_cell.rect.to_bounding_box().model_dump(), "bbox": new_bbox,
} }
) )
page_input["tokens"] = tokens page_input["tokens"] = tokens
@ -302,3 +327,11 @@ class TableStructureModel(BasePageModel):
) )
yield page yield page
def _rotate_bbox(
bbox: BoundingBox, *, orientation: int, im_size: Tuple[int, int]
) -> BoundingBox:
if orientation:
return rotate_bounding_box(bbox, orientation, im_size).to_bounding_box()
return bbox

View File

@ -3,7 +3,10 @@ from typing import Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle from docling_core.types.doc.page import BoundingRectangle
from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box from docling.utils.orientation import (
CLIPPED_ORIENTATIONS,
rotate_bounding_box,
)
def map_tesseract_script(script: str) -> str: def map_tesseract_script(script: str) -> str:
@ -40,7 +43,9 @@ def tesseract_box_to_bounding_rectangle(
orientation: int, orientation: int,
im_size: Tuple[int, int], im_size: Tuple[int, int],
) -> BoundingRectangle: ) -> BoundingRectangle:
# box is in the top, left, height, width format, top left coordinates # bbox is in the top, left, height, width format, top left coordinates
# We detected the tesseract on the document rotated with minus orientation, we have
# to apply an orientation angle
rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size) rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size)
rect = BoundingRectangle( rect = BoundingRectangle(
r_x0=rect.r_x0 / scale, r_x0=rect.r_x0 / scale,
@ -51,7 +56,7 @@ def tesseract_box_to_bounding_rectangle(
r_y2=rect.r_y2 / scale, r_y2=rect.r_y2 / scale,
r_x3=rect.r_x3 / scale, r_x3=rect.r_x3 / scale,
r_y3=rect.r_y3 / scale, r_y3=rect.r_y3 / scale,
coord_origin=CoordOrigin.TOPLEFT, coord_origin=rect.coord_origin,
) )
if original_offset is not None: if original_offset is not None:
if original_offset.coord_origin is not CoordOrigin.TOPLEFT: if original_offset.coord_origin is not CoordOrigin.TOPLEFT:

View File

@ -1,11 +1,24 @@
from collections import Counter
from operator import itemgetter
from typing import Tuple from typing import Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle from docling_core.types.doc.page import BoundingRectangle, TextCell
CLIPPED_ORIENTATIONS = [0, 90, 180, 270] CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
def _clipped_orientation(angle: float) -> int:
return min((abs(angle - o) % 360, o) for o in CLIPPED_ORIENTATIONS)[1]
def detect_orientation(cells: list[TextCell]) -> int:
if not cells:
return 0
orientation_counter = Counter(_clipped_orientation(c.rect.angle_360) for c in cells)
return max(orientation_counter.items(), key=itemgetter(1))[0]
def rotate_bounding_box( def rotate_bounding_box(
bbox: BoundingBox, angle: int, im_size: Tuple[int, int] bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
) -> BoundingRectangle: ) -> BoundingRectangle:

View File

@ -213,10 +213,10 @@
"prov": [ "prov": [
{ {
"bbox": [ "bbox": [
139.66741943359375, 139.66746520996094,
322.5054626464844, 322.5054626464844,
475.00927734375, 475.0093078613281,
454.45458984375 454.4546203613281
], ],
"page": 1, "page": 1,
"span": [ "span": [

View File

@ -2705,7 +2705,7 @@
"b": 102.78223000000003, "b": 102.78223000000003,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"confidence": 0.9373534917831421, "confidence": 0.9373531937599182,
"cells": [ "cells": [
{ {
"index": 0, "index": 0,
@ -2745,7 +2745,7 @@
"b": 102.78223000000003, "b": 102.78223000000003,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"confidence": 0.8858680725097656, "confidence": 0.8858677744865417,
"cells": [ "cells": [
{ {
"index": 1, "index": 1,
@ -2785,7 +2785,7 @@
"b": 152.90697999999998, "b": 152.90697999999998,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"confidence": 0.9806433916091919, "confidence": 0.9806435108184814,
"cells": [ "cells": [
{ {
"index": 2, "index": 2,
@ -3155,7 +3155,7 @@
"b": 327.98218, "b": 327.98218,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"confidence": 0.9591909050941467, "confidence": 0.9591910243034363,
"cells": [ "cells": [
{ {
"index": 15, "index": 15,
@ -3339,9 +3339,9 @@
"id": 0, "id": 0,
"label": "table", "label": "table",
"bbox": { "bbox": {
"l": 139.66741943359375, "l": 139.66746520996094,
"t": 337.54541015625, "t": 337.5453796386719,
"r": 475.00927734375, "r": 475.0093078613281,
"b": 469.4945373535156, "b": 469.4945373535156,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
@ -7846,7 +7846,7 @@
"b": 518.17419, "b": 518.17419,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"confidence": 0.9589294195175171, "confidence": 0.9589295387268066,
"cells": [ "cells": [
{ {
"index": 91, "index": 91,
@ -8243,9 +8243,9 @@
"id": 0, "id": 0,
"label": "table", "label": "table",
"bbox": { "bbox": {
"l": 139.66741943359375, "l": 139.66746520996094,
"t": 337.54541015625, "t": 337.5453796386719,
"r": 475.00927734375, "r": 475.0093078613281,
"b": 469.4945373535156, "b": 469.4945373535156,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
@ -13641,7 +13641,7 @@
"b": 102.78223000000003, "b": 102.78223000000003,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"confidence": 0.9373534917831421, "confidence": 0.9373531937599182,
"cells": [ "cells": [
{ {
"index": 0, "index": 0,
@ -13687,7 +13687,7 @@
"b": 102.78223000000003, "b": 102.78223000000003,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"confidence": 0.8858680725097656, "confidence": 0.8858677744865417,
"cells": [ "cells": [
{ {
"index": 1, "index": 1,
@ -13733,7 +13733,7 @@
"b": 152.90697999999998, "b": 152.90697999999998,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"confidence": 0.9806433916091919, "confidence": 0.9806435108184814,
"cells": [ "cells": [
{ {
"index": 2, "index": 2,
@ -14121,7 +14121,7 @@
"b": 327.98218, "b": 327.98218,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"confidence": 0.9591909050941467, "confidence": 0.9591910243034363,
"cells": [ "cells": [
{ {
"index": 15, "index": 15,
@ -14311,9 +14311,9 @@
"id": 0, "id": 0,
"label": "table", "label": "table",
"bbox": { "bbox": {
"l": 139.66741943359375, "l": 139.66746520996094,
"t": 337.54541015625, "t": 337.5453796386719,
"r": 475.00927734375, "r": 475.0093078613281,
"b": 469.4945373535156, "b": 469.4945373535156,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
@ -19701,7 +19701,7 @@
"b": 518.17419, "b": 518.17419,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"confidence": 0.9589294195175171, "confidence": 0.9589295387268066,
"cells": [ "cells": [
{ {
"index": 91, "index": 91,
@ -20116,7 +20116,7 @@
"b": 152.90697999999998, "b": 152.90697999999998,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"confidence": 0.9806433916091919, "confidence": 0.9806435108184814,
"cells": [ "cells": [
{ {
"index": 2, "index": 2,
@ -20504,7 +20504,7 @@
"b": 327.98218, "b": 327.98218,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"confidence": 0.9591909050941467, "confidence": 0.9591910243034363,
"cells": [ "cells": [
{ {
"index": 15, "index": 15,
@ -20694,9 +20694,9 @@
"id": 0, "id": 0,
"label": "table", "label": "table",
"bbox": { "bbox": {
"l": 139.66741943359375, "l": 139.66746520996094,
"t": 337.54541015625, "t": 337.5453796386719,
"r": 475.00927734375, "r": 475.0093078613281,
"b": 469.4945373535156, "b": 469.4945373535156,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
@ -26084,7 +26084,7 @@
"b": 518.17419, "b": 518.17419,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"confidence": 0.9589294195175171, "confidence": 0.9589295387268066,
"cells": [ "cells": [
{ {
"index": 91, "index": 91,
@ -26499,7 +26499,7 @@
"b": 102.78223000000003, "b": 102.78223000000003,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"confidence": 0.9373534917831421, "confidence": 0.9373531937599182,
"cells": [ "cells": [
{ {
"index": 0, "index": 0,
@ -26545,7 +26545,7 @@
"b": 102.78223000000003, "b": 102.78223000000003,
"coord_origin": "TOPLEFT" "coord_origin": "TOPLEFT"
}, },
"confidence": 0.8858680725097656, "confidence": 0.8858677744865417,
"cells": [ "cells": [
{ {
"index": 1, "index": 1,

View File

@ -1,2 +1,2 @@
<doctag><text><loc_59><loc_46><loc_424><loc_90>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text> <doctag><text><loc_60><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
</doctag> </doctag>

View File

@ -1,3 +1,8 @@
<document> <document>
<paragraph><location><page_1><loc_12><loc_82><loc_85><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph> <table>
<location><page_1><loc_9><loc_45><loc_70><loc_86></location>
<row_0><col_0><col_header>Vertically merged</col_0><col_1><col_header>Other merged column</col_1><col_2><col_header>Yet another column</col_2></row_0>
<row_1><col_0><body>value</col_0><col_1><body>Some other value</col_1><col_2><body>Yet another value</col_2></row_1>
<row_2><col_0><body>value</col_0><col_1><body>Some other value</col_1><col_2><body>Yet another value</col_2></row_2>
</table>
</document> </document>

View File

@ -27,53 +27,321 @@
"file-info": { "file-info": {
"filename": "ocr_test.pdf", "filename": "ocr_test.pdf",
"filename-prov": null, "filename-prov": null,
"document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61", "document-hash": "0f391d12850f72bb91897f7f3bebfd4a0a8357e2a883ac1f664e32342c04e418",
"#-pages": 1, "#-pages": 1,
"collection-name": null, "collection-name": null,
"description": null, "description": null,
"page-hashes": [ "page-hashes": [
{ {
"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3", "hash": "32f328168da3f69890a725c1168799f9ff7337249e98b1f36c12965551477be5",
"model": "default", "model": "default",
"page": 1 "page": 1
} }
] ]
}, },
"main-text": [ "main-text": [
{
"name": "Table",
"type": "table",
"$ref": "#/tables/0"
}
],
"figures": [],
"tables": [
{ {
"prov": [ "prov": [
{ {
"bbox": [ "bbox": [
69.6796630536824, 69.04969024658203,
689.0124221922704, 277.41973876953125,
504.8720051760782, 551.0990600585938,
764.9216921155637 524.3504486083984
], ],
"page": 1, "page": 1,
"span": [ "span": [
0, 0,
94 0
], ],
"__ref_s3_data": null "__ref_s3_data": null
} }
], ],
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "",
"type": "paragraph", "type": "table",
"payload": null, "payload": null,
"name": "Text", "#-cols": 3,
"font": null "#-rows": 3,
"data": [
[
{
"bbox": [
97.33333333333333,
105.66666666666666,
190.0,
126.33333333333334
],
"spans": [
[
0,
0
]
],
"text": "Vertically merged",
"type": "col_header",
"col": 0,
"col-header": true,
"col-span": [
0,
1
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
232.66666666666666,
105.66666666666666,
364.0,
126.33333333333334
],
"spans": [
[
0,
1
]
],
"text": "Other merged column",
"type": "col_header",
"col": 1,
"col-header": true,
"col-span": [
1,
2
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
406.3333333333333,
105.66666666666666,
518.3333333333333,
121.66666666666666
],
"spans": [
[
0,
2
]
],
"text": "Yet another column",
"type": "col_header",
"col": 2,
"col-header": true,
"col-span": [
2,
3
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
}
],
[
{
"bbox": [
121.66666666666667,
204.33333333333334,
168.66666666666666,
220.0
],
"spans": [
[
1,
0
]
],
"text": "value",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
},
{
"bbox": [
247.0,
188.33333333333331,
349.6666666666667,
204.33333333333334
],
"spans": [
[
1,
1
]
],
"text": "Some other value",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
},
{
"bbox": [
408.3333333333333,
188.33333333333331,
514.0,
204.33333333333334
],
"spans": [
[
1,
2
]
],
"text": "Yet another value",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
}
],
[
{
"bbox": [
121.66666666666667,
284.0,
168.66666666666666,
300.0
],
"spans": [
[
2,
0
]
],
"text": "value",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
},
{
"bbox": [
247.0,
268.0,
349.6666666666667,
284.0
],
"spans": [
[
2,
1
]
],
"text": "Some other value",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
},
{
"bbox": [
408.3333333333333,
268.0,
514.0,
284.0
],
"spans": [
[
2,
2
]
],
"text": "Yet another value",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
}
]
],
"model": null,
"bounding-box": null
} }
], ],
"figures": [],
"tables": [],
"bitmaps": null, "bitmaps": null,
"equations": [], "equations": [],
"footnotes": [], "footnotes": [],
"page-dimensions": [ "page-dimensions": [
{ {
"height": 841.9216918945312, "height": 612.0,
"page": 1, "page": 1,
"width": 595.201171875 "width": 792.0
} }
], ],
"page-footers": [], "page-footers": [],

View File

@ -1 +1,4 @@
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package | Vertically merged | Other merged column | Yet another column |
|---------------------|-----------------------|----------------------|
| value | Some other value | Yet another value |
| value | Some other value | Yet another value |

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +0,0 @@
<document>
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
</document>

View File

@ -1 +0,0 @@
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.21306574279092, 74.12495603322407, 152.19606490864376, 154.19400205373182], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}

View File

@ -1 +0,0 @@
package

View File

@ -1 +0,0 @@
[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}]

View File

@ -1,4 +1,8 @@
<document> <document>
<paragraph><location><page_1><loc_74><loc_16><loc_88><loc_18></location>package</paragraph> <table>
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph> <location><page_1><loc_30><loc_14><loc_91><loc_55></location>
<row_0><col_0><col_header>Vertically merged</col_0><col_1><col_header>Other merged column</col_1><col_2><col_header>Yet another column</col_2></row_0>
<row_1><col_0><body>value</col_0><col_1><body>Some other value</col_1><col_2><body>Yet another value</col_2></row_1>
<row_2><col_0><body>value</col_0><col_1><body>Some other value</col_1><col_2><body>Yet another value</col_2></row_2>
</table>
</document> </document>

View File

@ -27,13 +27,13 @@
"file-info": { "file-info": {
"filename": "ocr_test_rotated_180.pdf", "filename": "ocr_test_rotated_180.pdf",
"filename-prov": null, "filename-prov": null,
"document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982", "document-hash": "361fa0fc8db9c3a973d316d08509ac78cc0e7f81dea94358319092640d439ca0",
"#-pages": 1, "#-pages": 1,
"collection-name": null, "collection-name": null,
"description": null, "description": null,
"page-hashes": [ "page-hashes": [
{ {
"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49", "hash": "ab89ee70d4aee0b8dc5ed72ad42e16e98a8ec9c2eea1e03d99b50c25bbc5a806",
"model": "default", "model": "default",
"page": 1 "page": 1
} }
@ -41,62 +41,307 @@
}, },
"main-text": [ "main-text": [
{ {
"prov": [ "name": "Table",
{ "type": "table",
"bbox": [ "$ref": "#/tables/0"
441.2561096985719,
131.89488404865142,
522.0347860494834,
151.87873262042876
],
"page": 1,
"span": [
0,
7
],
"__ref_s3_data": null
}
],
"text": "package",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
89.23887497045128,
77.02339852098021,
523.208764293368,
124.75312428291147
],
"page": 1,
"span": [
0,
86
],
"__ref_s3_data": null
}
],
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
} }
], ],
"figures": [], "figures": [],
"tables": [], "tables": [
{
"prov": [
{
"bbox": [
240.90093994140625,
87.64955139160156,
722.950309753418,
334.58026123046875
],
"page": 1,
"span": [
0,
0
],
"__ref_s3_data": null
}
],
"text": "",
"type": "table",
"payload": null,
"#-cols": 3,
"#-rows": 3,
"data": [
[
{
"bbox": [
97.33333333333337,
105.66666666666669,
190.0,
126.33333333333337
],
"spans": [
[
0,
0
]
],
"text": "Vertically merged",
"type": "col_header",
"col": 0,
"col-header": true,
"col-span": [
0,
1
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
232.33333333333326,
105.66666666666669,
363.6666666666667,
126.33333333333337
],
"spans": [
[
0,
1
]
],
"text": "Other merged column",
"type": "col_header",
"col": 1,
"col-header": true,
"col-span": [
1,
2
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
406.3333333333333,
105.66666666666669,
518.0,
121.66666666666663
],
"spans": [
[
0,
2
]
],
"text": "Yet another column",
"type": "col_header",
"col": 2,
"col-header": true,
"col-span": [
2,
3
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
}
],
[
{
"bbox": [
121.66666666666663,
204.0,
168.66666666666663,
220.0
],
"spans": [
[
1,
0
]
],
"text": "value",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
},
{
"bbox": [
247.0,
188.0,
349.6666666666667,
204.0
],
"spans": [
[
1,
1
]
],
"text": "Some other value",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
},
{
"bbox": [
408.3333333333333,
188.0,
514.0,
204.0
],
"spans": [
[
1,
2
]
],
"text": "Yet another value",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 1,
"row-header": false,
"row-span": [
1,
2
]
}
],
[
{
"bbox": [
121.66666666666663,
284.0,
168.66666666666663,
300.0
],
"spans": [
[
2,
0
]
],
"text": "value",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
},
{
"bbox": [
247.0,
268.0,
349.6666666666667,
284.0
],
"spans": [
[
2,
1
]
],
"text": "Some other value",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
},
{
"bbox": [
408.3333333333333,
268.0,
514.0,
284.0
],
"spans": [
[
2,
2
]
],
"text": "Yet another value",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 2,
"row-header": false,
"row-span": [
2,
3
]
}
]
],
"model": null,
"bounding-box": null
}
],
"bitmaps": null, "bitmaps": null,
"equations": [], "equations": [],
"footnotes": [], "footnotes": [],
"page-dimensions": [ "page-dimensions": [
{ {
"height": 841.9216918945312, "height": 612.0,
"page": 1, "page": 1,
"width": 595.201171875 "width": 792.0
} }
], ],
"page-footers": [], "page-footers": [],

View File

@ -1,3 +1,4 @@
package | Vertically merged | Other merged column | Yet another column |
|---------------------|-----------------------|----------------------|
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained | value | Some other value | Yet another value |
| value | Some other value | Yet another value |

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,6 @@
<document> <document>
<paragraph><location><page_1><loc_82><loc_74><loc_84><loc_88></location>package</paragraph> <table>
<location><page_1><loc_45><loc_30><loc_86><loc_91></location>
<row_0><col_0><body>Yet another value</col_0><col_1><body>Some other value</col_1><col_2><body>value</col_2></row_0>
</table>
</document> </document>

View File

@ -27,53 +27,149 @@
"file-info": { "file-info": {
"filename": "ocr_test_rotated_270.pdf", "filename": "ocr_test_rotated_270.pdf",
"filename-prov": null, "filename-prov": null,
"document-hash": "52f54e7183bdb73aa3713c7b169baca93e276963a138418c26e7d6a1ea128f14", "document-hash": "753140dc9b8c39b67c6f6712e2a1de4c364c808ca09d13dd05b79c23192429dc",
"#-pages": 1, "#-pages": 1,
"collection-name": null, "collection-name": null,
"description": null, "description": null,
"page-hashes": [ "page-hashes": [
{ {
"hash": "59bc9ddba89e7b008185dd16d384493beb034686e5670546786390c5d237a304", "hash": "c8fa256d58940f76c5e0ec6b65548a2e939f867c2c75d0ee27f5f70ff32a44be",
"model": "default", "model": "default",
"page": 1 "page": 1
} }
] ]
}, },
"main-text": [ "main-text": [
{
"name": "Table",
"type": "table",
"$ref": "#/tables/0"
}
],
"figures": [],
"tables": [
{ {
"prov": [ "prov": [
{ {
"bbox": [ "bbox": [
690.2441821046808, 277.4178771972656,
442.39487414368364, 240.90216064453125,
709.8255852011977, 524.3541717529297,
523.076601235155 722.9614028930664
], ],
"page": 1, "page": 1,
"span": [ "span": [
0, 0,
7 0
], ],
"__ref_s3_data": null "__ref_s3_data": null
} }
], ],
"text": "package", "text": "",
"type": "paragraph", "type": "table",
"payload": null, "payload": null,
"name": "Text", "#-cols": 3,
"font": null "#-rows": 1,
"data": [
[
{
"bbox": [
98.0,
296.6666666666667,
203.66666666666669,
344.0
],
"spans": [
[
0,
0
]
],
"text": "Yet another value",
"type": "body",
"col": 0,
"col-header": false,
"col-span": [
0,
1
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
262.3333333333333,
296.6666666666667,
365.0,
344.0
],
"spans": [
[
0,
1
]
],
"text": "Some other value",
"type": "body",
"col": 1,
"col-header": false,
"col-span": [
1,
2
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
},
{
"bbox": [
443.33333333333337,
312.0,
490.33333333333337,
328.0
],
"spans": [
[
0,
2
]
],
"text": "value",
"type": "body",
"col": 2,
"col-header": false,
"col-span": [
2,
3
],
"row": 0,
"row-header": false,
"row-span": [
0,
1
]
}
]
],
"model": null,
"bounding-box": null
} }
], ],
"figures": [],
"tables": [],
"bitmaps": null, "bitmaps": null,
"equations": [], "equations": [],
"footnotes": [], "footnotes": [],
"page-dimensions": [ "page-dimensions": [
{ {
"height": 595.201171875, "height": 792.0,
"page": 1, "page": 1,
"width": 841.9216918945312 "width": 612.0
} }
], ],
"page-footers": [], "page-footers": [],

View File

@ -1 +0,0 @@
package

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,5 @@
<document> <document>
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph> <table>
<location><page_1><loc_14><loc_9><loc_55><loc_70></location>
</table>
</document> </document>

View File

@ -27,53 +27,62 @@
"file-info": { "file-info": {
"filename": "ocr_test_rotated_90.pdf", "filename": "ocr_test_rotated_90.pdf",
"filename-prov": null, "filename-prov": null,
"document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "document-hash": "418ae4425f514f002bd4223ea3003c17f319cbeafd67801732d58f2bedb3bd91",
"#-pages": 1, "#-pages": 1,
"collection-name": null, "collection-name": null,
"description": null, "description": null,
"page-hashes": [ "page-hashes": [
{ {
"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "hash": "36315c08dc861ecde4be6179d2f155da0519b93e0311c290f8db164f593d36d8",
"model": "default", "model": "default",
"page": 1 "page": 1
} }
] ]
}, },
"main-text": [ "main-text": [
{
"name": "Table",
"type": "table",
"$ref": "#/tables/0"
}
],
"figures": [],
"tables": [
{ {
"prov": [ "prov": [
{ {
"bbox": [ "bbox": [
131.21306574279092, 87.64582824707031,
74.12495603322407, 69.0385971069336,
152.19606490864376, 334.5821228027344,
154.19400205373182 551.0978393554688
], ],
"page": 1, "page": 1,
"span": [ "span": [
0, 0,
7 0
], ],
"__ref_s3_data": null "__ref_s3_data": null
} }
], ],
"text": "package", "text": "",
"type": "paragraph", "type": "table",
"payload": null, "payload": null,
"name": "Text", "#-cols": 0,
"font": null "#-rows": 0,
"data": [],
"model": null,
"bounding-box": null
} }
], ],
"figures": [],
"tables": [],
"bitmaps": null, "bitmaps": null,
"equations": [], "equations": [],
"footnotes": [], "footnotes": [],
"page-dimensions": [ "page-dimensions": [
{ {
"height": 595.201171875, "height": 792.0,
"page": 1, "page": 1,
"width": 841.9216918945312 "width": 612.0
} }
], ],
"page-footers": [], "page-footers": [],

View File

@ -1 +0,0 @@
package

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +1,2 @@
<doctag><text><loc_59><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text> <doctag><otsl><loc_44><loc_72><loc_348><loc_273><ched>Vertically merged<ched>Other merged column<ched>Yet another column<nl><fcel>value<fcel>Some other value<fcel>Yet another value<nl><fcel>value<fcel>Some other value<fcel>Yet another value<nl></otsl>
</doctag> </doctag>

View File

@ -4,7 +4,7 @@
"name": "ocr_test", "name": "ocr_test",
"origin": { "origin": {
"mimetype": "application/pdf", "mimetype": "application/pdf",
"binary_hash": 14853448746796404529, "binary_hash": 14846044078209721391,
"filename": "ocr_test.pdf" "filename": "ocr_test.pdf"
}, },
"furniture": { "furniture": {
@ -18,7 +18,7 @@
"self_ref": "#/body", "self_ref": "#/body",
"children": [ "children": [
{ {
"$ref": "#/texts/0" "$ref": "#/tables/0"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -26,44 +26,402 @@
"label": "unspecified" "label": "unspecified"
}, },
"groups": [], "groups": [],
"texts": [ "texts": [],
"pictures": [],
"tables": [
{ {
"self_ref": "#/texts/0", "self_ref": "#/tables/0",
"parent": { "parent": {
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "text", "label": "table",
"prov": [ "prov": [
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 69.68, "l": 69.05,
"t": 764.92, "t": 524.35,
"r": 504.87, "r": 551.1,
"b": 689.01, "b": 277.42,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [
0, 0,
94 0
] ]
} }
], ],
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "captions": [],
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package" "references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"bbox": {
"l": 97.33,
"t": 105.67,
"r": 190.0,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Vertically merged",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 121.67,
"t": 204.33,
"r": 168.67,
"b": 220.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 121.67,
"t": 284.0,
"r": 168.67,
"b": 300.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 232.67,
"t": 105.67,
"r": 364.0,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Other merged column",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 188.33,
"r": 349.67,
"b": 204.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 268.0,
"r": 349.67,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 406.33,
"t": 105.67,
"r": 518.33,
"b": 121.67,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another column",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 188.33,
"r": 514.0,
"b": 204.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 268.0,
"r": 514.0,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
}
],
"num_rows": 3,
"num_cols": 3,
"grid": [
[
{
"bbox": {
"l": 97.33,
"t": 105.67,
"r": 190.0,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Vertically merged",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 232.67,
"t": 105.67,
"r": 364.0,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Other merged column",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 406.33,
"t": 105.67,
"r": 518.33,
"b": 121.67,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another column",
"column_header": true,
"row_header": false,
"row_section": false
}
],
[
{
"bbox": {
"l": 121.67,
"t": 204.33,
"r": 168.67,
"b": 220.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 188.33,
"r": 349.67,
"b": 204.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 188.33,
"r": 514.0,
"b": 204.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"bbox": {
"l": 121.67,
"t": 284.0,
"r": 168.67,
"b": 300.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 268.0,
"r": 349.67,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 268.0,
"r": 514.0,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
}
]
]
},
"annotations": []
} }
], ],
"pictures": [],
"tables": [],
"key_value_items": [], "key_value_items": [],
"form_items": [], "form_items": [],
"pages": { "pages": {
"1": { "1": {
"size": { "size": {
"width": 595.2, "width": 792.0,
"height": 841.92 "height": 612.0
}, },
"page_no": 1 "page_no": 1
} }

View File

@ -1 +1,4 @@
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package | Vertically merged | Other merged column | Yet another column |
|---------------------|-----------------------|----------------------|
| value | Some other value | Yet another value |
| value | Some other value | Yet another value |

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,2 @@
<doctag><text><loc_371><loc_410><loc_439><loc_422>package</text> <doctag><otsl><loc_152><loc_227><loc_456><loc_428><ched>Vertically merged<ched>Other merged column<ched>Yet another column<nl><fcel>value<fcel>Some other value<fcel>Yet another value<nl><fcel>value<fcel>Some other value<fcel>Yet another value<nl></otsl>
<text><loc_75><loc_426><loc_440><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
</doctag> </doctag>

View File

@ -4,7 +4,7 @@
"name": "ocr_test_rotated_180", "name": "ocr_test_rotated_180",
"origin": { "origin": {
"mimetype": "application/pdf", "mimetype": "application/pdf",
"binary_hash": 2530576989861832966, "binary_hash": 16151733167151414937,
"filename": "ocr_test_rotated_180.pdf" "filename": "ocr_test_rotated_180.pdf"
}, },
"furniture": { "furniture": {
@ -18,10 +18,7 @@
"self_ref": "#/body", "self_ref": "#/body",
"children": [ "children": [
{ {
"$ref": "#/texts/0" "$ref": "#/tables/0"
},
{
"$ref": "#/texts/1"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -29,71 +26,402 @@
"label": "unspecified" "label": "unspecified"
}, },
"groups": [], "groups": [],
"texts": [ "texts": [],
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 441.26,
"t": 151.88,
"r": 522.03,
"b": 131.89,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
7
]
}
],
"orig": "package",
"text": "package"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 89.24,
"t": 124.75,
"r": 523.21,
"b": 77.02,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
86
]
}
],
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
}
],
"pictures": [], "pictures": [],
"tables": [], "tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "table",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 240.9,
"t": 334.58,
"r": 722.95,
"b": 87.65,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"bbox": {
"l": 97.33,
"t": 105.67,
"r": 190.0,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Vertically merged",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 121.67,
"t": 204.0,
"r": 168.67,
"b": 220.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 121.67,
"t": 284.0,
"r": 168.67,
"b": 300.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 232.33,
"t": 105.67,
"r": 363.67,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Other merged column",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 188.0,
"r": 349.67,
"b": 204.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 268.0,
"r": 349.67,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 406.33,
"t": 105.67,
"r": 518.0,
"b": 121.67,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another column",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 188.0,
"r": 514.0,
"b": 204.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 268.0,
"r": 514.0,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
}
],
"num_rows": 3,
"num_cols": 3,
"grid": [
[
{
"bbox": {
"l": 97.33,
"t": 105.67,
"r": 190.0,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Vertically merged",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 232.33,
"t": 105.67,
"r": 363.67,
"b": 126.33,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Other merged column",
"column_header": true,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 406.33,
"t": 105.67,
"r": 518.0,
"b": 121.67,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another column",
"column_header": true,
"row_header": false,
"row_section": false
}
],
[
{
"bbox": {
"l": 121.67,
"t": 204.0,
"r": 168.67,
"b": 220.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 188.0,
"r": 349.67,
"b": 204.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 188.0,
"r": 514.0,
"b": 204.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"bbox": {
"l": 121.67,
"t": 284.0,
"r": 168.67,
"b": 300.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 247.0,
"t": 268.0,
"r": 349.67,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 408.33,
"t": 268.0,
"r": 514.0,
"b": 284.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
}
]
]
},
"annotations": []
}
],
"key_value_items": [], "key_value_items": [],
"form_items": [], "form_items": [],
"pages": { "pages": {
"1": { "1": {
"size": { "size": {
"width": 595.2, "width": 792.0,
"height": 841.92 "height": 612.0
}, },
"page_no": 1 "page_no": 1
} }

View File

@ -1,3 +1,4 @@
package | Vertically merged | Other merged column | Yet another column |
|---------------------|-----------------------|----------------------|
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained | value | Some other value | Yet another value |
| value | Some other value | Yet another value |

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,2 @@
<doctag><page_header><loc_426><loc_60><loc_454><loc_424>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header> <doctag><otsl><loc_227><loc_44><loc_428><loc_348><fcel>Yet another value<fcel>Some other value<fcel>value<nl></otsl>
<text><loc_410><loc_61><loc_422><loc_128>package</text>
</doctag> </doctag>

View File

@ -4,7 +4,7 @@
"name": "ocr_test_rotated_270", "name": "ocr_test_rotated_270",
"origin": { "origin": {
"mimetype": "application/pdf", "mimetype": "application/pdf",
"binary_hash": 10890858393843077593, "binary_hash": 8365439800722100027,
"filename": "ocr_test_rotated_270.pdf" "filename": "ocr_test_rotated_270.pdf"
}, },
"furniture": { "furniture": {
@ -18,10 +18,7 @@
"self_ref": "#/body", "self_ref": "#/body",
"children": [ "children": [
{ {
"$ref": "#/texts/0" "$ref": "#/tables/0"
},
{
"$ref": "#/texts/1"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -29,71 +26,170 @@
"label": "unspecified" "label": "unspecified"
}, },
"groups": [], "groups": [],
"texts": [ "texts": [],
"pictures": [],
"tables": [
{ {
"self_ref": "#/texts/0", "self_ref": "#/tables/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "page_header",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 717.17,
"t": 524.3,
"r": 764.9,
"b": 90.33,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
86
]
}
],
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
},
{
"self_ref": "#/texts/1",
"parent": { "parent": {
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "text", "label": "table",
"prov": [ "prov": [
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 690.24, "l": 277.42,
"t": 523.08, "t": 722.96,
"r": 709.83, "r": 524.35,
"b": 442.39, "b": 240.9,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [
0, 0,
7 0
] ]
} }
], ],
"orig": "package", "captions": [],
"text": "package" "references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"bbox": {
"l": 443.33,
"t": 312.0,
"r": 490.33,
"b": 328.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 262.33,
"t": 296.67,
"r": 365.0,
"b": 344.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 98.0,
"t": 296.67,
"r": 203.67,
"b": 344.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
}
],
"num_rows": 1,
"num_cols": 3,
"grid": [
[
{
"bbox": {
"l": 98.0,
"t": 296.67,
"r": 203.67,
"b": 344.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Yet another value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 262.33,
"t": 296.67,
"r": 365.0,
"b": 344.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Some other value",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"bbox": {
"l": 443.33,
"t": 312.0,
"r": 490.33,
"b": 328.0,
"coord_origin": "TOPLEFT"
},
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "value",
"column_header": false,
"row_header": false,
"row_section": false
}
]
]
},
"annotations": []
} }
], ],
"pictures": [],
"tables": [],
"key_value_items": [], "key_value_items": [],
"form_items": [], "form_items": [],
"pages": { "pages": {
"1": { "1": {
"size": { "size": {
"width": 841.92, "width": 612.0,
"height": 595.2 "height": 792.0
}, },
"page_no": 1 "page_no": 1
} }

View File

@ -1 +0,0 @@
package

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,2 @@
<doctag><page_header><loc_46><loc_75><loc_75><loc_440>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header> <doctag><otsl><loc_72><loc_152><loc_273><loc_456></otsl>
<text><loc_78><loc_370><loc_90><loc_438>package</text>
</doctag> </doctag>

View File

@ -4,7 +4,7 @@
"name": "ocr_test_rotated_90", "name": "ocr_test_rotated_90",
"origin": { "origin": {
"mimetype": "application/pdf", "mimetype": "application/pdf",
"binary_hash": 6989291015361162334, "binary_hash": 6752841177619701916,
"filename": "ocr_test_rotated_90.pdf" "filename": "ocr_test_rotated_90.pdf"
}, },
"furniture": { "furniture": {
@ -18,10 +18,7 @@
"self_ref": "#/body", "self_ref": "#/body",
"children": [ "children": [
{ {
"$ref": "#/texts/0" "$ref": "#/tables/0"
},
{
"$ref": "#/texts/1"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -29,71 +26,52 @@
"label": "unspecified" "label": "unspecified"
}, },
"groups": [], "groups": [],
"texts": [ "texts": [],
"pictures": [],
"tables": [
{ {
"self_ref": "#/texts/0", "self_ref": "#/tables/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "page_header",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 77.1,
"t": 506.07,
"r": 126.08,
"b": 71.88,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
86
]
}
],
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
},
{
"self_ref": "#/texts/1",
"parent": { "parent": {
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "text", "label": "table",
"prov": [ "prov": [
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 131.21, "l": 87.65,
"t": 154.19, "t": 551.1,
"r": 152.2, "r": 334.58,
"b": 74.12, "b": 69.04,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [
0, 0,
7 0
] ]
} }
], ],
"orig": "package", "captions": [],
"text": "package" "references": [],
"footnotes": [],
"data": {
"table_cells": [],
"num_rows": 0,
"num_cols": 0,
"grid": []
},
"annotations": []
} }
], ],
"pictures": [],
"tables": [],
"key_value_items": [], "key_value_items": [],
"form_items": [], "form_items": [],
"pages": { "pages": {
"1": { "1": {
"size": { "size": {
"width": 841.92, "width": 612.0,
"height": 595.2 "height": 792.0
}, },
"page_no": 1 "page_no": 1
} }

View File

@ -1 +0,0 @@
package

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -73,8 +73,8 @@ def test_e2e_conversions():
# only works on mac # only works on mac
if "darwin" == sys.platform: if "darwin" == sys.platform:
engines.append((OcrMacOptions(), True)) engines.append((OcrMacOptions(), False))
engines.append((OcrMacOptions(force_full_page_ocr=True), True)) engines.append((OcrMacOptions(force_full_page_ocr=True), False))
for ocr_options, supports_rotation in engines: for ocr_options, supports_rotation in engines:
print( print(