mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 19:44:34 +00:00
Merge 7b4a4457e8
into 2b8616d6d5
This commit is contained in:
commit
fb900115ee
@ -2,6 +2,7 @@ import copy
|
||||
import logging
|
||||
import warnings
|
||||
from collections.abc import Iterable
|
||||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
@ -19,6 +20,7 @@ from docling.models.base_model import BasePageModel
|
||||
from docling.models.utils.hf_model_download import download_hf_model
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.layout_postprocessor import LayoutPostprocessor
|
||||
from docling.utils.orientation import detect_orientation, rotate_bounding_box
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
from docling.utils.visualization import draw_clusters
|
||||
|
||||
@ -157,7 +159,9 @@ class LayoutModel(BasePageModel):
|
||||
assert page.size is not None
|
||||
page_image = page.get_image(scale=1.0)
|
||||
assert page_image is not None
|
||||
|
||||
page_orientation = detect_orientation(page.cells)
|
||||
if page_orientation:
|
||||
page_image = page_image.rotate(-page_orientation, expand=True)
|
||||
clusters = []
|
||||
for ix, pred_item in enumerate(
|
||||
self.layout_predictor.predict(page_image)
|
||||
@ -168,11 +172,16 @@ class LayoutModel(BasePageModel):
|
||||
.replace(" ", "_")
|
||||
.replace("-", "_")
|
||||
) # Temporary, until docling-ibm-model uses docling-core types
|
||||
bbox = BoundingBox.model_validate(pred_item)
|
||||
if page_orientation:
|
||||
bbox = rotate_bounding_box(
|
||||
bbox, page_orientation, page_image.size
|
||||
).to_bounding_box()
|
||||
cluster = Cluster(
|
||||
id=ix,
|
||||
label=label,
|
||||
confidence=pred_item["confidence"],
|
||||
bbox=BoundingBox.model_validate(pred_item),
|
||||
bbox=bbox,
|
||||
cells=[],
|
||||
)
|
||||
clusters.append(cluster)
|
||||
|
@ -107,10 +107,10 @@ class OcrMacModel(BaseOcrModel):
|
||||
x2 = x1 + w * im_width
|
||||
y1 = y2 - h * im_height
|
||||
|
||||
left = x1 / self.scale
|
||||
top = y1 / self.scale
|
||||
right = x2 / self.scale
|
||||
bottom = y2 / self.scale
|
||||
left = x1 / self.scale + ocr_rect.l
|
||||
top = y1 / self.scale + ocr_rect.t
|
||||
right = x2 / self.scale + ocr_rect.l
|
||||
bottom = y2 / self.scale + ocr_rect.t
|
||||
|
||||
cells.append(
|
||||
TextCell(
|
||||
|
@ -1,8 +1,7 @@
|
||||
import copy
|
||||
import warnings
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Iterable, Optional, Tuple, cast
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||
@ -11,6 +10,7 @@ from docling_core.types.doc.page import (
|
||||
TextCellUnit,
|
||||
)
|
||||
from PIL import ImageDraw
|
||||
from PIL.Image import Image
|
||||
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||
@ -23,6 +23,7 @@ from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.utils.hf_model_download import download_hf_model
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.orientation import detect_orientation, rotate_bounding_box
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
|
||||
@ -30,6 +31,8 @@ class TableStructureModel(BasePageModel):
|
||||
_model_repo_folder = "ds4sd--docling-models"
|
||||
_model_path = "model_artifacts/tableformer"
|
||||
|
||||
_table_labels = {DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
@ -186,31 +189,48 @@ class TableStructureModel(BasePageModel):
|
||||
page.predictions.tablestructure = (
|
||||
TableStructurePrediction()
|
||||
) # dummy
|
||||
|
||||
in_tables = [
|
||||
(
|
||||
cluster,
|
||||
[
|
||||
round(cluster.bbox.l) * self.scale,
|
||||
round(cluster.bbox.t) * self.scale,
|
||||
round(cluster.bbox.r) * self.scale,
|
||||
round(cluster.bbox.b) * self.scale,
|
||||
],
|
||||
)
|
||||
cells_orientation = detect_orientation(page.cells)
|
||||
# Keep only table bboxes
|
||||
in_tables_clusters = [
|
||||
cluster
|
||||
for cluster in page.predictions.layout.clusters
|
||||
if cluster.label
|
||||
in [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
|
||||
if cluster.label in self._table_labels
|
||||
]
|
||||
if not len(in_tables):
|
||||
|
||||
if not len(in_tables_clusters):
|
||||
yield page
|
||||
continue
|
||||
|
||||
# Rotate and scale table image
|
||||
page_im = cast(Image, page.get_image())
|
||||
scaled_page_im: Image = cast(
|
||||
Image, page.get_image(scale=self.scale)
|
||||
)
|
||||
if cells_orientation:
|
||||
scaled_page_im = scaled_page_im.rotate(
|
||||
-cells_orientation, expand=True
|
||||
)
|
||||
page_input = {
|
||||
"width": page.size.width * self.scale,
|
||||
"height": page.size.height * self.scale,
|
||||
"image": numpy.asarray(page.get_image(scale=self.scale)),
|
||||
"width": scaled_page_im.size[0],
|
||||
"height": scaled_page_im.size[1],
|
||||
"image": numpy.asarray(scaled_page_im),
|
||||
}
|
||||
|
||||
# Rotate and scale table cells
|
||||
in_tables = [
|
||||
(
|
||||
c,
|
||||
[
|
||||
round(x) * self.scale
|
||||
for x in _rotate_bbox(
|
||||
c.bbox,
|
||||
orientation=-cells_orientation,
|
||||
im_size=page_im.size,
|
||||
)
|
||||
.to_top_left_origin(page_im.size[1])
|
||||
.as_tuple()
|
||||
],
|
||||
)
|
||||
for c in in_tables_clusters
|
||||
]
|
||||
table_clusters, table_bboxes = zip(*in_tables)
|
||||
|
||||
if len(table_bboxes):
|
||||
@ -238,11 +258,16 @@ class TableStructureModel(BasePageModel):
|
||||
scale=self.scale
|
||||
)
|
||||
)
|
||||
new_bbox = _rotate_bbox(
|
||||
new_cell.to_bounding_box(),
|
||||
orientation=cells_orientation,
|
||||
im_size=scaled_page_im.size,
|
||||
).model_dump()
|
||||
tokens.append(
|
||||
{
|
||||
"id": new_cell.index,
|
||||
"text": new_cell.text,
|
||||
"bbox": new_cell.rect.to_bounding_box().model_dump(),
|
||||
"bbox": new_bbox,
|
||||
}
|
||||
)
|
||||
page_input["tokens"] = tokens
|
||||
@ -302,3 +327,11 @@ class TableStructureModel(BasePageModel):
|
||||
)
|
||||
|
||||
yield page
|
||||
|
||||
|
||||
def _rotate_bbox(
|
||||
bbox: BoundingBox, *, orientation: int, im_size: Tuple[int, int]
|
||||
) -> BoundingBox:
|
||||
if orientation:
|
||||
return rotate_bounding_box(bbox, orientation, im_size).to_bounding_box()
|
||||
return bbox
|
||||
|
@ -3,7 +3,10 @@ from typing import Optional, Tuple
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle
|
||||
|
||||
from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
|
||||
from docling.utils.orientation import (
|
||||
CLIPPED_ORIENTATIONS,
|
||||
rotate_bounding_box,
|
||||
)
|
||||
|
||||
|
||||
def map_tesseract_script(script: str) -> str:
|
||||
@ -40,7 +43,9 @@ def tesseract_box_to_bounding_rectangle(
|
||||
orientation: int,
|
||||
im_size: Tuple[int, int],
|
||||
) -> BoundingRectangle:
|
||||
# box is in the top, left, height, width format, top left coordinates
|
||||
# bbox is in the top, left, height, width format, top left coordinates
|
||||
# We detected the tesseract on the document rotated with minus orientation, we have
|
||||
# to apply an orientation angle
|
||||
rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size)
|
||||
rect = BoundingRectangle(
|
||||
r_x0=rect.r_x0 / scale,
|
||||
@ -51,7 +56,7 @@ def tesseract_box_to_bounding_rectangle(
|
||||
r_y2=rect.r_y2 / scale,
|
||||
r_x3=rect.r_x3 / scale,
|
||||
r_y3=rect.r_y3 / scale,
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
coord_origin=rect.coord_origin,
|
||||
)
|
||||
if original_offset is not None:
|
||||
if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
|
||||
|
@ -1,11 +1,24 @@
|
||||
from collections import Counter
|
||||
from operator import itemgetter
|
||||
from typing import Tuple
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
|
||||
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
|
||||
|
||||
|
||||
def _clipped_orientation(angle: float) -> int:
|
||||
return min((abs(angle - o) % 360, o) for o in CLIPPED_ORIENTATIONS)[1]
|
||||
|
||||
|
||||
def detect_orientation(cells: list[TextCell]) -> int:
|
||||
if not cells:
|
||||
return 0
|
||||
orientation_counter = Counter(_clipped_orientation(c.rect.angle_360) for c in cells)
|
||||
return max(orientation_counter.items(), key=itemgetter(1))[0]
|
||||
|
||||
|
||||
def rotate_bounding_box(
|
||||
bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
|
||||
) -> BoundingRectangle:
|
||||
|
@ -213,10 +213,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
139.66741943359375,
|
||||
139.66746520996094,
|
||||
322.5054626464844,
|
||||
475.00927734375,
|
||||
454.45458984375
|
||||
475.0093078613281,
|
||||
454.4546203613281
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
|
@ -2705,7 +2705,7 @@
|
||||
"b": 102.78223000000003,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9373534917831421,
|
||||
"confidence": 0.9373531937599182,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
@ -2745,7 +2745,7 @@
|
||||
"b": 102.78223000000003,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.8858680725097656,
|
||||
"confidence": 0.8858677744865417,
|
||||
"cells": [
|
||||
{
|
||||
"index": 1,
|
||||
@ -2785,7 +2785,7 @@
|
||||
"b": 152.90697999999998,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9806433916091919,
|
||||
"confidence": 0.9806435108184814,
|
||||
"cells": [
|
||||
{
|
||||
"index": 2,
|
||||
@ -3155,7 +3155,7 @@
|
||||
"b": 327.98218,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9591909050941467,
|
||||
"confidence": 0.9591910243034363,
|
||||
"cells": [
|
||||
{
|
||||
"index": 15,
|
||||
@ -3339,9 +3339,9 @@
|
||||
"id": 0,
|
||||
"label": "table",
|
||||
"bbox": {
|
||||
"l": 139.66741943359375,
|
||||
"t": 337.54541015625,
|
||||
"r": 475.00927734375,
|
||||
"l": 139.66746520996094,
|
||||
"t": 337.5453796386719,
|
||||
"r": 475.0093078613281,
|
||||
"b": 469.4945373535156,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
@ -7846,7 +7846,7 @@
|
||||
"b": 518.17419,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9589294195175171,
|
||||
"confidence": 0.9589295387268066,
|
||||
"cells": [
|
||||
{
|
||||
"index": 91,
|
||||
@ -8243,9 +8243,9 @@
|
||||
"id": 0,
|
||||
"label": "table",
|
||||
"bbox": {
|
||||
"l": 139.66741943359375,
|
||||
"t": 337.54541015625,
|
||||
"r": 475.00927734375,
|
||||
"l": 139.66746520996094,
|
||||
"t": 337.5453796386719,
|
||||
"r": 475.0093078613281,
|
||||
"b": 469.4945373535156,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
@ -13641,7 +13641,7 @@
|
||||
"b": 102.78223000000003,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9373534917831421,
|
||||
"confidence": 0.9373531937599182,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
@ -13687,7 +13687,7 @@
|
||||
"b": 102.78223000000003,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.8858680725097656,
|
||||
"confidence": 0.8858677744865417,
|
||||
"cells": [
|
||||
{
|
||||
"index": 1,
|
||||
@ -13733,7 +13733,7 @@
|
||||
"b": 152.90697999999998,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9806433916091919,
|
||||
"confidence": 0.9806435108184814,
|
||||
"cells": [
|
||||
{
|
||||
"index": 2,
|
||||
@ -14121,7 +14121,7 @@
|
||||
"b": 327.98218,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9591909050941467,
|
||||
"confidence": 0.9591910243034363,
|
||||
"cells": [
|
||||
{
|
||||
"index": 15,
|
||||
@ -14311,9 +14311,9 @@
|
||||
"id": 0,
|
||||
"label": "table",
|
||||
"bbox": {
|
||||
"l": 139.66741943359375,
|
||||
"t": 337.54541015625,
|
||||
"r": 475.00927734375,
|
||||
"l": 139.66746520996094,
|
||||
"t": 337.5453796386719,
|
||||
"r": 475.0093078613281,
|
||||
"b": 469.4945373535156,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
@ -19701,7 +19701,7 @@
|
||||
"b": 518.17419,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9589294195175171,
|
||||
"confidence": 0.9589295387268066,
|
||||
"cells": [
|
||||
{
|
||||
"index": 91,
|
||||
@ -20116,7 +20116,7 @@
|
||||
"b": 152.90697999999998,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9806433916091919,
|
||||
"confidence": 0.9806435108184814,
|
||||
"cells": [
|
||||
{
|
||||
"index": 2,
|
||||
@ -20504,7 +20504,7 @@
|
||||
"b": 327.98218,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9591909050941467,
|
||||
"confidence": 0.9591910243034363,
|
||||
"cells": [
|
||||
{
|
||||
"index": 15,
|
||||
@ -20694,9 +20694,9 @@
|
||||
"id": 0,
|
||||
"label": "table",
|
||||
"bbox": {
|
||||
"l": 139.66741943359375,
|
||||
"t": 337.54541015625,
|
||||
"r": 475.00927734375,
|
||||
"l": 139.66746520996094,
|
||||
"t": 337.5453796386719,
|
||||
"r": 475.0093078613281,
|
||||
"b": 469.4945373535156,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
@ -26084,7 +26084,7 @@
|
||||
"b": 518.17419,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9589294195175171,
|
||||
"confidence": 0.9589295387268066,
|
||||
"cells": [
|
||||
{
|
||||
"index": 91,
|
||||
@ -26499,7 +26499,7 @@
|
||||
"b": 102.78223000000003,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9373534917831421,
|
||||
"confidence": 0.9373531937599182,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
@ -26545,7 +26545,7 @@
|
||||
"b": 102.78223000000003,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.8858680725097656,
|
||||
"confidence": 0.8858677744865417,
|
||||
"cells": [
|
||||
{
|
||||
"index": 1,
|
||||
|
@ -1,2 +1,2 @@
|
||||
<doctag><text><loc_59><loc_46><loc_424><loc_90>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
|
||||
<doctag><text><loc_60><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
|
||||
</doctag>
|
@ -1,3 +1,8 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_12><loc_82><loc_85><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
|
||||
<table>
|
||||
<location><page_1><loc_9><loc_45><loc_70><loc_86></location>
|
||||
<row_0><col_0><col_header>Vertically merged</col_0><col_1><col_header>Other merged column</col_1><col_2><col_header>Yet another column</col_2></row_0>
|
||||
<row_1><col_0><body>value</col_0><col_1><body>Some other value</col_1><col_2><body>Yet another value</col_2></row_1>
|
||||
<row_2><col_0><body>value</col_0><col_1><body>Some other value</col_1><col_2><body>Yet another value</col_2></row_2>
|
||||
</table>
|
||||
</document>
|
@ -27,53 +27,321 @@
|
||||
"file-info": {
|
||||
"filename": "ocr_test.pdf",
|
||||
"filename-prov": null,
|
||||
"document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61",
|
||||
"document-hash": "0f391d12850f72bb91897f7f3bebfd4a0a8357e2a883ac1f664e32342c04e418",
|
||||
"#-pages": 1,
|
||||
"collection-name": null,
|
||||
"description": null,
|
||||
"page-hashes": [
|
||||
{
|
||||
"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3",
|
||||
"hash": "32f328168da3f69890a725c1168799f9ff7337249e98b1f36c12965551477be5",
|
||||
"model": "default",
|
||||
"page": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"main-text": [
|
||||
{
|
||||
"name": "Table",
|
||||
"type": "table",
|
||||
"$ref": "#/tables/0"
|
||||
}
|
||||
],
|
||||
"figures": [],
|
||||
"tables": [
|
||||
{
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
69.6796630536824,
|
||||
689.0124221922704,
|
||||
504.8720051760782,
|
||||
764.9216921155637
|
||||
69.04969024658203,
|
||||
277.41973876953125,
|
||||
551.0990600585938,
|
||||
524.3504486083984
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
0,
|
||||
94
|
||||
0
|
||||
],
|
||||
"__ref_s3_data": null
|
||||
}
|
||||
],
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
|
||||
"type": "paragraph",
|
||||
"text": "",
|
||||
"type": "table",
|
||||
"payload": null,
|
||||
"name": "Text",
|
||||
"font": null
|
||||
"#-cols": 3,
|
||||
"#-rows": 3,
|
||||
"data": [
|
||||
[
|
||||
{
|
||||
"bbox": [
|
||||
97.33333333333333,
|
||||
105.66666666666666,
|
||||
190.0,
|
||||
126.33333333333334
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
0,
|
||||
0
|
||||
]
|
||||
],
|
||||
"text": "Vertically merged",
|
||||
"type": "col_header",
|
||||
"col": 0,
|
||||
"col-header": true,
|
||||
"col-span": [
|
||||
0,
|
||||
1
|
||||
],
|
||||
"row": 0,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
232.66666666666666,
|
||||
105.66666666666666,
|
||||
364.0,
|
||||
126.33333333333334
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
0,
|
||||
1
|
||||
]
|
||||
],
|
||||
"text": "Other merged column",
|
||||
"type": "col_header",
|
||||
"col": 1,
|
||||
"col-header": true,
|
||||
"col-span": [
|
||||
1,
|
||||
2
|
||||
],
|
||||
"row": 0,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
406.3333333333333,
|
||||
105.66666666666666,
|
||||
518.3333333333333,
|
||||
121.66666666666666
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
0,
|
||||
2
|
||||
]
|
||||
],
|
||||
"text": "Yet another column",
|
||||
"type": "col_header",
|
||||
"col": 2,
|
||||
"col-header": true,
|
||||
"col-span": [
|
||||
2,
|
||||
3
|
||||
],
|
||||
"row": 0,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"bbox": [
|
||||
121.66666666666667,
|
||||
204.33333333333334,
|
||||
168.66666666666666,
|
||||
220.0
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
1,
|
||||
0
|
||||
]
|
||||
],
|
||||
"text": "value",
|
||||
"type": "body",
|
||||
"col": 0,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
0,
|
||||
1
|
||||
],
|
||||
"row": 1,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
1,
|
||||
2
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
247.0,
|
||||
188.33333333333331,
|
||||
349.6666666666667,
|
||||
204.33333333333334
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
1,
|
||||
1
|
||||
]
|
||||
],
|
||||
"text": "Some other value",
|
||||
"type": "body",
|
||||
"col": 1,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
1,
|
||||
2
|
||||
],
|
||||
"row": 1,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
1,
|
||||
2
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
408.3333333333333,
|
||||
188.33333333333331,
|
||||
514.0,
|
||||
204.33333333333334
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
1,
|
||||
2
|
||||
]
|
||||
],
|
||||
"text": "Yet another value",
|
||||
"type": "body",
|
||||
"col": 2,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
2,
|
||||
3
|
||||
],
|
||||
"row": 1,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
1,
|
||||
2
|
||||
]
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"bbox": [
|
||||
121.66666666666667,
|
||||
284.0,
|
||||
168.66666666666666,
|
||||
300.0
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
2,
|
||||
0
|
||||
]
|
||||
],
|
||||
"text": "value",
|
||||
"type": "body",
|
||||
"col": 0,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
0,
|
||||
1
|
||||
],
|
||||
"row": 2,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
2,
|
||||
3
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
247.0,
|
||||
268.0,
|
||||
349.6666666666667,
|
||||
284.0
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
2,
|
||||
1
|
||||
]
|
||||
],
|
||||
"text": "Some other value",
|
||||
"type": "body",
|
||||
"col": 1,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
1,
|
||||
2
|
||||
],
|
||||
"row": 2,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
2,
|
||||
3
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
408.3333333333333,
|
||||
268.0,
|
||||
514.0,
|
||||
284.0
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
2,
|
||||
2
|
||||
]
|
||||
],
|
||||
"text": "Yet another value",
|
||||
"type": "body",
|
||||
"col": 2,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
2,
|
||||
3
|
||||
],
|
||||
"row": 2,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
2,
|
||||
3
|
||||
]
|
||||
}
|
||||
]
|
||||
],
|
||||
"model": null,
|
||||
"bounding-box": null
|
||||
}
|
||||
],
|
||||
"figures": [],
|
||||
"tables": [],
|
||||
"bitmaps": null,
|
||||
"equations": [],
|
||||
"footnotes": [],
|
||||
"page-dimensions": [
|
||||
{
|
||||
"height": 841.9216918945312,
|
||||
"height": 612.0,
|
||||
"page": 1,
|
||||
"width": 595.201171875
|
||||
"width": 792.0
|
||||
}
|
||||
],
|
||||
"page-footers": [],
|
||||
|
@ -1 +1,4 @@
|
||||
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
|
||||
| Vertically merged | Other merged column | Yet another column |
|
||||
|---------------------|-----------------------|----------------------|
|
||||
| value | Some other value | Yet another value |
|
||||
| value | Some other value | Yet another value |
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +0,0 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
|
||||
</document>
|
@ -1 +0,0 @@
|
||||
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.21306574279092, 74.12495603322407, 152.19606490864376, 154.19400205373182], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
@ -1 +0,0 @@
|
||||
package
|
@ -1 +0,0 @@
|
||||
[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}]
|
@ -1,4 +1,8 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_74><loc_16><loc_88><loc_18></location>package</paragraph>
|
||||
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph>
|
||||
<table>
|
||||
<location><page_1><loc_30><loc_14><loc_91><loc_55></location>
|
||||
<row_0><col_0><col_header>Vertically merged</col_0><col_1><col_header>Other merged column</col_1><col_2><col_header>Yet another column</col_2></row_0>
|
||||
<row_1><col_0><body>value</col_0><col_1><body>Some other value</col_1><col_2><body>Yet another value</col_2></row_1>
|
||||
<row_2><col_0><body>value</col_0><col_1><body>Some other value</col_1><col_2><body>Yet another value</col_2></row_2>
|
||||
</table>
|
||||
</document>
|
@ -27,13 +27,13 @@
|
||||
"file-info": {
|
||||
"filename": "ocr_test_rotated_180.pdf",
|
||||
"filename-prov": null,
|
||||
"document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982",
|
||||
"document-hash": "361fa0fc8db9c3a973d316d08509ac78cc0e7f81dea94358319092640d439ca0",
|
||||
"#-pages": 1,
|
||||
"collection-name": null,
|
||||
"description": null,
|
||||
"page-hashes": [
|
||||
{
|
||||
"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49",
|
||||
"hash": "ab89ee70d4aee0b8dc5ed72ad42e16e98a8ec9c2eea1e03d99b50c25bbc5a806",
|
||||
"model": "default",
|
||||
"page": 1
|
||||
}
|
||||
@ -41,62 +41,307 @@
|
||||
},
|
||||
"main-text": [
|
||||
{
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
441.2561096985719,
|
||||
131.89488404865142,
|
||||
522.0347860494834,
|
||||
151.87873262042876
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
0,
|
||||
7
|
||||
],
|
||||
"__ref_s3_data": null
|
||||
}
|
||||
],
|
||||
"text": "package",
|
||||
"type": "paragraph",
|
||||
"payload": null,
|
||||
"name": "Text",
|
||||
"font": null
|
||||
},
|
||||
"name": "Table",
|
||||
"type": "table",
|
||||
"$ref": "#/tables/0"
|
||||
}
|
||||
],
|
||||
"figures": [],
|
||||
"tables": [
|
||||
{
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
89.23887497045128,
|
||||
77.02339852098021,
|
||||
523.208764293368,
|
||||
124.75312428291147
|
||||
240.90093994140625,
|
||||
87.64955139160156,
|
||||
722.950309753418,
|
||||
334.58026123046875
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
0,
|
||||
86
|
||||
0
|
||||
],
|
||||
"__ref_s3_data": null
|
||||
}
|
||||
],
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
|
||||
"type": "paragraph",
|
||||
"text": "",
|
||||
"type": "table",
|
||||
"payload": null,
|
||||
"name": "Text",
|
||||
"font": null
|
||||
"#-cols": 3,
|
||||
"#-rows": 3,
|
||||
"data": [
|
||||
[
|
||||
{
|
||||
"bbox": [
|
||||
97.33333333333337,
|
||||
105.66666666666669,
|
||||
190.0,
|
||||
126.33333333333337
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
0,
|
||||
0
|
||||
]
|
||||
],
|
||||
"text": "Vertically merged",
|
||||
"type": "col_header",
|
||||
"col": 0,
|
||||
"col-header": true,
|
||||
"col-span": [
|
||||
0,
|
||||
1
|
||||
],
|
||||
"row": 0,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
232.33333333333326,
|
||||
105.66666666666669,
|
||||
363.6666666666667,
|
||||
126.33333333333337
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
0,
|
||||
1
|
||||
]
|
||||
],
|
||||
"text": "Other merged column",
|
||||
"type": "col_header",
|
||||
"col": 1,
|
||||
"col-header": true,
|
||||
"col-span": [
|
||||
1,
|
||||
2
|
||||
],
|
||||
"row": 0,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
406.3333333333333,
|
||||
105.66666666666669,
|
||||
518.0,
|
||||
121.66666666666663
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
0,
|
||||
2
|
||||
]
|
||||
],
|
||||
"text": "Yet another column",
|
||||
"type": "col_header",
|
||||
"col": 2,
|
||||
"col-header": true,
|
||||
"col-span": [
|
||||
2,
|
||||
3
|
||||
],
|
||||
"row": 0,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"bbox": [
|
||||
121.66666666666663,
|
||||
204.0,
|
||||
168.66666666666663,
|
||||
220.0
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
1,
|
||||
0
|
||||
]
|
||||
],
|
||||
"text": "value",
|
||||
"type": "body",
|
||||
"col": 0,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
0,
|
||||
1
|
||||
],
|
||||
"row": 1,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
1,
|
||||
2
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
247.0,
|
||||
188.0,
|
||||
349.6666666666667,
|
||||
204.0
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
1,
|
||||
1
|
||||
]
|
||||
],
|
||||
"text": "Some other value",
|
||||
"type": "body",
|
||||
"col": 1,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
1,
|
||||
2
|
||||
],
|
||||
"row": 1,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
1,
|
||||
2
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
408.3333333333333,
|
||||
188.0,
|
||||
514.0,
|
||||
204.0
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
1,
|
||||
2
|
||||
]
|
||||
],
|
||||
"text": "Yet another value",
|
||||
"type": "body",
|
||||
"col": 2,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
2,
|
||||
3
|
||||
],
|
||||
"row": 1,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
1,
|
||||
2
|
||||
]
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"bbox": [
|
||||
121.66666666666663,
|
||||
284.0,
|
||||
168.66666666666663,
|
||||
300.0
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
2,
|
||||
0
|
||||
]
|
||||
],
|
||||
"text": "value",
|
||||
"type": "body",
|
||||
"col": 0,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
0,
|
||||
1
|
||||
],
|
||||
"row": 2,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
2,
|
||||
3
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
247.0,
|
||||
268.0,
|
||||
349.6666666666667,
|
||||
284.0
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
2,
|
||||
1
|
||||
]
|
||||
],
|
||||
"text": "Some other value",
|
||||
"type": "body",
|
||||
"col": 1,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
1,
|
||||
2
|
||||
],
|
||||
"row": 2,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
2,
|
||||
3
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
408.3333333333333,
|
||||
268.0,
|
||||
514.0,
|
||||
284.0
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
2,
|
||||
2
|
||||
]
|
||||
],
|
||||
"text": "Yet another value",
|
||||
"type": "body",
|
||||
"col": 2,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
2,
|
||||
3
|
||||
],
|
||||
"row": 2,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
2,
|
||||
3
|
||||
]
|
||||
}
|
||||
]
|
||||
],
|
||||
"model": null,
|
||||
"bounding-box": null
|
||||
}
|
||||
],
|
||||
"figures": [],
|
||||
"tables": [],
|
||||
"bitmaps": null,
|
||||
"equations": [],
|
||||
"footnotes": [],
|
||||
"page-dimensions": [
|
||||
{
|
||||
"height": 841.9216918945312,
|
||||
"height": 612.0,
|
||||
"page": 1,
|
||||
"width": 595.201171875
|
||||
"width": 792.0
|
||||
}
|
||||
],
|
||||
"page-footers": [],
|
||||
|
@ -1,3 +1,4 @@
|
||||
package
|
||||
|
||||
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained
|
||||
| Vertically merged | Other merged column | Yet another column |
|
||||
|---------------------|-----------------------|----------------------|
|
||||
| value | Some other value | Yet another value |
|
||||
| value | Some other value | Yet another value |
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,6 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_82><loc_74><loc_84><loc_88></location>package</paragraph>
|
||||
<table>
|
||||
<location><page_1><loc_45><loc_30><loc_86><loc_91></location>
|
||||
<row_0><col_0><body>Yet another value</col_0><col_1><body>Some other value</col_1><col_2><body>value</col_2></row_0>
|
||||
</table>
|
||||
</document>
|
@ -27,53 +27,149 @@
|
||||
"file-info": {
|
||||
"filename": "ocr_test_rotated_270.pdf",
|
||||
"filename-prov": null,
|
||||
"document-hash": "52f54e7183bdb73aa3713c7b169baca93e276963a138418c26e7d6a1ea128f14",
|
||||
"document-hash": "753140dc9b8c39b67c6f6712e2a1de4c364c808ca09d13dd05b79c23192429dc",
|
||||
"#-pages": 1,
|
||||
"collection-name": null,
|
||||
"description": null,
|
||||
"page-hashes": [
|
||||
{
|
||||
"hash": "59bc9ddba89e7b008185dd16d384493beb034686e5670546786390c5d237a304",
|
||||
"hash": "c8fa256d58940f76c5e0ec6b65548a2e939f867c2c75d0ee27f5f70ff32a44be",
|
||||
"model": "default",
|
||||
"page": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"main-text": [
|
||||
{
|
||||
"name": "Table",
|
||||
"type": "table",
|
||||
"$ref": "#/tables/0"
|
||||
}
|
||||
],
|
||||
"figures": [],
|
||||
"tables": [
|
||||
{
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
690.2441821046808,
|
||||
442.39487414368364,
|
||||
709.8255852011977,
|
||||
523.076601235155
|
||||
277.4178771972656,
|
||||
240.90216064453125,
|
||||
524.3541717529297,
|
||||
722.9614028930664
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
0,
|
||||
7
|
||||
0
|
||||
],
|
||||
"__ref_s3_data": null
|
||||
}
|
||||
],
|
||||
"text": "package",
|
||||
"type": "paragraph",
|
||||
"text": "",
|
||||
"type": "table",
|
||||
"payload": null,
|
||||
"name": "Text",
|
||||
"font": null
|
||||
"#-cols": 3,
|
||||
"#-rows": 1,
|
||||
"data": [
|
||||
[
|
||||
{
|
||||
"bbox": [
|
||||
98.0,
|
||||
296.6666666666667,
|
||||
203.66666666666669,
|
||||
344.0
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
0,
|
||||
0
|
||||
]
|
||||
],
|
||||
"text": "Yet another value",
|
||||
"type": "body",
|
||||
"col": 0,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
0,
|
||||
1
|
||||
],
|
||||
"row": 0,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
262.3333333333333,
|
||||
296.6666666666667,
|
||||
365.0,
|
||||
344.0
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
0,
|
||||
1
|
||||
]
|
||||
],
|
||||
"text": "Some other value",
|
||||
"type": "body",
|
||||
"col": 1,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
1,
|
||||
2
|
||||
],
|
||||
"row": 0,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
443.33333333333337,
|
||||
312.0,
|
||||
490.33333333333337,
|
||||
328.0
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
0,
|
||||
2
|
||||
]
|
||||
],
|
||||
"text": "value",
|
||||
"type": "body",
|
||||
"col": 2,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
2,
|
||||
3
|
||||
],
|
||||
"row": 0,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
}
|
||||
]
|
||||
],
|
||||
"model": null,
|
||||
"bounding-box": null
|
||||
}
|
||||
],
|
||||
"figures": [],
|
||||
"tables": [],
|
||||
"bitmaps": null,
|
||||
"equations": [],
|
||||
"footnotes": [],
|
||||
"page-dimensions": [
|
||||
{
|
||||
"height": 595.201171875,
|
||||
"height": 792.0,
|
||||
"page": 1,
|
||||
"width": 841.9216918945312
|
||||
"width": 612.0
|
||||
}
|
||||
],
|
||||
"page-footers": [],
|
||||
|
@ -1 +0,0 @@
|
||||
package
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,5 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
|
||||
<table>
|
||||
<location><page_1><loc_14><loc_9><loc_55><loc_70></location>
|
||||
</table>
|
||||
</document>
|
@ -27,53 +27,62 @@
|
||||
"file-info": {
|
||||
"filename": "ocr_test_rotated_90.pdf",
|
||||
"filename-prov": null,
|
||||
"document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6",
|
||||
"document-hash": "418ae4425f514f002bd4223ea3003c17f319cbeafd67801732d58f2bedb3bd91",
|
||||
"#-pages": 1,
|
||||
"collection-name": null,
|
||||
"description": null,
|
||||
"page-hashes": [
|
||||
{
|
||||
"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3",
|
||||
"hash": "36315c08dc861ecde4be6179d2f155da0519b93e0311c290f8db164f593d36d8",
|
||||
"model": "default",
|
||||
"page": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"main-text": [
|
||||
{
|
||||
"name": "Table",
|
||||
"type": "table",
|
||||
"$ref": "#/tables/0"
|
||||
}
|
||||
],
|
||||
"figures": [],
|
||||
"tables": [
|
||||
{
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
131.21306574279092,
|
||||
74.12495603322407,
|
||||
152.19606490864376,
|
||||
154.19400205373182
|
||||
87.64582824707031,
|
||||
69.0385971069336,
|
||||
334.5821228027344,
|
||||
551.0978393554688
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
0,
|
||||
7
|
||||
0
|
||||
],
|
||||
"__ref_s3_data": null
|
||||
}
|
||||
],
|
||||
"text": "package",
|
||||
"type": "paragraph",
|
||||
"text": "",
|
||||
"type": "table",
|
||||
"payload": null,
|
||||
"name": "Text",
|
||||
"font": null
|
||||
"#-cols": 0,
|
||||
"#-rows": 0,
|
||||
"data": [],
|
||||
"model": null,
|
||||
"bounding-box": null
|
||||
}
|
||||
],
|
||||
"figures": [],
|
||||
"tables": [],
|
||||
"bitmaps": null,
|
||||
"equations": [],
|
||||
"footnotes": [],
|
||||
"page-dimensions": [
|
||||
{
|
||||
"height": 595.201171875,
|
||||
"height": 792.0,
|
||||
"page": 1,
|
||||
"width": 841.9216918945312
|
||||
"width": 612.0
|
||||
}
|
||||
],
|
||||
"page-footers": [],
|
||||
|
@ -1 +0,0 @@
|
||||
package
|
File diff suppressed because it is too large
Load Diff
@ -1,2 +1,2 @@
|
||||
<doctag><text><loc_59><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
|
||||
<doctag><otsl><loc_44><loc_72><loc_348><loc_273><ched>Vertically merged<ched>Other merged column<ched>Yet another column<nl><fcel>value<fcel>Some other value<fcel>Yet another value<nl><fcel>value<fcel>Some other value<fcel>Yet another value<nl></otsl>
|
||||
</doctag>
|
@ -4,7 +4,7 @@
|
||||
"name": "ocr_test",
|
||||
"origin": {
|
||||
"mimetype": "application/pdf",
|
||||
"binary_hash": 14853448746796404529,
|
||||
"binary_hash": 14846044078209721391,
|
||||
"filename": "ocr_test.pdf"
|
||||
},
|
||||
"furniture": {
|
||||
@ -18,7 +18,7 @@
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
"$ref": "#/tables/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -26,44 +26,402 @@
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [],
|
||||
"texts": [
|
||||
"texts": [],
|
||||
"pictures": [],
|
||||
"tables": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"self_ref": "#/tables/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"label": "table",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 69.68,
|
||||
"t": 764.92,
|
||||
"r": 504.87,
|
||||
"b": 689.01,
|
||||
"l": 69.05,
|
||||
"t": 524.35,
|
||||
"r": 551.1,
|
||||
"b": 277.42,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
94
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"data": {
|
||||
"table_cells": [
|
||||
{
|
||||
"bbox": {
|
||||
"l": 97.33,
|
||||
"t": 105.67,
|
||||
"r": 190.0,
|
||||
"b": 126.33,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Vertically merged",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 121.67,
|
||||
"t": 204.33,
|
||||
"r": 168.67,
|
||||
"b": 220.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 121.67,
|
||||
"t": 284.0,
|
||||
"r": 168.67,
|
||||
"b": 300.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 232.67,
|
||||
"t": 105.67,
|
||||
"r": 364.0,
|
||||
"b": 126.33,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Other merged column",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 247.0,
|
||||
"t": 188.33,
|
||||
"r": 349.67,
|
||||
"b": 204.33,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Some other value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 247.0,
|
||||
"t": 268.0,
|
||||
"r": 349.67,
|
||||
"b": 284.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Some other value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 406.33,
|
||||
"t": 105.67,
|
||||
"r": 518.33,
|
||||
"b": 121.67,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "Yet another column",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 408.33,
|
||||
"t": 188.33,
|
||||
"r": 514.0,
|
||||
"b": 204.33,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "Yet another value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 408.33,
|
||||
"t": 268.0,
|
||||
"r": 514.0,
|
||||
"b": 284.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "Yet another value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
}
|
||||
],
|
||||
"num_rows": 3,
|
||||
"num_cols": 3,
|
||||
"grid": [
|
||||
[
|
||||
{
|
||||
"bbox": {
|
||||
"l": 97.33,
|
||||
"t": 105.67,
|
||||
"r": 190.0,
|
||||
"b": 126.33,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Vertically merged",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 232.67,
|
||||
"t": 105.67,
|
||||
"r": 364.0,
|
||||
"b": 126.33,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Other merged column",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 406.33,
|
||||
"t": 105.67,
|
||||
"r": 518.33,
|
||||
"b": 121.67,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "Yet another column",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"bbox": {
|
||||
"l": 121.67,
|
||||
"t": 204.33,
|
||||
"r": 168.67,
|
||||
"b": 220.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 247.0,
|
||||
"t": 188.33,
|
||||
"r": 349.67,
|
||||
"b": 204.33,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Some other value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 408.33,
|
||||
"t": 188.33,
|
||||
"r": 514.0,
|
||||
"b": 204.33,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "Yet another value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"bbox": {
|
||||
"l": 121.67,
|
||||
"t": 284.0,
|
||||
"r": 168.67,
|
||||
"b": 300.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 247.0,
|
||||
"t": 268.0,
|
||||
"r": 349.67,
|
||||
"b": 284.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Some other value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 408.33,
|
||||
"t": 268.0,
|
||||
"r": 514.0,
|
||||
"b": 284.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "Yet another value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {
|
||||
"1": {
|
||||
"size": {
|
||||
"width": 595.2,
|
||||
"height": 841.92
|
||||
"width": 792.0,
|
||||
"height": 612.0
|
||||
},
|
||||
"page_no": 1
|
||||
}
|
||||
|
@ -1 +1,4 @@
|
||||
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
|
||||
| Vertically merged | Other merged column | Yet another column |
|
||||
|---------------------|-----------------------|----------------------|
|
||||
| value | Some other value | Yet another value |
|
||||
| value | Some other value | Yet another value |
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,2 @@
|
||||
<doctag><text><loc_371><loc_410><loc_439><loc_422>package</text>
|
||||
<text><loc_75><loc_426><loc_440><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
|
||||
<doctag><otsl><loc_152><loc_227><loc_456><loc_428><ched>Vertically merged<ched>Other merged column<ched>Yet another column<nl><fcel>value<fcel>Some other value<fcel>Yet another value<nl><fcel>value<fcel>Some other value<fcel>Yet another value<nl></otsl>
|
||||
</doctag>
|
@ -4,7 +4,7 @@
|
||||
"name": "ocr_test_rotated_180",
|
||||
"origin": {
|
||||
"mimetype": "application/pdf",
|
||||
"binary_hash": 2530576989861832966,
|
||||
"binary_hash": 16151733167151414937,
|
||||
"filename": "ocr_test_rotated_180.pdf"
|
||||
},
|
||||
"furniture": {
|
||||
@ -18,10 +18,7 @@
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
"$ref": "#/tables/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -29,71 +26,402 @@
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [],
|
||||
"texts": [
|
||||
"texts": [],
|
||||
"pictures": [],
|
||||
"tables": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"self_ref": "#/tables/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"label": "table",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 441.26,
|
||||
"t": 151.88,
|
||||
"r": 522.03,
|
||||
"b": 131.89,
|
||||
"l": 240.9,
|
||||
"t": 334.58,
|
||||
"r": 722.95,
|
||||
"b": 87.65,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
7
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "package",
|
||||
"text": "package"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"data": {
|
||||
"table_cells": [
|
||||
{
|
||||
"bbox": {
|
||||
"l": 97.33,
|
||||
"t": 105.67,
|
||||
"r": 190.0,
|
||||
"b": 126.33,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Vertically merged",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 121.67,
|
||||
"t": 204.0,
|
||||
"r": 168.67,
|
||||
"b": 220.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 121.67,
|
||||
"t": 284.0,
|
||||
"r": 168.67,
|
||||
"b": 300.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 232.33,
|
||||
"t": 105.67,
|
||||
"r": 363.67,
|
||||
"b": 126.33,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Other merged column",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 247.0,
|
||||
"t": 188.0,
|
||||
"r": 349.67,
|
||||
"b": 204.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Some other value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 247.0,
|
||||
"t": 268.0,
|
||||
"r": 349.67,
|
||||
"b": 284.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Some other value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 406.33,
|
||||
"t": 105.67,
|
||||
"r": 518.0,
|
||||
"b": 121.67,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "Yet another column",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 408.33,
|
||||
"t": 188.0,
|
||||
"r": 514.0,
|
||||
"b": 204.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "Yet another value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 408.33,
|
||||
"t": 268.0,
|
||||
"r": 514.0,
|
||||
"b": 284.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "Yet another value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
}
|
||||
],
|
||||
"num_rows": 3,
|
||||
"num_cols": 3,
|
||||
"grid": [
|
||||
[
|
||||
{
|
||||
"bbox": {
|
||||
"l": 97.33,
|
||||
"t": 105.67,
|
||||
"r": 190.0,
|
||||
"b": 126.33,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Vertically merged",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 232.33,
|
||||
"t": 105.67,
|
||||
"r": 363.67,
|
||||
"b": 126.33,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Other merged column",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 406.33,
|
||||
"t": 105.67,
|
||||
"r": 518.0,
|
||||
"b": 121.67,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "Yet another column",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"bbox": {
|
||||
"l": 121.67,
|
||||
"t": 204.0,
|
||||
"r": 168.67,
|
||||
"b": 220.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 247.0,
|
||||
"t": 188.0,
|
||||
"r": 349.67,
|
||||
"b": 204.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Some other value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 408.33,
|
||||
"t": 188.0,
|
||||
"r": 514.0,
|
||||
"b": 204.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "Yet another value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"bbox": {
|
||||
"l": 121.67,
|
||||
"t": 284.0,
|
||||
"r": 168.67,
|
||||
"b": 300.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 247.0,
|
||||
"t": 268.0,
|
||||
"r": 349.67,
|
||||
"b": 284.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Some other value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 408.33,
|
||||
"t": 268.0,
|
||||
"r": 514.0,
|
||||
"b": 284.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "Yet another value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 89.24,
|
||||
"t": 124.75,
|
||||
"r": 523.21,
|
||||
"b": 77.02,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
86
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {
|
||||
"1": {
|
||||
"size": {
|
||||
"width": 595.2,
|
||||
"height": 841.92
|
||||
"width": 792.0,
|
||||
"height": 612.0
|
||||
},
|
||||
"page_no": 1
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
package
|
||||
|
||||
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained
|
||||
| Vertically merged | Other merged column | Yet another column |
|
||||
|---------------------|-----------------------|----------------------|
|
||||
| value | Some other value | Yet another value |
|
||||
| value | Some other value | Yet another value |
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,2 @@
|
||||
<doctag><page_header><loc_426><loc_60><loc_454><loc_424>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
|
||||
<text><loc_410><loc_61><loc_422><loc_128>package</text>
|
||||
<doctag><otsl><loc_227><loc_44><loc_428><loc_348><fcel>Yet another value<fcel>Some other value<fcel>value<nl></otsl>
|
||||
</doctag>
|
@ -4,7 +4,7 @@
|
||||
"name": "ocr_test_rotated_270",
|
||||
"origin": {
|
||||
"mimetype": "application/pdf",
|
||||
"binary_hash": 10890858393843077593,
|
||||
"binary_hash": 8365439800722100027,
|
||||
"filename": "ocr_test_rotated_270.pdf"
|
||||
},
|
||||
"furniture": {
|
||||
@ -18,10 +18,7 @@
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
"$ref": "#/tables/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -29,71 +26,170 @@
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [],
|
||||
"texts": [
|
||||
"texts": [],
|
||||
"pictures": [],
|
||||
"tables": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "page_header",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 717.17,
|
||||
"t": 524.3,
|
||||
"r": 764.9,
|
||||
"b": 90.33,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
86
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"self_ref": "#/tables/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"label": "table",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 690.24,
|
||||
"t": 523.08,
|
||||
"r": 709.83,
|
||||
"b": 442.39,
|
||||
"l": 277.42,
|
||||
"t": 722.96,
|
||||
"r": 524.35,
|
||||
"b": 240.9,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
7
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "package",
|
||||
"text": "package"
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"data": {
|
||||
"table_cells": [
|
||||
{
|
||||
"bbox": {
|
||||
"l": 443.33,
|
||||
"t": 312.0,
|
||||
"r": 490.33,
|
||||
"b": 328.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 262.33,
|
||||
"t": 296.67,
|
||||
"r": 365.0,
|
||||
"b": 344.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Some other value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 98.0,
|
||||
"t": 296.67,
|
||||
"r": 203.67,
|
||||
"b": 344.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Yet another value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
}
|
||||
],
|
||||
"num_rows": 1,
|
||||
"num_cols": 3,
|
||||
"grid": [
|
||||
[
|
||||
{
|
||||
"bbox": {
|
||||
"l": 98.0,
|
||||
"t": 296.67,
|
||||
"r": 203.67,
|
||||
"b": 344.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Yet another value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 262.33,
|
||||
"t": 296.67,
|
||||
"r": 365.0,
|
||||
"b": 344.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Some other value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
},
|
||||
{
|
||||
"bbox": {
|
||||
"l": 443.33,
|
||||
"t": 312.0,
|
||||
"r": 490.33,
|
||||
"b": 328.0,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 2,
|
||||
"end_col_offset_idx": 3,
|
||||
"text": "value",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {
|
||||
"1": {
|
||||
"size": {
|
||||
"width": 841.92,
|
||||
"height": 595.2
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"page_no": 1
|
||||
}
|
||||
|
@ -1 +0,0 @@
|
||||
package
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,2 @@
|
||||
<doctag><page_header><loc_46><loc_75><loc_75><loc_440>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
|
||||
<text><loc_78><loc_370><loc_90><loc_438>package</text>
|
||||
<doctag><otsl><loc_72><loc_152><loc_273><loc_456></otsl>
|
||||
</doctag>
|
@ -4,7 +4,7 @@
|
||||
"name": "ocr_test_rotated_90",
|
||||
"origin": {
|
||||
"mimetype": "application/pdf",
|
||||
"binary_hash": 6989291015361162334,
|
||||
"binary_hash": 6752841177619701916,
|
||||
"filename": "ocr_test_rotated_90.pdf"
|
||||
},
|
||||
"furniture": {
|
||||
@ -18,10 +18,7 @@
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
"$ref": "#/tables/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -29,71 +26,52 @@
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [],
|
||||
"texts": [
|
||||
"texts": [],
|
||||
"pictures": [],
|
||||
"tables": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "page_header",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 77.1,
|
||||
"t": 506.07,
|
||||
"r": 126.08,
|
||||
"b": 71.88,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
86
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained",
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"self_ref": "#/tables/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"label": "table",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 131.21,
|
||||
"t": 154.19,
|
||||
"r": 152.2,
|
||||
"b": 74.12,
|
||||
"l": 87.65,
|
||||
"t": 551.1,
|
||||
"r": 334.58,
|
||||
"b": 69.04,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
7
|
||||
0
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "package",
|
||||
"text": "package"
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"data": {
|
||||
"table_cells": [],
|
||||
"num_rows": 0,
|
||||
"num_cols": 0,
|
||||
"grid": []
|
||||
},
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {
|
||||
"1": {
|
||||
"size": {
|
||||
"width": 841.92,
|
||||
"height": 595.2
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"page_no": 1
|
||||
}
|
||||
|
@ -1 +0,0 @@
|
||||
package
|
File diff suppressed because it is too large
Load Diff
BIN
tests/data_scanned/ocr_test.pdf
vendored
BIN
tests/data_scanned/ocr_test.pdf
vendored
Binary file not shown.
BIN
tests/data_scanned/ocr_test_rotated_180.pdf
vendored
BIN
tests/data_scanned/ocr_test_rotated_180.pdf
vendored
Binary file not shown.
BIN
tests/data_scanned/ocr_test_rotated_270.pdf
vendored
BIN
tests/data_scanned/ocr_test_rotated_270.pdf
vendored
Binary file not shown.
BIN
tests/data_scanned/ocr_test_rotated_90.pdf
vendored
BIN
tests/data_scanned/ocr_test_rotated_90.pdf
vendored
Binary file not shown.
@ -73,8 +73,8 @@ def test_e2e_conversions():
|
||||
|
||||
# only works on mac
|
||||
if "darwin" == sys.platform:
|
||||
engines.append((OcrMacOptions(), True))
|
||||
engines.append((OcrMacOptions(force_full_page_ocr=True), True))
|
||||
engines.append((OcrMacOptions(), False))
|
||||
engines.append((OcrMacOptions(force_full_page_ocr=True), False))
|
||||
|
||||
for ocr_options, supports_rotation in engines:
|
||||
print(
|
||||
|
Loading…
Reference in New Issue
Block a user