mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 19:44:34 +00:00
fix(layout,table): orientation-aware layout and table detection
Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
parent
a47fd8372d
commit
8ffa01bc9f
@ -1,8 +1,8 @@
|
||||
import copy
|
||||
import logging
|
||||
import warnings
|
||||
from copy import deepcopy
|
||||
from collections.abc import Iterable
|
||||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
@ -19,7 +19,7 @@ from docling.models.base_model import BasePageModel
|
||||
from docling.models.utils.hf_model_download import download_hf_model
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.layout_postprocessor import LayoutPostprocessor
|
||||
from docling.utils.orientation import detect_orientation
|
||||
from docling.utils.orientation import detect_orientation, rotate_bounding_box
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
from docling.utils.visualization import draw_clusters
|
||||
|
||||
@ -105,7 +105,6 @@ class LayoutModel(BasePageModel):
|
||||
self,
|
||||
conv_res,
|
||||
page,
|
||||
page_orientation: int,
|
||||
clusters,
|
||||
mode_prefix: str,
|
||||
show: bool = False,
|
||||
@ -119,10 +118,6 @@ class LayoutModel(BasePageModel):
|
||||
page_image = deepcopy(page.image)
|
||||
scale_x = page_image.width / page.size.width
|
||||
scale_y = page_image.height / page.size.height
|
||||
if page_orientation:
|
||||
page_image = page_image.rotate(-page_orientation, expand=True)
|
||||
if abs(page_orientation) in [90, 270]:
|
||||
scale_x, scale_y = scale_y, scale_x
|
||||
# Filter clusters for left and right images
|
||||
exclude_labels = {
|
||||
DocItemLabel.FORM,
|
||||
@ -138,9 +133,6 @@ class LayoutModel(BasePageModel):
|
||||
# Draw clusters on both images
|
||||
draw_clusters(left_image, left_clusters, scale_x, scale_y)
|
||||
draw_clusters(right_image, right_clusters, scale_x, scale_y)
|
||||
if page_orientation:
|
||||
left_image = left_image.rotate(page_orientation, expand=True)
|
||||
right_image = right_image.rotate(page_orientation, expand=True)
|
||||
# Combine the images side by side
|
||||
combined_width = left_image.width * 2
|
||||
combined_height = left_image.height
|
||||
@ -183,11 +175,16 @@ class LayoutModel(BasePageModel):
|
||||
.replace(" ", "_")
|
||||
.replace("-", "_")
|
||||
) # Temporary, until docling-ibm-model uses docling-core types
|
||||
bbox = BoundingBox.model_validate(pred_item)
|
||||
if page_orientation:
|
||||
bbox = rotate_bounding_box(
|
||||
bbox, page_orientation, page_image.size
|
||||
).to_bounding_box()
|
||||
cluster = Cluster(
|
||||
id=ix,
|
||||
label=label,
|
||||
confidence=pred_item["confidence"],
|
||||
bbox=BoundingBox.model_validate(pred_item),
|
||||
bbox=bbox,
|
||||
cells=[],
|
||||
)
|
||||
clusters.append(cluster)
|
||||
@ -196,7 +193,6 @@ class LayoutModel(BasePageModel):
|
||||
self.draw_clusters_and_cells_side_by_side(
|
||||
conv_res,
|
||||
page,
|
||||
page_orientation,
|
||||
clusters,
|
||||
mode_prefix="raw",
|
||||
)
|
||||
@ -234,7 +230,6 @@ class LayoutModel(BasePageModel):
|
||||
self.draw_clusters_and_cells_side_by_side(
|
||||
conv_res,
|
||||
page,
|
||||
page_orientation,
|
||||
processed_clusters,
|
||||
mode_prefix="postprocessed",
|
||||
)
|
||||
|
@ -1,8 +1,7 @@
|
||||
import copy
|
||||
import warnings
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Iterable, Optional, Tuple, cast
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||
@ -11,6 +10,7 @@ from docling_core.types.doc.page import (
|
||||
TextCellUnit,
|
||||
)
|
||||
from PIL import ImageDraw
|
||||
from PIL.Image import Image
|
||||
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||
@ -23,6 +23,7 @@ from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.utils.hf_model_download import download_hf_model
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.orientation import detect_orientation, rotate_bounding_box
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
|
||||
@ -30,6 +31,8 @@ class TableStructureModel(BasePageModel):
|
||||
_model_repo_folder = "ds4sd--docling-models"
|
||||
_model_path = "model_artifacts/tableformer"
|
||||
|
||||
_table_labels = {DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
@ -186,31 +189,48 @@ class TableStructureModel(BasePageModel):
|
||||
page.predictions.tablestructure = (
|
||||
TableStructurePrediction()
|
||||
) # dummy
|
||||
|
||||
in_tables = [
|
||||
(
|
||||
cluster,
|
||||
[
|
||||
round(cluster.bbox.l) * self.scale,
|
||||
round(cluster.bbox.t) * self.scale,
|
||||
round(cluster.bbox.r) * self.scale,
|
||||
round(cluster.bbox.b) * self.scale,
|
||||
],
|
||||
)
|
||||
cells_orientation = detect_orientation(page.cells)
|
||||
# Keep only table bboxes
|
||||
in_tables_clusters = [
|
||||
cluster
|
||||
for cluster in page.predictions.layout.clusters
|
||||
if cluster.label
|
||||
in [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
|
||||
if cluster.label in self._table_labels
|
||||
]
|
||||
if not len(in_tables):
|
||||
|
||||
if not len(in_tables_clusters):
|
||||
yield page
|
||||
continue
|
||||
|
||||
# Rotate and scale table image
|
||||
page_im = cast(Image, page.get_image())
|
||||
scaled_page_im: Image = cast(
|
||||
Image, page.get_image(scale=self.scale)
|
||||
)
|
||||
if cells_orientation:
|
||||
scaled_page_im = scaled_page_im.rotate(
|
||||
-cells_orientation, expand=True
|
||||
)
|
||||
page_input = {
|
||||
"width": page.size.width * self.scale,
|
||||
"height": page.size.height * self.scale,
|
||||
"image": numpy.asarray(page.get_image(scale=self.scale)),
|
||||
"width": scaled_page_im.size[0],
|
||||
"height": scaled_page_im.size[1],
|
||||
"image": numpy.asarray(scaled_page_im),
|
||||
}
|
||||
|
||||
# Rotate and scale table cells
|
||||
in_tables = [
|
||||
(
|
||||
c,
|
||||
[
|
||||
round(x) * self.scale
|
||||
for x in _rotate_bbox(
|
||||
c.bbox,
|
||||
orientation=-cells_orientation,
|
||||
im_size=page_im.size,
|
||||
)
|
||||
.to_top_left_origin(page_im.size[1])
|
||||
.as_tuple()
|
||||
],
|
||||
)
|
||||
for c in in_tables_clusters
|
||||
]
|
||||
table_clusters, table_bboxes = zip(*in_tables)
|
||||
|
||||
if len(table_bboxes):
|
||||
@ -238,11 +258,16 @@ class TableStructureModel(BasePageModel):
|
||||
scale=self.scale
|
||||
)
|
||||
)
|
||||
new_bbox = _rotate_bbox(
|
||||
new_cell.to_bounding_box(),
|
||||
orientation=-cells_orientation,
|
||||
im_size=scaled_page_im.size,
|
||||
).model_dump()
|
||||
tokens.append(
|
||||
{
|
||||
"id": new_cell.index,
|
||||
"text": new_cell.text,
|
||||
"bbox": new_cell.rect.to_bounding_box().model_dump(),
|
||||
"bbox": new_bbox,
|
||||
}
|
||||
)
|
||||
page_input["tokens"] = tokens
|
||||
@ -302,3 +327,11 @@ class TableStructureModel(BasePageModel):
|
||||
)
|
||||
|
||||
yield page
|
||||
|
||||
|
||||
def _rotate_bbox(
|
||||
bbox: BoundingBox, *, orientation: int, im_size: Tuple[int, int]
|
||||
) -> BoundingBox:
|
||||
if orientation:
|
||||
return rotate_bounding_box(bbox, orientation, im_size).to_bounding_box()
|
||||
return bbox
|
||||
|
@ -27,7 +27,6 @@ from docling.utils.ocr_utils import (
|
||||
parse_tesseract_orientation,
|
||||
tesseract_box_to_bounding_rectangle,
|
||||
)
|
||||
from docling.utils.orientation import Box
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -1,4 +1,4 @@
|
||||
from typing import Optional
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle
|
||||
@ -43,7 +43,9 @@ def tesseract_box_to_bounding_rectangle(
|
||||
orientation: int,
|
||||
im_size: Tuple[int, int],
|
||||
) -> BoundingRectangle:
|
||||
# box is in the top, left, height, width format, top left coordinates
|
||||
# bbox is in the top, left, height, width format, top left coordinates
|
||||
# We detected the tesseract on the document rotated with minus orientation, we have
|
||||
# to apply an orientation angle
|
||||
rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size)
|
||||
rect = BoundingRectangle(
|
||||
r_x0=rect.r_x0 / scale,
|
||||
@ -54,7 +56,7 @@ def tesseract_box_to_bounding_rectangle(
|
||||
r_y2=rect.r_y2 / scale,
|
||||
r_x3=rect.r_x3 / scale,
|
||||
r_y3=rect.r_y3 / scale,
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
coord_origin=rect.coord_origin,
|
||||
)
|
||||
if original_offset is not None:
|
||||
if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
|
||||
|
@ -1,13 +1,15 @@
|
||||
from collections import Counter
|
||||
from operator import itemgetter
|
||||
from typing import Tuple
|
||||
|
||||
from docling_core.types.doc.page import TextCell
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
|
||||
_ORIENTATIONS = [0, 90, 180, 270]
|
||||
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
|
||||
|
||||
|
||||
def _clipped_orientation(angle: float) -> int:
|
||||
return min((abs(angle - o) % 360, o) for o in _ORIENTATIONS)[1]
|
||||
return min((abs(angle - o) % 360, o) for o in CLIPPED_ORIENTATIONS)[1]
|
||||
|
||||
|
||||
def detect_orientation(cells: list[TextCell]) -> int:
|
||||
@ -15,12 +17,6 @@ def detect_orientation(cells: list[TextCell]) -> int:
|
||||
return 0
|
||||
orientation_counter = Counter(_clipped_orientation(c.rect.angle_360) for c in cells)
|
||||
return max(orientation_counter.items(), key=itemgetter(1))[0]
|
||||
from typing import Tuple
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle
|
||||
|
||||
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
|
||||
|
||||
|
||||
def rotate_bounding_box(
|
||||
|
@ -1,3 +1,9 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_12><loc_82><loc_85><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
|
||||
<table>
|
||||
<location><page_1><loc_12><loc_39><loc_67><loc_87></location>
|
||||
<row_0><col_0><body></col_0><col_1><col_header>Column 0</col_1><col_2><col_header>Column 1</col_2><col_3><col_header>Column 2</col_3></row_0>
|
||||
<row_1><col_0><row_header>this is row 0</col_0><col_1><body>some cells</col_1><col_2><body>have content</col_2><col_3><body>and</col_3></row_1>
|
||||
<row_2><col_0><row_header>and row 1</col_0><col_1><body></col_1><col_2><body>other</col_2><col_3><body>have</col_3></row_2>
|
||||
<row_3><col_0><row_header>and last row 2</col_0><col_1><body>nothing</col_1><col_2><body></col_2><col_3><body>inside</col_3></row_3>
|
||||
</table>
|
||||
</document>
|
@ -1 +1,5 @@
|
||||
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
|
||||
| | Column 0 | Column 1 | Column 2 |
|
||||
|----------------|------------|--------------|------------|
|
||||
| this is row 0 | some cells | have content | and |
|
||||
| and row 1 | | other | have |
|
||||
| and last row 2 | nothing | | inside |
|
@ -1,3 +0,0 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
|
||||
</document>
|
@ -1 +0,0 @@
|
||||
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.21306574279092, 74.12495603322407, 152.19606490864376, 154.19400205373182], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
@ -1 +0,0 @@
|
||||
package
|
@ -1 +0,0 @@
|
||||
[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}]
|
@ -1,5 +1,9 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
|
||||
<paragraph><location><page_1><loc_15><loc_12><loc_88><loc_15></location>JSON and Markdown in an easy self contained</paragraph>
|
||||
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_11></location>Docling bundles PDF document conversion to</paragraph>
|
||||
<table>
|
||||
<location><page_1><loc_33><loc_13><loc_88><loc_61></location>
|
||||
<row_0><col_0><col_header>inside</col_0><col_1><body></col_1><col_2><col_header>nothing</col_2><col_3><col_header>and last row 2</col_3></row_0>
|
||||
<row_1><col_0><body>have</col_0><col_1><body>other</col_1><col_2><body></col_2><col_3><body>and row 1</col_3></row_1>
|
||||
<row_2><col_0><body>and</col_0><col_1><body>have content</col_1><col_2><body>some cells</col_2><col_3><body>this is row 0</col_3></row_2>
|
||||
<row_3><col_0><body>Column 2</col_0><col_1><body>Column 1</col_1><col_2><body>Column 0</col_2><col_3><body></col_3></row_3>
|
||||
</table>
|
||||
</document>
|
@ -1,5 +1,5 @@
|
||||
package
|
||||
|
||||
JSON and Markdown in an easy self contained
|
||||
|
||||
Docling bundles PDF document conversion to
|
||||
| inside | | nothing | and last row 2 |
|
||||
|----------|--------------|------------|------------------|
|
||||
| have | other | | and row 1 |
|
||||
| and | have content | some cells | this is row 0 |
|
||||
| Column 2 | Column 1 | Column 0 | |
|
@ -1,3 +1,9 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_82><loc_75><loc_84><loc_88></location>package</paragraph>
|
||||
<table>
|
||||
<location><page_1><loc_39><loc_33><loc_87><loc_88></location>
|
||||
<row_0><col_0><body>and last row 2</col_0><col_1><body>and row 1</col_1><col_2><body>this is row 0</col_2><col_3><body></col_3></row_0>
|
||||
<row_1><col_0><body>nothing</col_0><col_1><body></col_1><col_2><body>some cells</col_2><col_3><body>Column 0</col_3></row_1>
|
||||
<row_2><col_0><body></col_0><col_1><body>other</col_1><col_2><body>have content</col_2><col_3><body>Column 1</col_3></row_2>
|
||||
<row_3><col_0><body>inside</col_0><col_1><body>have</col_1><col_2><body>and</col_2><col_3><body>Column 2</col_3></row_3>
|
||||
</table>
|
||||
</document>
|
@ -1 +1,5 @@
|
||||
package
|
||||
| and last row 2 | and row 1 | this is row 0 | |
|
||||
|------------------|-------------|-----------------|----------|
|
||||
| nothing | | some cells | Column 0 |
|
||||
| | other | have content | Column 1 |
|
||||
| inside | have | and | Column 2 |
|
@ -1,4 +1,9 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_9><loc_12><loc_11><loc_85></location>Docling bundles PDF document conversion to</paragraph>
|
||||
<paragraph><location><page_1><loc_12><loc_12><loc_15><loc_85></location><location><page_1><loc_12><loc_12><loc_15><loc_85></location>JSON and Markdown in an easy self contained package</paragraph>
|
||||
<table>
|
||||
<location><page_1><loc_13><loc_12><loc_61><loc_67></location>
|
||||
<row_0><col_0><body>Column 2</col_0><col_1><body>and</col_1><col_2><body>have</col_2><col_3><body>inside</col_3></row_0>
|
||||
<row_1><col_0><body>Column 1</col_0><col_1><body>have content</col_1><col_2><body>other</col_2><col_3><body></col_3></row_1>
|
||||
<row_2><col_0><body>Column 0</col_0><col_1><body>some cells</col_1><col_2><body></col_2><col_3><body>nothing</col_3></row_2>
|
||||
<row_3><col_0><body></col_0><col_1><body>this is row 0</col_1><col_2><body>and row 1</col_2><col_3><body>and last row 2</col_3></row_3>
|
||||
</table>
|
||||
</document>
|
@ -27,53 +27,468 @@
|
||||
"file-info": {
|
||||
"filename": "ocr_test_rotated_90.pdf",
|
||||
"filename-prov": null,
|
||||
"document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6",
|
||||
"document-hash": "2fb20caf4f54c878a0b454b496010d92adc6ae1b7f10fbd9ba1ba26260f818a8",
|
||||
"#-pages": 1,
|
||||
"collection-name": null,
|
||||
"description": null,
|
||||
"page-hashes": [
|
||||
{
|
||||
"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3",
|
||||
"hash": "56c847ad7c5ab9f0346a325510af001ab66a9bb45f65ffc7bbfc60c929def7d2",
|
||||
"model": "default",
|
||||
"page": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"main-text": [
|
||||
{
|
||||
"name": "Table",
|
||||
"type": "table",
|
||||
"$ref": "#/tables/0"
|
||||
}
|
||||
],
|
||||
"figures": [],
|
||||
"tables": [
|
||||
{
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
131.21306574279092,
|
||||
74.12495603322407,
|
||||
152.19606490864376,
|
||||
154.19400205373182
|
||||
75.13359832763672,
|
||||
102.99908447265625,
|
||||
361.18695068359375,
|
||||
562.1403198242188
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
0,
|
||||
7
|
||||
0
|
||||
],
|
||||
"__ref_s3_data": null
|
||||
}
|
||||
],
|
||||
"text": "package",
|
||||
"type": "paragraph",
|
||||
"text": "",
|
||||
"type": "table",
|
||||
"payload": null,
|
||||
"name": "Text",
|
||||
"font": null
|
||||
"#-cols": 4,
|
||||
"#-rows": 4,
|
||||
"data": [
|
||||
[
|
||||
{
|
||||
"bbox": [
|
||||
105.0718660651769,
|
||||
304.7354643560275,
|
||||
119.73306194406335,
|
||||
369.59883715876185
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
0,
|
||||
0
|
||||
]
|
||||
],
|
||||
"text": "Column 2",
|
||||
"type": "body",
|
||||
"col": 0,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
0,
|
||||
1
|
||||
],
|
||||
"row": 0,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
172.26899264661517,
|
||||
324.3168597625203,
|
||||
188.15195177751215,
|
||||
352.46511670018316
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
0,
|
||||
1
|
||||
]
|
||||
],
|
||||
"text": "and",
|
||||
"type": "body",
|
||||
"col": 1,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
1,
|
||||
2
|
||||
],
|
||||
"row": 0,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
240.68788382926402,
|
||||
321.869185135892,
|
||||
256.570842960161,
|
||||
356.13662847492196
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
0,
|
||||
2
|
||||
]
|
||||
],
|
||||
"text": "have",
|
||||
"type": "body",
|
||||
"col": 2,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
2,
|
||||
3
|
||||
],
|
||||
"row": 0,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
312.772072637728,
|
||||
319.42151173034614,
|
||||
326.21150018118874,
|
||||
359.8081389276117
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
0,
|
||||
3
|
||||
]
|
||||
],
|
||||
"text": "inside",
|
||||
"type": "body",
|
||||
"col": 3,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
3,
|
||||
4
|
||||
],
|
||||
"row": 0,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"bbox": [
|
||||
105.0718660651769,
|
||||
419.77616156495424,
|
||||
119.73306194406335,
|
||||
483.4156981046677
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
1,
|
||||
0
|
||||
]
|
||||
],
|
||||
"text": "Column 1",
|
||||
"type": "body",
|
||||
"col": 0,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
0,
|
||||
1
|
||||
],
|
||||
"row": 1,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
1,
|
||||
2
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
172.26898999097682,
|
||||
408.7616301134671,
|
||||
185.70842261785268,
|
||||
495.6540658231026
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
1,
|
||||
1
|
||||
]
|
||||
],
|
||||
"text": "have content",
|
||||
"type": "body",
|
||||
"col": 1,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
1,
|
||||
2
|
||||
],
|
||||
"row": 1,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
1,
|
||||
2
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
240.68788377535307,
|
||||
433.23837164942523,
|
||||
255.34907711253194,
|
||||
468.729651251476
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
1,
|
||||
2
|
||||
]
|
||||
],
|
||||
"text": "other",
|
||||
"type": "body",
|
||||
"col": 2,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
2,
|
||||
3
|
||||
],
|
||||
"row": 1,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
1,
|
||||
2
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": null,
|
||||
"spans": [
|
||||
[
|
||||
1,
|
||||
3
|
||||
]
|
||||
],
|
||||
"text": "",
|
||||
"type": "body"
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"bbox": [
|
||||
105.07186605295925,
|
||||
532.3691850430223,
|
||||
119.73306193184567,
|
||||
597.2325578457567
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
2,
|
||||
0
|
||||
]
|
||||
],
|
||||
"text": "Column 0",
|
||||
"type": "body",
|
||||
"col": 0,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
0,
|
||||
1
|
||||
],
|
||||
"row": 2,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
2,
|
||||
3
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
172.26899069197702,
|
||||
529.9215107729757,
|
||||
186.93018720629036,
|
||||
600.9040699770771
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
2,
|
||||
1
|
||||
]
|
||||
],
|
||||
"text": "some cells",
|
||||
"type": "body",
|
||||
"col": 1,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
1,
|
||||
2
|
||||
],
|
||||
"row": 2,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
2,
|
||||
3
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": null,
|
||||
"spans": [
|
||||
[
|
||||
2,
|
||||
2
|
||||
]
|
||||
],
|
||||
"text": "",
|
||||
"type": "body"
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
311.49999737299976,
|
||||
536.775000315586,
|
||||
332.5000022770002,
|
||||
592.9083316144141
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
2,
|
||||
3
|
||||
]
|
||||
],
|
||||
"text": "nothing",
|
||||
"type": "body",
|
||||
"col": 3,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
3,
|
||||
4
|
||||
],
|
||||
"row": 2,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
2,
|
||||
3
|
||||
]
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"bbox": null,
|
||||
"spans": [
|
||||
[
|
||||
3,
|
||||
0
|
||||
]
|
||||
],
|
||||
"text": "",
|
||||
"type": "body"
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
172.2689900422697,
|
||||
638.8430233885732,
|
||||
186.93018846286373,
|
||||
719.6162777831045
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
3,
|
||||
1
|
||||
]
|
||||
],
|
||||
"text": "this is row 0",
|
||||
"type": "body",
|
||||
"col": 1,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
1,
|
||||
2
|
||||
],
|
||||
"row": 3,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
3,
|
||||
4
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
240.68788248006402,
|
||||
647.4098827174411,
|
||||
255.34907835895044,
|
||||
712.2732555201754
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
3,
|
||||
2
|
||||
]
|
||||
],
|
||||
"text": "and row 1",
|
||||
"type": "body",
|
||||
"col": 2,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
2,
|
||||
3
|
||||
],
|
||||
"row": 3,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
3,
|
||||
4
|
||||
]
|
||||
},
|
||||
{
|
||||
"bbox": [
|
||||
313.9938353514431,
|
||||
633.9476737903873,
|
||||
327.43326861374595,
|
||||
725.735464724632
|
||||
],
|
||||
"spans": [
|
||||
[
|
||||
3,
|
||||
3
|
||||
]
|
||||
],
|
||||
"text": "and last row 2",
|
||||
"type": "body",
|
||||
"col": 3,
|
||||
"col-header": false,
|
||||
"col-span": [
|
||||
3,
|
||||
4
|
||||
],
|
||||
"row": 3,
|
||||
"row-header": false,
|
||||
"row-span": [
|
||||
3,
|
||||
4
|
||||
]
|
||||
}
|
||||
]
|
||||
],
|
||||
"model": null,
|
||||
"bounding-box": null
|
||||
}
|
||||
],
|
||||
"figures": [],
|
||||
"tables": [],
|
||||
"bitmaps": null,
|
||||
"equations": [],
|
||||
"footnotes": [],
|
||||
"page-dimensions": [
|
||||
{
|
||||
"height": 595.201171875,
|
||||
"height": 842.0,
|
||||
"page": 1,
|
||||
"width": 841.9216918945312
|
||||
"width": 595.0
|
||||
}
|
||||
],
|
||||
"page-footers": [],
|
||||
|
@ -1,3 +1,5 @@
|
||||
Docling bundles PDF document conversion to
|
||||
|
||||
JSON and Markdown in an easy self contained package
|
||||
| Column 2 | and | have | inside |
|
||||
|------------|---------------|-----------|----------------|
|
||||
| Column 1 | have content | other | |
|
||||
| Column 0 | some cells | | nothing |
|
||||
| | this is row 0 | and row 1 | and last row 2 |
|
@ -1 +1,5 @@
|
||||
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
|
||||
| | Column 0 | Column 1 | Column 2 |
|
||||
|----------------|------------|--------------|------------|
|
||||
| this is row 0 | some cells | have content | and |
|
||||
| and row 1 | | other | have |
|
||||
| and last row 2 | nothing | | inside |
|
@ -1,5 +1,5 @@
|
||||
package
|
||||
|
||||
JSON and Markdown in an easy self contained
|
||||
|
||||
Docling bundles PDF document conversion to
|
||||
| inside | | nothing | and last row 2 |
|
||||
|----------|--------------|------------|------------------|
|
||||
| have | other | | and row 1 |
|
||||
| and | have content | some cells | this is row 0 |
|
||||
| Column 2 | Column 1 | Column 0 | |
|
@ -1 +1,5 @@
|
||||
package
|
||||
| and last row 2 | and row 1 | this is row 0 | |
|
||||
|------------------|-------------|-----------------|----------|
|
||||
| nothing | | some cells | Column 0 |
|
||||
| | other | have content | Column 1 |
|
||||
| inside | have | and | Column 2 |
|
@ -1,3 +1,5 @@
|
||||
Docling bundles PDF document conversion to
|
||||
|
||||
JSON and Markdown in an easy self contained package
|
||||
| Column 2 | and | have | inside |
|
||||
|------------|---------------|-----------|----------------|
|
||||
| Column 1 | have content | other | |
|
||||
| Column 0 | some cells | | nothing |
|
||||
| | this is row 0 | and row 1 | and last row 2 |
|
BIN
tests/data_scanned/ocr_test.pdf
vendored
BIN
tests/data_scanned/ocr_test.pdf
vendored
Binary file not shown.
BIN
tests/data_scanned/ocr_test_rotated_180.pdf
vendored
BIN
tests/data_scanned/ocr_test_rotated_180.pdf
vendored
Binary file not shown.
BIN
tests/data_scanned/ocr_test_rotated_270.pdf
vendored
BIN
tests/data_scanned/ocr_test_rotated_270.pdf
vendored
Binary file not shown.
BIN
tests/data_scanned/ocr_test_rotated_90.pdf
vendored
BIN
tests/data_scanned/ocr_test_rotated_90.pdf
vendored
Binary file not shown.
Loading…
Reference in New Issue
Block a user