feat: Make Page.parsed_page the only source of truth for text cells, add OCR cells to it (#1745)

* Keep page.parsed_page.textline_cells and page.cells in sync, including OCR

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Make page.parsed_page the only source of truth for text cells

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Small fix

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Correctly compute PDF boxes from pymupdf

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Use different OCR engine order

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add type hints and fix mypy

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* One more test fix

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Remove with pypdfium2_lock from caller sites

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix typing

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2025-06-13 19:01:55 +02:00
committed by GitHub
parent 0432a31b2f
commit 7d3302cb48
50 changed files with 339091 additions and 330047 deletions

View File

@@ -5,84 +5,143 @@
"width": 595.201171875,
"height": 841.9216918945312
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 73.34702132031646,
"r_y0": 97.99999977896755,
"r_x1": 503.64955224479564,
"r_y1": 97.99999977896755,
"r_x2": 503.64955224479564,
"r_y2": 76.99999977896756,
"r_x3": 73.34702132031646,
"r_y3": 76.99999977896756,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 73.34702132031646,
"r_y0": 97.99999977896755,
"r_x1": 503.64955224479564,
"r_y1": 97.99999977896755,
"r_x2": 503.64955224479564,
"r_y2": 76.99999977896756,
"r_x3": 73.34702132031646,
"r_y3": 76.99999977896756,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 71.84193505100733,
"r_y0": 152.90926970226084,
"r_x1": 153.088934155825,
"r_y1": 152.90926970226084,
"r_x2": 153.088934155825,
"r_y2": 129.797125232046,
"r_x3": 71.84193505100733,
"r_y3": 129.797125232046,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 71.84193505100733,
"r_y0": 152.90926970226084,
"r_x1": 153.088934155825,
"r_y1": 152.90926970226084,
"r_x2": 153.088934155825,
"r_y2": 129.797125232046,
"r_x3": 71.84193505100733,
"r_y3": 129.797125232046,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@@ -5,84 +5,143 @@
"width": 595.201171875,
"height": 841.9216918945312
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 89.2388782764286,
"r_y0": 764.898293373551,
"r_x1": 521.9863147998661,
"r_y1": 764.898293373551,
"r_x2": 521.9863147998661,
"r_y2": 744.0929853494625,
"r_x3": 89.2388782764286,
"r_y3": 744.0929853494625,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.2388782764286,
"r_y0": 764.898293373551,
"r_x1": 521.9863147998661,
"r_y1": 764.898293373551,
"r_x2": 521.9863147998661,
"r_y2": 744.0929853494625,
"r_x3": 89.2388782764286,
"r_y3": 744.0929853494625,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 89.23887497045128,
"r_y0": 739.1977118987292,
"r_x1": 523.208764293368,
"r_y1": 739.1977118987292,
"r_x2": 523.208764293368,
"r_y2": 717.1685676116198,
"r_x3": 89.23887497045128,
"r_y3": 717.1685676116198,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.23887497045128,
"r_y0": 739.1977118987292,
"r_x1": 523.208764293368,
"r_y1": 739.1977118987292,
"r_x2": 523.208764293368,
"r_y2": 717.1685676116198,
"r_x3": 89.23887497045128,
"r_y3": 717.1685676116198,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 441.2561096985719,
"r_y0": 710.0268078458798,
"r_x1": 522.0347860494834,
"r_y1": 710.0268078458798,
"r_x2": 522.0347860494834,
"r_y2": 690.0429592741025,
"r_x3": 441.2561096985719,
"r_y3": 690.0429592741025,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 441.2561096985719,
"r_y0": 710.0268078458798,
"r_x1": 522.0347860494834,
"r_y1": 710.0268078458798,
"r_x2": 522.0347860494834,
"r_y2": 690.0429592741025,
"r_x3": 441.2561096985719,
"r_y3": 690.0429592741025,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@@ -5,84 +5,143 @@
"width": 841.9216918945312,
"height": 595.201171875
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 717.168585936602,
"r_y0": 504.8720061466397,
"r_x1": 737.9738558137178,
"r_y1": 504.8720061466397,
"r_x2": 737.9738558137178,
"r_y2": 70.90211682372312,
"r_x3": 717.168585936602,
"r_y3": 70.90211682372312,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 717.168585936602,
"r_y0": 504.8720061466397,
"r_x1": 737.9738558137178,
"r_y1": 504.8720061466397,
"r_x2": 737.9738558137178,
"r_y2": 70.90211682372312,
"r_x3": 717.168585936602,
"r_y3": 70.90211682372312,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 690.2441821046808,
"r_y0": 152.80629773131633,
"r_x1": 709.8255852011977,
"r_y1": 152.80629773131633,
"r_x2": 709.8255852011977,
"r_y2": 72.124570639845,
"r_x3": 690.2441821046808,
"r_y3": 72.124570639845,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 690.2441821046808,
"r_y0": 152.80629773131633,
"r_x1": 709.8255852011977,
"r_y1": 152.80629773131633,
"r_x2": 709.8255852011977,
"r_y2": 72.124570639845,
"r_x3": 690.2441821046808,
"r_y3": 72.124570639845,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@@ -5,84 +5,143 @@
"width": 841.9216918945312,
"height": 595.201171875
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 77.10171545548258,
"r_y0": 520.7638571913312,
"r_x1": 96.68315797053792,
"r_y1": 520.7638571913312,
"r_x2": 96.68315797053792,
"r_y2": 89.2388734673729,
"r_x3": 77.10171545548258,
"r_y3": 89.2388734673729,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 77.10171545548258,
"r_y0": 520.7638571913312,
"r_x1": 96.68315797053792,
"r_y1": 520.7638571913312,
"r_x2": 96.68315797053792,
"r_y2": 89.2388734673729,
"r_x3": 77.10171545548258,
"r_y3": 89.2388734673729,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 100.64168123325977,
"r_y0": 523.3236155182395,
"r_x1": 126.08064862014129,
"r_y1": 523.3236155182395,
"r_x2": 126.08064862014129,
"r_y2": 89.1266754140729,
"r_x3": 100.64168123325977,
"r_y3": 89.1266754140729,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 100.64168123325977,
"r_y0": 523.3236155182395,
"r_x1": 126.08064862014129,
"r_y1": 523.3236155182395,
"r_x2": 126.08064862014129,
"r_y2": 89.1266754140729,
"r_x3": 100.64168123325977,
"r_y3": 89.1266754140729,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 131.21306574279092,
"r_y0": 521.0762158417759,
"r_x1": 152.19606490864376,
"r_y1": 521.0762158417759,
"r_x2": 152.19606490864376,
"r_y2": 441.0071698212682,
"r_x3": 131.21306574279092,
"r_y3": 441.0071698212682,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 131.21306574279092,
"r_y0": 521.0762158417759,
"r_x1": 152.19606490864376,
"r_y1": 521.0762158417759,
"r_x2": 152.19606490864376,
"r_y2": 441.0071698212682,
"r_x3": 131.21306574279092,
"r_y3": 441.0071698212682,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@@ -5,84 +5,143 @@
"width": 595.201171875,
"height": 841.9216918945312
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 73.34702132031646,
"r_y0": 97.99999977896755,
"r_x1": 503.64955224479564,
"r_y1": 97.99999977896755,
"r_x2": 503.64955224479564,
"r_y2": 76.99999977896756,
"r_x3": 73.34702132031646,
"r_y3": 76.99999977896756,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 73.34702132031646,
"r_y0": 97.99999977896755,
"r_x1": 503.64955224479564,
"r_y1": 97.99999977896755,
"r_x2": 503.64955224479564,
"r_y2": 76.99999977896756,
"r_x3": 73.34702132031646,
"r_y3": 76.99999977896756,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 71.84193505100733,
"r_y0": 152.90926970226084,
"r_x1": 153.088934155825,
"r_y1": 152.90926970226084,
"r_x2": 153.088934155825,
"r_y2": 129.797125232046,
"r_x3": 71.84193505100733,
"r_y3": 129.797125232046,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 71.84193505100733,
"r_y0": 152.90926970226084,
"r_x1": 153.088934155825,
"r_y1": 152.90926970226084,
"r_x2": 153.088934155825,
"r_y2": 129.797125232046,
"r_x3": 71.84193505100733,
"r_y3": 129.797125232046,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@@ -5,84 +5,143 @@
"width": 595.201171875,
"height": 841.9216918945312
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 89.2388782764286,
"r_y0": 764.898293373551,
"r_x1": 521.9863147998661,
"r_y1": 764.898293373551,
"r_x2": 521.9863147998661,
"r_y2": 744.0929853494625,
"r_x3": 89.2388782764286,
"r_y3": 744.0929853494625,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.2388782764286,
"r_y0": 764.898293373551,
"r_x1": 521.9863147998661,
"r_y1": 764.898293373551,
"r_x2": 521.9863147998661,
"r_y2": 744.0929853494625,
"r_x3": 89.2388782764286,
"r_y3": 744.0929853494625,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 89.23887497045128,
"r_y0": 739.1977118987292,
"r_x1": 523.208764293368,
"r_y1": 739.1977118987292,
"r_x2": 523.208764293368,
"r_y2": 717.1685676116198,
"r_x3": 89.23887497045128,
"r_y3": 717.1685676116198,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.23887497045128,
"r_y0": 739.1977118987292,
"r_x1": 523.208764293368,
"r_y1": 739.1977118987292,
"r_x2": 523.208764293368,
"r_y2": 717.1685676116198,
"r_x3": 89.23887497045128,
"r_y3": 717.1685676116198,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 441.2561096985719,
"r_y0": 710.0268078458798,
"r_x1": 522.0347860494834,
"r_y1": 710.0268078458798,
"r_x2": 522.0347860494834,
"r_y2": 690.0429592741025,
"r_x3": 441.2561096985719,
"r_y3": 690.0429592741025,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 441.2561096985719,
"r_y0": 710.0268078458798,
"r_x1": 522.0347860494834,
"r_y1": 710.0268078458798,
"r_x2": 522.0347860494834,
"r_y2": 690.0429592741025,
"r_x3": 441.2561096985719,
"r_y3": 690.0429592741025,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@@ -5,84 +5,143 @@
"width": 841.9216918945312,
"height": 595.201171875
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 717.168585936602,
"r_y0": 504.8720061466397,
"r_x1": 737.9738558137178,
"r_y1": 504.8720061466397,
"r_x2": 737.9738558137178,
"r_y2": 70.90211682372312,
"r_x3": 717.168585936602,
"r_y3": 70.90211682372312,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 717.168585936602,
"r_y0": 504.8720061466397,
"r_x1": 737.9738558137178,
"r_y1": 504.8720061466397,
"r_x2": 737.9738558137178,
"r_y2": 70.90211682372312,
"r_x3": 717.168585936602,
"r_y3": 70.90211682372312,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 690.2441821046808,
"r_y0": 152.80629773131633,
"r_x1": 709.8255852011977,
"r_y1": 152.80629773131633,
"r_x2": 709.8255852011977,
"r_y2": 72.124570639845,
"r_x3": 690.2441821046808,
"r_y3": 72.124570639845,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 690.2441821046808,
"r_y0": 152.80629773131633,
"r_x1": 709.8255852011977,
"r_y1": 152.80629773131633,
"r_x2": 709.8255852011977,
"r_y2": 72.124570639845,
"r_x3": 690.2441821046808,
"r_y3": 72.124570639845,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@@ -5,84 +5,143 @@
"width": 841.9216918945312,
"height": 595.201171875
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 77.10171545548258,
"r_y0": 520.7638571913312,
"r_x1": 96.68315797053792,
"r_y1": 520.7638571913312,
"r_x2": 96.68315797053792,
"r_y2": 89.2388734673729,
"r_x3": 77.10171545548258,
"r_y3": 89.2388734673729,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 77.10171545548258,
"r_y0": 520.7638571913312,
"r_x1": 96.68315797053792,
"r_y1": 520.7638571913312,
"r_x2": 96.68315797053792,
"r_y2": 89.2388734673729,
"r_x3": 77.10171545548258,
"r_y3": 89.2388734673729,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 100.64168123325977,
"r_y0": 523.3236155182395,
"r_x1": 126.08064862014129,
"r_y1": 523.3236155182395,
"r_x2": 126.08064862014129,
"r_y2": 89.1266754140729,
"r_x3": 100.64168123325977,
"r_y3": 89.1266754140729,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 100.64168123325977,
"r_y0": 523.3236155182395,
"r_x1": 126.08064862014129,
"r_y1": 523.3236155182395,
"r_x2": 126.08064862014129,
"r_y2": 89.1266754140729,
"r_x3": 100.64168123325977,
"r_y3": 89.1266754140729,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 131.21306574279092,
"r_y0": 521.0762158417759,
"r_x1": 152.19606490864376,
"r_y1": 521.0762158417759,
"r_x2": 152.19606490864376,
"r_y2": 441.0071698212682,
"r_x3": 131.21306574279092,
"r_y3": 441.0071698212682,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 131.21306574279092,
"r_y0": 521.0762158417759,
"r_x1": 152.19606490864376,
"r_y1": 521.0762158417759,
"r_x2": 152.19606490864376,
"r_y2": 441.0071698212682,
"r_x3": 131.21306574279092,
"r_y3": 441.0071698212682,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [