feat: Add content_layer property to items to address body, furniture and other roles (#735)

* feat: Pass predicted page-headers and page-footers through to DoclingDocument furniture

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* chore: Update all test GT

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: update all test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: update all test cases again

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update lock

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update lock to final docling-core

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2025-02-10 12:07:49 +01:00
committed by GitHub
parent 3e26597995
commit cf78d5b7b9
43 changed files with 2082 additions and 198 deletions

View File

@@ -1 +1 @@
[{"page_no": 0, "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.99999977896756, "r": 504.8720051760782, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}, "confidence": 0.9715732336044312, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.99999977896756, "r": 504.8720051760782, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}, "confidence": 0.9715732336044312, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.99999977896756, "r": 504.8720051760782, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}, "confidence": 0.9715732336044312, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}]
[{"page_no": 0, "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.99999977896756, "r": 504.8720051760782, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}, "confidence": 0.9715733528137207, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.99999977896756, "r": 504.8720051760782, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}, "confidence": 0.9715733528137207, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.99999977896756, "r": 504.8720051760782, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}, "confidence": 0.9715733528137207, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}]

View File

@@ -1 +1 @@
{"schema_name": "DoclingDocument", "version": "1.0.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.6796646118164, "t": 764.9216918945312, "r": 504.87200927734375, "b": 689.012451171875, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
{"schema_name": "DoclingDocument", "version": "1.1.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.6796646118164, "t": 764.9216918945312, "r": 504.87200927734375, "b": 689.012451171875, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}

View File

@@ -1 +1 @@
[{"page_no": 0, "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.99999977896756, "r": 504.8720051760782, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}, "confidence": 0.9715732336044312, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.99999977896756, "r": 504.8720051760782, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}, "confidence": 0.9715732336044312, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.99999977896756, "r": 504.8720051760782, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}, "confidence": 0.9715732336044312, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}]
[{"page_no": 0, "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.99999977896756, "r": 504.8720051760782, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}, "confidence": 0.9715733528137207, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.99999977896756, "r": 504.8720051760782, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}, "confidence": 0.9715733528137207, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.99999977896756, "r": 504.8720051760782, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}, "confidence": 0.9715733528137207, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896756, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573796, "r": 504.8720051760782, "b": 124.83139494707741, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.797125232046, "r": 153.088934155825, "b": 152.90926970226084, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}]