Update to final version

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-06-20 11:42:35 +02:00
parent 48ee8a1291
commit 926e32037d
17 changed files with 1673 additions and 1604 deletions

View File

@ -12,7 +12,7 @@ from docling_core.types.doc import (
TableData, TableData,
) )
from docling_core.types.doc.document import ContentLayer from docling_core.types.doc.document import ContentLayer
from docling_ibm_models.listitem_normalizer.list_marker_processor import ( from docling_ibm_models.list_item_normalizer.list_marker_processor import (
ListItemMarkerProcessor, ListItemMarkerProcessor,
) )
from docling_ibm_models.reading_order.reading_order_rb import ( from docling_ibm_models.reading_order.reading_order_rb import (
@ -97,7 +97,7 @@ class ReadingOrderModel:
if c_label == DocItemLabel.LIST_ITEM: if c_label == DocItemLabel.LIST_ITEM:
# TODO: Infer if this is a numbered or a bullet list item # TODO: Infer if this is a numbered or a bullet list item
l_item = doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov) l_item = doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
self.list_item_processor.process_listitem(l_item) self.list_item_processor.process_list_item(l_item)
elif c_label == DocItemLabel.SECTION_HEADER: elif c_label == DocItemLabel.SECTION_HEADER:
doc.add_heading(parent=doc_item, text=c_text, prov=c_prov) doc.add_heading(parent=doc_item, text=c_text, prov=c_prov)
else: else:
@ -306,7 +306,7 @@ class ReadingOrderModel:
new_item = out_doc.add_list_item( new_item = out_doc.add_list_item(
text=cap_text, enumerated=False, prov=prov, parent=current_list text=cap_text, enumerated=False, prov=prov, parent=current_list
) )
self.list_item_processor.process_listitem(new_item) self.list_item_processor.process_list_item(new_item)
elif label == DocItemLabel.SECTION_HEADER: elif label == DocItemLabel.SECTION_HEADER:
current_list = None current_list = None

View File

@ -45,8 +45,8 @@ requires-python = '>=3.9,<4.0'
dependencies = [ dependencies = [
'pydantic (>=2.0.0,<3.0.0)', 'pydantic (>=2.0.0,<3.0.0)',
'docling-core[chunking] (>=2.29.0,<3.0.0)', 'docling-core[chunking] (>=2.29.0,<3.0.0)',
"docling-ibm-models>=3.4.4,<4.0.0",
'docling-parse (>=4.0.0,<5.0.0)', 'docling-parse (>=4.0.0,<5.0.0)',
'docling-ibm-models (>=3.6.0,<4)',
'filetype (>=1.2.0,<2.0.0)', 'filetype (>=1.2.0,<2.0.0)',
'pypdfium2 (>=4.30.0,<5.0.0)', 'pypdfium2 (>=4.30.0,<5.0.0)',
'pydantic-settings (>=2.3.0,<3.0.0)', 'pydantic-settings (>=2.3.0,<3.0.0)',
@ -145,9 +145,6 @@ constraints = [
package = true package = true
default-groups = "all" default-groups = "all"
[tool.uv.sources]
docling-ibm-models = { git = "https://github.com/docling-project/docling-ibm-models.git", rev = "dev/add-listitem-marker-identifier" }
[tool.setuptools.packages.find] [tool.setuptools.packages.find]
include = ["docling*"] include = ["docling*"]

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.3.0", "version": "1.4.0",
"name": "2203.01017v2", "name": "2203.01017v2",
"origin": { "origin": {
"mimetype": "application/pdf", "mimetype": "application/pdf",
@ -17863,7 +17863,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/1", "self_ref": "#/tables/1",
@ -18753,7 +18754,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/2", "self_ref": "#/tables/2",
@ -20117,7 +20119,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/3", "self_ref": "#/tables/3",
@ -22266,7 +22269,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/4", "self_ref": "#/tables/4",
@ -22927,7 +22931,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/5", "self_ref": "#/tables/5",
@ -24050,7 +24055,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/6", "self_ref": "#/tables/6",
@ -26307,7 +26313,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/7", "self_ref": "#/tables/7",
@ -27600,7 +27607,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/8", "self_ref": "#/tables/8",
@ -27635,7 +27643,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/9", "self_ref": "#/tables/9",
@ -27670,7 +27679,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/10", "self_ref": "#/tables/10",
@ -27705,7 +27715,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/11", "self_ref": "#/tables/11",
@ -27740,7 +27751,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/12", "self_ref": "#/tables/12",
@ -27783,7 +27795,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/13", "self_ref": "#/tables/13",
@ -27818,7 +27831,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/14", "self_ref": "#/tables/14",
@ -27853,7 +27867,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/15", "self_ref": "#/tables/15",
@ -27888,7 +27903,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/16", "self_ref": "#/tables/16",
@ -27931,7 +27947,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/17", "self_ref": "#/tables/17",
@ -27966,7 +27983,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/18", "self_ref": "#/tables/18",
@ -28001,7 +28019,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/19", "self_ref": "#/tables/19",
@ -28036,7 +28055,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/20", "self_ref": "#/tables/20",
@ -28071,7 +28091,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/21", "self_ref": "#/tables/21",
@ -28106,7 +28127,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/22", "self_ref": "#/tables/22",
@ -28141,7 +28163,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/23", "self_ref": "#/tables/23",
@ -28176,7 +28199,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/24", "self_ref": "#/tables/24",
@ -28211,7 +28235,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/25", "self_ref": "#/tables/25",
@ -28246,7 +28271,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/26", "self_ref": "#/tables/26",
@ -28281,7 +28307,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/27", "self_ref": "#/tables/27",
@ -28324,7 +28351,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/28", "self_ref": "#/tables/28",
@ -28359,7 +28387,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/29", "self_ref": "#/tables/29",
@ -28394,7 +28423,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/30", "self_ref": "#/tables/30",
@ -28429,7 +28459,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/31", "self_ref": "#/tables/31",
@ -28464,7 +28495,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/32", "self_ref": "#/tables/32",
@ -28499,7 +28531,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/33", "self_ref": "#/tables/33",
@ -28542,7 +28575,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/34", "self_ref": "#/tables/34",
@ -28577,7 +28611,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/35", "self_ref": "#/tables/35",
@ -28612,7 +28647,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/36", "self_ref": "#/tables/36",
@ -28647,7 +28683,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/37", "self_ref": "#/tables/37",
@ -28682,7 +28719,8 @@
"num_rows": 0, "num_rows": 0,
"num_cols": 0, "num_cols": 0,
"grid": [] "grid": []
} },
"annotations": []
} }
], ],
"key_value_items": [], "key_value_items": [],

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.3.0", "version": "1.4.0",
"name": "2206.01062", "name": "2206.01062",
"origin": { "origin": {
"mimetype": "application/pdf", "mimetype": "application/pdf",
@ -23491,7 +23491,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/1", "self_ref": "#/tables/1",
@ -26654,7 +26655,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/2", "self_ref": "#/tables/2",
@ -29187,7 +29189,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/3", "self_ref": "#/tables/3",
@ -31574,7 +31577,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/4", "self_ref": "#/tables/4",
@ -34177,7 +34181,8 @@
} }
] ]
] ]
} },
"annotations": []
} }
], ],
"key_value_items": [], "key_value_items": [],

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.3.0", "version": "1.4.0",
"name": "2305.03393v1-pg9", "name": "2305.03393v1-pg9",
"origin": { "origin": {
"mimetype": "application/pdf", "mimetype": "application/pdf",
@ -2104,7 +2104,8 @@
} }
] ]
] ]
} },
"annotations": []
} }
], ],
"key_value_items": [], "key_value_items": [],

View File

@ -60,6 +60,8 @@
<page_header><loc_159><loc_59><loc_366><loc_64>Optimized Table Tokenization for Table Structure Recognition</page_header> <page_header><loc_159><loc_59><loc_366><loc_64>Optimized Table Tokenization for Table Structure Recognition</page_header>
<page_header><loc_389><loc_59><loc_393><loc_64>7</page_header> <page_header><loc_389><loc_59><loc_393><loc_64>7</page_header>
<picture><loc_135><loc_103><loc_367><loc_177><caption><loc_110><loc_79><loc_393><loc_98>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption></picture> <picture><loc_135><loc_103><loc_367><loc_177><caption><loc_110><loc_79><loc_393><loc_98>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption></picture>
<unordered_list><list_item><loc_273><loc_172><loc_349><loc_176>4 - 2d merges: "C", "L", "U", "X"</list_item>
</unordered_list>
<section_header_level_1><loc_110><loc_193><loc_202><loc_198>4.2 Language Syntax</section_header_level_1> <section_header_level_1><loc_110><loc_193><loc_202><loc_198>4.2 Language Syntax</section_header_level_1>
<text><loc_110><loc_205><loc_297><loc_211>The OTSL representation follows these syntax rules:</text> <text><loc_110><loc_205><loc_297><loc_211>The OTSL representation follows these syntax rules:</text>
<unordered_list><list_item><loc_114><loc_219><loc_393><loc_232>Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.</list_item> <unordered_list><list_item><loc_114><loc_219><loc_393><loc_232>Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.</list_item>

File diff suppressed because it is too large Load Diff

View File

@ -84,6 +84,8 @@ Fig. 3. OTSL description of table structure: A - table example; B - graphical re
<!-- image --> <!-- image -->
- 4 - 2d merges: "C", "L", "U", "X"
## 4.2 Language Syntax ## 4.2 Language Syntax
The OTSL representation follows these syntax rules: The OTSL representation follows these syntax rules:

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.3.0", "version": "1.4.0",
"name": "amt_handbook_sample", "name": "amt_handbook_sample",
"origin": { "origin": {
"mimetype": "application/pdf", "mimetype": "application/pdf",

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.3.0", "version": "1.4.0",
"name": "code_and_formula", "name": "code_and_formula",
"origin": { "origin": {
"mimetype": "application/pdf", "mimetype": "application/pdf",

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.3.0", "version": "1.4.0",
"name": "multi_page", "name": "multi_page",
"origin": { "origin": {
"mimetype": "application/pdf", "mimetype": "application/pdf",

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.3.0", "version": "1.4.0",
"name": "picture_classification", "name": "picture_classification",
"origin": { "origin": {
"mimetype": "application/pdf", "mimetype": "application/pdf",

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.3.0", "version": "1.4.0",
"name": "redp5110_sampled", "name": "redp5110_sampled",
"origin": { "origin": {
"mimetype": "application/pdf", "mimetype": "application/pdf",
@ -12471,7 +12471,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/1", "self_ref": "#/tables/1",
@ -13096,7 +13097,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/2", "self_ref": "#/tables/2",
@ -15356,7 +15358,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/3", "self_ref": "#/tables/3",
@ -15713,7 +15716,8 @@
} }
] ]
] ]
} },
"annotations": []
}, },
{ {
"self_ref": "#/tables/4", "self_ref": "#/tables/4",
@ -16918,7 +16922,8 @@
} }
] ]
] ]
} },
"annotations": []
} }
], ],
"key_value_items": [], "key_value_items": [],

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.3.0", "version": "1.4.0",
"name": "right_to_left_01", "name": "right_to_left_01",
"origin": { "origin": {
"mimetype": "application/pdf", "mimetype": "application/pdf",

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.3.0", "version": "1.4.0",
"name": "right_to_left_02", "name": "right_to_left_02",
"origin": { "origin": {
"mimetype": "application/pdf", "mimetype": "application/pdf",

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.3.0", "version": "1.4.0",
"name": "right_to_left_03", "name": "right_to_left_03",
"origin": { "origin": {
"mimetype": "application/pdf", "mimetype": "application/pdf",

2199
uv.lock generated

File diff suppressed because it is too large Load Diff