feat: Integrate ListItemMarkerProcessor into document assembly (#1825)

* Integrate ListItemMarkerProcessor into document assembly

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update to final version

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update all test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Upgrade deps

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2025-07-01 10:04:58 +02:00
committed by GitHub
parent bdfee4e2d0
commit 56a0e104f7
24 changed files with 739 additions and 1675 deletions

View File

@@ -12,6 +12,9 @@ from docling_core.types.doc import (
TableData,
)
from docling_core.types.doc.document import ContentLayer
from docling_ibm_models.list_item_normalizer.list_marker_processor import (
ListItemMarkerProcessor,
)
from docling_ibm_models.reading_order.reading_order_rb import (
PageElement as ReadingOrderPageElement,
ReadingOrderPredictor,
@@ -40,6 +43,7 @@ class ReadingOrderModel:
def __init__(self, options: ReadingOrderOptions):
self.options = options
self.ro_model = ReadingOrderPredictor()
self.list_item_processor = ListItemMarkerProcessor()
def _assembled_to_readingorder_elements(
self, conv_res: ConversionResult
@@ -92,7 +96,8 @@ class ReadingOrderModel:
)
if c_label == DocItemLabel.LIST_ITEM:
# TODO: Infer if this is a numbered or a bullet list item
doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
l_item = doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
self.list_item_processor.process_list_item(l_item)
elif c_label == DocItemLabel.SECTION_HEADER:
doc.add_heading(parent=doc_item, text=c_text, prov=c_prov)
else:
@@ -301,6 +306,8 @@ class ReadingOrderModel:
new_item = out_doc.add_list_item(
text=cap_text, enumerated=False, prov=prov, parent=current_list
)
self.list_item_processor.process_list_item(new_item)
elif label == DocItemLabel.SECTION_HEADER:
current_list = None