mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 03:55:00 +00:00
added the reading-order model
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
c2ae1cc4ca
commit
5e351e9d86
45
docling/models/ds_ro_model.py
Normal file
45
docling/models/ds_ro_model.py
Normal file
@ -0,0 +1,45 @@
|
||||
|
||||
from typing import List, Dict
|
||||
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
||||
from docling_ibm_models.reading_order.reading_order_rb import PageElement, ReadingOrderPredictor
|
||||
|
||||
class ReadingOrderRbOptions(BaseModel):
|
||||
model_config = ConfigDict(protected_namespaces=())
|
||||
|
||||
class ReadingOrderRbModel:
|
||||
|
||||
def __init__(self, options: ReadingOrderRbOptions):
|
||||
self.options = options
|
||||
|
||||
self.model = ReadingOrderPredictor()
|
||||
|
||||
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
||||
|
||||
with TimeRecorder(conv_res, "ReadingOrderRbModel", scope=ProfilingScope.DOCUMENT):
|
||||
|
||||
pred_elements: Dict[int, List[PageElement]] = {}
|
||||
|
||||
for element in conv_res.assembled.elements:
|
||||
|
||||
page_no = element.page_no
|
||||
page_height = page_no_to_page[element.page_no].size.height
|
||||
|
||||
bbox = element.cluster.bbox.to_bottom_left_origin(
|
||||
page_height=page_height
|
||||
)
|
||||
|
||||
if page_no not in pred_elements:
|
||||
pred_elements[page_no] = []
|
||||
|
||||
pred_elements[prov.page_no].append(
|
||||
PageElement(
|
||||
page_no=page_no,
|
||||
cid=len(true_elements[page_no]),
|
||||
pid=0,
|
||||
label=element.label,
|
||||
bbox=bbox
|
||||
)
|
||||
)
|
||||
|
||||
for page_no,elements in pred_elements.items():
|
||||
sorted_elements, to_captions, to_footnotes = self.model.predict_page(page_elements=elements)
|
Loading…
Reference in New Issue
Block a user