feat: export document pages as multimodal output (#54)

* feat: export document pages as multimodal output Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * create a single parquet output Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add loading into HF datasets library Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * renaming Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * cleanup Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-09 13:18:24 +00:00 · 2024-09-03 15:05:35 +02:00
parent 69e5d951a3
commit 1de2e4f924
5 changed files with 1025 additions and 7 deletions
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -71,6 +71,15 @@ class BoundingBox(BaseModel):

        return out_bbox

+    def normalized(self, page_size: PageSize) -> "BoundingBox":
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l /= page_size.width
+        out_bbox.r /= page_size.width
+        out_bbox.t /= page_size.height
+        out_bbox.b /= page_size.height
+
+        return out_bbox
+
    def as_tuple(self):
        if self.coord_origin == CoordOrigin.TOPLEFT:
            return (self.l, self.t, self.r, self.b)
--- a/docling/utils/export.py
+++ b/docling/utils/export.py
@@ -0,0 +1,193 @@
+import logging
+from typing import Any, Dict, Iterable, List, Tuple
+
+from docling_core.types.doc.base import BaseCell, Ref, Table, TableCell
+
+from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
+from docling.datamodel.document import ConvertedDocument, Page
+
+_log = logging.getLogger(__name__)
+
+
+def _export_table_to_html(table: Table):
+
+    # TODO: this is flagged as internal, because we will move it
+    # to the docling-core package.
+
+    def _get_tablecell_span(cell: TableCell, ix):
+        span = set([s[ix] for s in cell.spans])
+        if len(span) == 0:
+            return 1, None, None
+        return len(span), min(span), max(span)
+
+    body = ""
+    nrows = table.num_rows
+    ncols = table.num_cols
+
+    for i in range(nrows):
+        body += "<tr>"
+        for j in range(ncols):
+            cell: TableCell = table.data[i][j]
+
+            rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
+            colspan, colstart, colend = _get_tablecell_span(cell, 1)
+
+            if rowstart is not None and rowstart != i:
+                continue
+            if colstart is not None and colstart != j:
+                continue
+
+            if rowstart is None:
+                rowstart = i
+            if colstart is None:
+                colstart = j
+
+            content = cell.text.strip()
+            label = cell.obj_type
+            label_class = "body"
+            celltag = "td"
+            if label in ["row_header", "row_multi_header", "row_title"]:
+                label_class = "header"
+            elif label in ["col_header", "col_multi_header"]:
+                label_class = "header"
+                celltag = "th"
+
+            opening_tag = f"{celltag}"
+            if rowspan > 1:
+                opening_tag += f' rowspan="{rowspan}"'
+            if colspan > 1:
+                opening_tag += f' colspan="{colspan}"'
+
+            body += f"<{opening_tag}>{content}</{celltag}>"
+        body += "</tr>"
+    body = f"<table>{body}</table>"
+
+    return body
+
+
+def generate_multimodal_pages(
+    doc_result: ConvertedDocument,
+) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
+
+    label_to_doclaynet = {
+        "title": "title",
+        "table-of-contents": "document_index",
+        "subtitle-level-1": "section_header",
+        "checkbox-selected": "checkbox_selected",
+        "checkbox-unselected": "checkbox_unselected",
+        "caption": "caption",
+        "page-header": "page_header",
+        "page-footer": "page_footer",
+        "footnote": "footnote",
+        "table": "table",
+        "formula": "formula",
+        "list-item": "list_item",
+        "code": "code",
+        "figure": "picture",
+        "picture": "picture",
+        "reference": "text",
+        "paragraph": "text",
+        "text": "text",
+    }
+
+    content_text = ""
+    page_no = 0
+    start_ix = 0
+    end_ix = 0
+    doc_items = []
+
+    doc = doc_result.output
+
+    def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
+        segments = []
+
+        for ix, item in doc_items:
+            item_type = item.obj_type
+            label = label_to_doclaynet.get(item_type, None)
+
+            if label is None:
+                continue
+
+            bbox = BoundingBox.from_tuple(
+                item.prov[0].bbox, origin=CoordOrigin.BOTTOMLEFT
+            )
+            new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
+                page_size=page.size
+            )
+
+            new_segment = {
+                "index_in_doc": ix,
+                "label": label,
+                "text": item.text if item.text is not None else "",
+                "bbox": new_bbox.as_tuple(),
+                "data": [],
+            }
+
+            if isinstance(item, Table):
+                table_html = _export_table_to_html(item)
+                new_segment["data"].append(
+                    {
+                        "html_seq": table_html,
+                        "otsl_seq": "",
+                    }
+                )
+
+            segments.append(new_segment)
+
+        return segments
+
+    def _process_page_cells(page: Page):
+        cells = []
+        for cell in page.cells:
+            new_bbox = cell.bbox.to_top_left_origin(
+                page_height=page.size.height
+            ).normalized(page_size=page.size)
+            is_ocr = isinstance(cell, OcrCell)
+            ocr_confidence = cell.confidence if is_ocr else 1.0
+            cells.append(
+                {
+                    "text": cell.text,
+                    "bbox": new_bbox.as_tuple(),
+                    "ocr": is_ocr,
+                    "ocr_confidence": ocr_confidence,
+                }
+            )
+        return cells
+
+    def _process_page():
+        page_ix = page_no - 1
+        page = doc_result.pages[page_ix]
+
+        page_cells = _process_page_cells(page=page)
+        page_segments = _process_page_segments(doc_items=doc_items, page=page)
+        content_md = doc.export_to_markdown(
+            main_text_start=start_ix, main_text_stop=end_ix
+        )
+
+        return content_text, content_md, page_cells, page_segments, page
+
+    for ix, orig_item in enumerate(doc.main_text):
+
+        item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
+        if item is None or item.prov is None or len(item.prov) == 0:
+            _log.debug(f"Skipping item {orig_item}")
+            continue
+
+        item_page = item.prov[0].page
+
+        # Page is complete
+        if page_no > 0 and item_page > page_no:
+            yield _process_page()
+
+            start_ix = ix
+            doc_items = []
+            content_text = ""
+
+        page_no = item_page
+        end_ix = ix
+        doc_items.append((ix, item))
+        if item.text is not None and item.text != "":
+            content_text += item.text + " "
+
+    if len(doc_items) > 0:
+        yield _process_page()