diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
index d6695e79..71238b8d 100644
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -71,6 +71,15 @@ class BoundingBox(BaseModel):
return out_bbox
+ def normalized(self, page_size: PageSize) -> "BoundingBox":
+ out_bbox = copy.deepcopy(self)
+ out_bbox.l /= page_size.width
+ out_bbox.r /= page_size.width
+ out_bbox.t /= page_size.height
+ out_bbox.b /= page_size.height
+
+ return out_bbox
+
def as_tuple(self):
if self.coord_origin == CoordOrigin.TOPLEFT:
return (self.l, self.t, self.r, self.b)
diff --git a/docling/utils/export.py b/docling/utils/export.py
new file mode 100644
index 00000000..31b17ea8
--- /dev/null
+++ b/docling/utils/export.py
@@ -0,0 +1,193 @@
+import logging
+from typing import Any, Dict, Iterable, List, Tuple
+
+from docling_core.types.doc.base import BaseCell, Ref, Table, TableCell
+
+from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
+from docling.datamodel.document import ConvertedDocument, Page
+
+_log = logging.getLogger(__name__)
+
+
+def _export_table_to_html(table: Table):
+
+ # TODO: this is flagged as internal, because we will move it
+ # to the docling-core package.
+
+ def _get_tablecell_span(cell: TableCell, ix):
+ span = set([s[ix] for s in cell.spans])
+ if len(span) == 0:
+ return 1, None, None
+ return len(span), min(span), max(span)
+
+ body = ""
+ nrows = table.num_rows
+ ncols = table.num_cols
+
+ for i in range(nrows):
+ body += "
"
+ for j in range(ncols):
+ cell: TableCell = table.data[i][j]
+
+ rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
+ colspan, colstart, colend = _get_tablecell_span(cell, 1)
+
+ if rowstart is not None and rowstart != i:
+ continue
+ if colstart is not None and colstart != j:
+ continue
+
+ if rowstart is None:
+ rowstart = i
+ if colstart is None:
+ colstart = j
+
+ content = cell.text.strip()
+ label = cell.obj_type
+ label_class = "body"
+ celltag = "td"
+ if label in ["row_header", "row_multi_header", "row_title"]:
+ label_class = "header"
+ elif label in ["col_header", "col_multi_header"]:
+ label_class = "header"
+ celltag = "th"
+
+ opening_tag = f"{celltag}"
+ if rowspan > 1:
+ opening_tag += f' rowspan="{rowspan}"'
+ if colspan > 1:
+ opening_tag += f' colspan="{colspan}"'
+
+ body += f"<{opening_tag}>{content}{celltag}>"
+ body += "
"
+ body = f""
+
+ return body
+
+
+def generate_multimodal_pages(
+ doc_result: ConvertedDocument,
+) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
+
+ label_to_doclaynet = {
+ "title": "title",
+ "table-of-contents": "document_index",
+ "subtitle-level-1": "section_header",
+ "checkbox-selected": "checkbox_selected",
+ "checkbox-unselected": "checkbox_unselected",
+ "caption": "caption",
+ "page-header": "page_header",
+ "page-footer": "page_footer",
+ "footnote": "footnote",
+ "table": "table",
+ "formula": "formula",
+ "list-item": "list_item",
+ "code": "code",
+ "figure": "picture",
+ "picture": "picture",
+ "reference": "text",
+ "paragraph": "text",
+ "text": "text",
+ }
+
+ content_text = ""
+ page_no = 0
+ start_ix = 0
+ end_ix = 0
+ doc_items = []
+
+ doc = doc_result.output
+
+ def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
+ segments = []
+
+ for ix, item in doc_items:
+ item_type = item.obj_type
+ label = label_to_doclaynet.get(item_type, None)
+
+ if label is None:
+ continue
+
+ bbox = BoundingBox.from_tuple(
+ item.prov[0].bbox, origin=CoordOrigin.BOTTOMLEFT
+ )
+ new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
+ page_size=page.size
+ )
+
+ new_segment = {
+ "index_in_doc": ix,
+ "label": label,
+ "text": item.text if item.text is not None else "",
+ "bbox": new_bbox.as_tuple(),
+ "data": [],
+ }
+
+ if isinstance(item, Table):
+ table_html = _export_table_to_html(item)
+ new_segment["data"].append(
+ {
+ "html_seq": table_html,
+ "otsl_seq": "",
+ }
+ )
+
+ segments.append(new_segment)
+
+ return segments
+
+ def _process_page_cells(page: Page):
+ cells = []
+ for cell in page.cells:
+ new_bbox = cell.bbox.to_top_left_origin(
+ page_height=page.size.height
+ ).normalized(page_size=page.size)
+ is_ocr = isinstance(cell, OcrCell)
+ ocr_confidence = cell.confidence if is_ocr else 1.0
+ cells.append(
+ {
+ "text": cell.text,
+ "bbox": new_bbox.as_tuple(),
+ "ocr": is_ocr,
+ "ocr_confidence": ocr_confidence,
+ }
+ )
+ return cells
+
+ def _process_page():
+ page_ix = page_no - 1
+ page = doc_result.pages[page_ix]
+
+ page_cells = _process_page_cells(page=page)
+ page_segments = _process_page_segments(doc_items=doc_items, page=page)
+ content_md = doc.export_to_markdown(
+ main_text_start=start_ix, main_text_stop=end_ix
+ )
+
+ return content_text, content_md, page_cells, page_segments, page
+
+ for ix, orig_item in enumerate(doc.main_text):
+
+ item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
+ if item is None or item.prov is None or len(item.prov) == 0:
+ _log.debug(f"Skipping item {orig_item}")
+ continue
+
+ item_page = item.prov[0].page
+
+ # Page is complete
+ if page_no > 0 and item_page > page_no:
+ yield _process_page()
+
+ start_ix = ix
+ doc_items = []
+ content_text = ""
+
+ page_no = item_page
+ end_ix = ix
+ doc_items.append((ix, item))
+ if item.text is not None and item.text != "":
+ content_text += item.text + " "
+
+ if len(doc_items) > 0:
+ yield _process_page()
diff --git a/examples/export_multimodal.py b/examples/export_multimodal.py
new file mode 100644
index 00000000..d0c1a0ac
--- /dev/null
+++ b/examples/export_multimodal.py
@@ -0,0 +1,87 @@
+import logging
+import time
+from pathlib import Path
+
+import pandas as pd
+
+from docling.datamodel.base_models import AssembleOptions, ConversionStatus
+from docling.datamodel.document import DocumentConversionInput
+from docling.document_converter import DocumentConverter
+from docling.utils.export import generate_multimodal_pages
+
+_log = logging.getLogger(__name__)
+
+IMAGE_RESOLUTION_SCALE = 2.0
+
+
+def main():
+ logging.basicConfig(level=logging.INFO)
+
+ input_doc_paths = [
+ Path("./test/data/2206.01062.pdf"),
+ ]
+ output_dir = Path("./scratch")
+
+ input_files = DocumentConversionInput.from_paths(input_doc_paths)
+
+ # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
+ # will destroy them for cleaning up memory.
+ # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
+ # scale=1 correspond of a standard 72 DPI image
+ assemble_options = AssembleOptions()
+ assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
+
+ doc_converter = DocumentConverter(assemble_options=assemble_options)
+
+ start_time = time.time()
+
+ converted_docs = doc_converter.convert(input_files)
+
+ output_dir.mkdir(parents=True, exist_ok=True)
+ for doc in converted_docs:
+ if doc.status != ConversionStatus.SUCCESS:
+ _log.info(f"Document {doc.input.file} failed to convert.")
+ continue
+
+ doc_filename = doc.input.file.stem
+
+ rows = []
+ for _pack in generate_multimodal_pages(doc):
+ content_text, content_md, page_cells, page_segments, page = _pack
+
+ dpi = page._default_image_scale * 72
+
+ rows.append(
+ {
+ "document": doc.input.file.name,
+ "hash": doc.input.document_hash,
+ "page_hash": page.page_hash,
+ "image": {
+ "width": page.image.width,
+ "height": page.image.height,
+ "bytes": page.image.tobytes(),
+ },
+ "cells": page_cells,
+ "contents": content_text,
+ "contents_md": content_md,
+ "segments": page_segments,
+ "extra": {
+ "page_num": page.page_no + 1,
+ "width_in_points": page.size.width,
+ "height_in_points": page.size.height,
+ "dpi": dpi,
+ },
+ }
+ )
+ df = pd.json_normalize(rows)
+
+ output_filename = output_dir / f"{doc_filename}.parquet"
+ df.to_parquet(output_filename)
+
+ end_time = time.time() - start_time
+
+ _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/poetry.lock b/poetry.lock
index dc1c57f8..f7f54a0f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -26,8 +26,8 @@ files = [
lazy-object-proxy = ">=1.4.0"
typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
wrapt = [
- {version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
{version = ">=1.11,<2", markers = "python_version < \"3.11\""},
+ {version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
]
[[package]]
@@ -2670,10 +2670,10 @@ files = [
[package.dependencies]
numpy = [
- {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
+ {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
]
[[package]]
@@ -2727,9 +2727,9 @@ files = [
[package.dependencies]
numpy = [
- {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
+ {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
]
python-dateutil = ">=2.8.2"
pytz = ">=2020.1"
@@ -3080,6 +3080,57 @@ files = [
[package.extras]
tests = ["pytest"]
+[[package]]
+name = "pyarrow"
+version = "17.0.0"
+description = "Python library for Apache Arrow"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"},
+ {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"},
+ {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"},
+ {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"},
+ {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"},
+ {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"},
+ {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"},
+ {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"},
+ {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"},
+ {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"},
+ {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"},
+ {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"},
+ {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"},
+ {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"},
+ {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"},
+ {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"},
+ {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"},
+ {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"},
+ {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"},
+ {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"},
+ {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
+ {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
+ {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
+ {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"},
+ {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"},
+ {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"},
+ {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"},
+ {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"},
+ {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"},
+ {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"},
+ {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"},
+ {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"},
+ {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"},
+ {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"},
+ {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"},
+ {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
+]
+
+[package.dependencies]
+numpy = ">=1.16.6"
+
+[package.extras]
+test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
+
[[package]]
name = "pybind11"
version = "2.13.5"
@@ -3184,8 +3235,8 @@ files = [
annotated-types = ">=0.4.0"
pydantic-core = "2.20.1"
typing-extensions = [
- {version = ">=4.12.2", markers = "python_version >= \"3.13\""},
{version = ">=4.6.1", markers = "python_version < \"3.13\""},
+ {version = ">=4.12.2", markers = "python_version >= \"3.13\""},
]
[package.extras]
@@ -3352,8 +3403,8 @@ files = [
astroid = ">=2.15.8,<=2.17.0-dev0"
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
dill = [
- {version = ">=0.3.6", markers = "python_version >= \"3.11\""},
{version = ">=0.2", markers = "python_version < \"3.11\""},
+ {version = ">=0.3.6", markers = "python_version >= \"3.11\""},
]
isort = ">=4.2.5,<6"
mccabe = ">=0.6,<0.8"
@@ -4887,6 +4938,11 @@ files = [
{file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
{file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
{file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
+ {file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
+ {file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
+ {file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
+ {file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
+ {file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
]
[package.dependencies]
@@ -5245,4 +5301,4 @@ type = ["pytest-mypy"]
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
-content-hash = "9f876a0fe3d1f350eb9279f6e8ec8c481d2ce196d1bdd9834f4b89c881658074"
+content-hash = "03159c4cb3c5e1fa19df5e67a54932836eb88c78494467dd385f88fa99866071"
diff --git a/pyproject.toml b/pyproject.toml
index fe465bc0..33964b57 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,7 @@ packages = [{include = "docling"}]
[tool.poetry.dependencies]
python = "^3.10"
pydantic = "^2.0.0"
-docling-core = "^1.1.2"
+docling-core = "^1.1.3"
docling-ibm-models = "^1.1.3"
deepsearch-glm = "^0.19.1"
filetype = "^1.2.0"
@@ -36,6 +36,7 @@ docling-parse = "^1.1.3"
certifi = ">=2024.7.4"
rtree = "^1.3.0"
scipy = "^1.14.1"
+pyarrow = "^17.0.0"
[tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"}