From 3e789dfbdd01098ef52de3c5664822c76c61096c Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Wed, 28 Aug 2024 11:40:55 +0200 Subject: [PATCH] feat: export document pages as multimodal output Signed-off-by: Michele Dolfi --- docling/datamodel/base_models.py | 9 ++ docling/utils/export.py | 193 +++++++++++++++++++++++++++++++ examples/export_multimodal.py | 87 ++++++++++++++ poetry.lock | 68 ++++++++++- pyproject.toml | 3 +- 5 files changed, 353 insertions(+), 7 deletions(-) create mode 100644 docling/utils/export.py create mode 100644 examples/export_multimodal.py diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index d6695e79..71238b8d 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -71,6 +71,15 @@ class BoundingBox(BaseModel): return out_bbox + def normalized(self, page_size: PageSize) -> "BoundingBox": + out_bbox = copy.deepcopy(self) + out_bbox.l /= page_size.width + out_bbox.r /= page_size.width + out_bbox.t /= page_size.height + out_bbox.b /= page_size.height + + return out_bbox + def as_tuple(self): if self.coord_origin == CoordOrigin.TOPLEFT: return (self.l, self.t, self.r, self.b) diff --git a/docling/utils/export.py b/docling/utils/export.py new file mode 100644 index 00000000..31b17ea8 --- /dev/null +++ b/docling/utils/export.py @@ -0,0 +1,193 @@ +import logging +from typing import Any, Dict, Iterable, List, Tuple + +from docling_core.types.doc.base import BaseCell, Ref, Table, TableCell + +from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell +from docling.datamodel.document import ConvertedDocument, Page + +_log = logging.getLogger(__name__) + + +def _export_table_to_html(table: Table): + + # TODO: this is flagged as internal, because we will move it + # to the docling-core package. + + def _get_tablecell_span(cell: TableCell, ix): + span = set([s[ix] for s in cell.spans]) + if len(span) == 0: + return 1, None, None + return len(span), min(span), max(span) + + body = "" + nrows = table.num_rows + ncols = table.num_cols + + for i in range(nrows): + body += "" + for j in range(ncols): + cell: TableCell = table.data[i][j] + + rowspan, rowstart, rowend = _get_tablecell_span(cell, 0) + colspan, colstart, colend = _get_tablecell_span(cell, 1) + + if rowstart is not None and rowstart != i: + continue + if colstart is not None and colstart != j: + continue + + if rowstart is None: + rowstart = i + if colstart is None: + colstart = j + + content = cell.text.strip() + label = cell.obj_type + label_class = "body" + celltag = "td" + if label in ["row_header", "row_multi_header", "row_title"]: + label_class = "header" + elif label in ["col_header", "col_multi_header"]: + label_class = "header" + celltag = "th" + + opening_tag = f"{celltag}" + if rowspan > 1: + opening_tag += f' rowspan="{rowspan}"' + if colspan > 1: + opening_tag += f' colspan="{colspan}"' + + body += f"<{opening_tag}>{content}" + body += "" + body = f"{body}
" + + return body + + +def generate_multimodal_pages( + doc_result: ConvertedDocument, +) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]: + + label_to_doclaynet = { + "title": "title", + "table-of-contents": "document_index", + "subtitle-level-1": "section_header", + "checkbox-selected": "checkbox_selected", + "checkbox-unselected": "checkbox_unselected", + "caption": "caption", + "page-header": "page_header", + "page-footer": "page_footer", + "footnote": "footnote", + "table": "table", + "formula": "formula", + "list-item": "list_item", + "code": "code", + "figure": "picture", + "picture": "picture", + "reference": "text", + "paragraph": "text", + "text": "text", + } + + content_text = "" + page_no = 0 + start_ix = 0 + end_ix = 0 + doc_items = [] + + doc = doc_result.output + + def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page): + segments = [] + + for ix, item in doc_items: + item_type = item.obj_type + label = label_to_doclaynet.get(item_type, None) + + if label is None: + continue + + bbox = BoundingBox.from_tuple( + item.prov[0].bbox, origin=CoordOrigin.BOTTOMLEFT + ) + new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized( + page_size=page.size + ) + + new_segment = { + "index_in_doc": ix, + "label": label, + "text": item.text if item.text is not None else "", + "bbox": new_bbox.as_tuple(), + "data": [], + } + + if isinstance(item, Table): + table_html = _export_table_to_html(item) + new_segment["data"].append( + { + "html_seq": table_html, + "otsl_seq": "", + } + ) + + segments.append(new_segment) + + return segments + + def _process_page_cells(page: Page): + cells = [] + for cell in page.cells: + new_bbox = cell.bbox.to_top_left_origin( + page_height=page.size.height + ).normalized(page_size=page.size) + is_ocr = isinstance(cell, OcrCell) + ocr_confidence = cell.confidence if is_ocr else 1.0 + cells.append( + { + "text": cell.text, + "bbox": new_bbox.as_tuple(), + "ocr": is_ocr, + "ocr_confidence": ocr_confidence, + } + ) + return cells + + def _process_page(): + page_ix = page_no - 1 + page = doc_result.pages[page_ix] + + page_cells = _process_page_cells(page=page) + page_segments = _process_page_segments(doc_items=doc_items, page=page) + content_md = doc.export_to_markdown( + main_text_start=start_ix, main_text_stop=end_ix + ) + + return content_text, content_md, page_cells, page_segments, page + + for ix, orig_item in enumerate(doc.main_text): + + item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item + if item is None or item.prov is None or len(item.prov) == 0: + _log.debug(f"Skipping item {orig_item}") + continue + + item_page = item.prov[0].page + + # Page is complete + if page_no > 0 and item_page > page_no: + yield _process_page() + + start_ix = ix + doc_items = [] + content_text = "" + + page_no = item_page + end_ix = ix + doc_items.append((ix, item)) + if item.text is not None and item.text != "": + content_text += item.text + " " + + if len(doc_items) > 0: + yield _process_page() diff --git a/examples/export_multimodal.py b/examples/export_multimodal.py new file mode 100644 index 00000000..d0c1a0ac --- /dev/null +++ b/examples/export_multimodal.py @@ -0,0 +1,87 @@ +import logging +import time +from pathlib import Path + +import pandas as pd + +from docling.datamodel.base_models import AssembleOptions, ConversionStatus +from docling.datamodel.document import DocumentConversionInput +from docling.document_converter import DocumentConverter +from docling.utils.export import generate_multimodal_pages + +_log = logging.getLogger(__name__) + +IMAGE_RESOLUTION_SCALE = 2.0 + + +def main(): + logging.basicConfig(level=logging.INFO) + + input_doc_paths = [ + Path("./test/data/2206.01062.pdf"), + ] + output_dir = Path("./scratch") + + input_files = DocumentConversionInput.from_paths(input_doc_paths) + + # Important: For operating with page images, we must keep them, otherwise the DocumentConverter + # will destroy them for cleaning up memory. + # This is done by setting AssembleOptions.images_scale, which also defines the scale of images. + # scale=1 correspond of a standard 72 DPI image + assemble_options = AssembleOptions() + assemble_options.images_scale = IMAGE_RESOLUTION_SCALE + + doc_converter = DocumentConverter(assemble_options=assemble_options) + + start_time = time.time() + + converted_docs = doc_converter.convert(input_files) + + output_dir.mkdir(parents=True, exist_ok=True) + for doc in converted_docs: + if doc.status != ConversionStatus.SUCCESS: + _log.info(f"Document {doc.input.file} failed to convert.") + continue + + doc_filename = doc.input.file.stem + + rows = [] + for _pack in generate_multimodal_pages(doc): + content_text, content_md, page_cells, page_segments, page = _pack + + dpi = page._default_image_scale * 72 + + rows.append( + { + "document": doc.input.file.name, + "hash": doc.input.document_hash, + "page_hash": page.page_hash, + "image": { + "width": page.image.width, + "height": page.image.height, + "bytes": page.image.tobytes(), + }, + "cells": page_cells, + "contents": content_text, + "contents_md": content_md, + "segments": page_segments, + "extra": { + "page_num": page.page_no + 1, + "width_in_points": page.size.width, + "height_in_points": page.size.height, + "dpi": dpi, + }, + } + ) + df = pd.json_normalize(rows) + + output_filename = output_dir / f"{doc_filename}.parquet" + df.to_parquet(output_filename) + + end_time = time.time() - start_time + + _log.info(f"All documents were converted in {end_time:.2f} seconds.") + + +if __name__ == "__main__": + main() diff --git a/poetry.lock b/poetry.lock index dc1c57f8..f7f54a0f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -26,8 +26,8 @@ files = [ lazy-object-proxy = ">=1.4.0" typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} wrapt = [ - {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, {version = ">=1.11,<2", markers = "python_version < \"3.11\""}, + {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, ] [[package]] @@ -2670,10 +2670,10 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] [[package]] @@ -2727,9 +2727,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -3080,6 +3080,57 @@ files = [ [package.extras] tests = ["pytest"] +[[package]] +name = "pyarrow" +version = "17.0.0" +description = "Python library for Apache Arrow" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"}, + {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"}, + {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"}, + {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"}, + {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"}, + {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"}, + {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"}, + {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"}, + {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"}, + {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"}, + {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"}, + {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"}, + {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"}, + {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"}, + {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"}, + {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"}, +] + +[package.dependencies] +numpy = ">=1.16.6" + +[package.extras] +test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] + [[package]] name = "pybind11" version = "2.13.5" @@ -3184,8 +3235,8 @@ files = [ annotated-types = ">=0.4.0" pydantic-core = "2.20.1" typing-extensions = [ - {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, {version = ">=4.6.1", markers = "python_version < \"3.13\""}, + {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, ] [package.extras] @@ -3352,8 +3403,8 @@ files = [ astroid = ">=2.15.8,<=2.17.0-dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = [ - {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, {version = ">=0.2", markers = "python_version < \"3.11\""}, + {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, ] isort = ">=4.2.5,<6" mccabe = ">=0.6,<0.8" @@ -4887,6 +4938,11 @@ files = [ {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"}, {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"}, {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"}, + {file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"}, + {file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"}, + {file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"}, + {file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"}, + {file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"}, ] [package.dependencies] @@ -5245,4 +5301,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "9f876a0fe3d1f350eb9279f6e8ec8c481d2ce196d1bdd9834f4b89c881658074" +content-hash = "03159c4cb3c5e1fa19df5e67a54932836eb88c78494467dd385f88fa99866071" diff --git a/pyproject.toml b/pyproject.toml index fe465bc0..33964b57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ packages = [{include = "docling"}] [tool.poetry.dependencies] python = "^3.10" pydantic = "^2.0.0" -docling-core = "^1.1.2" +docling-core = "^1.1.3" docling-ibm-models = "^1.1.3" deepsearch-glm = "^0.19.1" filetype = "^1.2.0" @@ -36,6 +36,7 @@ docling-parse = "^1.1.3" certifi = ">=2024.7.4" rtree = "^1.3.0" scipy = "^1.14.1" +pyarrow = "^17.0.0" [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "^24.4.2"}