diff --git a/docling/utils/export.py b/docling/utils/export.py index 115f7646..2bba44f0 100644 --- a/docling/utils/export.py +++ b/docling/utils/export.py @@ -129,7 +129,7 @@ def generate_multimodal_pages( } if isinstance(item, Table): - table_html = _export_table_to_html(item) + table_html = item.export_to_html() new_segment["data"].append( { "html_seq": table_html, diff --git a/examples/export_tables.py b/examples/export_tables.py new file mode 100644 index 00000000..a0c605c1 --- /dev/null +++ b/examples/export_tables.py @@ -0,0 +1,74 @@ +import logging +import time +from pathlib import Path +from typing import Tuple + +import pandas as pd + +from docling.datamodel.base_models import ConversionStatus +from docling.datamodel.document import DocumentConversionInput +from docling.document_converter import DocumentConverter + +_log = logging.getLogger(__name__) + + +def main(): + logging.basicConfig(level=logging.INFO) + + input_doc_paths = [ + Path("./tests/data/2206.01062.pdf"), + ] + output_dir = Path("./scratch") + + input_files = DocumentConversionInput.from_paths(input_doc_paths) + + doc_converter = DocumentConverter() + + start_time = time.time() + + conv_results = doc_converter.convert(input_files) + + success_count = 0 + failure_count = 0 + output_dir.mkdir(parents=True, exist_ok=True) + for conv_res in conv_results: + if conv_res.status != ConversionStatus.SUCCESS: + _log.info(f"Document {conv_res.input.file} failed to convert.") + failure_count += 1 + continue + + doc_filename = conv_res.input.file.stem + + # Export tables + for table_ix, table in enumerate(conv_res.output.tables): + table_df: pd.DataFrame = table.export_to_dataframe() + print(f"## Table {table_ix}") + print(table_df.to_markdown()) + + # Save the table as csv + element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv" + _log.info(f"Saving CSV table to {element_csv_filename}") + table_df.to_csv(element_csv_filename) + + # Save the table as html + element_html_filename = ( + output_dir / f"{doc_filename}-table-{table_ix+1}.html" + ) + _log.info(f"Saving HTML table to {element_html_filename}") + with element_html_filename.open("w") as fp: + fp.write(table.export_to_html()) + + success_count += 1 + + end_time = time.time() - start_time + + _log.info(f"All documents were converted in {end_time:.2f} seconds.") + + if failure_count > 0: + raise RuntimeError( + f"The example failed converting {failure_count} on {len(input_doc_paths)}." + ) + + +if __name__ == "__main__": + main() diff --git a/poetry.lock b/poetry.lock index b8459ad6..5ce2f16a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -854,35 +854,35 @@ files = [ [[package]] name = "deepsearch-glm" -version = "0.21.0" +version = "0.21.1" description = "Graph Language Models" optional = false python-versions = "<4.0,>=3.8" files = [ - {file = "deepsearch_glm-0.21.0-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:a07f9ee8b9532f2f02ce363fefd4622178552032e2de8e4f540cab16852b3d6d"}, - {file = "deepsearch_glm-0.21.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:27561630487dc331c30238f94b5f00b4e5e1359bcd120ee7e5d9f9e3b4d824a1"}, - {file = "deepsearch_glm-0.21.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:05bfde0f2bea6f235deb66093c1553248374914bc793f6f1823e632d2da1a625"}, - {file = "deepsearch_glm-0.21.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:ff5a7aa3ed12b1ad8f8cb290851d4ddeb87d3f486ac9a1e90f13d69ff40233e1"}, - {file = "deepsearch_glm-0.21.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e721314ee7b9d7ce9303d38f4038e33758a55b004691e32a5821818c1f42aff"}, - {file = "deepsearch_glm-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2605a90146336f9cff2659d90f3dc9eea52a08b72ef8da211323b197ae61c557"}, - {file = "deepsearch_glm-0.21.0-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:7fc8c07002ff8ade6deb1dc6d6bd3d07371433aa242cbc02c20349764b23269a"}, - {file = "deepsearch_glm-0.21.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:ba0cf927f4e9f2553e94349e29c07b4505b94deafa55cb65a19457ff83b8dc9e"}, - {file = "deepsearch_glm-0.21.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:e69001e9b64ea5d45fd3e03ec7612f531ebcb0eee6f574cbe4976598d78ede3c"}, - {file = "deepsearch_glm-0.21.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:f8846f9228065a89a0438453b067815f7ac28753217912b944d28ca0d68fae6a"}, - {file = "deepsearch_glm-0.21.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a7dcd0fe3911df1821e343946e24443126d3cf6e7b6c13d7dfdd437fdaf013a"}, - {file = "deepsearch_glm-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86d97bc0c19672bee7723153b143f3e6f65d97497a22b4da19488aab5224f77f"}, - {file = "deepsearch_glm-0.21.0-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:d74caa7336d256c8091af4db99f2d5bd6f7cd9c9448b9a210e5512de67eaef54"}, - {file = "deepsearch_glm-0.21.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:82a32f996f398425d62de3681ff7568ef18b9e0a8900c52ac9a780341f169073"}, - {file = "deepsearch_glm-0.21.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:4c823e59e934716ccd93df309a37d56a7e75cffac9831cffb9a9a560b84feadc"}, - {file = "deepsearch_glm-0.21.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:b2c897bc2d57e0d21c86fe5706b5458d2a948e7016b26730f4e04a892f12d690"}, - {file = "deepsearch_glm-0.21.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4273fbda4cb25e949776ff81f60dde2ae278cab94a67babe7fab024f98dce993"}, - {file = "deepsearch_glm-0.21.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:117b0556e2a36582590d5627582265498bff66dae74658a67a02ef3a76956e34"}, - {file = "deepsearch_glm-0.21.0-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:2ba54efd3661d7196f0a9828ebf642cb21bc2bd0594915e0486bd50b2ec0632c"}, - {file = "deepsearch_glm-0.21.0-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:197ca6dc60330ff90c90dc85507899307353a2a0620db40ee825e632644c99c5"}, - {file = "deepsearch_glm-0.21.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:a49c63cef72c32b63a10fc85589721bf82580a8a42cfe0a5c901798f4d161fbf"}, - {file = "deepsearch_glm-0.21.0-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:3802436d610c6e24a0ea9291a2cdee9c260fd4492438af08041ed460e3f92743"}, - {file = "deepsearch_glm-0.21.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1dd273c950877fb40d538ba16724efdedb82f3c9f15f9fc4407b9d60a832490"}, - {file = "deepsearch_glm-0.21.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9f0b0cbd2f773558f79d356603f1d49d913e52c8f9b5610b4603480df3c5804"}, + {file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:b765d371ab0a4f57dd2532c651d7dc1b4a187395153e619a77b6f0d0f6aefb32"}, + {file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:c69e055b98d0a22267a1d0b6139801aecc5b7386289b89f53f976ab723352728"}, + {file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:3eaa245e5ac4ab3e9d0c95a93e23f58d61d70f11431b76b6705fae358eb31c62"}, + {file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:63d195f6c5b30f4f908436589cffd4a5b9e18553c44c57fb635068a2afbd7fab"}, + {file = "deepsearch_glm-0.21.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91c9296a2e417a30bf030de0c7c2e2cce4773c58bead039d5e6fccbf7deb2269"}, + {file = "deepsearch_glm-0.21.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:166b9958d3a8a98d0671a1e3fdf8083ded9ccf12c2ab80fb9709908a2cf81784"}, + {file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:13bea2b4e8c04647ec743c3feb1ee66c784db542ab9dbed8dad7eb66fca74b70"}, + {file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:c5b8b8e2207615ff99e535f00548c7b0b8e4ca4593e59edd83fcad98fc318284"}, + {file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ba74868243caf5ac850fff7c45c8a372c1cac0193431e22eb41888d45ac79719"}, + {file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:7815b06aa1c3953488496f191ce0265d0ee7bed5a6b96454a5f9d6f1add28f69"}, + {file = "deepsearch_glm-0.21.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1a7dd2a1e63cee47f6090ebfebc15f68d24f61d5f4f45a21f22120b2267798d"}, + {file = "deepsearch_glm-0.21.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d52bd2934a27fdc9db5f2d0713dbeec0c94e5c5843d29996e85d641a11498ad0"}, + {file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:fd4d0d4ff853e566b05769c704a4ea3c050c0cfc5721e4e2035e550fb2a8fe91"}, + {file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:802a59a8a3bea1801bce848d58d19fcdbbcea27d9e2c23f163419d13cdec2345"}, + {file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:1ead7958bc044000a8d43cce53c9b82be0d341b0ca5cf7b39a0c09f9c4fd8ceb"}, + {file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:312cf2b0b6560c8dfe5331a5a80a0ed5cb409d29ee6cc999a81696774d50f5e7"}, + {file = "deepsearch_glm-0.21.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc3d6f6ca2cffbe5e112818c8aba9a783af8ab7cffff04624bfb5bf8d185b707"}, + {file = "deepsearch_glm-0.21.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bc537d5e9d108233b7e7249c6739292dc9c36a0f39c11e7f430700df35ff884"}, + {file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:4db0a700c08ff2d6285461dc5f4a68ccd36876a59b62131f847dc4be76a85989"}, + {file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:f1041c44d1a4d1a43a324781795b03edfdfd8076c49a610c4dd384c86f2a6236"}, + {file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:efb0e9678fe07640bd9b6dc07651eaf1f8e5d5602e379b4cf78dbcddc62b50e9"}, + {file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:f8d46922d74339ec7fd7a6933220ebc36b2ff39738ad9bb74ea55a198dd31b2f"}, + {file = "deepsearch_glm-0.21.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2872de101ce6d262f57afd3f4d68452064c214c5ab001b7ac698a948e0725314"}, + {file = "deepsearch_glm-0.21.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:187da7dabc11317badbf6983ee508c367299eb39ed78938623206be6b21e41bd"}, ] [package.dependencies] @@ -960,20 +960,24 @@ name = "docling-core" version = "1.3.0" description = "A python library to define and validate data types in Docling." optional = false -python-versions = "<4.0,>=3.9" -files = [ - {file = "docling_core-1.3.0-py3-none-any.whl", hash = "sha256:31779b9a5cce7e925d01d3b78fa8a835c531fa74646205ae2a8721f534eb8b27"}, - {file = "docling_core-1.3.0.tar.gz", hash = "sha256:beb55fb0018c912209bdf12958e4cf5a6c8bbe73fd097d03da25fc3979260fab"}, -] +python-versions = "^3.9" +files = [] +develop = false [package.dependencies] -json-schema-for-humans = ">=1.0.0,<2.0.0" -jsonref = ">=1.1.0,<2.0.0" -jsonschema = ">=4.16.0,<5.0.0" -pandas = ">=2.2.2,<3.0.0" -pydantic = ">=2.6.0,<3.0.0" -pyproject-toml = ">=0.0.10,<0.0.11" -tabulate = ">=0.9.0,<0.10.0" +json-schema-for-humans = "^1.0.0" +jsonref = "^1.1.0" +jsonschema = "^4.16.0" +pandas = "^2.2.2" +pydantic = "^2.6.0" +pyproject-toml = "^0.0.10" +tabulate = "^0.9.0" + +[package.source] +type = "git" +url = "https://github.com/DS4SD/docling-core.git" +reference = "feat-table-exports" +resolved_reference = "66926e9c0ae476495cc51f5d05ee33ad357bc820" [[package]] name = "docling-ibm-models" @@ -7228,4 +7232,4 @@ examples = ["langchain-huggingface", "langchain-milvus", "langchain-text-splitte [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "f50b5f6158b688cc25f80253e3cec8e60d852d66a90fe8eb96798ea3c2372019" +content-hash = "9027f67f4847db1dc8384e4083e00c95c79cbe92999f7e0dd11c5fe50861294a" diff --git a/pyproject.toml b/pyproject.toml index 1a2c0a0a..b9b00a44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ packages = [{include = "docling"}] [tool.poetry.dependencies] python = "^3.10" pydantic = "^2.0.0" -docling-core = "^1.3.0" +docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "feat-table-exports"} docling-ibm-models = "^1.1.7" deepsearch-glm = "^0.21.0" filetype = "^1.2.0"