feat: expose docling-core table exporters and add examples

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-09-17 15:37:01 +02:00
parent 30a0ef69b4
commit a472f49d5d
4 changed files with 118 additions and 40 deletions

View File

@ -129,7 +129,7 @@ def generate_multimodal_pages(
}
if isinstance(item, Table):
table_html = _export_table_to_html(item)
table_html = item.export_to_html()
new_segment["data"].append(
{
"html_seq": table_html,

74
examples/export_tables.py Normal file
View File

@ -0,0 +1,74 @@
import logging
import time
from pathlib import Path
from typing import Tuple
import pandas as pd
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
doc_converter = DocumentConverter()
start_time = time.time()
conv_results = doc_converter.convert(input_files)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results:
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue
doc_filename = conv_res.input.file.stem
# Export tables
for table_ix, table in enumerate(conv_res.output.tables):
table_df: pd.DataFrame = table.export_to_dataframe()
print(f"## Table {table_ix}")
print(table_df.to_markdown())
# Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename)
# Save the table as html
element_html_filename = (
output_dir / f"{doc_filename}-table-{table_ix+1}.html"
)
_log.info(f"Saving HTML table to {element_html_filename}")
with element_html_filename.open("w") as fp:
fp.write(table.export_to_html())
success_count += 1
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__":
main()

80
poetry.lock generated
View File

@ -854,35 +854,35 @@ files = [
[[package]]
name = "deepsearch-glm"
version = "0.21.0"
version = "0.21.1"
description = "Graph Language Models"
optional = false
python-versions = "<4.0,>=3.8"
files = [
{file = "deepsearch_glm-0.21.0-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:a07f9ee8b9532f2f02ce363fefd4622178552032e2de8e4f540cab16852b3d6d"},
{file = "deepsearch_glm-0.21.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:27561630487dc331c30238f94b5f00b4e5e1359bcd120ee7e5d9f9e3b4d824a1"},
{file = "deepsearch_glm-0.21.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:05bfde0f2bea6f235deb66093c1553248374914bc793f6f1823e632d2da1a625"},
{file = "deepsearch_glm-0.21.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:ff5a7aa3ed12b1ad8f8cb290851d4ddeb87d3f486ac9a1e90f13d69ff40233e1"},
{file = "deepsearch_glm-0.21.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e721314ee7b9d7ce9303d38f4038e33758a55b004691e32a5821818c1f42aff"},
{file = "deepsearch_glm-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2605a90146336f9cff2659d90f3dc9eea52a08b72ef8da211323b197ae61c557"},
{file = "deepsearch_glm-0.21.0-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:7fc8c07002ff8ade6deb1dc6d6bd3d07371433aa242cbc02c20349764b23269a"},
{file = "deepsearch_glm-0.21.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:ba0cf927f4e9f2553e94349e29c07b4505b94deafa55cb65a19457ff83b8dc9e"},
{file = "deepsearch_glm-0.21.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:e69001e9b64ea5d45fd3e03ec7612f531ebcb0eee6f574cbe4976598d78ede3c"},
{file = "deepsearch_glm-0.21.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:f8846f9228065a89a0438453b067815f7ac28753217912b944d28ca0d68fae6a"},
{file = "deepsearch_glm-0.21.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a7dcd0fe3911df1821e343946e24443126d3cf6e7b6c13d7dfdd437fdaf013a"},
{file = "deepsearch_glm-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86d97bc0c19672bee7723153b143f3e6f65d97497a22b4da19488aab5224f77f"},
{file = "deepsearch_glm-0.21.0-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:d74caa7336d256c8091af4db99f2d5bd6f7cd9c9448b9a210e5512de67eaef54"},
{file = "deepsearch_glm-0.21.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:82a32f996f398425d62de3681ff7568ef18b9e0a8900c52ac9a780341f169073"},
{file = "deepsearch_glm-0.21.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:4c823e59e934716ccd93df309a37d56a7e75cffac9831cffb9a9a560b84feadc"},
{file = "deepsearch_glm-0.21.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:b2c897bc2d57e0d21c86fe5706b5458d2a948e7016b26730f4e04a892f12d690"},
{file = "deepsearch_glm-0.21.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4273fbda4cb25e949776ff81f60dde2ae278cab94a67babe7fab024f98dce993"},
{file = "deepsearch_glm-0.21.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:117b0556e2a36582590d5627582265498bff66dae74658a67a02ef3a76956e34"},
{file = "deepsearch_glm-0.21.0-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:2ba54efd3661d7196f0a9828ebf642cb21bc2bd0594915e0486bd50b2ec0632c"},
{file = "deepsearch_glm-0.21.0-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:197ca6dc60330ff90c90dc85507899307353a2a0620db40ee825e632644c99c5"},
{file = "deepsearch_glm-0.21.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:a49c63cef72c32b63a10fc85589721bf82580a8a42cfe0a5c901798f4d161fbf"},
{file = "deepsearch_glm-0.21.0-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:3802436d610c6e24a0ea9291a2cdee9c260fd4492438af08041ed460e3f92743"},
{file = "deepsearch_glm-0.21.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1dd273c950877fb40d538ba16724efdedb82f3c9f15f9fc4407b9d60a832490"},
{file = "deepsearch_glm-0.21.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9f0b0cbd2f773558f79d356603f1d49d913e52c8f9b5610b4603480df3c5804"},
{file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:b765d371ab0a4f57dd2532c651d7dc1b4a187395153e619a77b6f0d0f6aefb32"},
{file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:c69e055b98d0a22267a1d0b6139801aecc5b7386289b89f53f976ab723352728"},
{file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:3eaa245e5ac4ab3e9d0c95a93e23f58d61d70f11431b76b6705fae358eb31c62"},
{file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:63d195f6c5b30f4f908436589cffd4a5b9e18553c44c57fb635068a2afbd7fab"},
{file = "deepsearch_glm-0.21.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91c9296a2e417a30bf030de0c7c2e2cce4773c58bead039d5e6fccbf7deb2269"},
{file = "deepsearch_glm-0.21.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:166b9958d3a8a98d0671a1e3fdf8083ded9ccf12c2ab80fb9709908a2cf81784"},
{file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:13bea2b4e8c04647ec743c3feb1ee66c784db542ab9dbed8dad7eb66fca74b70"},
{file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:c5b8b8e2207615ff99e535f00548c7b0b8e4ca4593e59edd83fcad98fc318284"},
{file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ba74868243caf5ac850fff7c45c8a372c1cac0193431e22eb41888d45ac79719"},
{file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:7815b06aa1c3953488496f191ce0265d0ee7bed5a6b96454a5f9d6f1add28f69"},
{file = "deepsearch_glm-0.21.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1a7dd2a1e63cee47f6090ebfebc15f68d24f61d5f4f45a21f22120b2267798d"},
{file = "deepsearch_glm-0.21.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d52bd2934a27fdc9db5f2d0713dbeec0c94e5c5843d29996e85d641a11498ad0"},
{file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:fd4d0d4ff853e566b05769c704a4ea3c050c0cfc5721e4e2035e550fb2a8fe91"},
{file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:802a59a8a3bea1801bce848d58d19fcdbbcea27d9e2c23f163419d13cdec2345"},
{file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:1ead7958bc044000a8d43cce53c9b82be0d341b0ca5cf7b39a0c09f9c4fd8ceb"},
{file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:312cf2b0b6560c8dfe5331a5a80a0ed5cb409d29ee6cc999a81696774d50f5e7"},
{file = "deepsearch_glm-0.21.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc3d6f6ca2cffbe5e112818c8aba9a783af8ab7cffff04624bfb5bf8d185b707"},
{file = "deepsearch_glm-0.21.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bc537d5e9d108233b7e7249c6739292dc9c36a0f39c11e7f430700df35ff884"},
{file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:4db0a700c08ff2d6285461dc5f4a68ccd36876a59b62131f847dc4be76a85989"},
{file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:f1041c44d1a4d1a43a324781795b03edfdfd8076c49a610c4dd384c86f2a6236"},
{file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:efb0e9678fe07640bd9b6dc07651eaf1f8e5d5602e379b4cf78dbcddc62b50e9"},
{file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:f8d46922d74339ec7fd7a6933220ebc36b2ff39738ad9bb74ea55a198dd31b2f"},
{file = "deepsearch_glm-0.21.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2872de101ce6d262f57afd3f4d68452064c214c5ab001b7ac698a948e0725314"},
{file = "deepsearch_glm-0.21.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:187da7dabc11317badbf6983ee508c367299eb39ed78938623206be6b21e41bd"},
]
[package.dependencies]
@ -960,20 +960,24 @@ name = "docling-core"
version = "1.3.0"
description = "A python library to define and validate data types in Docling."
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "docling_core-1.3.0-py3-none-any.whl", hash = "sha256:31779b9a5cce7e925d01d3b78fa8a835c531fa74646205ae2a8721f534eb8b27"},
{file = "docling_core-1.3.0.tar.gz", hash = "sha256:beb55fb0018c912209bdf12958e4cf5a6c8bbe73fd097d03da25fc3979260fab"},
]
python-versions = "^3.9"
files = []
develop = false
[package.dependencies]
json-schema-for-humans = ">=1.0.0,<2.0.0"
jsonref = ">=1.1.0,<2.0.0"
jsonschema = ">=4.16.0,<5.0.0"
pandas = ">=2.2.2,<3.0.0"
pydantic = ">=2.6.0,<3.0.0"
pyproject-toml = ">=0.0.10,<0.0.11"
tabulate = ">=0.9.0,<0.10.0"
json-schema-for-humans = "^1.0.0"
jsonref = "^1.1.0"
jsonschema = "^4.16.0"
pandas = "^2.2.2"
pydantic = "^2.6.0"
pyproject-toml = "^0.0.10"
tabulate = "^0.9.0"
[package.source]
type = "git"
url = "https://github.com/DS4SD/docling-core.git"
reference = "feat-table-exports"
resolved_reference = "66926e9c0ae476495cc51f5d05ee33ad357bc820"
[[package]]
name = "docling-ibm-models"
@ -7228,4 +7232,4 @@ examples = ["langchain-huggingface", "langchain-milvus", "langchain-text-splitte
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "f50b5f6158b688cc25f80253e3cec8e60d852d66a90fe8eb96798ea3c2372019"
content-hash = "9027f67f4847db1dc8384e4083e00c95c79cbe92999f7e0dd11c5fe50861294a"

View File

@ -23,7 +23,7 @@ packages = [{include = "docling"}]
[tool.poetry.dependencies]
python = "^3.10"
pydantic = "^2.0.0"
docling-core = "^1.3.0"
docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "feat-table-exports"}
docling-ibm-models = "^1.1.7"
deepsearch-glm = "^0.21.0"
filetype = "^1.2.0"