feat: export document pages as multimodal output (#54)

* feat: export document pages as multimodal output

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* create a single parquet output

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add loading into HF datasets library

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* renaming

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* cleanup

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2024-09-03 15:05:35 +02:00
committed by GitHub
parent 69e5d951a3
commit 1de2e4f924
5 changed files with 1025 additions and 7 deletions

View File

@@ -23,7 +23,7 @@ packages = [{include = "docling"}]
[tool.poetry.dependencies]
python = "^3.10"
pydantic = "^2.0.0"
docling-core = "^1.1.2"
docling-core = "^1.1.3"
docling-ibm-models = "^1.1.3"
deepsearch-glm = "^0.19.1"
filetype = "^1.2.0"
@@ -36,6 +36,7 @@ docling-parse = "^1.1.3"
certifi = ">=2024.7.4"
rtree = "^1.3.0"
scipy = "^1.14.1"
pyarrow = "^17.0.0"
[tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"}
@@ -51,6 +52,10 @@ types-requests = "^2.31.0.2"
flake8-pyproject = "^1.2.3"
pylint = "^2.17.5"
[tool.poetry.group.examples.dependencies]
datasets = "^2.21.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"