pin models, core and adapt example

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-10-16 10:57:05 +02:00
parent df3ff47914
commit dd2982cce1
3 changed files with 25 additions and 129 deletions

View File

@ -3,7 +3,7 @@ from pathlib import Path
from typing import Any, Iterable from typing import Any, Iterable
from docling_core.types.doc import DoclingDocument, NodeItem from docling_core.types.doc import DoclingDocument, NodeItem
from docling_core.types.doc.document import PictureClassificationData, PictureItem from docling_core.types.doc.document import PictureClassificationData, PictureItem, PictureClassificationClass
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
@ -38,8 +38,7 @@ class ExamplePictureClassifierEnrichmentModel(BaseEnrichmentModel):
element.data.classification = PictureClassificationData( element.data.classification = PictureClassificationData(
provenance="example_classifier-0.0.1", provenance="example_classifier-0.0.1",
predicted_class="dummy", predicted_classes=[PictureClassificationClass(class_name="dummy", confidence=0.42)]
confidence=0.42,
) )
yield element yield element

143
poetry.lock generated
View File

@ -798,25 +798,6 @@ files = [
docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] docs = ["ipython", "matplotlib", "numpydoc", "sphinx"]
tests = ["pytest", "pytest-cov", "pytest-xdist"] tests = ["pytest", "pytest-cov", "pytest-xdist"]
[[package]]
name = "dataclasses-json"
version = "0.5.9"
description = "Easily serialize dataclasses to and from JSON"
optional = false
python-versions = ">=3.6"
files = [
{file = "dataclasses-json-0.5.9.tar.gz", hash = "sha256:e9ac87b73edc0141aafbce02b44e93553c3123ad574958f0fe52a534b6707e8e"},
{file = "dataclasses_json-0.5.9-py3-none-any.whl", hash = "sha256:1280542631df1c375b7bc92e5b86d39e06c44760d7e3571a537b3b8acabf2f0c"},
]
[package.dependencies]
marshmallow = ">=3.3.0,<4.0.0"
marshmallow-enum = ">=1.5.1,<2.0.0"
typing-inspect = ">=0.4.0"
[package.extras]
dev = ["flake8", "hypothesis", "ipython", "mypy (>=0.710)", "portray", "pytest (>=7.2.0)", "setuptools", "simplejson", "twine", "types-dataclasses", "wheel"]
[[package]] [[package]]
name = "datasets" name = "datasets"
version = "2.21.0" version = "2.21.0"
@ -917,7 +898,7 @@ files = []
develop = false develop = false
[package.dependencies] [package.dependencies]
docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "7c104d61aa5d003dd8d9711c37e23ce04799f4c9"} docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "33aa21408400c9c475db0f8c6be681b888388284"}
docutils = "!=0.21" docutils = "!=0.21"
matplotlib = "^3.7.1" matplotlib = "^3.7.1"
networkx = "^3.1" networkx = "^3.1"
@ -941,8 +922,8 @@ toolkit = ["deepsearch-toolkit (>=0.31.0)"]
[package.source] [package.source]
type = "git" type = "git"
url = "https://github.com/DS4SD/deepsearch-glm.git" url = "https://github.com/DS4SD/deepsearch-glm.git"
reference = "c185c4f985ccd29a470a1cddd3bec43880b739ee" reference = "8ab1b4372122c820a28badd3c6095c2ce2feaf61"
resolved_reference = "c185c4f985ccd29a470a1cddd3bec43880b739ee" resolved_reference = "8ab1b4372122c820a28badd3c6095c2ce2feaf61"
[[package]] [[package]]
name = "defusedxml" name = "defusedxml"
@ -991,7 +972,6 @@ files = []
develop = false develop = false
[package.dependencies] [package.dependencies]
json-schema-for-humans = "^1.0.0"
jsonref = "^1.1.0" jsonref = "^1.1.0"
jsonschema = "^4.16.0" jsonschema = "^4.16.0"
pandas = "^2.1.4" pandas = "^2.1.4"
@ -1002,29 +982,31 @@ tabulate = "^0.9.0"
[package.source] [package.source]
type = "git" type = "git"
url = "https://github.com/DS4SD/docling-core.git" url = "https://github.com/DS4SD/docling-core.git"
reference = "7c104d61aa5d003dd8d9711c37e23ce04799f4c9" reference = "33aa21408400c9c475db0f8c6be681b888388284"
resolved_reference = "7c104d61aa5d003dd8d9711c37e23ce04799f4c9" resolved_reference = "33aa21408400c9c475db0f8c6be681b888388284"
[[package]] [[package]]
name = "docling-ibm-models" name = "docling-ibm-models"
version = "2.0.0" version = "2.0.1"
description = "This package contains the AI models used by the Docling PDF conversion package" description = "This package contains the AI models used by the Docling PDF conversion package"
optional = false optional = false
python-versions = "^3.10" python-versions = "<4.0,>=3.10"
files = [] files = [
develop = false {file = "docling_ibm_models-2.0.1-py3-none-any.whl", hash = "sha256:f81c6002b7e102aa79afb8287fce48872f27d1cffb088ea4d1fbebe490364a1d"},
{file = "docling_ibm_models-2.0.1.tar.gz", hash = "sha256:4fb0300022cfa0d0ac1fcbcb296c144e71ee9816654407f8a4d3a7b934f3065f"},
]
[package.dependencies] [package.dependencies]
huggingface_hub = ">=0.23,<1" huggingface_hub = ">=0.23,<1"
jsonlines = "^3.1.0" jsonlines = ">=3.1.0,<4.0.0"
lxml = "^4.9.1" lxml = ">=4.9.1,<5.0.0"
mean_average_precision = "^2021.4.26.0" mean_average_precision = ">=2021.4.26.0,<2022.0.0.0"
numpy = [ numpy = [
{version = ">=2.1.0,<3.0.0", markers = "python_version >= \"3.13\""}, {version = ">=2.1.0,<3.0.0", markers = "python_version >= \"3.13\""},
{version = ">=1.24.4,<2.0.0", markers = "python_version < \"3.13\""}, {version = ">=1.24.4,<2.0.0", markers = "python_version < \"3.13\""},
] ]
opencv-python-headless = "^4.6.0.66" opencv-python-headless = ">=4.6.0.66,<5.0.0.0"
Pillow = "^10.0.0" Pillow = ">=10.0.0,<11.0.0"
torch = [ torch = [
{version = ">=2.2.2,<3.0.0", markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""}, {version = ">=2.2.2,<3.0.0", markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""},
{version = ">=2.2.2,<2.3.0", markers = "sys_platform == \"darwin\" and platform_machine == \"x86_64\""}, {version = ">=2.2.2,<2.3.0", markers = "sys_platform == \"darwin\" and platform_machine == \"x86_64\""},
@ -1033,13 +1015,7 @@ torchvision = [
{version = ">=0,<1", markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""}, {version = ">=0,<1", markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""},
{version = ">=0.17.2,<0.18.0", markers = "sys_platform == \"darwin\" and platform_machine == \"x86_64\""}, {version = ">=0.17.2,<0.18.0", markers = "sys_platform == \"darwin\" and platform_machine == \"x86_64\""},
] ]
tqdm = "^4.64.0" tqdm = ">=4.64.0,<5.0.0"
[package.source]
type = "git"
url = "https://github.com/DS4SD/docling-ibm-models.git"
reference = "1d2e2a2e6eb152c237f1383cdba20cf85db80b97"
resolved_reference = "1d2e2a2e6eb152c237f1383cdba20cf85db80b97"
[[package]] [[package]]
name = "docling-parse" name = "docling-parse"
@ -1593,16 +1569,6 @@ files = [
{file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
] ]
[[package]]
name = "htmlmin"
version = "0.1.12"
description = "An HTML Minifier"
optional = false
python-versions = "*"
files = [
{file = "htmlmin-0.1.12.tar.gz", hash = "sha256:50c1ef4630374a5d723900096a961cff426dff46b48f34d194a81bbe14eca178"},
]
[[package]] [[package]]
name = "httpcore" name = "httpcore"
version = "1.0.6" version = "1.0.6"
@ -2015,29 +1981,6 @@ files = [
{file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"}, {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"},
] ]
[[package]]
name = "json-schema-for-humans"
version = "1.0.2"
description = "Generate static HTML documentation from JSON schemas"
optional = false
python-versions = "<4.0,>=3.8"
files = [
{file = "json_schema_for_humans-1.0.2-py3-none-any.whl", hash = "sha256:d6ecb023b4f802b10b01abca1295a37e363d9f060e54c21aa2cddea44731c6e1"},
{file = "json_schema_for_humans-1.0.2.tar.gz", hash = "sha256:8bd807a2bac31650226e451ad3b9583c27ce916375d6938ac9d0251eb6549ad5"},
]
[package.dependencies]
click = ">=8.0.1,<9.0.0"
dataclasses-json = ">=0.5.6,<0.6.0"
htmlmin = ">=0.1.12,<0.2.0"
Jinja2 = ">3"
markdown2 = ">=2.4.1,<3.0.0"
MarkupSafe = ">=2.0,<3.0"
Pygments = ">=2.10.0,<3.0.0"
pytz = "*"
PyYAML = ">=5.4.1,<7"
requests = ">=2.31.0,<3.0.0"
[[package]] [[package]]
name = "jsonlines" name = "jsonlines"
version = "3.1.0" version = "3.1.0"
@ -2671,23 +2614,6 @@ profiling = ["gprof2dot"]
rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
[[package]]
name = "markdown2"
version = "2.5.1"
description = "A fast and complete Python implementation of Markdown"
optional = false
python-versions = "<4,>=3.8"
files = [
{file = "markdown2-2.5.1-py2.py3-none-any.whl", hash = "sha256:190ae60a4bd0425c60c863bede18a9f3d45b1cbf3fbc9f40b4fac336ff2c520b"},
{file = "markdown2-2.5.1.tar.gz", hash = "sha256:12fc04ea5a87f7bb4b65acf5bf3af1183b20838cc7d543b74c92ec7eea4bbc74"},
]
[package.extras]
all = ["latex2mathml", "pygments (>=2.7.3)", "wavedrom"]
code-syntax-highlighting = ["pygments (>=2.7.3)"]
latex = ["latex2mathml"]
wavedrom = ["wavedrom"]
[[package]] [[package]]
name = "markupsafe" name = "markupsafe"
version = "2.1.5" version = "2.1.5"
@ -2776,20 +2702,6 @@ dev = ["marshmallow[tests]", "pre-commit (>=3.5,<4.0)", "tox"]
docs = ["alabaster (==1.0.0)", "autodocsumm (==0.2.13)", "sphinx (==8.0.2)", "sphinx-issues (==4.1.0)", "sphinx-version-warning (==1.1.2)"] docs = ["alabaster (==1.0.0)", "autodocsumm (==0.2.13)", "sphinx (==8.0.2)", "sphinx-issues (==4.1.0)", "sphinx-version-warning (==1.1.2)"]
tests = ["pytest", "pytz", "simplejson"] tests = ["pytest", "pytz", "simplejson"]
[[package]]
name = "marshmallow-enum"
version = "1.5.1"
description = "Enum field for Marshmallow"
optional = false
python-versions = "*"
files = [
{file = "marshmallow-enum-1.5.1.tar.gz", hash = "sha256:38e697e11f45a8e64b4a1e664000897c659b60aa57bfa18d44e226a9920b6e58"},
{file = "marshmallow_enum-1.5.1-py2.py3-none-any.whl", hash = "sha256:57161ab3dbfde4f57adeb12090f39592e992b9c86d206d02f6bd03ebec60f072"},
]
[package.dependencies]
marshmallow = ">=2.0.0"
[[package]] [[package]]
name = "matplotlib" name = "matplotlib"
version = "3.9.2" version = "3.9.2"
@ -3797,9 +3709,9 @@ files = [
[package.dependencies] [package.dependencies]
numpy = [ numpy = [
{version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
] ]
[[package]] [[package]]
@ -3948,8 +3860,8 @@ files = [
[package.dependencies] [package.dependencies]
numpy = [ numpy = [
{version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
{version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
] ]
python-dateutil = ">=2.8.2" python-dateutil = ">=2.8.2"
pytz = ">=2020.1" pytz = ">=2020.1"
@ -7001,21 +6913,6 @@ files = [
{file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
] ]
[[package]]
name = "typing-inspect"
version = "0.9.0"
description = "Runtime inspection utilities for typing module."
optional = false
python-versions = "*"
files = [
{file = "typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f"},
{file = "typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78"},
]
[package.dependencies]
mypy-extensions = ">=0.3.0"
typing-extensions = ">=3.7.4"
[[package]] [[package]]
name = "tzdata" name = "tzdata"
version = "2024.2" version = "2024.2"
@ -7599,4 +7496,4 @@ tesserocr = ["tesserocr"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "8a545ce70eb2001e47c79c102a494aa42d8f5efee5dfbf3dfd0acfb3fb0f8ec9" content-hash = "70620592368cfa1a6a8a7e32e1f98f5f9f253f0d99f7a8bdfb6c46a0363b2408"

View File

@ -37,9 +37,9 @@ torchvision = [
###################### ######################
python = "^3.10" python = "^3.10"
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "7c104d61aa5d003dd8d9711c37e23ce04799f4c9"} docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "33aa21408400c9c475db0f8c6be681b888388284"}
docling-ibm-models = {git = "https://github.com/DS4SD/docling-ibm-models.git", rev = "1d2e2a2e6eb152c237f1383cdba20cf85db80b97"} docling-ibm-models = "^2.0.1"
deepsearch-glm = {git = "https://github.com/DS4SD/deepsearch-glm.git", rev = "c185c4f985ccd29a470a1cddd3bec43880b739ee"} deepsearch-glm = {git = "https://github.com/DS4SD/deepsearch-glm.git", rev = "8ab1b4372122c820a28badd3c6095c2ce2feaf61"}
filetype = "^1.2.0" filetype = "^1.2.0"
pypdfium2 = "^4.30.0" pypdfium2 = "^4.30.0"
pydantic-settings = "^2.3.0" pydantic-settings = "^2.3.0"