Update to docling-core v2.1.0

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-31 14:34:40 +00:00 · 2024-10-22 14:34:38 +02:00 · 2024-10-22 14:34:38 +02:00 · 578e30e23b
commit 578e30e23b
parent b1a2af6d39
5 changed files with 22 additions and 23 deletions
--- a/poetry.lock
+++ b/poetry.lock
@ -894,26 +894,22 @@ files = [

 [[package]]
 name = "docling-core"
-version = "2.0.1"
+version = "2.1.0"
 description = "A python library to define and validate data types in Docling."
 optional = false
-python-versions = "^3.9"
-files = []
-develop = false
+python-versions = "<4.0,>=3.9"
+files = [
+    {file = "docling_core-2.1.0-py3-none-any.whl", hash = "sha256:4ccf9c44f8d7cf663657283baea4c4a36e1c4d1fba7df6b70ebc2c16b58f11a4"},
+    {file = "docling_core-2.1.0.tar.gz", hash = "sha256:76ba3cb0a912db712aa89618746d279f1276b943c259dcf9d0b335a30cf7c99e"},
+]

 [package.dependencies]
-jsonref = "^1.1.0"
-jsonschema = "^4.16.0"
-pandas = "^2.1.4"
-pillow = "^10.3.0"
-pydantic = "^2.6.0"
-tabulate = "^0.9.0"
-
-[package.source]
-type = "git"
-url = "https://github.com/DS4SD/docling-core.git"
-reference = "dev/add-to_indented_text"
-resolved_reference = "bfcd07f0904e1b5afb5c31b3965be3221a74fe02"
+jsonref = ">=1.1.0,<2.0.0"
+jsonschema = ">=4.16.0,<5.0.0"
+pandas = ">=2.1.4,<3.0.0"
+pillow = ">=10.3.0,<11.0.0"
+pydantic = ">=2.6.0,<3.0.0"
+tabulate = ">=0.9.0,<0.10.0"

 [[package]]
 name = "docling-ibm-models"
@ -5583,6 +5579,11 @@ files = [
    {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
    {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
    {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
    {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
    {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
    {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
@ -7162,4 +7163,4 @@ tesserocr = ["tesserocr"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "2c70610e1dbc3f4578e73fd4cef6c0018caed6c4018d9ab67df478bc6558394b"
+content-hash = "d62a5e631775394405a311a6085989175539e68cb24f456f9559e507473ec9b7"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -37,8 +37,7 @@ torchvision = [
 ######################
 python = "^3.10"
 pydantic = "^2.0.0"
-#docling-core = "^2.0.1"
-docling-core = { git = "https://github.com/DS4SD/docling-core.git", branch = "dev/add-to_indented_text" }
+docling-core = "^2.1.0"
 docling-ibm-models = "^2.0.1"
 deepsearch-glm = "^0.26.1"
 filetype = "^1.2.0"
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.md
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.md
@ -5,6 +5,7 @@ order to compute the TED score. Inference timing results for all experiments wer
 We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.

 Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.
+
 | #          | #          | Language   | TEDs        | TEDs        | TEDs        | mAP         | Inference   |
 |------------|------------|------------|-------------|-------------|-------------|-------------|-------------|
 | enc-layers | dec-layers | Language   | simple      | complex     | all         | (0.75)      | time (secs) |
--- a/tests/test_backend_asciidoc.py
+++ b/tests/test_backend_asciidoc.py
@ -51,6 +51,4 @@ def test_asciidocs_examples():

            # print("\n\n", doc.export_to_markdown())

-        input("continue")
-
    assert True
--- a/tests/test_e2e_conversion.py
+++ b/tests/test_e2e_conversion.py
@ -11,8 +11,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption

 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2

-GENERATE_V1 = False
-GENERATE_V2 = False
+GENERATE_V1 = True
+GENERATE_V2 = True


 def get_pdf_paths():