From 578e30e23bc3497d1b1e51f6ceaebe29df310c93 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 22 Oct 2024 14:34:38 +0200 Subject: [PATCH] Update to docling-core v2.1.0 Signed-off-by: Christoph Auer --- poetry.lock | 35 ++++++++++--------- pyproject.toml | 3 +- .../docling_v2/2305.03393v1-pg9.md | 1 + tests/test_backend_asciidoc.py | 2 -- tests/test_e2e_conversion.py | 4 +-- 5 files changed, 22 insertions(+), 23 deletions(-) diff --git a/poetry.lock b/poetry.lock index 2a39f2c0..b43be70c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -894,26 +894,22 @@ files = [ [[package]] name = "docling-core" -version = "2.0.1" +version = "2.1.0" description = "A python library to define and validate data types in Docling." optional = false -python-versions = "^3.9" -files = [] -develop = false +python-versions = "<4.0,>=3.9" +files = [ + {file = "docling_core-2.1.0-py3-none-any.whl", hash = "sha256:4ccf9c44f8d7cf663657283baea4c4a36e1c4d1fba7df6b70ebc2c16b58f11a4"}, + {file = "docling_core-2.1.0.tar.gz", hash = "sha256:76ba3cb0a912db712aa89618746d279f1276b943c259dcf9d0b335a30cf7c99e"}, +] [package.dependencies] -jsonref = "^1.1.0" -jsonschema = "^4.16.0" -pandas = "^2.1.4" -pillow = "^10.3.0" -pydantic = "^2.6.0" -tabulate = "^0.9.0" - -[package.source] -type = "git" -url = "https://github.com/DS4SD/docling-core.git" -reference = "dev/add-to_indented_text" -resolved_reference = "bfcd07f0904e1b5afb5c31b3965be3221a74fe02" +jsonref = ">=1.1.0,<2.0.0" +jsonschema = ">=4.16.0,<5.0.0" +pandas = ">=2.1.4,<3.0.0" +pillow = ">=10.3.0,<11.0.0" +pydantic = ">=2.6.0,<3.0.0" +tabulate = ">=0.9.0,<0.10.0" [[package]] name = "docling-ibm-models" @@ -5583,6 +5579,11 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, + {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, @@ -7162,4 +7163,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "2c70610e1dbc3f4578e73fd4cef6c0018caed6c4018d9ab67df478bc6558394b" +content-hash = "d62a5e631775394405a311a6085989175539e68cb24f456f9559e507473ec9b7" diff --git a/pyproject.toml b/pyproject.toml index 05f22043..2284d7c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,8 +37,7 @@ torchvision = [ ###################### python = "^3.10" pydantic = "^2.0.0" -#docling-core = "^2.0.1" -docling-core = { git = "https://github.com/DS4SD/docling-core.git", branch = "dev/add-to_indented_text" } +docling-core = "^2.1.0" docling-ibm-models = "^2.0.1" deepsearch-glm = "^0.26.1" filetype = "^1.2.0" diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.md b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.md index 48a5fe4a..45466f7d 100644 --- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.md +++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.md @@ -5,6 +5,7 @@ order to compute the TED score. Inference timing results for all experiments wer We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML. Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart. + | # | # | Language | TEDs | TEDs | TEDs | mAP | Inference | |------------|------------|------------|-------------|-------------|-------------|-------------|-------------| | enc-layers | dec-layers | Language | simple | complex | all | (0.75) | time (secs) | diff --git a/tests/test_backend_asciidoc.py b/tests/test_backend_asciidoc.py index 44442cc2..ab94d58c 100644 --- a/tests/test_backend_asciidoc.py +++ b/tests/test_backend_asciidoc.py @@ -51,6 +51,4 @@ def test_asciidocs_examples(): # print("\n\n", doc.export_to_markdown()) - input("continue") - assert True diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py index 1e166116..b8d20bb2 100644 --- a/tests/test_e2e_conversion.py +++ b/tests/test_e2e_conversion.py @@ -11,8 +11,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 -GENERATE_V1 = False -GENERATE_V2 = False +GENERATE_V1 = True +GENERATE_V2 = True def get_pdf_paths():