Update to docling-core v2.1.0

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-22 14:34:38 +02:00
parent b1a2af6d39
commit 578e30e23b
5 changed files with 22 additions and 23 deletions

35
poetry.lock generated
View File

@ -894,26 +894,22 @@ files = [
[[package]]
name = "docling-core"
version = "2.0.1"
version = "2.1.0"
description = "A python library to define and validate data types in Docling."
optional = false
python-versions = "^3.9"
files = []
develop = false
python-versions = "<4.0,>=3.9"
files = [
{file = "docling_core-2.1.0-py3-none-any.whl", hash = "sha256:4ccf9c44f8d7cf663657283baea4c4a36e1c4d1fba7df6b70ebc2c16b58f11a4"},
{file = "docling_core-2.1.0.tar.gz", hash = "sha256:76ba3cb0a912db712aa89618746d279f1276b943c259dcf9d0b335a30cf7c99e"},
]
[package.dependencies]
jsonref = "^1.1.0"
jsonschema = "^4.16.0"
pandas = "^2.1.4"
pillow = "^10.3.0"
pydantic = "^2.6.0"
tabulate = "^0.9.0"
[package.source]
type = "git"
url = "https://github.com/DS4SD/docling-core.git"
reference = "dev/add-to_indented_text"
resolved_reference = "bfcd07f0904e1b5afb5c31b3965be3221a74fe02"
jsonref = ">=1.1.0,<2.0.0"
jsonschema = ">=4.16.0,<5.0.0"
pandas = ">=2.1.4,<3.0.0"
pillow = ">=10.3.0,<11.0.0"
pydantic = ">=2.6.0,<3.0.0"
tabulate = ">=0.9.0,<0.10.0"
[[package]]
name = "docling-ibm-models"
@ -5583,6 +5579,11 @@ files = [
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
{file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
{file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
{file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
@ -7162,4 +7163,4 @@ tesserocr = ["tesserocr"]
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "2c70610e1dbc3f4578e73fd4cef6c0018caed6c4018d9ab67df478bc6558394b"
content-hash = "d62a5e631775394405a311a6085989175539e68cb24f456f9559e507473ec9b7"

View File

@ -37,8 +37,7 @@ torchvision = [
######################
python = "^3.10"
pydantic = "^2.0.0"
#docling-core = "^2.0.1"
docling-core = { git = "https://github.com/DS4SD/docling-core.git", branch = "dev/add-to_indented_text" }
docling-core = "^2.1.0"
docling-ibm-models = "^2.0.1"
deepsearch-glm = "^0.26.1"
filetype = "^1.2.0"

View File

@ -5,6 +5,7 @@ order to compute the TED score. Inference timing results for all experiments wer
We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.
Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.
| # | # | Language | TEDs | TEDs | TEDs | mAP | Inference |
|------------|------------|------------|-------------|-------------|-------------|-------------|-------------|
| enc-layers | dec-layers | Language | simple | complex | all | (0.75) | time (secs) |

View File

@ -51,6 +51,4 @@ def test_asciidocs_examples():
# print("\n\n", doc.export_to_markdown())
input("continue")
assert True

View File

@ -11,8 +11,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
GENERATE_V1 = False
GENERATE_V2 = False
GENERATE_V1 = True
GENERATE_V2 = True
def get_pdf_paths():