mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
Update to docling-core v2.1.0
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
b1a2af6d39
commit
578e30e23b
35
poetry.lock
generated
35
poetry.lock
generated
@ -894,26 +894,22 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "docling-core"
|
||||
version = "2.0.1"
|
||||
version = "2.1.0"
|
||||
description = "A python library to define and validate data types in Docling."
|
||||
optional = false
|
||||
python-versions = "^3.9"
|
||||
files = []
|
||||
develop = false
|
||||
python-versions = "<4.0,>=3.9"
|
||||
files = [
|
||||
{file = "docling_core-2.1.0-py3-none-any.whl", hash = "sha256:4ccf9c44f8d7cf663657283baea4c4a36e1c4d1fba7df6b70ebc2c16b58f11a4"},
|
||||
{file = "docling_core-2.1.0.tar.gz", hash = "sha256:76ba3cb0a912db712aa89618746d279f1276b943c259dcf9d0b335a30cf7c99e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
jsonref = "^1.1.0"
|
||||
jsonschema = "^4.16.0"
|
||||
pandas = "^2.1.4"
|
||||
pillow = "^10.3.0"
|
||||
pydantic = "^2.6.0"
|
||||
tabulate = "^0.9.0"
|
||||
|
||||
[package.source]
|
||||
type = "git"
|
||||
url = "https://github.com/DS4SD/docling-core.git"
|
||||
reference = "dev/add-to_indented_text"
|
||||
resolved_reference = "bfcd07f0904e1b5afb5c31b3965be3221a74fe02"
|
||||
jsonref = ">=1.1.0,<2.0.0"
|
||||
jsonschema = ">=4.16.0,<5.0.0"
|
||||
pandas = ">=2.1.4,<3.0.0"
|
||||
pillow = ">=10.3.0,<11.0.0"
|
||||
pydantic = ">=2.6.0,<3.0.0"
|
||||
tabulate = ">=0.9.0,<0.10.0"
|
||||
|
||||
[[package]]
|
||||
name = "docling-ibm-models"
|
||||
@ -5583,6 +5579,11 @@ files = [
|
||||
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
|
||||
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
|
||||
{file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
|
||||
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
|
||||
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
|
||||
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
|
||||
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
|
||||
{file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
|
||||
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
|
||||
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
|
||||
{file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
|
||||
@ -7162,4 +7163,4 @@ tesserocr = ["tesserocr"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "2c70610e1dbc3f4578e73fd4cef6c0018caed6c4018d9ab67df478bc6558394b"
|
||||
content-hash = "d62a5e631775394405a311a6085989175539e68cb24f456f9559e507473ec9b7"
|
||||
|
@ -37,8 +37,7 @@ torchvision = [
|
||||
######################
|
||||
python = "^3.10"
|
||||
pydantic = "^2.0.0"
|
||||
#docling-core = "^2.0.1"
|
||||
docling-core = { git = "https://github.com/DS4SD/docling-core.git", branch = "dev/add-to_indented_text" }
|
||||
docling-core = "^2.1.0"
|
||||
docling-ibm-models = "^2.0.1"
|
||||
deepsearch-glm = "^0.26.1"
|
||||
filetype = "^1.2.0"
|
||||
|
@ -5,6 +5,7 @@ order to compute the TED score. Inference timing results for all experiments wer
|
||||
We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.
|
||||
|
||||
Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.
|
||||
|
||||
| # | # | Language | TEDs | TEDs | TEDs | mAP | Inference |
|
||||
|------------|------------|------------|-------------|-------------|-------------|-------------|-------------|
|
||||
| enc-layers | dec-layers | Language | simple | complex | all | (0.75) | time (secs) |
|
||||
|
@ -51,6 +51,4 @@ def test_asciidocs_examples():
|
||||
|
||||
# print("\n\n", doc.export_to_markdown())
|
||||
|
||||
input("continue")
|
||||
|
||||
assert True
|
||||
|
@ -11,8 +11,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||
|
||||
GENERATE_V1 = False
|
||||
GENERATE_V2 = False
|
||||
GENERATE_V1 = True
|
||||
GENERATE_V2 = True
|
||||
|
||||
|
||||
def get_pdf_paths():
|
||||
|
Loading…
Reference in New Issue
Block a user