From 817a480038b98e4379372ade01088cf80ff8651d Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Mon, 3 Feb 2025 08:21:57 +0100 Subject: [PATCH] pin new docling-core and exploit it via assembler changes Signed-off-by: Michele Dolfi --- docling/utils/glm_utils.py | 4 +++ poetry.lock | 66 +++++++++++++++++++++++--------------- pyproject.toml | 2 +- 3 files changed, 46 insertions(+), 26 deletions(-) diff --git a/docling/utils/glm_utils.py b/docling/utils/glm_utils.py index da29cdd1..b03a0e7b 100644 --- a/docling/utils/glm_utils.py +++ b/docling/utils/glm_utils.py @@ -307,6 +307,10 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: current_list = None doc.add_code(text=text, prov=prov) + elif label == DocItemLabel.FORMULA: + current_list = None + + doc.add_text(label=DocItemLabel.FORMULA, text="", orig=text, prov=prov) else: current_list = None diff --git a/poetry.lock b/poetry.lock index e4882a71..f9dee268 100644 --- a/poetry.lock +++ b/poetry.lock @@ -182,8 +182,8 @@ files = [ lazy-object-proxy = ">=1.4.0" typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} wrapt = [ - {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, {version = ">=1.11,<2", markers = "python_version < \"3.11\""}, + {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, ] [[package]] @@ -864,28 +864,33 @@ name = "docling-core" version = "2.16.1" description = "A python library to define and validate data types in Docling." optional = false -python-versions = "<4.0,>=3.9" -files = [ - {file = "docling_core-2.16.1-py3-none-any.whl", hash = "sha256:d26af2f49e9f1f65ae5dfca972e206860339c1f91adfe427fa67d1cf95cce241"}, - {file = "docling_core-2.16.1.tar.gz", hash = "sha256:676f51fa5797c91a86ccbc1fdaa020effcde4cc86aa9b094a0d5d775636871ba"}, -] +python-versions = "^3.9" +files = [] +develop = false [package.dependencies] -jsonref = ">=1.1.0,<2.0.0" -jsonschema = ">=4.16.0,<5.0.0" -pandas = ">=2.1.4,<3.0.0" -pillow = ">=10.3.0,<11.0.0" -pydantic = ">=2.6.0,<2.10.0 || >2.10.0,<2.10.1 || >2.10.1,<2.10.2 || >2.10.2,<3.0.0" +jsonref = "^1.1.0" +jsonschema = "^4.16.0" +latex2mathml = "^3.77.0" +pandas = "^2.1.4" +pillow = "^10.3.0" +pydantic = ">=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2" pyyaml = ">=5.1,<7.0.0" -semchunk = {version = ">=2.2.0,<3.0.0", optional = true, markers = "extra == \"chunking\""} -tabulate = ">=0.9.0,<0.10.0" -transformers = {version = ">=4.34.0,<5.0.0", optional = true, markers = "extra == \"chunking\""} -typer = ">=0.12.5,<0.13.0" -typing-extensions = ">=4.12.2,<5.0.0" +semchunk = {version = "^2.2.0", optional = true} +tabulate = "^0.9.0" +transformers = {version = "^4.34.0", optional = true} +typer = "^0.12.5" +typing-extensions = "^4.12.2" [package.extras] chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"] +[package.source] +type = "git" +url = "https://github.com/DS4SD/docling-core" +reference = "feat-formula-as-img" +resolved_reference = "65519b08ca1301d783097a6d70186ebd4be9561a" + [[package]] name = "docling-ibm-models" version = "3.3.0" @@ -2167,6 +2172,17 @@ requests-toolbelt = ">=1.0.0,<2.0.0" [package.extras] langsmith-pyo3 = ["langsmith-pyo3 (>=0.1.0rc2,<0.2.0)"] +[[package]] +name = "latex2mathml" +version = "3.77.0" +description = "Pure Python library for LaTeX to MathML conversion" +optional = false +python-versions = ">=3.8.1,<4.0.0" +files = [ + {file = "latex2mathml-3.77.0-py3-none-any.whl", hash = "sha256:5531e18a2a9eae7c24e257118b6a444cbba253cd27ff3e81f1bd6c41e88e786e"}, + {file = "latex2mathml-3.77.0.tar.gz", hash = "sha256:e2f501d1878f2e489c3f6f12786bef74c62f712d2770f7f3c837eb20a55d0a1e"}, +] + [[package]] name = "lazy-loader" version = "0.4" @@ -2817,8 +2833,8 @@ files = [ [package.dependencies] multiprocess = [ - {version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""}, {version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""}, + {version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""}, ] pygments = ">=2.0" pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""} @@ -3833,10 +3849,10 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, ] @@ -3859,10 +3875,10 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, ] @@ -4048,9 +4064,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.2", markers = "python_version == \"3.11\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -4814,8 +4830,8 @@ files = [ astroid = ">=2.15.8,<=2.17.0-dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = [ - {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, {version = ">=0.2", markers = "python_version < \"3.11\""}, + {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, ] isort = ">=4.2.5,<6" mccabe = ">=0.6,<0.8" @@ -7825,4 +7841,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "336970505f4bae6b21f4cf358ebf6b5ef4fa42a4980358297e63bfea381b350a" +content-hash = "f90b6e8c1654f62ddbc3efd702176c9a39f0b8c48e3addd4c887e6395eda095f" diff --git a/pyproject.toml b/pyproject.toml index 6e3343f3..34df62e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ packages = [{include = "docling"}] ###################### python = "^3.9" pydantic = "^2.0.0" -docling-core = {version = "^2.16.1", extras = ["chunking"]} +docling-core = {git = "https://github.com/DS4SD/docling-core", rev = "feat-formula-as-img", extras = ["chunking"]} docling-ibm-models = "^3.3.0" deepsearch-glm = "^1.0.0" docling-parse = "^3.1.0"