Rebase from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2024-12-16 11:26:24 +01:00
parent 8cb7d8327a 31184ad516
commit c020f2cba3
9 changed files with 114 additions and 149 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## [v2.12.0](https://github.com/DS4SD/docling/releases/tag/v2.12.0) - 2024-12-13
+
+### Feature
+
+* Introduce support for GPU Accelerators ([#593](https://github.com/DS4SD/docling/issues/593)) ([`19fad92`](https://github.com/DS4SD/docling/commit/19fad9261cb61f732a0426393866c8c1a9efbf4f))
+
 ## [v2.11.0](https://github.com/DS4SD/docling/releases/tag/v2.11.0) - 2024-12-12

 ### Feature
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -1,15 +1,17 @@
 import logging
 import os
+import warnings
 from enum import Enum
 from pathlib import Path
 from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union

-from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from pydantic_settings import (
    BaseSettings,
    PydanticBaseSettingsSource,
    SettingsConfigDict,
 )
+from typing_extensions import deprecated

 _log = logging.getLogger(__name__)

@@ -134,14 +136,8 @@ class EasyOcrOptions(OcrOptions):

    kind: Literal["easyocr"] = "easyocr"
    lang: List[str] = ["fr", "de", "es", "en"]
-    use_gpu: Annotated[
-        int,
-        Field(
-            deprecated="Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
-            "When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
-            "to run EasyOCR. Otherwise, EasyOCR runs in CPU."
-        ),
-    ] = True
+
+    use_gpu: Optional[bool] = None

    model_storage_directory: Optional[str] = None
    download_enabled: bool = True
@@ -216,8 +212,8 @@ class PipelineOptions(BaseModel):
    create_legacy_output: bool = (
        True  # This default will be set to False on a future version of docling
    )
-    accelerator_options: AcceleratorOptions = AcceleratorOptions()
    document_timeout: Optional[float] = None
+    accelerator_options: AcceleratorOptions = AcceleratorOptions()


 class PdfPipelineOptions(PipelineOptions):
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@@ -1,4 +1,5 @@
 import logging
+import warnings
 from typing import Iterable

 import numpy
@@ -41,16 +42,25 @@ class EasyOcrModel(BaseOcrModel):
                    "Alternatively, Docling has support for other OCR engines. See the documentation."
                )

-            use_gpu = False
-            if self.options.use_gpu:
+            if self.options.use_gpu is None:
                device = decide_device(accelerator_options.device)
                # Enable easyocr GPU if running on CUDA, MPS
                use_gpu = any(
-                    filter(
-                        lambda x: str(x).lower() in device,
-                        [AcceleratorDevice.CUDA.value, AcceleratorDevice.MPS.value],
-                    )
+                    [
+                        device.startswith(x)
+                        for x in [
+                            AcceleratorDevice.CUDA.value,
+                            AcceleratorDevice.MPS.value,
+                        ]
+                    ]
                )
+            else:
+                warnings.warn(
+                    "Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
+                    "When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
+                    "to run EasyOCR. Otherwise, EasyOCR runs in CPU."
+                )
+                use_gpu = self.options.use_gpu

            self.reader = easyocr.Reader(
                lang_list=self.options.lang,
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -17,7 +17,7 @@ from docling.datamodel.base_models import (
    Page,
 )
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import AcceleratorOptions
+from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.utils.accelerator_utils import decide_device
@@ -51,8 +51,13 @@ class LayoutModel(BasePageModel):

    def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
        device = decide_device(accelerator_options.device)
+
        self.layout_predictor = LayoutPredictor(
-            artifacts_path, device, accelerator_options.num_threads
+            artifact_path=str(artifacts_path),
+            device=device,
+            num_threads=accelerator_options.num_threads,
+            base_threshold=0.6,
+            blacklist_classes={"Form", "Key-Value Region"},
        )

    def draw_clusters_and_cells_side_by_side(
@@ -87,7 +92,6 @@ class LayoutModel(BasePageModel):
            DocItemLabel.FORM: (200, 255, 255),  # Light Cyan
            DocItemLabel.KEY_VALUE_REGION: (183, 65, 14),  # Rusty orange
        }
-
        # Filter clusters for left and right images
        exclude_labels = {
            DocItemLabel.FORM,
@@ -96,7 +100,6 @@ class LayoutModel(BasePageModel):
        }
        left_clusters = [c for c in clusters if c.label not in exclude_labels]
        right_clusters = [c for c in clusters if c.label in exclude_labels]
-
        # Create a deep copy of the original image for both sides
        left_image = copy.deepcopy(page.image)
        right_image = copy.deepcopy(page.image)
@@ -104,14 +107,12 @@ class LayoutModel(BasePageModel):
        # Function to draw clusters on an image
        def draw_clusters(image, clusters):
            draw = ImageDraw.Draw(image, "RGBA")
-
            # Create a smaller font for the labels
            try:
                font = ImageFont.truetype("arial.ttf", 12)
            except OSError:
                # Fallback to default font if arial is not available
                font = ImageFont.load_default()
-
            for c_tl in clusters:
                all_clusters = [c_tl, *c_tl.children]
                for c in all_clusters:
@@ -124,7 +125,6 @@ class LayoutModel(BasePageModel):
                            outline=None,
                            fill=cell_color,
                        )
-
                    # Draw cluster rectangle
                    x0, y0, x1, y1 = c.bbox.as_tuple()
                    cluster_fill_color = (*list(label_to_color.get(c.label)), 70)
@@ -134,10 +134,8 @@ class LayoutModel(BasePageModel):
                        outline=cluster_outline_color,
                        fill=cluster_fill_color,
                    )
-
                    # Add label name and confidence
                    label_text = f"{c.label.name} ({c.confidence:.2f})"
-
                    # Create semi-transparent background for text
                    text_bbox = draw.textbbox((x0, y0), label_text, font=font)
                    text_bg_padding = 2
@@ -154,7 +152,6 @@ class LayoutModel(BasePageModel):
                        ],
                        fill=(255, 255, 255, 180),  # Semi-transparent white
                    )
-
                    # Draw text
                    draw.text(
                        (x0, y0),
@@ -166,14 +163,12 @@ class LayoutModel(BasePageModel):
        # Draw clusters on both images
        draw_clusters(left_image, left_clusters)
        draw_clusters(right_image, right_clusters)
-
        # Combine the images side by side
        combined_width = left_image.width * 2
        combined_height = left_image.height
        combined_image = Image.new("RGB", (combined_width, combined_height))
        combined_image.paste(left_image, (0, 0))
        combined_image.paste(right_image, (left_image.width, 0))
-
        if show:
            combined_image.show()
        else:
@@ -182,7 +177,6 @@ class LayoutModel(BasePageModel):
                / f"debug_{conv_res.input.file.stem}"
            )
            out_path.mkdir(parents=True, exist_ok=True)
-
            out_file = out_path / f"{mode_prefix}_layout_page_{page.page_no:05}.png"
            combined_image.save(str(out_file), format="png")

@@ -217,93 +211,6 @@ class LayoutModel(BasePageModel):
                        )
                        clusters.append(cluster)

-                    # DEBUG code:
-                    def draw_clusters_and_cells(
-                        clusters, mode_prefix: str, show: bool = False
-                    ):
-                        label_to_color = {
-                            DocItemLabel.TEXT: (255, 255, 153),  # Light Yellow
-                            DocItemLabel.CAPTION: (255, 204, 153),  # Light Orange
-                            DocItemLabel.LIST_ITEM: (153, 153, 255),  # Light Purple
-                            DocItemLabel.FORMULA: (192, 192, 192),  # Gray
-                            DocItemLabel.TABLE: (255, 204, 204),  # Light Pink
-                            DocItemLabel.PICTURE: (255, 255, 204),  # Light Beige
-                            DocItemLabel.SECTION_HEADER: (255, 153, 153),  # Light Red
-                            DocItemLabel.PAGE_HEADER: (204, 255, 204),  # Light Green
-                            DocItemLabel.PAGE_FOOTER: (
-                                204,
-                                255,
-                                204,
-                            ),  # Light Green (same as Page-Header)
-                            DocItemLabel.TITLE: (
-                                255,
-                                153,
-                                153,
-                            ),  # Light Red (same as Section-Header)
-                            DocItemLabel.FOOTNOTE: (200, 200, 255),  # Light Blue
-                            DocItemLabel.DOCUMENT_INDEX: (220, 220, 220),  # Light Gray
-                            DocItemLabel.CODE: (255, 223, 186),  # Peach
-                            DocItemLabel.CHECKBOX_SELECTED: (
-                                255,
-                                182,
-                                193,
-                            ),  # Pale Green
-                            DocItemLabel.CHECKBOX_UNSELECTED: (
-                                255,
-                                182,
-                                193,
-                            ),  # Light Pink
-                            DocItemLabel.FORM: (200, 255, 255),  # Light Cyan
-                            DocItemLabel.KEY_VALUE_REGION: (
-                                183,
-                                65,
-                                14,
-                            ),  # Rusty orange
-                        }
-
-                        image = copy.deepcopy(page.image)
-                        if image is not None:
-                            draw = ImageDraw.Draw(image, "RGBA")
-                            for c in clusters:
-                                cell_color = (0, 0, 0, 40)
-                                for tc in c.cells:  # [:1]:
-                                    cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
-                                    draw.rectangle(
-                                        [(cx0, cy0), (cx1, cy1)],
-                                        outline=None,
-                                        fill=cell_color,
-                                    )
-
-                                x0, y0, x1, y1 = c.bbox.as_tuple()
-                                cluster_fill_color = (
-                                    *list(label_to_color.get(c.label)),  # type: ignore
-                                    70,
-                                )
-                                cluster_outline_color = (
-                                    *list(label_to_color.get(c.label)),  # type: ignore
-                                    255,
-                                )
-                                draw.rectangle(
-                                    [(x0, y0), (x1, y1)],
-                                    outline=cluster_outline_color,
-                                    fill=cluster_fill_color,
-                                )
-
-                            if show:
-                                image.show()
-                            else:
-                                out_path: Path = (
-                                    Path(settings.debug.debug_output_path)
-                                    / f"debug_{conv_res.input.file.stem}"
-                                )
-                                out_path.mkdir(parents=True, exist_ok=True)
-
-                                out_file = (
-                                    out_path
-                                    / f"{mode_prefix}_layout_page_{page.page_no:05}.png"
-                                )
-                                image.save(str(out_file), format="png")
-
                    if settings.debug.visualize_raw_layout:
                        self.draw_clusters_and_cells_side_by_side(
                            conv_res, page, clusters, mode_prefix="raw"
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -10,6 +10,7 @@ from PIL import ImageDraw
 from docling.datamodel.base_models import Page, Table, TableStructurePrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
    AcceleratorOptions,
    TableFormerMode,
    TableStructureOptions,
@@ -44,6 +45,10 @@ class TableStructureModel(BasePageModel):

            device = decide_device(accelerator_options.device)

+            # Disable MPS here, until we know why it makes things slower.
+            if device == AcceleratorDevice.MPS.value:
+                device = AcceleratorDevice.CPU.value
+
            self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
            self.tm_config["model"]["save_dir"] = artifacts_path
            self.tm_model_type = self.tm_config["model"]["type"]
--- a/docling/utils/accelerator_utils.py
+++ b/docling/utils/accelerator_utils.py
@@ -21,9 +21,11 @@ def decide_device(accelerator_device: AcceleratorDevice) -> str:
    has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()

    if accelerator_device == AcceleratorDevice.AUTO:
-        # TODO: Enable MPS later
        if has_cuda:
            device = f"cuda:{cuda_index}"
+        elif has_mps:
+            device = "mps"
+
    else:
        if accelerator_device == AcceleratorDevice.CUDA:
            if has_cuda:
--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@@ -74,7 +74,7 @@ def main():
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
-    pipeline_options.ocr_options.lang = "es"
+    pipeline_options.ocr_options.lang = ["es"]
    pipeline_options.accelerator_options = AcceleratorOptions(
        num_threads=4, device=Device.AUTO
    )
--- a/poetry.lock
+++ b/poetry.lock
@@ -888,41 +888,40 @@ files = [

 [[package]]
 name = "docling-core"
-version = "2.9.0"
+version = "2.10.0"
 description = "A python library to define and validate data types in Docling."
 optional = false
-python-versions = "^3.9"
-files = []
-develop = false
+python-versions = "<4.0,>=3.9"
+files = [
+    {file = "docling_core-2.10.0-py3-none-any.whl", hash = "sha256:b4fe310cd0f1edde7d727e15cb39f8b5a31d2bd5b1ac5af3f4670ac5209c9057"},
+    {file = "docling_core-2.10.0.tar.gz", hash = "sha256:f9b33074de048afb4cb6be784d52f97f8723d1d41737096e575629e0bb30add8"},
+]

 [package.dependencies]
-jsonref = "^1.1.0"
-jsonschema = "^4.16.0"
-pandas = "^2.1.4"
-pillow = "^10.3.0"
-pydantic = ">=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2"
+jsonref = ">=1.1.0,<2.0.0"
+jsonschema = ">=4.16.0,<5.0.0"
+pandas = ">=2.1.4,<3.0.0"
+pillow = ">=10.3.0,<11.0.0"
+pydantic = ">=2.6.0,<2.10.0 || >2.10.0,<2.10.1 || >2.10.1,<2.10.2 || >2.10.2,<3.0.0"
 pyyaml = ">=5.1,<7.0.0"
-tabulate = "^0.9.0"
-typing-extensions = "^4.12.2"
+semchunk = {version = ">=2.2.0,<3.0.0", optional = true, markers = "extra == \"chunking\""}
+tabulate = ">=0.9.0,<0.10.0"
+transformers = {version = ">=4.34.0,<5.0.0", optional = true, markers = "extra == \"chunking\""}
+typer = ">=0.12.5,<0.13.0"
+typing-extensions = ">=4.12.2,<5.0.0"

 [package.extras]
 chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]

-[package.source]
-type = "git"
-url = "ssh://git@github.com/DS4SD/docling-core.git"
-reference = "improve-doc-item-typing"
-resolved_reference = "948a1c56caef3fe1770d7c5cdc61cbf9bc026113"
-
 [[package]]
 name = "docling-ibm-models"
-version = "3.0.0"
+version = "3.1.0"
 description = "This package contains the AI models used by the Docling PDF conversion package"
 optional = false
 python-versions = "<4.0,>=3.9"
 files = [
-    {file = "docling_ibm_models-3.0.0-py3-none-any.whl", hash = "sha256:61d1bc3fc36fbec687533f543e2f899117bc19e5b31ab03520af4b84e1f7327c"},
-    {file = "docling_ibm_models-3.0.0.tar.gz", hash = "sha256:2a4c064c6a58cfce039e9574c52cb3cab7decd103e20e9c5ccb7834e7fa04d4f"},
+    {file = "docling_ibm_models-3.1.0-py3-none-any.whl", hash = "sha256:a381a45dff16fdb2246b99c15a2e3d6ba880c573d48a1d6477d3ffb36bab807f"},
+    {file = "docling_ibm_models-3.1.0.tar.gz", hash = "sha256:65d734ffa490edc4e2301d296b6e893afa536c63b7daae7bbda781bd15b3431e"},
 ]

 [package.dependencies]
@@ -2822,6 +2821,32 @@ files = [
    {file = "more_itertools-10.5.0-py3-none-any.whl", hash = "sha256:037b0d3203ce90cca8ab1defbbdac29d5f993fc20131f3664dc8d6acfa872aef"},
 ]

+[[package]]
+name = "mpire"
+version = "2.10.2"
+description = "A Python package for easy multiprocessing, but faster than multiprocessing"
+optional = false
+python-versions = "*"
+files = [
+    {file = "mpire-2.10.2-py3-none-any.whl", hash = "sha256:d627707f7a8d02aa4c7f7d59de399dec5290945ddf7fbd36cbb1d6ebb37a51fb"},
+    {file = "mpire-2.10.2.tar.gz", hash = "sha256:f66a321e93fadff34585a4bfa05e95bd946cf714b442f51c529038eb45773d97"},
+]
+
+[package.dependencies]
+multiprocess = [
+    {version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""},
+    {version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""},
+]
+pygments = ">=2.0"
+pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""}
+tqdm = ">=4.27"
+
+[package.extras]
+dashboard = ["flask"]
+dill = ["multiprocess", "multiprocess (>=0.70.15)"]
+docs = ["docutils (==0.17.1)", "sphinx (==3.2.1)", "sphinx-autodoc-typehints (==1.11.0)", "sphinx-rtd-theme (==0.5.0)", "sphinx-versions (==1.0.1)", "sphinxcontrib-images (==0.9.2)"]
+testing = ["ipywidgets", "multiprocess", "multiprocess (>=0.70.15)", "numpy", "pywin32 (>=301)", "rich"]
+
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@@ -3765,10 +3790,10 @@ files = [
 numpy = [
    {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
    {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
-    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
    {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
+    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]

 [[package]]
@@ -3791,10 +3816,10 @@ files = [
 numpy = [
    {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
    {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
-    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
    {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
+    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]

 [[package]]
@@ -3975,8 +4000,8 @@ files = [
 [package.dependencies]
 numpy = [
    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
@@ -6132,6 +6157,21 @@ files = [
 cryptography = ">=2.0"
 jeepney = ">=0.6"

+[[package]]
+name = "semchunk"
+version = "2.2.0"
+description = "A fast and lightweight Python library for splitting text into semantically meaningful chunks."
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "semchunk-2.2.0-py3-none-any.whl", hash = "sha256:7db19ca90ddb48f99265e789e07a7bb111ae25185f9cc3d44b94e1e61b9067fc"},
+    {file = "semchunk-2.2.0.tar.gz", hash = "sha256:4de761ce614036fa3bea61adbe47e3ade7c96ac9b062f223b3ac353dbfd26743"},
+]
+
+[package.dependencies]
+mpire = {version = "*", extras = ["dill"]}
+tqdm = "*"
+
 [[package]]
 name = "semver"
 version = "2.13.0"
@@ -7573,4 +7613,4 @@ tesserocr = ["tesserocr"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "c81c99b768cfca5c58c7d41c553110bad65b16c6f527c4d3892a916dffc47a05"
+content-hash = "c99badc27c127051233e278f497b98acda8239697ce1cded43a2b05eab28795e"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.11.0"  # DO NOT EDIT, updated automatically
+version = "2.12.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@@ -25,12 +25,11 @@ packages = [{include = "docling"}]
 # actual dependencies:
 ######################
 python = "^3.9"
-docling-ibm-models = "^3.0.0"
+docling-core = { version = "^2.10.0", extras = ["chunking"] }
+pydantic = "^2.0.0"
+docling-ibm-models = "^3.1.0"
 deepsearch-glm = "^1.0.0"
 docling-parse = "^3.0.0"
-#docling-core = { version = "^2.9.0", extras = ["chunking"] }
-docling-core = { git = "ssh://git@github.com/DS4SD/docling-core.git", branch = "improve-doc-item-typing" }
-pydantic = "^2.0.0"
 filetype = "^1.2.0"
 pypdfium2 = "^4.30.0"
 pydantic-settings = "^2.3.0"