diff --git a/CHANGELOG.md b/CHANGELOG.md index a422acd5..ea993f70 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## [v2.12.0](https://github.com/DS4SD/docling/releases/tag/v2.12.0) - 2024-12-13 + +### Feature + +* Introduce support for GPU Accelerators ([#593](https://github.com/DS4SD/docling/issues/593)) ([`19fad92`](https://github.com/DS4SD/docling/commit/19fad9261cb61f732a0426393866c8c1a9efbf4f)) + ## [v2.11.0](https://github.com/DS4SD/docling/releases/tag/v2.11.0) - 2024-12-12 ### Feature diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 6916a83f..cf1689da 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -1,15 +1,17 @@ import logging import os +import warnings from enum import Enum from pathlib import Path from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from pydantic_settings import ( BaseSettings, PydanticBaseSettingsSource, SettingsConfigDict, ) +from typing_extensions import deprecated _log = logging.getLogger(__name__) @@ -134,14 +136,8 @@ class EasyOcrOptions(OcrOptions): kind: Literal["easyocr"] = "easyocr" lang: List[str] = ["fr", "de", "es", "en"] - use_gpu: Annotated[ - int, - Field( - deprecated="Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. " - "When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used " - "to run EasyOCR. Otherwise, EasyOCR runs in CPU." - ), - ] = True + + use_gpu: Optional[bool] = None model_storage_directory: Optional[str] = None download_enabled: bool = True @@ -216,8 +212,8 @@ class PipelineOptions(BaseModel): create_legacy_output: bool = ( True # This default will be set to False on a future version of docling ) - accelerator_options: AcceleratorOptions = AcceleratorOptions() document_timeout: Optional[float] = None + accelerator_options: AcceleratorOptions = AcceleratorOptions() class PdfPipelineOptions(PipelineOptions): diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index 4387cd82..5de1409c 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -1,4 +1,5 @@ import logging +import warnings from typing import Iterable import numpy @@ -41,16 +42,25 @@ class EasyOcrModel(BaseOcrModel): "Alternatively, Docling has support for other OCR engines. See the documentation." ) - use_gpu = False - if self.options.use_gpu: + if self.options.use_gpu is None: device = decide_device(accelerator_options.device) # Enable easyocr GPU if running on CUDA, MPS use_gpu = any( - filter( - lambda x: str(x).lower() in device, - [AcceleratorDevice.CUDA.value, AcceleratorDevice.MPS.value], - ) + [ + device.startswith(x) + for x in [ + AcceleratorDevice.CUDA.value, + AcceleratorDevice.MPS.value, + ] + ] ) + else: + warnings.warn( + "Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. " + "When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used " + "to run EasyOCR. Otherwise, EasyOCR runs in CPU." + ) + use_gpu = self.options.use_gpu self.reader = easyocr.Reader( lang_list=self.options.lang, diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index 2caa3866..3dc83eba 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -17,7 +17,7 @@ from docling.datamodel.base_models import ( Page, ) from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import AcceleratorOptions +from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel from docling.utils.accelerator_utils import decide_device @@ -51,8 +51,13 @@ class LayoutModel(BasePageModel): def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions): device = decide_device(accelerator_options.device) + self.layout_predictor = LayoutPredictor( - artifacts_path, device, accelerator_options.num_threads + artifact_path=str(artifacts_path), + device=device, + num_threads=accelerator_options.num_threads, + base_threshold=0.6, + blacklist_classes={"Form", "Key-Value Region"}, ) def draw_clusters_and_cells_side_by_side( @@ -87,7 +92,6 @@ class LayoutModel(BasePageModel): DocItemLabel.FORM: (200, 255, 255), # Light Cyan DocItemLabel.KEY_VALUE_REGION: (183, 65, 14), # Rusty orange } - # Filter clusters for left and right images exclude_labels = { DocItemLabel.FORM, @@ -96,7 +100,6 @@ class LayoutModel(BasePageModel): } left_clusters = [c for c in clusters if c.label not in exclude_labels] right_clusters = [c for c in clusters if c.label in exclude_labels] - # Create a deep copy of the original image for both sides left_image = copy.deepcopy(page.image) right_image = copy.deepcopy(page.image) @@ -104,14 +107,12 @@ class LayoutModel(BasePageModel): # Function to draw clusters on an image def draw_clusters(image, clusters): draw = ImageDraw.Draw(image, "RGBA") - # Create a smaller font for the labels try: font = ImageFont.truetype("arial.ttf", 12) except OSError: # Fallback to default font if arial is not available font = ImageFont.load_default() - for c_tl in clusters: all_clusters = [c_tl, *c_tl.children] for c in all_clusters: @@ -124,7 +125,6 @@ class LayoutModel(BasePageModel): outline=None, fill=cell_color, ) - # Draw cluster rectangle x0, y0, x1, y1 = c.bbox.as_tuple() cluster_fill_color = (*list(label_to_color.get(c.label)), 70) @@ -134,10 +134,8 @@ class LayoutModel(BasePageModel): outline=cluster_outline_color, fill=cluster_fill_color, ) - # Add label name and confidence label_text = f"{c.label.name} ({c.confidence:.2f})" - # Create semi-transparent background for text text_bbox = draw.textbbox((x0, y0), label_text, font=font) text_bg_padding = 2 @@ -154,7 +152,6 @@ class LayoutModel(BasePageModel): ], fill=(255, 255, 255, 180), # Semi-transparent white ) - # Draw text draw.text( (x0, y0), @@ -166,14 +163,12 @@ class LayoutModel(BasePageModel): # Draw clusters on both images draw_clusters(left_image, left_clusters) draw_clusters(right_image, right_clusters) - # Combine the images side by side combined_width = left_image.width * 2 combined_height = left_image.height combined_image = Image.new("RGB", (combined_width, combined_height)) combined_image.paste(left_image, (0, 0)) combined_image.paste(right_image, (left_image.width, 0)) - if show: combined_image.show() else: @@ -182,7 +177,6 @@ class LayoutModel(BasePageModel): / f"debug_{conv_res.input.file.stem}" ) out_path.mkdir(parents=True, exist_ok=True) - out_file = out_path / f"{mode_prefix}_layout_page_{page.page_no:05}.png" combined_image.save(str(out_file), format="png") @@ -217,93 +211,6 @@ class LayoutModel(BasePageModel): ) clusters.append(cluster) - # DEBUG code: - def draw_clusters_and_cells( - clusters, mode_prefix: str, show: bool = False - ): - label_to_color = { - DocItemLabel.TEXT: (255, 255, 153), # Light Yellow - DocItemLabel.CAPTION: (255, 204, 153), # Light Orange - DocItemLabel.LIST_ITEM: (153, 153, 255), # Light Purple - DocItemLabel.FORMULA: (192, 192, 192), # Gray - DocItemLabel.TABLE: (255, 204, 204), # Light Pink - DocItemLabel.PICTURE: (255, 255, 204), # Light Beige - DocItemLabel.SECTION_HEADER: (255, 153, 153), # Light Red - DocItemLabel.PAGE_HEADER: (204, 255, 204), # Light Green - DocItemLabel.PAGE_FOOTER: ( - 204, - 255, - 204, - ), # Light Green (same as Page-Header) - DocItemLabel.TITLE: ( - 255, - 153, - 153, - ), # Light Red (same as Section-Header) - DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue - DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray - DocItemLabel.CODE: (255, 223, 186), # Peach - DocItemLabel.CHECKBOX_SELECTED: ( - 255, - 182, - 193, - ), # Pale Green - DocItemLabel.CHECKBOX_UNSELECTED: ( - 255, - 182, - 193, - ), # Light Pink - DocItemLabel.FORM: (200, 255, 255), # Light Cyan - DocItemLabel.KEY_VALUE_REGION: ( - 183, - 65, - 14, - ), # Rusty orange - } - - image = copy.deepcopy(page.image) - if image is not None: - draw = ImageDraw.Draw(image, "RGBA") - for c in clusters: - cell_color = (0, 0, 0, 40) - for tc in c.cells: # [:1]: - cx0, cy0, cx1, cy1 = tc.bbox.as_tuple() - draw.rectangle( - [(cx0, cy0), (cx1, cy1)], - outline=None, - fill=cell_color, - ) - - x0, y0, x1, y1 = c.bbox.as_tuple() - cluster_fill_color = ( - *list(label_to_color.get(c.label)), # type: ignore - 70, - ) - cluster_outline_color = ( - *list(label_to_color.get(c.label)), # type: ignore - 255, - ) - draw.rectangle( - [(x0, y0), (x1, y1)], - outline=cluster_outline_color, - fill=cluster_fill_color, - ) - - if show: - image.show() - else: - out_path: Path = ( - Path(settings.debug.debug_output_path) - / f"debug_{conv_res.input.file.stem}" - ) - out_path.mkdir(parents=True, exist_ok=True) - - out_file = ( - out_path - / f"{mode_prefix}_layout_page_{page.page_no:05}.png" - ) - image.save(str(out_file), format="png") - if settings.debug.visualize_raw_layout: self.draw_clusters_and_cells_side_by_side( conv_res, page, clusters, mode_prefix="raw" diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 94f347c1..ba306449 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -10,6 +10,7 @@ from PIL import ImageDraw from docling.datamodel.base_models import Page, Table, TableStructurePrediction from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( + AcceleratorDevice, AcceleratorOptions, TableFormerMode, TableStructureOptions, @@ -44,6 +45,10 @@ class TableStructureModel(BasePageModel): device = decide_device(accelerator_options.device) + # Disable MPS here, until we know why it makes things slower. + if device == AcceleratorDevice.MPS.value: + device = AcceleratorDevice.CPU.value + self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json") self.tm_config["model"]["save_dir"] = artifacts_path self.tm_model_type = self.tm_config["model"]["type"] diff --git a/docling/utils/accelerator_utils.py b/docling/utils/accelerator_utils.py index 4ed1cbf4..59b04796 100644 --- a/docling/utils/accelerator_utils.py +++ b/docling/utils/accelerator_utils.py @@ -21,9 +21,11 @@ def decide_device(accelerator_device: AcceleratorDevice) -> str: has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available() if accelerator_device == AcceleratorDevice.AUTO: - # TODO: Enable MPS later if has_cuda: device = f"cuda:{cuda_index}" + elif has_mps: + device = "mps" + else: if accelerator_device == AcceleratorDevice.CUDA: if has_cuda: diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py index 12893e22..a7efa975 100644 --- a/docs/examples/custom_convert.py +++ b/docs/examples/custom_convert.py @@ -74,7 +74,7 @@ def main(): pipeline_options.do_ocr = True pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True - pipeline_options.ocr_options.lang = "es" + pipeline_options.ocr_options.lang = ["es"] pipeline_options.accelerator_options = AcceleratorOptions( num_threads=4, device=Device.AUTO ) diff --git a/poetry.lock b/poetry.lock index cedeb768..c9ebd4f4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -888,41 +888,40 @@ files = [ [[package]] name = "docling-core" -version = "2.9.0" +version = "2.10.0" description = "A python library to define and validate data types in Docling." optional = false -python-versions = "^3.9" -files = [] -develop = false +python-versions = "<4.0,>=3.9" +files = [ + {file = "docling_core-2.10.0-py3-none-any.whl", hash = "sha256:b4fe310cd0f1edde7d727e15cb39f8b5a31d2bd5b1ac5af3f4670ac5209c9057"}, + {file = "docling_core-2.10.0.tar.gz", hash = "sha256:f9b33074de048afb4cb6be784d52f97f8723d1d41737096e575629e0bb30add8"}, +] [package.dependencies] -jsonref = "^1.1.0" -jsonschema = "^4.16.0" -pandas = "^2.1.4" -pillow = "^10.3.0" -pydantic = ">=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2" +jsonref = ">=1.1.0,<2.0.0" +jsonschema = ">=4.16.0,<5.0.0" +pandas = ">=2.1.4,<3.0.0" +pillow = ">=10.3.0,<11.0.0" +pydantic = ">=2.6.0,<2.10.0 || >2.10.0,<2.10.1 || >2.10.1,<2.10.2 || >2.10.2,<3.0.0" pyyaml = ">=5.1,<7.0.0" -tabulate = "^0.9.0" -typing-extensions = "^4.12.2" +semchunk = {version = ">=2.2.0,<3.0.0", optional = true, markers = "extra == \"chunking\""} +tabulate = ">=0.9.0,<0.10.0" +transformers = {version = ">=4.34.0,<5.0.0", optional = true, markers = "extra == \"chunking\""} +typer = ">=0.12.5,<0.13.0" +typing-extensions = ">=4.12.2,<5.0.0" [package.extras] chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"] -[package.source] -type = "git" -url = "ssh://git@github.com/DS4SD/docling-core.git" -reference = "improve-doc-item-typing" -resolved_reference = "948a1c56caef3fe1770d7c5cdc61cbf9bc026113" - [[package]] name = "docling-ibm-models" -version = "3.0.0" +version = "3.1.0" description = "This package contains the AI models used by the Docling PDF conversion package" optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "docling_ibm_models-3.0.0-py3-none-any.whl", hash = "sha256:61d1bc3fc36fbec687533f543e2f899117bc19e5b31ab03520af4b84e1f7327c"}, - {file = "docling_ibm_models-3.0.0.tar.gz", hash = "sha256:2a4c064c6a58cfce039e9574c52cb3cab7decd103e20e9c5ccb7834e7fa04d4f"}, + {file = "docling_ibm_models-3.1.0-py3-none-any.whl", hash = "sha256:a381a45dff16fdb2246b99c15a2e3d6ba880c573d48a1d6477d3ffb36bab807f"}, + {file = "docling_ibm_models-3.1.0.tar.gz", hash = "sha256:65d734ffa490edc4e2301d296b6e893afa536c63b7daae7bbda781bd15b3431e"}, ] [package.dependencies] @@ -2822,6 +2821,32 @@ files = [ {file = "more_itertools-10.5.0-py3-none-any.whl", hash = "sha256:037b0d3203ce90cca8ab1defbbdac29d5f993fc20131f3664dc8d6acfa872aef"}, ] +[[package]] +name = "mpire" +version = "2.10.2" +description = "A Python package for easy multiprocessing, but faster than multiprocessing" +optional = false +python-versions = "*" +files = [ + {file = "mpire-2.10.2-py3-none-any.whl", hash = "sha256:d627707f7a8d02aa4c7f7d59de399dec5290945ddf7fbd36cbb1d6ebb37a51fb"}, + {file = "mpire-2.10.2.tar.gz", hash = "sha256:f66a321e93fadff34585a4bfa05e95bd946cf714b442f51c529038eb45773d97"}, +] + +[package.dependencies] +multiprocess = [ + {version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""}, + {version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""}, +] +pygments = ">=2.0" +pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""} +tqdm = ">=4.27" + +[package.extras] +dashboard = ["flask"] +dill = ["multiprocess", "multiprocess (>=0.70.15)"] +docs = ["docutils (==0.17.1)", "sphinx (==3.2.1)", "sphinx-autodoc-typehints (==1.11.0)", "sphinx-rtd-theme (==0.5.0)", "sphinx-versions (==1.0.1)", "sphinxcontrib-images (==0.9.2)"] +testing = ["ipywidgets", "multiprocess", "multiprocess (>=0.70.15)", "numpy", "pywin32 (>=301)", "rich"] + [[package]] name = "mpmath" version = "1.3.0" @@ -3765,10 +3790,10 @@ files = [ numpy = [ {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] [[package]] @@ -3791,10 +3816,10 @@ files = [ numpy = [ {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] [[package]] @@ -3975,8 +4000,8 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.22.4", markers = "python_version < \"3.11\""}, - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -6132,6 +6157,21 @@ files = [ cryptography = ">=2.0" jeepney = ">=0.6" +[[package]] +name = "semchunk" +version = "2.2.0" +description = "A fast and lightweight Python library for splitting text into semantically meaningful chunks." +optional = false +python-versions = ">=3.9" +files = [ + {file = "semchunk-2.2.0-py3-none-any.whl", hash = "sha256:7db19ca90ddb48f99265e789e07a7bb111ae25185f9cc3d44b94e1e61b9067fc"}, + {file = "semchunk-2.2.0.tar.gz", hash = "sha256:4de761ce614036fa3bea61adbe47e3ade7c96ac9b062f223b3ac353dbfd26743"}, +] + +[package.dependencies] +mpire = {version = "*", extras = ["dill"]} +tqdm = "*" + [[package]] name = "semver" version = "2.13.0" @@ -7573,4 +7613,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "c81c99b768cfca5c58c7d41c553110bad65b16c6f527c4d3892a916dffc47a05" +content-hash = "c99badc27c127051233e278f497b98acda8239697ce1cded43a2b05eab28795e" diff --git a/pyproject.toml b/pyproject.toml index 6e514ebb..f3d2efdb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "2.11.0" # DO NOT EDIT, updated automatically +version = "2.12.0" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Panos Vagenas ", "Peter Staar "] license = "MIT" @@ -25,12 +25,11 @@ packages = [{include = "docling"}] # actual dependencies: ###################### python = "^3.9" -docling-ibm-models = "^3.0.0" +docling-core = { version = "^2.10.0", extras = ["chunking"] } +pydantic = "^2.0.0" +docling-ibm-models = "^3.1.0" deepsearch-glm = "^1.0.0" docling-parse = "^3.0.0" -#docling-core = { version = "^2.9.0", extras = ["chunking"] } -docling-core = { git = "ssh://git@github.com/DS4SD/docling-core.git", branch = "improve-doc-item-typing" } -pydantic = "^2.0.0" filetype = "^1.2.0" pypdfium2 = "^4.30.0" pydantic-settings = "^2.3.0"