Rebase from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-12-16 11:26:24 +01:00
commit c020f2cba3
9 changed files with 114 additions and 149 deletions

View File

@ -1,3 +1,9 @@
## [v2.12.0](https://github.com/DS4SD/docling/releases/tag/v2.12.0) - 2024-12-13
### Feature
* Introduce support for GPU Accelerators ([#593](https://github.com/DS4SD/docling/issues/593)) ([`19fad92`](https://github.com/DS4SD/docling/commit/19fad9261cb61f732a0426393866c8c1a9efbf4f))
## [v2.11.0](https://github.com/DS4SD/docling/releases/tag/v2.11.0) - 2024-12-12
### Feature

View File

@ -1,15 +1,17 @@
import logging
import os
import warnings
from enum import Enum
from pathlib import Path
from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union
from pydantic import BaseModel, ConfigDict, Field, model_validator
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
from pydantic_settings import (
BaseSettings,
PydanticBaseSettingsSource,
SettingsConfigDict,
)
from typing_extensions import deprecated
_log = logging.getLogger(__name__)
@ -134,14 +136,8 @@ class EasyOcrOptions(OcrOptions):
kind: Literal["easyocr"] = "easyocr"
lang: List[str] = ["fr", "de", "es", "en"]
use_gpu: Annotated[
int,
Field(
deprecated="Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
"When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
"to run EasyOCR. Otherwise, EasyOCR runs in CPU."
),
] = True
use_gpu: Optional[bool] = None
model_storage_directory: Optional[str] = None
download_enabled: bool = True
@ -216,8 +212,8 @@ class PipelineOptions(BaseModel):
create_legacy_output: bool = (
True # This default will be set to False on a future version of docling
)
accelerator_options: AcceleratorOptions = AcceleratorOptions()
document_timeout: Optional[float] = None
accelerator_options: AcceleratorOptions = AcceleratorOptions()
class PdfPipelineOptions(PipelineOptions):

View File

@ -1,4 +1,5 @@
import logging
import warnings
from typing import Iterable
import numpy
@ -41,16 +42,25 @@ class EasyOcrModel(BaseOcrModel):
"Alternatively, Docling has support for other OCR engines. See the documentation."
)
use_gpu = False
if self.options.use_gpu:
if self.options.use_gpu is None:
device = decide_device(accelerator_options.device)
# Enable easyocr GPU if running on CUDA, MPS
use_gpu = any(
filter(
lambda x: str(x).lower() in device,
[AcceleratorDevice.CUDA.value, AcceleratorDevice.MPS.value],
)
[
device.startswith(x)
for x in [
AcceleratorDevice.CUDA.value,
AcceleratorDevice.MPS.value,
]
]
)
else:
warnings.warn(
"Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
"When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
"to run EasyOCR. Otherwise, EasyOCR runs in CPU."
)
use_gpu = self.options.use_gpu
self.reader = easyocr.Reader(
lang_list=self.options.lang,

View File

@ -17,7 +17,7 @@ from docling.datamodel.base_models import (
Page,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorOptions
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device
@ -51,8 +51,13 @@ class LayoutModel(BasePageModel):
def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
device = decide_device(accelerator_options.device)
self.layout_predictor = LayoutPredictor(
artifacts_path, device, accelerator_options.num_threads
artifact_path=str(artifacts_path),
device=device,
num_threads=accelerator_options.num_threads,
base_threshold=0.6,
blacklist_classes={"Form", "Key-Value Region"},
)
def draw_clusters_and_cells_side_by_side(
@ -87,7 +92,6 @@ class LayoutModel(BasePageModel):
DocItemLabel.FORM: (200, 255, 255), # Light Cyan
DocItemLabel.KEY_VALUE_REGION: (183, 65, 14), # Rusty orange
}
# Filter clusters for left and right images
exclude_labels = {
DocItemLabel.FORM,
@ -96,7 +100,6 @@ class LayoutModel(BasePageModel):
}
left_clusters = [c for c in clusters if c.label not in exclude_labels]
right_clusters = [c for c in clusters if c.label in exclude_labels]
# Create a deep copy of the original image for both sides
left_image = copy.deepcopy(page.image)
right_image = copy.deepcopy(page.image)
@ -104,14 +107,12 @@ class LayoutModel(BasePageModel):
# Function to draw clusters on an image
def draw_clusters(image, clusters):
draw = ImageDraw.Draw(image, "RGBA")
# Create a smaller font for the labels
try:
font = ImageFont.truetype("arial.ttf", 12)
except OSError:
# Fallback to default font if arial is not available
font = ImageFont.load_default()
for c_tl in clusters:
all_clusters = [c_tl, *c_tl.children]
for c in all_clusters:
@ -124,7 +125,6 @@ class LayoutModel(BasePageModel):
outline=None,
fill=cell_color,
)
# Draw cluster rectangle
x0, y0, x1, y1 = c.bbox.as_tuple()
cluster_fill_color = (*list(label_to_color.get(c.label)), 70)
@ -134,10 +134,8 @@ class LayoutModel(BasePageModel):
outline=cluster_outline_color,
fill=cluster_fill_color,
)
# Add label name and confidence
label_text = f"{c.label.name} ({c.confidence:.2f})"
# Create semi-transparent background for text
text_bbox = draw.textbbox((x0, y0), label_text, font=font)
text_bg_padding = 2
@ -154,7 +152,6 @@ class LayoutModel(BasePageModel):
],
fill=(255, 255, 255, 180), # Semi-transparent white
)
# Draw text
draw.text(
(x0, y0),
@ -166,14 +163,12 @@ class LayoutModel(BasePageModel):
# Draw clusters on both images
draw_clusters(left_image, left_clusters)
draw_clusters(right_image, right_clusters)
# Combine the images side by side
combined_width = left_image.width * 2
combined_height = left_image.height
combined_image = Image.new("RGB", (combined_width, combined_height))
combined_image.paste(left_image, (0, 0))
combined_image.paste(right_image, (left_image.width, 0))
if show:
combined_image.show()
else:
@ -182,7 +177,6 @@ class LayoutModel(BasePageModel):
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)
out_file = out_path / f"{mode_prefix}_layout_page_{page.page_no:05}.png"
combined_image.save(str(out_file), format="png")
@ -217,93 +211,6 @@ class LayoutModel(BasePageModel):
)
clusters.append(cluster)
# DEBUG code:
def draw_clusters_and_cells(
clusters, mode_prefix: str, show: bool = False
):
label_to_color = {
DocItemLabel.TEXT: (255, 255, 153), # Light Yellow
DocItemLabel.CAPTION: (255, 204, 153), # Light Orange
DocItemLabel.LIST_ITEM: (153, 153, 255), # Light Purple
DocItemLabel.FORMULA: (192, 192, 192), # Gray
DocItemLabel.TABLE: (255, 204, 204), # Light Pink
DocItemLabel.PICTURE: (255, 255, 204), # Light Beige
DocItemLabel.SECTION_HEADER: (255, 153, 153), # Light Red
DocItemLabel.PAGE_HEADER: (204, 255, 204), # Light Green
DocItemLabel.PAGE_FOOTER: (
204,
255,
204,
), # Light Green (same as Page-Header)
DocItemLabel.TITLE: (
255,
153,
153,
), # Light Red (same as Section-Header)
DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue
DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray
DocItemLabel.CODE: (255, 223, 186), # Peach
DocItemLabel.CHECKBOX_SELECTED: (
255,
182,
193,
), # Pale Green
DocItemLabel.CHECKBOX_UNSELECTED: (
255,
182,
193,
), # Light Pink
DocItemLabel.FORM: (200, 255, 255), # Light Cyan
DocItemLabel.KEY_VALUE_REGION: (
183,
65,
14,
), # Rusty orange
}
image = copy.deepcopy(page.image)
if image is not None:
draw = ImageDraw.Draw(image, "RGBA")
for c in clusters:
cell_color = (0, 0, 0, 40)
for tc in c.cells: # [:1]:
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
draw.rectangle(
[(cx0, cy0), (cx1, cy1)],
outline=None,
fill=cell_color,
)
x0, y0, x1, y1 = c.bbox.as_tuple()
cluster_fill_color = (
*list(label_to_color.get(c.label)), # type: ignore
70,
)
cluster_outline_color = (
*list(label_to_color.get(c.label)), # type: ignore
255,
)
draw.rectangle(
[(x0, y0), (x1, y1)],
outline=cluster_outline_color,
fill=cluster_fill_color,
)
if show:
image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)
out_file = (
out_path
/ f"{mode_prefix}_layout_page_{page.page_no:05}.png"
)
image.save(str(out_file), format="png")
if settings.debug.visualize_raw_layout:
self.draw_clusters_and_cells_side_by_side(
conv_res, page, clusters, mode_prefix="raw"

View File

@ -10,6 +10,7 @@ from PIL import ImageDraw
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
TableFormerMode,
TableStructureOptions,
@ -44,6 +45,10 @@ class TableStructureModel(BasePageModel):
device = decide_device(accelerator_options.device)
# Disable MPS here, until we know why it makes things slower.
if device == AcceleratorDevice.MPS.value:
device = AcceleratorDevice.CPU.value
self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
self.tm_config["model"]["save_dir"] = artifacts_path
self.tm_model_type = self.tm_config["model"]["type"]

View File

@ -21,9 +21,11 @@ def decide_device(accelerator_device: AcceleratorDevice) -> str:
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
if accelerator_device == AcceleratorDevice.AUTO:
# TODO: Enable MPS later
if has_cuda:
device = f"cuda:{cuda_index}"
elif has_mps:
device = "mps"
else:
if accelerator_device == AcceleratorDevice.CUDA:
if has_cuda:

View File

@ -74,7 +74,7 @@ def main():
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options.lang = "es"
pipeline_options.ocr_options.lang = ["es"]
pipeline_options.accelerator_options = AcceleratorOptions(
num_threads=4, device=Device.AUTO
)

92
poetry.lock generated
View File

@ -888,41 +888,40 @@ files = [
[[package]]
name = "docling-core"
version = "2.9.0"
version = "2.10.0"
description = "A python library to define and validate data types in Docling."
optional = false
python-versions = "^3.9"
files = []
develop = false
python-versions = "<4.0,>=3.9"
files = [
{file = "docling_core-2.10.0-py3-none-any.whl", hash = "sha256:b4fe310cd0f1edde7d727e15cb39f8b5a31d2bd5b1ac5af3f4670ac5209c9057"},
{file = "docling_core-2.10.0.tar.gz", hash = "sha256:f9b33074de048afb4cb6be784d52f97f8723d1d41737096e575629e0bb30add8"},
]
[package.dependencies]
jsonref = "^1.1.0"
jsonschema = "^4.16.0"
pandas = "^2.1.4"
pillow = "^10.3.0"
pydantic = ">=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2"
jsonref = ">=1.1.0,<2.0.0"
jsonschema = ">=4.16.0,<5.0.0"
pandas = ">=2.1.4,<3.0.0"
pillow = ">=10.3.0,<11.0.0"
pydantic = ">=2.6.0,<2.10.0 || >2.10.0,<2.10.1 || >2.10.1,<2.10.2 || >2.10.2,<3.0.0"
pyyaml = ">=5.1,<7.0.0"
tabulate = "^0.9.0"
typing-extensions = "^4.12.2"
semchunk = {version = ">=2.2.0,<3.0.0", optional = true, markers = "extra == \"chunking\""}
tabulate = ">=0.9.0,<0.10.0"
transformers = {version = ">=4.34.0,<5.0.0", optional = true, markers = "extra == \"chunking\""}
typer = ">=0.12.5,<0.13.0"
typing-extensions = ">=4.12.2,<5.0.0"
[package.extras]
chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]
[package.source]
type = "git"
url = "ssh://git@github.com/DS4SD/docling-core.git"
reference = "improve-doc-item-typing"
resolved_reference = "948a1c56caef3fe1770d7c5cdc61cbf9bc026113"
[[package]]
name = "docling-ibm-models"
version = "3.0.0"
version = "3.1.0"
description = "This package contains the AI models used by the Docling PDF conversion package"
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "docling_ibm_models-3.0.0-py3-none-any.whl", hash = "sha256:61d1bc3fc36fbec687533f543e2f899117bc19e5b31ab03520af4b84e1f7327c"},
{file = "docling_ibm_models-3.0.0.tar.gz", hash = "sha256:2a4c064c6a58cfce039e9574c52cb3cab7decd103e20e9c5ccb7834e7fa04d4f"},
{file = "docling_ibm_models-3.1.0-py3-none-any.whl", hash = "sha256:a381a45dff16fdb2246b99c15a2e3d6ba880c573d48a1d6477d3ffb36bab807f"},
{file = "docling_ibm_models-3.1.0.tar.gz", hash = "sha256:65d734ffa490edc4e2301d296b6e893afa536c63b7daae7bbda781bd15b3431e"},
]
[package.dependencies]
@ -2822,6 +2821,32 @@ files = [
{file = "more_itertools-10.5.0-py3-none-any.whl", hash = "sha256:037b0d3203ce90cca8ab1defbbdac29d5f993fc20131f3664dc8d6acfa872aef"},
]
[[package]]
name = "mpire"
version = "2.10.2"
description = "A Python package for easy multiprocessing, but faster than multiprocessing"
optional = false
python-versions = "*"
files = [
{file = "mpire-2.10.2-py3-none-any.whl", hash = "sha256:d627707f7a8d02aa4c7f7d59de399dec5290945ddf7fbd36cbb1d6ebb37a51fb"},
{file = "mpire-2.10.2.tar.gz", hash = "sha256:f66a321e93fadff34585a4bfa05e95bd946cf714b442f51c529038eb45773d97"},
]
[package.dependencies]
multiprocess = [
{version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""},
{version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""},
]
pygments = ">=2.0"
pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""}
tqdm = ">=4.27"
[package.extras]
dashboard = ["flask"]
dill = ["multiprocess", "multiprocess (>=0.70.15)"]
docs = ["docutils (==0.17.1)", "sphinx (==3.2.1)", "sphinx-autodoc-typehints (==1.11.0)", "sphinx-rtd-theme (==0.5.0)", "sphinx-versions (==1.0.1)", "sphinxcontrib-images (==0.9.2)"]
testing = ["ipywidgets", "multiprocess", "multiprocess (>=0.70.15)", "numpy", "pywin32 (>=301)", "rich"]
[[package]]
name = "mpmath"
version = "1.3.0"
@ -3765,10 +3790,10 @@ files = [
numpy = [
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
]
[[package]]
@ -3791,10 +3816,10 @@ files = [
numpy = [
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
]
[[package]]
@ -3975,8 +4000,8 @@ files = [
[package.dependencies]
numpy = [
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
]
python-dateutil = ">=2.8.2"
pytz = ">=2020.1"
@ -6132,6 +6157,21 @@ files = [
cryptography = ">=2.0"
jeepney = ">=0.6"
[[package]]
name = "semchunk"
version = "2.2.0"
description = "A fast and lightweight Python library for splitting text into semantically meaningful chunks."
optional = false
python-versions = ">=3.9"
files = [
{file = "semchunk-2.2.0-py3-none-any.whl", hash = "sha256:7db19ca90ddb48f99265e789e07a7bb111ae25185f9cc3d44b94e1e61b9067fc"},
{file = "semchunk-2.2.0.tar.gz", hash = "sha256:4de761ce614036fa3bea61adbe47e3ade7c96ac9b062f223b3ac353dbfd26743"},
]
[package.dependencies]
mpire = {version = "*", extras = ["dill"]}
tqdm = "*"
[[package]]
name = "semver"
version = "2.13.0"
@ -7573,4 +7613,4 @@ tesserocr = ["tesserocr"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "c81c99b768cfca5c58c7d41c553110bad65b16c6f527c4d3892a916dffc47a05"
content-hash = "c99badc27c127051233e278f497b98acda8239697ce1cded43a2b05eab28795e"

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "docling"
version = "2.11.0" # DO NOT EDIT, updated automatically
version = "2.12.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
license = "MIT"
@ -25,12 +25,11 @@ packages = [{include = "docling"}]
# actual dependencies:
######################
python = "^3.9"
docling-ibm-models = "^3.0.0"
docling-core = { version = "^2.10.0", extras = ["chunking"] }
pydantic = "^2.0.0"
docling-ibm-models = "^3.1.0"
deepsearch-glm = "^1.0.0"
docling-parse = "^3.0.0"
#docling-core = { version = "^2.9.0", extras = ["chunking"] }
docling-core = { git = "ssh://git@github.com/DS4SD/docling-core.git", branch = "improve-doc-item-typing" }
pydantic = "^2.0.0"
filetype = "^1.2.0"
pypdfium2 = "^4.30.0"
pydantic-settings = "^2.3.0"