feat: Introduce support for GPU Accelerators (#593)

* Upgraded Layout Postprocessing, sending old code back to ERZ

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Implement hierachical cluster layout processing

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Pass nested cluster processing through full pipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Pass nested clusters through GLM as payload

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Move to_docling_document from ds-glm to this repo

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Clean up imports again

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* feat(Accelerator): Introduce options to control the num_threads and device from API, envvars, CLI.
- Introduce the AcceleratorOptions, AcceleratorDevice and use them to set the device where the models run.
- Introduce the accelerator_utils with function to decide the device and resolve the AUTO setting.
- Refactor the way how the docling-ibm-models are called to match the new init signature of models.
- Translate the accelerator options to the specific inputs for third-party models.
- Extend the docling CLI with parameters to set the num_threads and device.
- Add new unit tests.
- Write new example how to use the accelerator options.

* fix: Improve the pydantic objects in the pipeline_options and imports.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: TableStructureModel: Refactor the artifacts path to use the new structure for fast/accurate model

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* Updated test ground-truth

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Updated test ground-truth (again), bugfix for empty layout

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: Do proper check to set the device in EasyOCR, RapidOCR.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* Rollback changes from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update test gt

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Remove unused debug settings

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Review fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Nail the accelerator defaults for MPS

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
This commit is contained in:
Nikos Livathinos
2024-12-13 17:45:22 +01:00
committed by GitHub
parent 365a1e7b98
commit 19fad9261c
38 changed files with 384 additions and 93 deletions

View File

@@ -1,4 +1,5 @@
import logging
import warnings
from typing import Iterable
import numpy
@@ -7,16 +8,26 @@ from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import EasyOcrOptions
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
EasyOcrOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
class EasyOcrModel(BaseOcrModel):
def __init__(self, enabled: bool, options: EasyOcrOptions):
def __init__(
self,
enabled: bool,
options: EasyOcrOptions,
accelerator_options: AcceleratorOptions,
):
super().__init__(enabled=enabled, options=options)
self.options: EasyOcrOptions
@@ -31,11 +42,32 @@ class EasyOcrModel(BaseOcrModel):
"Alternatively, Docling has support for other OCR engines. See the documentation."
)
if self.options.use_gpu is None:
device = decide_device(accelerator_options.device)
# Enable easyocr GPU if running on CUDA, MPS
use_gpu = any(
[
device.startswith(x)
for x in [
AcceleratorDevice.CUDA.value,
AcceleratorDevice.MPS.value,
]
]
)
else:
warnings.warn(
"Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
"When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
"to run EasyOCR. Otherwise, EasyOCR runs in CPU."
)
use_gpu = self.options.use_gpu
self.reader = easyocr.Reader(
lang_list=self.options.lang,
gpu=self.options.use_gpu,
gpu=use_gpu,
model_storage_directory=self.options.model_storage_directory,
download_enabled=self.options.download_enabled,
verbose=False,
)
def __call__(

View File

@@ -9,6 +9,7 @@ from docling_core.types.doc import CoordOrigin, DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import ImageDraw
import docling.utils.layout_utils as lu
from docling.datamodel.base_models import (
BoundingBox,
Cell,
@@ -17,9 +18,10 @@ from docling.datamodel.base_models import (
Page,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils import layout_utils as lu
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@@ -46,8 +48,16 @@ class LayoutModel(BasePageModel):
FIGURE_LABEL = DocItemLabel.PICTURE
FORMULA_LABEL = DocItemLabel.FORMULA
def __init__(self, artifacts_path: Path):
self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary
def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
device = decide_device(accelerator_options.device)
self.layout_predictor = LayoutPredictor(
artifact_path=str(artifacts_path),
device=device,
num_threads=accelerator_options.num_threads,
base_threshold=0.6,
blacklist_classes={"Form", "Key-Value Region"},
)
def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
MIN_INTERSECTION = 0.2

View File

@@ -6,16 +6,26 @@ from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import RapidOcrOptions
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
RapidOcrOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
class RapidOcrModel(BaseOcrModel):
def __init__(self, enabled: bool, options: RapidOcrOptions):
def __init__(
self,
enabled: bool,
options: RapidOcrOptions,
accelerator_options: AcceleratorOptions,
):
super().__init__(enabled=enabled, options=options)
self.options: RapidOcrOptions
@@ -30,52 +40,21 @@ class RapidOcrModel(BaseOcrModel):
"Alternatively, Docling has support for other OCR engines. See the documentation."
)
# This configuration option will be revamped while introducing device settings for all models.
# For the moment we will default to auto and let onnx-runtime pick the best.
cls_use_cuda = True
rec_use_cuda = True
det_use_cuda = True
det_use_dml = True
cls_use_dml = True
rec_use_dml = True
# # Same as Defaults in RapidOCR
# cls_use_cuda = False
# rec_use_cuda = False
# det_use_cuda = False
# det_use_dml = False
# cls_use_dml = False
# rec_use_dml = False
# # If we set everything to true onnx-runtime would automatically choose the fastest accelerator
# if self.options.device == self.options.Device.AUTO:
# cls_use_cuda = True
# rec_use_cuda = True
# det_use_cuda = True
# det_use_dml = True
# cls_use_dml = True
# rec_use_dml = True
# # If we set use_cuda to true onnx would use the cuda device available in runtime if no cuda device is available it would run on CPU.
# elif self.options.device == self.options.Device.CUDA:
# cls_use_cuda = True
# rec_use_cuda = True
# det_use_cuda = True
# # If we set use_dml to true onnx would use the dml device available in runtime if no dml device is available it would work on CPU.
# elif self.options.device == self.options.Device.DIRECTML:
# det_use_dml = True
# cls_use_dml = True
# rec_use_dml = True
# Decide the accelerator devices
device = decide_device(accelerator_options.device)
use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
use_dml = accelerator_options.device == AcceleratorDevice.AUTO
intra_op_num_threads = accelerator_options.num_threads
self.reader = RapidOCR(
text_score=self.options.text_score,
cls_use_cuda=cls_use_cuda,
rec_use_cuda=rec_use_cuda,
det_use_cuda=det_use_cuda,
det_use_dml=det_use_dml,
cls_use_dml=cls_use_dml,
rec_use_dml=rec_use_dml,
cls_use_cuda=use_cuda,
rec_use_cuda=use_cuda,
det_use_cuda=use_cuda,
det_use_dml=use_dml,
cls_use_dml=use_dml,
rec_use_dml=use_dml,
intra_op_num_threads=intra_op_num_threads,
print_verbose=self.options.print_verbose,
det_model_path=self.options.det_model_path,
cls_model_path=self.options.cls_model_path,

View File

@@ -9,15 +9,25 @@ from PIL import ImageDraw
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
TableFormerMode,
TableStructureOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder
class TableStructureModel(BasePageModel):
def __init__(
self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
self,
enabled: bool,
artifacts_path: Path,
options: TableStructureOptions,
accelerator_options: AcceleratorOptions,
):
self.options = options
self.do_cell_matching = self.options.do_cell_matching
@@ -26,16 +36,26 @@ class TableStructureModel(BasePageModel):
self.enabled = enabled
if self.enabled:
if self.mode == TableFormerMode.ACCURATE:
artifacts_path = artifacts_path / "fat"
artifacts_path = artifacts_path / "accurate"
else:
artifacts_path = artifacts_path / "fast"
# Third Party
import docling_ibm_models.tableformer.common as c
device = decide_device(accelerator_options.device)
# Disable MPS here, until we know why it makes things slower.
if device == AcceleratorDevice.MPS.value:
device = AcceleratorDevice.CPU.value
self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
self.tm_config["model"]["save_dir"] = artifacts_path
self.tm_model_type = self.tm_config["model"]["type"]
self.tf_predictor = TFPredictor(self.tm_config)
self.tf_predictor = TFPredictor(
self.tm_config, device, accelerator_options.num_threads
)
self.scale = 2.0 # Scale up table input images to 144 dpi
def draw_table_and_cells(