mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-16 16:48:21 +00:00
feat: Introduce support for GPU Accelerators (#593)
* Upgraded Layout Postprocessing, sending old code back to ERZ Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Implement hierachical cluster layout processing Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Pass nested cluster processing through full pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Pass nested clusters through GLM as payload Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Move to_docling_document from ds-glm to this repo Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Clean up imports again Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * feat(Accelerator): Introduce options to control the num_threads and device from API, envvars, CLI. - Introduce the AcceleratorOptions, AcceleratorDevice and use them to set the device where the models run. - Introduce the accelerator_utils with function to decide the device and resolve the AUTO setting. - Refactor the way how the docling-ibm-models are called to match the new init signature of models. - Translate the accelerator options to the specific inputs for third-party models. - Extend the docling CLI with parameters to set the num_threads and device. - Add new unit tests. - Write new example how to use the accelerator options. * fix: Improve the pydantic objects in the pipeline_options and imports. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: TableStructureModel: Refactor the artifacts path to use the new structure for fast/accurate model Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * Updated test ground-truth Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updated test ground-truth (again), bugfix for empty layout Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Do proper check to set the device in EasyOCR, RapidOCR. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * Rollback changes from main Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update test gt Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Remove unused debug settings Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Review fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Nail the accelerator defaults for MPS Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import warnings
|
||||
from typing import Iterable
|
||||
|
||||
import numpy
|
||||
@@ -7,16 +8,26 @@ from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
EasyOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EasyOcrModel(BaseOcrModel):
|
||||
def __init__(self, enabled: bool, options: EasyOcrOptions):
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
options: EasyOcrOptions,
|
||||
accelerator_options: AcceleratorOptions,
|
||||
):
|
||||
super().__init__(enabled=enabled, options=options)
|
||||
self.options: EasyOcrOptions
|
||||
|
||||
@@ -31,11 +42,32 @@ class EasyOcrModel(BaseOcrModel):
|
||||
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||
)
|
||||
|
||||
if self.options.use_gpu is None:
|
||||
device = decide_device(accelerator_options.device)
|
||||
# Enable easyocr GPU if running on CUDA, MPS
|
||||
use_gpu = any(
|
||||
[
|
||||
device.startswith(x)
|
||||
for x in [
|
||||
AcceleratorDevice.CUDA.value,
|
||||
AcceleratorDevice.MPS.value,
|
||||
]
|
||||
]
|
||||
)
|
||||
else:
|
||||
warnings.warn(
|
||||
"Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
|
||||
"When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
|
||||
"to run EasyOCR. Otherwise, EasyOCR runs in CPU."
|
||||
)
|
||||
use_gpu = self.options.use_gpu
|
||||
|
||||
self.reader = easyocr.Reader(
|
||||
lang_list=self.options.lang,
|
||||
gpu=self.options.use_gpu,
|
||||
gpu=use_gpu,
|
||||
model_storage_directory=self.options.model_storage_directory,
|
||||
download_enabled=self.options.download_enabled,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
def __call__(
|
||||
|
||||
@@ -9,6 +9,7 @@ from docling_core.types.doc import CoordOrigin, DocItemLabel
|
||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||
from PIL import ImageDraw
|
||||
|
||||
import docling.utils.layout_utils as lu
|
||||
from docling.datamodel.base_models import (
|
||||
BoundingBox,
|
||||
Cell,
|
||||
@@ -17,9 +18,10 @@ from docling.datamodel.base_models import (
|
||||
Page,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils import layout_utils as lu
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@@ -46,8 +48,16 @@ class LayoutModel(BasePageModel):
|
||||
FIGURE_LABEL = DocItemLabel.PICTURE
|
||||
FORMULA_LABEL = DocItemLabel.FORMULA
|
||||
|
||||
def __init__(self, artifacts_path: Path):
|
||||
self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary
|
||||
def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
|
||||
device = decide_device(accelerator_options.device)
|
||||
|
||||
self.layout_predictor = LayoutPredictor(
|
||||
artifact_path=str(artifacts_path),
|
||||
device=device,
|
||||
num_threads=accelerator_options.num_threads,
|
||||
base_threshold=0.6,
|
||||
blacklist_classes={"Form", "Key-Value Region"},
|
||||
)
|
||||
|
||||
def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
|
||||
MIN_INTERSECTION = 0.2
|
||||
|
||||
@@ -6,16 +6,26 @@ from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import RapidOcrOptions
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
RapidOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RapidOcrModel(BaseOcrModel):
|
||||
def __init__(self, enabled: bool, options: RapidOcrOptions):
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
options: RapidOcrOptions,
|
||||
accelerator_options: AcceleratorOptions,
|
||||
):
|
||||
super().__init__(enabled=enabled, options=options)
|
||||
self.options: RapidOcrOptions
|
||||
|
||||
@@ -30,52 +40,21 @@ class RapidOcrModel(BaseOcrModel):
|
||||
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||
)
|
||||
|
||||
# This configuration option will be revamped while introducing device settings for all models.
|
||||
# For the moment we will default to auto and let onnx-runtime pick the best.
|
||||
cls_use_cuda = True
|
||||
rec_use_cuda = True
|
||||
det_use_cuda = True
|
||||
det_use_dml = True
|
||||
cls_use_dml = True
|
||||
rec_use_dml = True
|
||||
|
||||
# # Same as Defaults in RapidOCR
|
||||
# cls_use_cuda = False
|
||||
# rec_use_cuda = False
|
||||
# det_use_cuda = False
|
||||
# det_use_dml = False
|
||||
# cls_use_dml = False
|
||||
# rec_use_dml = False
|
||||
|
||||
# # If we set everything to true onnx-runtime would automatically choose the fastest accelerator
|
||||
# if self.options.device == self.options.Device.AUTO:
|
||||
# cls_use_cuda = True
|
||||
# rec_use_cuda = True
|
||||
# det_use_cuda = True
|
||||
# det_use_dml = True
|
||||
# cls_use_dml = True
|
||||
# rec_use_dml = True
|
||||
|
||||
# # If we set use_cuda to true onnx would use the cuda device available in runtime if no cuda device is available it would run on CPU.
|
||||
# elif self.options.device == self.options.Device.CUDA:
|
||||
# cls_use_cuda = True
|
||||
# rec_use_cuda = True
|
||||
# det_use_cuda = True
|
||||
|
||||
# # If we set use_dml to true onnx would use the dml device available in runtime if no dml device is available it would work on CPU.
|
||||
# elif self.options.device == self.options.Device.DIRECTML:
|
||||
# det_use_dml = True
|
||||
# cls_use_dml = True
|
||||
# rec_use_dml = True
|
||||
# Decide the accelerator devices
|
||||
device = decide_device(accelerator_options.device)
|
||||
use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
|
||||
use_dml = accelerator_options.device == AcceleratorDevice.AUTO
|
||||
intra_op_num_threads = accelerator_options.num_threads
|
||||
|
||||
self.reader = RapidOCR(
|
||||
text_score=self.options.text_score,
|
||||
cls_use_cuda=cls_use_cuda,
|
||||
rec_use_cuda=rec_use_cuda,
|
||||
det_use_cuda=det_use_cuda,
|
||||
det_use_dml=det_use_dml,
|
||||
cls_use_dml=cls_use_dml,
|
||||
rec_use_dml=rec_use_dml,
|
||||
cls_use_cuda=use_cuda,
|
||||
rec_use_cuda=use_cuda,
|
||||
det_use_cuda=use_cuda,
|
||||
det_use_dml=use_dml,
|
||||
cls_use_dml=use_dml,
|
||||
rec_use_dml=use_dml,
|
||||
intra_op_num_threads=intra_op_num_threads,
|
||||
print_verbose=self.options.print_verbose,
|
||||
det_model_path=self.options.det_model_path,
|
||||
cls_model_path=self.options.cls_model_path,
|
||||
|
||||
@@ -9,15 +9,25 @@ from PIL import ImageDraw
|
||||
|
||||
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
TableFormerMode,
|
||||
TableStructureOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
|
||||
class TableStructureModel(BasePageModel):
|
||||
def __init__(
|
||||
self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
|
||||
self,
|
||||
enabled: bool,
|
||||
artifacts_path: Path,
|
||||
options: TableStructureOptions,
|
||||
accelerator_options: AcceleratorOptions,
|
||||
):
|
||||
self.options = options
|
||||
self.do_cell_matching = self.options.do_cell_matching
|
||||
@@ -26,16 +36,26 @@ class TableStructureModel(BasePageModel):
|
||||
self.enabled = enabled
|
||||
if self.enabled:
|
||||
if self.mode == TableFormerMode.ACCURATE:
|
||||
artifacts_path = artifacts_path / "fat"
|
||||
artifacts_path = artifacts_path / "accurate"
|
||||
else:
|
||||
artifacts_path = artifacts_path / "fast"
|
||||
|
||||
# Third Party
|
||||
import docling_ibm_models.tableformer.common as c
|
||||
|
||||
device = decide_device(accelerator_options.device)
|
||||
|
||||
# Disable MPS here, until we know why it makes things slower.
|
||||
if device == AcceleratorDevice.MPS.value:
|
||||
device = AcceleratorDevice.CPU.value
|
||||
|
||||
self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
|
||||
self.tm_config["model"]["save_dir"] = artifacts_path
|
||||
self.tm_model_type = self.tm_config["model"]["type"]
|
||||
|
||||
self.tf_predictor = TFPredictor(self.tm_config)
|
||||
self.tf_predictor = TFPredictor(
|
||||
self.tm_config, device, accelerator_options.num_threads
|
||||
)
|
||||
self.scale = 2.0 # Scale up table input images to 144 dpi
|
||||
|
||||
def draw_table_and_cells(
|
||||
|
||||
Reference in New Issue
Block a user