Refactor and fix profiling codes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-29 10:54:17 +01:00
parent 0814f32ae4
commit 79f90b46dc
15 changed files with 97 additions and 83 deletions

View File

@ -3,10 +3,9 @@ import re
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
import filetype
import numpy as np
from docling_core.types.doc import (
DocItem,
DocItemLabel,
@ -53,6 +52,7 @@ from docling.datamodel.base_models import (
Page,
)
from docling.datamodel.settings import DocumentLimits
from docling.utils.profiling import ProfilingItem
from docling.utils.utils import create_file_hash, create_hash
if TYPE_CHECKING:
@ -180,29 +180,6 @@ class DocumentFormat(str, Enum):
V1 = "v1"
class ProfilingScope(str, Enum):
PAGE = "page"
DOCUMENT = "document"
class ProfilingItem(BaseModel):
scope: ProfilingScope
count: int = 0
times: List[float] = []
def avg(self) -> float:
return np.average(self.times) # type: ignore
def std(self) -> float:
return np.std(self.times) # type: ignore
def mean(self) -> float:
return np.mean(self.times) # type: ignore
def percentile(self, perc: float) -> float:
return np.percentile(self.times, perc) # type: ignore
class ConversionResult(BaseModel):
input: InputDocument

View File

@ -1,4 +1,5 @@
import sys
from pathlib import Path
from pydantic import BaseModel
from pydantic_settings import BaseSettings
@ -34,6 +35,9 @@ class DebugSettings(BaseModel):
profile_pipeline_timings: bool = False
# Path used to output debug information.
debug_output_path: str = str(Path.cwd())
class AppSettings(BaseSettings):
perf: BatchConcurrencySettings

View File

@ -1,12 +1,10 @@
import time
from abc import ABC, abstractmethod
from typing import Any, Callable, Iterable, Type
from typing import Any, Iterable
from docling_core.types.doc import DoclingDocument, NodeItem
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult, ProfilingItem, ProfilingScope
from docling.datamodel.settings import settings
from docling.datamodel.document import ConversionResult
class BasePageModel(ABC):
@ -28,28 +26,3 @@ class BaseEnrichmentModel(ABC):
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
) -> Iterable[Any]:
pass
class TimeRecorder:
def __init__(
self,
conv_res: ConversionResult,
key: str,
scope: ProfilingScope = ProfilingScope.PAGE,
):
if settings.debug.profile_pipeline_timings:
if key not in conv_res.timings.keys():
conv_res.timings[key] = ProfilingItem(scope=scope)
self.conv_res = conv_res
self.key = key
def __enter__(self):
if settings.debug.profile_pipeline_timings:
self.start = time.monotonic()
return self
def __exit__(self, *args):
if settings.debug.profile_pipeline_timings:
elapsed = time.monotonic() - self.start
self.conv_res.timings[self.key].times.append(elapsed)
self.conv_res.timings[self.key].count += 1

View File

@ -27,7 +27,7 @@ from pydantic import BaseModel, ConfigDict
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
from docling.models.base_model import TimeRecorder
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import create_hash
@ -227,7 +227,7 @@ class GlmModel:
return ds_doc
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
with TimeRecorder(conv_res, "glm"):
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
ds_doc = self._to_legacy_document(conv_res)
ds_doc_dict = ds_doc.model_dump(by_alias=True)

View File

@ -6,11 +6,11 @@ import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult, ProfilingItem
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import EasyOcrOptions
from docling.datamodel.settings import settings
from docling.models.base_model import TimeRecorder
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import ProfilingItem, TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -18,8 +18,9 @@ from docling.datamodel.base_models import (
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel, TimeRecorder
from docling.models.base_model import BasePageModel
from docling.utils import layout_utils as lu
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -13,8 +13,9 @@ from docling.datamodel.base_models import (
TextElement,
)
from docling.datamodel.document import ConversionResult
from docling.models.base_model import BasePageModel, TimeRecorder
from docling.models.base_model import BasePageModel
from docling.models.layout_model import LayoutModel
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -5,7 +5,8 @@ from pydantic import BaseModel
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.models.base_model import BasePageModel, TimeRecorder
from docling.models.base_model import BasePageModel
from docling.utils.profiling import TimeRecorder
class PagePreprocessingOptions(BaseModel):

View File

@ -11,7 +11,8 @@ from docling.datamodel.base_models import Page, Table, TableStructurePrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel, TimeRecorder
from docling.models.base_model import BasePageModel
from docling.utils.profiling import TimeRecorder
class TableStructureModel(BasePageModel):

View File

@ -11,8 +11,8 @@ from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
from docling.datamodel.settings import settings
from docling.models.base_model import TimeRecorder
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -7,8 +7,8 @@ from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractOcrOptions
from docling.datamodel.settings import settings
from docling.models.base_model import TimeRecorder
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -15,15 +15,11 @@ from docling.datamodel.base_models import (
ErrorItem,
Page,
)
from docling.datamodel.document import (
ConversionResult,
InputDocument,
ProfilingItem,
ProfilingScope,
)
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BaseEnrichmentModel, TimeRecorder
from docling.models.base_model import BaseEnrichmentModel
from docling.utils.profiling import ProfilingItem, ProfilingScope, TimeRecorder
from docling.utils.utils import chunkify
_log = logging.getLogger(__name__)
@ -40,13 +36,14 @@ class BasePipeline(ABC):
_log.info(f"Processing document {in_doc.file.name}")
try:
# These steps are building and assembling the structure of the
# output DoclingDocument
conv_res = self._build_document(conv_res)
conv_res = self._assemble_document(conv_res)
# From this stage, all operations should rely only on conv_res.output
conv_res = self._enrich_document(conv_res)
conv_res.status = self._determine_status(conv_res)
with TimeRecorder(conv_res, "total", scope=ProfilingScope.DOCUMENT):
# These steps are building and assembling the structure of the
# output DoclingDocument
conv_res = self._build_document(conv_res)
conv_res = self._assemble_document(conv_res)
# From this stage, all operations should rely only on conv_res.output
conv_res = self._enrich_document(conv_res)
conv_res.status = self._determine_status(conv_res)
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
if raises_on_error:

View File

@ -5,10 +5,10 @@ from docling.backend.abstract_backend import (
DeclarativeDocumentBackend,
)
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PipelineOptions
from docling.models.base_model import TimeRecorder
from docling.pipeline.base_pipeline import BasePipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -7,14 +7,13 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page
from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.models.base_model import TimeRecorder
from docling.models.base_ocr_model import BaseOcrModel
from docling.models.ds_glm_model import GlmModel, GlmOptions
from docling.models.easyocr_model import EasyOcrModel
@ -28,6 +27,7 @@ from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel
from docling.pipeline.base_pipeline import PaginatedPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__)
@ -121,7 +121,7 @@ class StandardPdfPipeline(PaginatedPipeline):
return None
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
with TimeRecorder(conv_res, "init_page"):
with TimeRecorder(conv_res, "page_init"):
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
if page._backend is not None and page._backend.is_valid():
page.size = page._backend.get_size()

View File

@ -0,0 +1,59 @@
import time
from enum import Enum
from typing import TYPE_CHECKING, List
import numpy as np
from pydantic import BaseModel
from docling.datamodel.settings import settings
if TYPE_CHECKING:
from docling.datamodel.document import ConversionResult
class ProfilingScope(str, Enum):
PAGE = "page"
DOCUMENT = "document"
class ProfilingItem(BaseModel):
scope: ProfilingScope
count: int = 0
times: List[float] = []
def avg(self) -> float:
return np.average(self.times) # type: ignore
def std(self) -> float:
return np.std(self.times) # type: ignore
def mean(self) -> float:
return np.mean(self.times) # type: ignore
def percentile(self, perc: float) -> float:
return np.percentile(self.times, perc) # type: ignore
class TimeRecorder:
def __init__(
self,
conv_res: "ConversionResult",
key: str,
scope: ProfilingScope = ProfilingScope.PAGE,
):
if settings.debug.profile_pipeline_timings:
if key not in conv_res.timings.keys():
conv_res.timings[key] = ProfilingItem(scope=scope)
self.conv_res = conv_res
self.key = key
def __enter__(self):
if settings.debug.profile_pipeline_timings:
self.start = time.monotonic()
return self
def __exit__(self, *args):
if settings.debug.profile_pipeline_timings:
elapsed = time.monotonic() - self.start
self.conv_res.timings[self.key].times.append(elapsed)
self.conv_res.timings[self.key].count += 1