mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-29 21:44:32 +00:00
Refactor and fix profiling codes
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
0814f32ae4
commit
79f90b46dc
@ -3,10 +3,9 @@ import re
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
|
||||
|
||||
import filetype
|
||||
import numpy as np
|
||||
from docling_core.types.doc import (
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
@ -53,6 +52,7 @@ from docling.datamodel.base_models import (
|
||||
Page,
|
||||
)
|
||||
from docling.datamodel.settings import DocumentLimits
|
||||
from docling.utils.profiling import ProfilingItem
|
||||
from docling.utils.utils import create_file_hash, create_hash
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -180,29 +180,6 @@ class DocumentFormat(str, Enum):
|
||||
V1 = "v1"
|
||||
|
||||
|
||||
class ProfilingScope(str, Enum):
|
||||
PAGE = "page"
|
||||
DOCUMENT = "document"
|
||||
|
||||
|
||||
class ProfilingItem(BaseModel):
|
||||
scope: ProfilingScope
|
||||
count: int = 0
|
||||
times: List[float] = []
|
||||
|
||||
def avg(self) -> float:
|
||||
return np.average(self.times) # type: ignore
|
||||
|
||||
def std(self) -> float:
|
||||
return np.std(self.times) # type: ignore
|
||||
|
||||
def mean(self) -> float:
|
||||
return np.mean(self.times) # type: ignore
|
||||
|
||||
def percentile(self, perc: float) -> float:
|
||||
return np.percentile(self.times, perc) # type: ignore
|
||||
|
||||
|
||||
class ConversionResult(BaseModel):
|
||||
input: InputDocument
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic_settings import BaseSettings
|
||||
@ -34,6 +35,9 @@ class DebugSettings(BaseModel):
|
||||
|
||||
profile_pipeline_timings: bool = False
|
||||
|
||||
# Path used to output debug information.
|
||||
debug_output_path: str = str(Path.cwd())
|
||||
|
||||
|
||||
class AppSettings(BaseSettings):
|
||||
perf: BatchConcurrencySettings
|
||||
|
@ -1,12 +1,10 @@
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Callable, Iterable, Type
|
||||
from typing import Any, Iterable
|
||||
|
||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult, ProfilingItem, ProfilingScope
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.datamodel.document import ConversionResult
|
||||
|
||||
|
||||
class BasePageModel(ABC):
|
||||
@ -28,28 +26,3 @@ class BaseEnrichmentModel(ABC):
|
||||
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
||||
) -> Iterable[Any]:
|
||||
pass
|
||||
|
||||
|
||||
class TimeRecorder:
|
||||
def __init__(
|
||||
self,
|
||||
conv_res: ConversionResult,
|
||||
key: str,
|
||||
scope: ProfilingScope = ProfilingScope.PAGE,
|
||||
):
|
||||
if settings.debug.profile_pipeline_timings:
|
||||
if key not in conv_res.timings.keys():
|
||||
conv_res.timings[key] = ProfilingItem(scope=scope)
|
||||
self.conv_res = conv_res
|
||||
self.key = key
|
||||
|
||||
def __enter__(self):
|
||||
if settings.debug.profile_pipeline_timings:
|
||||
self.start = time.monotonic()
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
if settings.debug.profile_pipeline_timings:
|
||||
elapsed = time.monotonic() - self.start
|
||||
self.conv_res.timings[self.key].times.append(elapsed)
|
||||
self.conv_res.timings[self.key].count += 1
|
||||
|
@ -27,7 +27,7 @@ from pydantic import BaseModel, ConfigDict
|
||||
|
||||
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
|
||||
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
||||
from docling.models.base_model import TimeRecorder
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
from docling.utils.utils import create_hash
|
||||
|
||||
|
||||
@ -227,7 +227,7 @@ class GlmModel:
|
||||
return ds_doc
|
||||
|
||||
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
||||
with TimeRecorder(conv_res, "glm"):
|
||||
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
||||
ds_doc = self._to_legacy_document(conv_res)
|
||||
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
||||
|
||||
|
@ -6,11 +6,11 @@ import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult, ProfilingItem
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import TimeRecorder
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.profiling import ProfilingItem, TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -18,8 +18,9 @@ from docling.datamodel.base_models import (
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel, TimeRecorder
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils import layout_utils as lu
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -13,8 +13,9 @@ from docling.datamodel.base_models import (
|
||||
TextElement,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.models.base_model import BasePageModel, TimeRecorder
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -5,7 +5,8 @@ from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.models.base_model import BasePageModel, TimeRecorder
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
|
||||
class PagePreprocessingOptions(BaseModel):
|
||||
|
@ -11,7 +11,8 @@ from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel, TimeRecorder
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
|
||||
class TableStructureModel(BasePageModel):
|
||||
|
@ -11,8 +11,8 @@ from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import TimeRecorder
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -7,8 +7,8 @@ from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import TimeRecorder
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -15,15 +15,11 @@ from docling.datamodel.base_models import (
|
||||
ErrorItem,
|
||||
Page,
|
||||
)
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
InputDocument,
|
||||
ProfilingItem,
|
||||
ProfilingScope,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BaseEnrichmentModel, TimeRecorder
|
||||
from docling.models.base_model import BaseEnrichmentModel
|
||||
from docling.utils.profiling import ProfilingItem, ProfilingScope, TimeRecorder
|
||||
from docling.utils.utils import chunkify
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -40,13 +36,14 @@ class BasePipeline(ABC):
|
||||
|
||||
_log.info(f"Processing document {in_doc.file.name}")
|
||||
try:
|
||||
# These steps are building and assembling the structure of the
|
||||
# output DoclingDocument
|
||||
conv_res = self._build_document(conv_res)
|
||||
conv_res = self._assemble_document(conv_res)
|
||||
# From this stage, all operations should rely only on conv_res.output
|
||||
conv_res = self._enrich_document(conv_res)
|
||||
conv_res.status = self._determine_status(conv_res)
|
||||
with TimeRecorder(conv_res, "total", scope=ProfilingScope.DOCUMENT):
|
||||
# These steps are building and assembling the structure of the
|
||||
# output DoclingDocument
|
||||
conv_res = self._build_document(conv_res)
|
||||
conv_res = self._assemble_document(conv_res)
|
||||
# From this stage, all operations should rely only on conv_res.output
|
||||
conv_res = self._enrich_document(conv_res)
|
||||
conv_res.status = self._determine_status(conv_res)
|
||||
except Exception as e:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
if raises_on_error:
|
||||
|
@ -5,10 +5,10 @@ from docling.backend.abstract_backend import (
|
||||
DeclarativeDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.models.base_model import TimeRecorder
|
||||
from docling.pipeline.base_pipeline import BasePipeline
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -7,14 +7,13 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import AssembledUnit, Page
|
||||
from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.models.base_model import TimeRecorder
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
@ -28,6 +27,7 @@ from docling.models.table_structure_model import TableStructureModel
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@ -121,7 +121,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
return None
|
||||
|
||||
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||
with TimeRecorder(conv_res, "init_page"):
|
||||
with TimeRecorder(conv_res, "page_init"):
|
||||
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
||||
if page._backend is not None and page._backend.is_valid():
|
||||
page.size = page._backend.get_size()
|
||||
|
59
docling/utils/profiling.py
Normal file
59
docling/utils/profiling.py
Normal file
@ -0,0 +1,59 @@
|
||||
import time
|
||||
from enum import Enum
|
||||
from typing import TYPE_CHECKING, List
|
||||
|
||||
import numpy as np
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.settings import settings
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.document import ConversionResult
|
||||
|
||||
|
||||
class ProfilingScope(str, Enum):
|
||||
PAGE = "page"
|
||||
DOCUMENT = "document"
|
||||
|
||||
|
||||
class ProfilingItem(BaseModel):
|
||||
scope: ProfilingScope
|
||||
count: int = 0
|
||||
times: List[float] = []
|
||||
|
||||
def avg(self) -> float:
|
||||
return np.average(self.times) # type: ignore
|
||||
|
||||
def std(self) -> float:
|
||||
return np.std(self.times) # type: ignore
|
||||
|
||||
def mean(self) -> float:
|
||||
return np.mean(self.times) # type: ignore
|
||||
|
||||
def percentile(self, perc: float) -> float:
|
||||
return np.percentile(self.times, perc) # type: ignore
|
||||
|
||||
|
||||
class TimeRecorder:
|
||||
def __init__(
|
||||
self,
|
||||
conv_res: "ConversionResult",
|
||||
key: str,
|
||||
scope: ProfilingScope = ProfilingScope.PAGE,
|
||||
):
|
||||
if settings.debug.profile_pipeline_timings:
|
||||
if key not in conv_res.timings.keys():
|
||||
conv_res.timings[key] = ProfilingItem(scope=scope)
|
||||
self.conv_res = conv_res
|
||||
self.key = key
|
||||
|
||||
def __enter__(self):
|
||||
if settings.debug.profile_pipeline_timings:
|
||||
self.start = time.monotonic()
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
if settings.debug.profile_pipeline_timings:
|
||||
elapsed = time.monotonic() - self.start
|
||||
self.conv_res.timings[self.key].times.append(elapsed)
|
||||
self.conv_res.timings[self.key].count += 1
|
Loading…
Reference in New Issue
Block a user