Refactor and fix profiling codes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-29 10:54:17 +01:00
parent 0814f32ae4
commit 79f90b46dc
15 changed files with 97 additions and 83 deletions

View File

@ -3,10 +3,9 @@ import re
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
import filetype import filetype
import numpy as np
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItem, DocItem,
DocItemLabel, DocItemLabel,
@ -53,6 +52,7 @@ from docling.datamodel.base_models import (
Page, Page,
) )
from docling.datamodel.settings import DocumentLimits from docling.datamodel.settings import DocumentLimits
from docling.utils.profiling import ProfilingItem
from docling.utils.utils import create_file_hash, create_hash from docling.utils.utils import create_file_hash, create_hash
if TYPE_CHECKING: if TYPE_CHECKING:
@ -180,29 +180,6 @@ class DocumentFormat(str, Enum):
V1 = "v1" V1 = "v1"
class ProfilingScope(str, Enum):
PAGE = "page"
DOCUMENT = "document"
class ProfilingItem(BaseModel):
scope: ProfilingScope
count: int = 0
times: List[float] = []
def avg(self) -> float:
return np.average(self.times) # type: ignore
def std(self) -> float:
return np.std(self.times) # type: ignore
def mean(self) -> float:
return np.mean(self.times) # type: ignore
def percentile(self, perc: float) -> float:
return np.percentile(self.times, perc) # type: ignore
class ConversionResult(BaseModel): class ConversionResult(BaseModel):
input: InputDocument input: InputDocument

View File

@ -1,4 +1,5 @@
import sys import sys
from pathlib import Path
from pydantic import BaseModel from pydantic import BaseModel
from pydantic_settings import BaseSettings from pydantic_settings import BaseSettings
@ -34,6 +35,9 @@ class DebugSettings(BaseModel):
profile_pipeline_timings: bool = False profile_pipeline_timings: bool = False
# Path used to output debug information.
debug_output_path: str = str(Path.cwd())
class AppSettings(BaseSettings): class AppSettings(BaseSettings):
perf: BatchConcurrencySettings perf: BatchConcurrencySettings

View File

@ -1,12 +1,10 @@
import time
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Callable, Iterable, Type from typing import Any, Iterable
from docling_core.types.doc import DoclingDocument, NodeItem from docling_core.types.doc import DoclingDocument, NodeItem
from docling.datamodel.base_models import Page from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult, ProfilingItem, ProfilingScope from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
class BasePageModel(ABC): class BasePageModel(ABC):
@ -28,28 +26,3 @@ class BaseEnrichmentModel(ABC):
self, doc: DoclingDocument, element_batch: Iterable[NodeItem] self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
) -> Iterable[Any]: ) -> Iterable[Any]:
pass pass
class TimeRecorder:
def __init__(
self,
conv_res: ConversionResult,
key: str,
scope: ProfilingScope = ProfilingScope.PAGE,
):
if settings.debug.profile_pipeline_timings:
if key not in conv_res.timings.keys():
conv_res.timings[key] = ProfilingItem(scope=scope)
self.conv_res = conv_res
self.key = key
def __enter__(self):
if settings.debug.profile_pipeline_timings:
self.start = time.monotonic()
return self
def __exit__(self, *args):
if settings.debug.profile_pipeline_timings:
elapsed = time.monotonic() - self.start
self.conv_res.timings[self.key].times.append(elapsed)
self.conv_res.timings[self.key].count += 1

View File

@ -27,7 +27,7 @@ from pydantic import BaseModel, ConfigDict
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
from docling.models.base_model import TimeRecorder from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import create_hash from docling.utils.utils import create_hash
@ -227,7 +227,7 @@ class GlmModel:
return ds_doc return ds_doc
def __call__(self, conv_res: ConversionResult) -> DoclingDocument: def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
with TimeRecorder(conv_res, "glm"): with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
ds_doc = self._to_legacy_document(conv_res) ds_doc = self._to_legacy_document(conv_res)
ds_doc_dict = ds_doc.model_dump(by_alias=True) ds_doc_dict = ds_doc.model_dump(by_alias=True)

View File

@ -6,11 +6,11 @@ import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult, ProfilingItem from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import EasyOcrOptions from docling.datamodel.pipeline_options import EasyOcrOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import TimeRecorder
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import ProfilingItem, TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -18,8 +18,9 @@ from docling.datamodel.base_models import (
) )
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel, TimeRecorder from docling.models.base_model import BasePageModel
from docling.utils import layout_utils as lu from docling.utils import layout_utils as lu
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -13,8 +13,9 @@ from docling.datamodel.base_models import (
TextElement, TextElement,
) )
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.models.base_model import BasePageModel, TimeRecorder from docling.models.base_model import BasePageModel
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -5,7 +5,8 @@ from pydantic import BaseModel
from docling.datamodel.base_models import Page from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.models.base_model import BasePageModel, TimeRecorder from docling.models.base_model import BasePageModel
from docling.utils.profiling import TimeRecorder
class PagePreprocessingOptions(BaseModel): class PagePreprocessingOptions(BaseModel):

View File

@ -11,7 +11,8 @@ from docling.datamodel.base_models import Page, Table, TableStructurePrediction
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel, TimeRecorder from docling.models.base_model import BasePageModel
from docling.utils.profiling import TimeRecorder
class TableStructureModel(BasePageModel): class TableStructureModel(BasePageModel):

View File

@ -11,8 +11,8 @@ from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractCliOcrOptions from docling.datamodel.pipeline_options import TesseractCliOcrOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import TimeRecorder
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -7,8 +7,8 @@ from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractOcrOptions from docling.datamodel.pipeline_options import TesseractOcrOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import TimeRecorder
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -15,15 +15,11 @@ from docling.datamodel.base_models import (
ErrorItem, ErrorItem,
Page, Page,
) )
from docling.datamodel.document import ( from docling.datamodel.document import ConversionResult, InputDocument
ConversionResult,
InputDocument,
ProfilingItem,
ProfilingScope,
)
from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import BaseEnrichmentModel, TimeRecorder from docling.models.base_model import BaseEnrichmentModel
from docling.utils.profiling import ProfilingItem, ProfilingScope, TimeRecorder
from docling.utils.utils import chunkify from docling.utils.utils import chunkify
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -40,13 +36,14 @@ class BasePipeline(ABC):
_log.info(f"Processing document {in_doc.file.name}") _log.info(f"Processing document {in_doc.file.name}")
try: try:
# These steps are building and assembling the structure of the with TimeRecorder(conv_res, "total", scope=ProfilingScope.DOCUMENT):
# output DoclingDocument # These steps are building and assembling the structure of the
conv_res = self._build_document(conv_res) # output DoclingDocument
conv_res = self._assemble_document(conv_res) conv_res = self._build_document(conv_res)
# From this stage, all operations should rely only on conv_res.output conv_res = self._assemble_document(conv_res)
conv_res = self._enrich_document(conv_res) # From this stage, all operations should rely only on conv_res.output
conv_res.status = self._determine_status(conv_res) conv_res = self._enrich_document(conv_res)
conv_res.status = self._determine_status(conv_res)
except Exception as e: except Exception as e:
conv_res.status = ConversionStatus.FAILURE conv_res.status = ConversionStatus.FAILURE
if raises_on_error: if raises_on_error:

View File

@ -5,10 +5,10 @@ from docling.backend.abstract_backend import (
DeclarativeDocumentBackend, DeclarativeDocumentBackend,
) )
from docling.datamodel.base_models import ConversionStatus from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.pipeline_options import PipelineOptions
from docling.models.base_model import TimeRecorder
from docling.pipeline.base_pipeline import BasePipeline from docling.pipeline.base_pipeline import BasePipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -7,14 +7,13 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page from docling.datamodel.base_models import AssembledUnit, Page
from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
PdfPipelineOptions, PdfPipelineOptions,
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
) )
from docling.models.base_model import TimeRecorder
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.models.ds_glm_model import GlmModel, GlmOptions from docling.models.ds_glm_model import GlmModel, GlmOptions
from docling.models.easyocr_model import EasyOcrModel from docling.models.easyocr_model import EasyOcrModel
@ -28,6 +27,7 @@ from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel from docling.models.tesseract_ocr_model import TesseractOcrModel
from docling.pipeline.base_pipeline import PaginatedPipeline from docling.pipeline.base_pipeline import PaginatedPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -121,7 +121,7 @@ class StandardPdfPipeline(PaginatedPipeline):
return None return None
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page: def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
with TimeRecorder(conv_res, "init_page"): with TimeRecorder(conv_res, "page_init"):
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
if page._backend is not None and page._backend.is_valid(): if page._backend is not None and page._backend.is_valid():
page.size = page._backend.get_size() page.size = page._backend.get_size()

View File

@ -0,0 +1,59 @@
import time
from enum import Enum
from typing import TYPE_CHECKING, List
import numpy as np
from pydantic import BaseModel
from docling.datamodel.settings import settings
if TYPE_CHECKING:
from docling.datamodel.document import ConversionResult
class ProfilingScope(str, Enum):
PAGE = "page"
DOCUMENT = "document"
class ProfilingItem(BaseModel):
scope: ProfilingScope
count: int = 0
times: List[float] = []
def avg(self) -> float:
return np.average(self.times) # type: ignore
def std(self) -> float:
return np.std(self.times) # type: ignore
def mean(self) -> float:
return np.mean(self.times) # type: ignore
def percentile(self, perc: float) -> float:
return np.percentile(self.times, perc) # type: ignore
class TimeRecorder:
def __init__(
self,
conv_res: "ConversionResult",
key: str,
scope: ProfilingScope = ProfilingScope.PAGE,
):
if settings.debug.profile_pipeline_timings:
if key not in conv_res.timings.keys():
conv_res.timings[key] = ProfilingItem(scope=scope)
self.conv_res = conv_res
self.key = key
def __enter__(self):
if settings.debug.profile_pipeline_timings:
self.start = time.monotonic()
return self
def __exit__(self, *args):
if settings.debug.profile_pipeline_timings:
elapsed = time.monotonic() - self.start
self.conv_res.timings[self.key].times.append(elapsed)
self.conv_res.timings[self.key].count += 1