diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 8f7bc842..be4e9a12 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -3,10 +3,9 @@ import re from enum import Enum from io import BytesIO from pathlib import Path, PurePath -from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union import filetype -import numpy as np from docling_core.types.doc import ( DocItem, DocItemLabel, @@ -53,6 +52,7 @@ from docling.datamodel.base_models import ( Page, ) from docling.datamodel.settings import DocumentLimits +from docling.utils.profiling import ProfilingItem from docling.utils.utils import create_file_hash, create_hash if TYPE_CHECKING: @@ -180,29 +180,6 @@ class DocumentFormat(str, Enum): V1 = "v1" -class ProfilingScope(str, Enum): - PAGE = "page" - DOCUMENT = "document" - - -class ProfilingItem(BaseModel): - scope: ProfilingScope - count: int = 0 - times: List[float] = [] - - def avg(self) -> float: - return np.average(self.times) # type: ignore - - def std(self) -> float: - return np.std(self.times) # type: ignore - - def mean(self) -> float: - return np.mean(self.times) # type: ignore - - def percentile(self, perc: float) -> float: - return np.percentile(self.times, perc) # type: ignore - - class ConversionResult(BaseModel): input: InputDocument diff --git a/docling/datamodel/settings.py b/docling/datamodel/settings.py index ce1c5faf..7287a499 100644 --- a/docling/datamodel/settings.py +++ b/docling/datamodel/settings.py @@ -1,4 +1,5 @@ import sys +from pathlib import Path from pydantic import BaseModel from pydantic_settings import BaseSettings @@ -34,6 +35,9 @@ class DebugSettings(BaseModel): profile_pipeline_timings: bool = False + # Path used to output debug information. + debug_output_path: str = str(Path.cwd()) + class AppSettings(BaseSettings): perf: BatchConcurrencySettings diff --git a/docling/models/base_model.py b/docling/models/base_model.py index e7bcc7f2..1147896c 100644 --- a/docling/models/base_model.py +++ b/docling/models/base_model.py @@ -1,12 +1,10 @@ -import time from abc import ABC, abstractmethod -from typing import Any, Callable, Iterable, Type +from typing import Any, Iterable from docling_core.types.doc import DoclingDocument, NodeItem from docling.datamodel.base_models import Page -from docling.datamodel.document import ConversionResult, ProfilingItem, ProfilingScope -from docling.datamodel.settings import settings +from docling.datamodel.document import ConversionResult class BasePageModel(ABC): @@ -28,28 +26,3 @@ class BaseEnrichmentModel(ABC): self, doc: DoclingDocument, element_batch: Iterable[NodeItem] ) -> Iterable[Any]: pass - - -class TimeRecorder: - def __init__( - self, - conv_res: ConversionResult, - key: str, - scope: ProfilingScope = ProfilingScope.PAGE, - ): - if settings.debug.profile_pipeline_timings: - if key not in conv_res.timings.keys(): - conv_res.timings[key] = ProfilingItem(scope=scope) - self.conv_res = conv_res - self.key = key - - def __enter__(self): - if settings.debug.profile_pipeline_timings: - self.start = time.monotonic() - return self - - def __exit__(self, *args): - if settings.debug.profile_pipeline_timings: - elapsed = time.monotonic() - self.start - self.conv_res.timings[self.key].times.append(elapsed) - self.conv_res.timings[self.key].count += 1 diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 42b84bd2..dade1870 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -27,7 +27,7 @@ from pydantic import BaseModel, ConfigDict from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement from docling.datamodel.document import ConversionResult, layout_label_to_ds_type -from docling.models.base_model import TimeRecorder +from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.utils import create_hash @@ -227,7 +227,7 @@ class GlmModel: return ds_doc def __call__(self, conv_res: ConversionResult) -> DoclingDocument: - with TimeRecorder(conv_res, "glm"): + with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT): ds_doc = self._to_legacy_document(conv_res) ds_doc_dict = ds_doc.model_dump(by_alias=True) diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index ff4954b6..d57b5747 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -6,11 +6,11 @@ import numpy from docling_core.types.doc import BoundingBox, CoordOrigin from docling.datamodel.base_models import OcrCell, Page -from docling.datamodel.document import ConversionResult, ProfilingItem +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import EasyOcrOptions from docling.datamodel.settings import settings -from docling.models.base_model import TimeRecorder from docling.models.base_ocr_model import BaseOcrModel +from docling.utils.profiling import ProfilingItem, TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index b885dc50..ca7f0602 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -18,8 +18,9 @@ from docling.datamodel.base_models import ( ) from docling.datamodel.document import ConversionResult from docling.datamodel.settings import settings -from docling.models.base_model import BasePageModel, TimeRecorder +from docling.models.base_model import BasePageModel from docling.utils import layout_utils as lu +from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index c3be7126..9b064ead 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -13,8 +13,9 @@ from docling.datamodel.base_models import ( TextElement, ) from docling.datamodel.document import ConversionResult -from docling.models.base_model import BasePageModel, TimeRecorder +from docling.models.base_model import BasePageModel from docling.models.layout_model import LayoutModel +from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index adb20fc3..ac987874 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -5,7 +5,8 @@ from pydantic import BaseModel from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult -from docling.models.base_model import BasePageModel, TimeRecorder +from docling.models.base_model import BasePageModel +from docling.utils.profiling import TimeRecorder class PagePreprocessingOptions(BaseModel): diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 95aa6b5c..fdc56f5b 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -11,7 +11,8 @@ from docling.datamodel.base_models import Page, Table, TableStructurePrediction from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions from docling.datamodel.settings import settings -from docling.models.base_model import BasePageModel, TimeRecorder +from docling.models.base_model import BasePageModel +from docling.utils.profiling import TimeRecorder class TableStructureModel(BasePageModel): diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 93c07f38..80ea5324 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -11,8 +11,8 @@ from docling.datamodel.base_models import OcrCell, Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TesseractCliOcrOptions from docling.datamodel.settings import settings -from docling.models.base_model import TimeRecorder from docling.models.base_ocr_model import BaseOcrModel +from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 136b185e..7fab2eb2 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -7,8 +7,8 @@ from docling.datamodel.base_models import OcrCell, Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TesseractOcrOptions from docling.datamodel.settings import settings -from docling.models.base_model import TimeRecorder from docling.models.base_ocr_model import BaseOcrModel +from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index a8cde7ea..3880fe3e 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -15,15 +15,11 @@ from docling.datamodel.base_models import ( ErrorItem, Page, ) -from docling.datamodel.document import ( - ConversionResult, - InputDocument, - ProfilingItem, - ProfilingScope, -) +from docling.datamodel.document import ConversionResult, InputDocument from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.settings import settings -from docling.models.base_model import BaseEnrichmentModel, TimeRecorder +from docling.models.base_model import BaseEnrichmentModel +from docling.utils.profiling import ProfilingItem, ProfilingScope, TimeRecorder from docling.utils.utils import chunkify _log = logging.getLogger(__name__) @@ -40,13 +36,14 @@ class BasePipeline(ABC): _log.info(f"Processing document {in_doc.file.name}") try: - # These steps are building and assembling the structure of the - # output DoclingDocument - conv_res = self._build_document(conv_res) - conv_res = self._assemble_document(conv_res) - # From this stage, all operations should rely only on conv_res.output - conv_res = self._enrich_document(conv_res) - conv_res.status = self._determine_status(conv_res) + with TimeRecorder(conv_res, "total", scope=ProfilingScope.DOCUMENT): + # These steps are building and assembling the structure of the + # output DoclingDocument + conv_res = self._build_document(conv_res) + conv_res = self._assemble_document(conv_res) + # From this stage, all operations should rely only on conv_res.output + conv_res = self._enrich_document(conv_res) + conv_res.status = self._determine_status(conv_res) except Exception as e: conv_res.status = ConversionStatus.FAILURE if raises_on_error: diff --git a/docling/pipeline/simple_pipeline.py b/docling/pipeline/simple_pipeline.py index 90f74475..f0108ab8 100644 --- a/docling/pipeline/simple_pipeline.py +++ b/docling/pipeline/simple_pipeline.py @@ -5,10 +5,10 @@ from docling.backend.abstract_backend import ( DeclarativeDocumentBackend, ) from docling.datamodel.base_models import ConversionStatus -from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope +from docling.datamodel.document import ConversionResult, InputDocument from docling.datamodel.pipeline_options import PipelineOptions -from docling.models.base_model import TimeRecorder from docling.pipeline.base_pipeline import BasePipeline +from docling.utils.profiling import ProfilingScope, TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 446371b4..2def3fd5 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -7,14 +7,13 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import AssembledUnit, Page -from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope +from docling.datamodel.document import ConversionResult, InputDocument from docling.datamodel.pipeline_options import ( EasyOcrOptions, PdfPipelineOptions, TesseractCliOcrOptions, TesseractOcrOptions, ) -from docling.models.base_model import TimeRecorder from docling.models.base_ocr_model import BaseOcrModel from docling.models.ds_glm_model import GlmModel, GlmOptions from docling.models.easyocr_model import EasyOcrModel @@ -28,6 +27,7 @@ from docling.models.table_structure_model import TableStructureModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_model import TesseractOcrModel from docling.pipeline.base_pipeline import PaginatedPipeline +from docling.utils.profiling import ProfilingScope, TimeRecorder _log = logging.getLogger(__name__) @@ -121,7 +121,7 @@ class StandardPdfPipeline(PaginatedPipeline): return None def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page: - with TimeRecorder(conv_res, "init_page"): + with TimeRecorder(conv_res, "page_init"): page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore if page._backend is not None and page._backend.is_valid(): page.size = page._backend.get_size() diff --git a/docling/utils/profiling.py b/docling/utils/profiling.py new file mode 100644 index 00000000..fec47964 --- /dev/null +++ b/docling/utils/profiling.py @@ -0,0 +1,59 @@ +import time +from enum import Enum +from typing import TYPE_CHECKING, List + +import numpy as np +from pydantic import BaseModel + +from docling.datamodel.settings import settings + +if TYPE_CHECKING: + from docling.datamodel.document import ConversionResult + + +class ProfilingScope(str, Enum): + PAGE = "page" + DOCUMENT = "document" + + +class ProfilingItem(BaseModel): + scope: ProfilingScope + count: int = 0 + times: List[float] = [] + + def avg(self) -> float: + return np.average(self.times) # type: ignore + + def std(self) -> float: + return np.std(self.times) # type: ignore + + def mean(self) -> float: + return np.mean(self.times) # type: ignore + + def percentile(self, perc: float) -> float: + return np.percentile(self.times, perc) # type: ignore + + +class TimeRecorder: + def __init__( + self, + conv_res: "ConversionResult", + key: str, + scope: ProfilingScope = ProfilingScope.PAGE, + ): + if settings.debug.profile_pipeline_timings: + if key not in conv_res.timings.keys(): + conv_res.timings[key] = ProfilingItem(scope=scope) + self.conv_res = conv_res + self.key = key + + def __enter__(self): + if settings.debug.profile_pipeline_timings: + self.start = time.monotonic() + return self + + def __exit__(self, *args): + if settings.debug.profile_pipeline_timings: + elapsed = time.monotonic() - self.start + self.conv_res.timings[self.key].times.append(elapsed) + self.conv_res.timings[self.key].count += 1