Add profiling code to all models

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-31 14:34:40 +00:00 · 2024-10-28 15:04:09 +01:00 · 2024-10-28 15:04:09 +01:00 · 0814f32ae4
commit 0814f32ae4
parent a00f01cf07
15 changed files with 644 additions and 527 deletions
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -6,6 +6,7 @@ from pathlib import Path, PurePath
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
 import filetype
 import numpy as np
 from docling_core.types.doc import (
    DocItem,
    DocItemLabel,
@ -179,6 +180,29 @@ class DocumentFormat(str, Enum):
    V1 = "v1"
 class ProfilingScope(str, Enum):
    PAGE = "page"
    DOCUMENT = "document"
 class ProfilingItem(BaseModel):
    scope: ProfilingScope
    count: int = 0
    times: List[float] = []
    def avg(self) -> float:
        return np.average(self.times)  # type: ignore
    def std(self) -> float:
        return np.std(self.times)  # type: ignore
    def mean(self) -> float:
        return np.mean(self.times)  # type: ignore
    def percentile(self, perc: float) -> float:
        return np.percentile(self.times, perc)  # type: ignore
 class ConversionResult(BaseModel):
    input: InputDocument
@ -187,6 +211,7 @@ class ConversionResult(BaseModel):
    pages: List[Page] = []
    assembled: AssembledUnit = AssembledUnit()
    timings: Dict[str, ProfilingItem] = {}
    document: DoclingDocument = _EMPTY_DOCLING_DOC
--- a/docling/datamodel/settings.py
+++ b/docling/datamodel/settings.py
@ -32,6 +32,8 @@ class DebugSettings(BaseModel):
    visualize_layout: bool = False
    visualize_tables: bool = False
    profile_pipeline_timings: bool = False
 class AppSettings(BaseSettings):
    perf: BatchConcurrencySettings
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@ -1,14 +1,19 @@
 import time
 from abc import ABC, abstractmethod
-from typing import Any, Iterable
+from typing import Any, Callable, Iterable, Type
 from docling_core.types.doc import DoclingDocument, NodeItem
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult, ProfilingItem, ProfilingScope
 from docling.datamodel.settings import settings
 class BasePageModel(ABC):
    @abstractmethod
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        pass
@ -23,3 +28,28 @@ class BaseEnrichmentModel(ABC):
        self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
    ) -> Iterable[Any]:
        pass
 class TimeRecorder:
    def __init__(
        self,
        conv_res: ConversionResult,
        key: str,
        scope: ProfilingScope = ProfilingScope.PAGE,
    ):
        if settings.debug.profile_pipeline_timings:
            if key not in conv_res.timings.keys():
                conv_res.timings[key] = ProfilingItem(scope=scope)
            self.conv_res = conv_res
            self.key = key
    def __enter__(self):
        if settings.debug.profile_pipeline_timings:
            self.start = time.monotonic()
        return self
    def __exit__(self, *args):
        if settings.debug.profile_pipeline_timings:
            elapsed = time.monotonic() - self.start
            self.conv_res.timings[self.key].times.append(elapsed)
            self.conv_res.timings[self.key].count += 1
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@ -10,12 +10,14 @@ from rtree import index
 from scipy.ndimage import find_objects, label
 from docling.datamodel.base_models import OcrCell, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import OcrOptions
 from docling.models.base_model import BasePageModel
 _log = logging.getLogger(__name__)
-class BaseOcrModel:
+class BaseOcrModel(BasePageModel):
    def __init__(self, enabled: bool, options: OcrOptions):
        self.enabled = enabled
        self.options = options
@ -133,5 +135,7 @@ class BaseOcrModel:
        image.show()
    @abstractmethod
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        pass
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@ -27,6 +27,7 @@ from pydantic import BaseModel, ConfigDict
 from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
 from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
 from docling.models.base_model import TimeRecorder
 from docling.utils.utils import create_hash
@ -226,6 +227,7 @@ class GlmModel:
        return ds_doc
    def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
        with TimeRecorder(conv_res, "glm"):
            ds_doc = self._to_legacy_document(conv_res)
            ds_doc_dict = ds_doc.model_dump(by_alias=True)
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@ -1,12 +1,15 @@
 import logging
 import time
 from typing import Iterable
 import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling.datamodel.base_models import OcrCell, Page
 from docling.datamodel.document import ConversionResult, ProfilingItem
 from docling.datamodel.pipeline_options import EasyOcrOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import TimeRecorder
 from docling.models.base_ocr_model import BaseOcrModel
 _log = logging.getLogger(__name__)
@ -34,17 +37,21 @@ class EasyOcrModel(BaseOcrModel):
                download_enabled=self.options.download_enabled,
            )
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        if not self.enabled:
            yield from page_batch
            return
        for page in page_batch:
            assert page._backend is not None
            if not page._backend.is_valid():
                yield page
            else:
                with TimeRecorder(conv_res, "ocr"):
                    ocr_rects = self.get_ocr_rects(page)
                    all_ocr_cells = []
@ -81,7 +88,9 @@ class EasyOcrModel(BaseOcrModel):
                        all_ocr_cells.extend(cells)
                    ## Remove OCR cells which overlap with programmatic cells.
-                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                    filtered_ocr_cells = self.filter_ocr_cells(
                        all_ocr_cells, page.cells
                    )
                    page.cells.extend(filtered_ocr_cells)
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -16,8 +16,9 @@ from docling.datamodel.base_models import (
    LayoutPrediction,
    Page,
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.settings import settings
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BasePageModel, TimeRecorder
 from docling.utils import layout_utils as lu
 _log = logging.getLogger(__name__)
@ -272,12 +273,16 @@ class LayoutModel(BasePageModel):
        return clusters_out_new, cells_out_new
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        for page in page_batch:
            assert page._backend is not None
            if not page._backend.is_valid():
                yield page
            else:
                with TimeRecorder(conv_res, "layout"):
                    assert page.size is not None
                    clusters = []
@ -285,7 +290,10 @@ class LayoutModel(BasePageModel):
                        self.layout_predictor.predict(page.get_image(scale=1.0))
                    ):
                        label = DocItemLabel(
-                        pred_item["label"].lower().replace(" ", "_").replace("-", "_")
+                            pred_item["label"]
                            .lower()
                            .replace(" ", "_")
                            .replace("-", "_")
                        )  # Temporary, until docling-ibm-model uses docling-core types
                        cluster = Cluster(
                            id=ix,
@ -330,7 +338,9 @@ class LayoutModel(BasePageModel):
                                )
                                for tc in c.cells:  # [:1]:
                                    x0, y0, x1, y1 = tc.bbox.as_tuple()
-                                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
+                                    draw.rectangle(
                                        [(x0, y0), (x1, y1)], outline=cell_color
                                    )
                            if show:
                                image.show()
@ -340,9 +350,9 @@ class LayoutModel(BasePageModel):
                        clusters, page.cells, page.size.height
                    )
                    page.predictions.layout = LayoutPrediction(clusters=clusters)
                if settings.debug.visualize_layout:
                    draw_clusters_and_cells()
                page.predictions.layout = LayoutPrediction(clusters=clusters)
                yield page
--- a/docling/models/page_assemble_model.py
+++ b/docling/models/page_assemble_model.py
@ -12,7 +12,8 @@ from docling.datamodel.base_models import (
    Table,
    TextElement,
 )
-from docling.models.base_model import BasePageModel
+from docling.datamodel.document import ConversionResult
 from docling.models.base_model import BasePageModel, TimeRecorder
 from docling.models.layout_model import LayoutModel
 _log = logging.getLogger(__name__)
@ -51,12 +52,16 @@ class PageAssembleModel(BasePageModel):
        return sanitized_text.strip()  # Strip any leading or trailing whitespace
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        for page in page_batch:
            assert page._backend is not None
            if not page._backend.is_valid():
                yield page
            else:
                with TimeRecorder(conv_res, "page_assemble"):
                    assert page.predictions.layout is not None
                    # assembles some JSON output page by page.
@ -112,11 +117,9 @@ class PageAssembleModel(BasePageModel):
                        elif cluster.label == LayoutModel.FIGURE_LABEL:
                            fig = None
                            if page.predictions.figures_classification:
-                            fig = (
+                                fig = page.predictions.figures_classification.figure_map.get(
                                page.predictions.figures_classification.figure_map.get(
                                    cluster.id, None
                                )
                            )
                            if (
                                not fig
                            ):  # fallback: add figure without classification, if it isn't present
@ -133,11 +136,9 @@ class PageAssembleModel(BasePageModel):
                        elif cluster.label == LayoutModel.FORMULA_LABEL:
                            equation = None
                            if page.predictions.equations_prediction:
-                            equation = (
+                                equation = page.predictions.equations_prediction.equation_map.get(
                                page.predictions.equations_prediction.equation_map.get(
                                    cluster.id, None
                                )
                            )
                            if (
                                not equation
                            ):  # fallback: add empty formula, if it isn't present
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@ -4,7 +4,8 @@ from PIL import ImageDraw
 from pydantic import BaseModel
 from docling.datamodel.base_models import Page
-from docling.models.base_model import BasePageModel
+from docling.datamodel.document import ConversionResult
 from docling.models.base_model import BasePageModel, TimeRecorder
 class PagePreprocessingOptions(BaseModel):
@ -15,12 +16,15 @@ class PagePreprocessingModel(BasePageModel):
    def __init__(self, options: PagePreprocessingOptions):
        self.options = options
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        for page in page_batch:
            assert page._backend is not None
            if not page._backend.is_valid():
                yield page
            else:
                with TimeRecorder(conv_res, "page_parse"):
                    page = self._populate_page_images(page)
                    page = self._parse_page_cells(page)
                yield page
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -8,9 +8,10 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
 from PIL import ImageDraw
 from docling.datamodel.base_models import Page, Table, TableStructurePrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
 from docling.datamodel.settings import settings
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BasePageModel, TimeRecorder
 class TableStructureModel(BasePageModel):
@ -64,7 +65,9 @@ class TableStructureModel(BasePageModel):
        image.show()
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        if not self.enabled:
            yield from page_batch
@ -75,11 +78,14 @@ class TableStructureModel(BasePageModel):
            if not page._backend.is_valid():
                yield page
            else:
                with TimeRecorder(conv_res, "table_structure"):
                    assert page.predictions.layout is not None
                    assert page.size is not None
-                page.predictions.tablestructure = TableStructurePrediction()  # dummy
+                    page.predictions.tablestructure = (
                        TableStructurePrediction()
                    )  # dummy
                    in_tables = [
                        (
@ -121,7 +127,9 @@ class TableStructureModel(BasePageModel):
                        "width": page.size.width * self.scale,
                        "height": page.size.height * self.scale,
                    }
-                page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
+                    page_input["image"] = numpy.asarray(
                        page.get_image(scale=self.scale)
                    )
                    table_clusters, table_bboxes = zip(*in_tables)
@ -138,7 +146,9 @@ class TableStructureModel(BasePageModel):
                                    the_bbox = BoundingBox.model_validate(
                                        element["bbox"]
                                    ).scaled(1 / self.scale)
-                                text_piece = page._backend.get_text_in_rect(the_bbox)
+                                    text_piece = page._backend.get_text_in_rect(
                                        the_bbox
                                    )
                                    element["bbox"]["token"] = text_piece
                                tc = TableCell.model_validate(element)
@ -149,7 +159,9 @@ class TableStructureModel(BasePageModel):
                            # Retrieving cols/rows, after post processing:
                            num_rows = table_out["predict_details"]["num_rows"]
                            num_cols = table_out["predict_details"]["num_cols"]
-                        otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
+                            otsl_seq = table_out["predict_details"]["prediction"][
                                "rs_seq"
                            ]
                            tbl = Table(
                                otsl_seq=otsl_seq,
@ -162,9 +174,9 @@ class TableStructureModel(BasePageModel):
                                label=DocItemLabel.TABLE,
                            )
-                        page.predictions.tablestructure.table_map[table_cluster.id] = (
+                            page.predictions.tablestructure.table_map[
-                            tbl
+                                table_cluster.id
-                        )
+                            ] = tbl
                    # For debugging purposes:
                    if settings.debug.visualize_tables:
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@ -8,8 +8,10 @@ import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling.datamodel.base_models import OcrCell, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractCliOcrOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import TimeRecorder
 from docling.models.base_ocr_model import BaseOcrModel
 _log = logging.getLogger(__name__)
@ -103,7 +105,9 @@ class TesseractOcrCliModel(BaseOcrModel):
        return df_filtered
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        if not self.enabled:
            yield from page_batch
@ -114,6 +118,8 @@ class TesseractOcrCliModel(BaseOcrModel):
            if not page._backend.is_valid():
                yield page
            else:
                with TimeRecorder(conv_res, "ocr"):
                    ocr_rects = self.get_ocr_rects(page)
                    all_ocr_cells = []
@ -165,7 +171,9 @@ class TesseractOcrCliModel(BaseOcrModel):
                            all_ocr_cells.append(cell)
                    ## Remove OCR cells which overlap with programmatic cells.
-                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                    filtered_ocr_cells = self.filter_ocr_cells(
                        all_ocr_cells, page.cells
                    )
                    page.cells.extend(filtered_ocr_cells)
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -4,8 +4,10 @@ from typing import Iterable
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling.datamodel.base_models import OcrCell, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractOcrOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import TimeRecorder
 from docling.models.base_ocr_model import BaseOcrModel
 _log = logging.getLogger(__name__)
@ -62,7 +64,9 @@ class TesseractOcrModel(BaseOcrModel):
            # Finalize the tesseractAPI
            self.reader.End()
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        if not self.enabled:
            yield from page_batch
@ -73,6 +77,8 @@ class TesseractOcrModel(BaseOcrModel):
            if not page._backend.is_valid():
                yield page
            else:
                with TimeRecorder(conv_res, "ocr"):
                    assert self.reader is not None
                    ocr_rects = self.get_ocr_rects(page)
@ -95,7 +101,9 @@ class TesseractOcrModel(BaseOcrModel):
                        cells = []
                        for ix, (im, box, _, _) in enumerate(boxes):
                            # Set the area of interest. Tesseract uses Bottom-Left for the origin
-                        self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
+                            self.reader.SetRectangle(
                                box["x"], box["y"], box["w"], box["h"]
                            )
                            # Extract text within the bounding box
                            text = self.reader.GetUTF8Text().strip()
@ -121,7 +129,9 @@ class TesseractOcrModel(BaseOcrModel):
                        all_ocr_cells.extend(cells)
                    ## Remove OCR cells which overlap with programmatic cells.
-                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                    filtered_ocr_cells = self.filter_ocr_cells(
                        all_ocr_cells, page.cells
                    )
                    page.cells.extend(filtered_ocr_cells)
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@ -15,10 +15,15 @@ from docling.datamodel.base_models import (
    ErrorItem,
    Page,
 )
-from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.document import (
    ConversionResult,
    InputDocument,
    ProfilingItem,
    ProfilingScope,
 )
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import settings
-from docling.models.base_model import BaseEnrichmentModel
+from docling.models.base_model import BaseEnrichmentModel, TimeRecorder
 from docling.utils.utils import chunkify
 _log = logging.getLogger(__name__)
@ -37,11 +42,11 @@ class BasePipeline(ABC):
        try:
            # These steps are building and assembling the structure of the
            # output DoclingDocument
-            conv_res = self._build_document(in_doc, conv_res)
+            conv_res = self._build_document(conv_res)
-            conv_res = self._assemble_document(in_doc, conv_res)
+            conv_res = self._assemble_document(conv_res)
            # From this stage, all operations should rely only on conv_res.output
-            conv_res = self._enrich_document(in_doc, conv_res)
+            conv_res = self._enrich_document(conv_res)
-            conv_res.status = self._determine_status(in_doc, conv_res)
+            conv_res.status = self._determine_status(conv_res)
        except Exception as e:
            conv_res.status = ConversionStatus.FAILURE
            if raises_on_error:
@ -50,19 +55,13 @@ class BasePipeline(ABC):
        return conv_res
    @abstractmethod
-    def _build_document(
+    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
        self, in_doc: InputDocument, conv_res: ConversionResult
    ) -> ConversionResult:
        pass
-    def _assemble_document(
+    def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
        self, in_doc: InputDocument, conv_res: ConversionResult
    ) -> ConversionResult:
        return conv_res
-    def _enrich_document(
+    def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
        self, in_doc: InputDocument, conv_res: ConversionResult
    ) -> ConversionResult:
        def _filter_elements(
            doc: DoclingDocument, model: BaseEnrichmentModel
@ -71,6 +70,7 @@ class BasePipeline(ABC):
                if model.is_processable(doc=doc, element=element):
                    yield element
        with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
            for model in self.enrichment_pipe:
                for element_batch in chunkify(
                    _filter_elements(conv_res.document, model),
@ -86,9 +86,7 @@ class BasePipeline(ABC):
        return conv_res
    @abstractmethod
-    def _determine_status(
+    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
        self, in_doc: InputDocument, conv_res: ConversionResult
    ) -> ConversionStatus:
        pass
    @classmethod
@ -110,40 +108,44 @@ class BasePipeline(ABC):
 class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
-    def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def _apply_on_pages(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        for model in self.build_pipe:
-            page_batch = model(page_batch)
+            page_batch = model(conv_res, page_batch)
        yield from page_batch
-    def _build_document(
+    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
        self, in_doc: InputDocument, conv_res: ConversionResult
    ) -> ConversionResult:
-        if not isinstance(in_doc._backend, PdfDocumentBackend):
+        if not isinstance(conv_res.input._backend, PdfDocumentBackend):
            raise RuntimeError(
-                f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
+                f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
                f"Can not convert this with a PDF pipeline. "
                f"Please check your format configuration on DocumentConverter."
            )
            # conv_res.status = ConversionStatus.FAILURE
            # return conv_res
-        for i in range(0, in_doc.page_count):
+        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
            for i in range(0, conv_res.input.page_count):
                conv_res.pages.append(Page(page_no=i))
            try:
                # Iterate batches of pages (page_batch_size) in the doc
-            for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
+                for page_batch in chunkify(
                    conv_res.pages, settings.perf.page_batch_size
                ):
                    start_pb_time = time.time()
                    # 1. Initialise the page resources
                    init_pages = map(
-                    functools.partial(self.initialize_page, in_doc), page_batch
+                        functools.partial(self.initialize_page, conv_res), page_batch
                    )
                    # 2. Run pipeline stages
-                pipeline_pages = self._apply_on_pages(init_pages)
+                    pipeline_pages = self._apply_on_pages(conv_res, init_pages)
                    for p in pipeline_pages:  # Must exhaust!
                        pass
@ -155,21 +157,19 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                conv_res.status = ConversionStatus.FAILURE
                trace = "\n".join(traceback.format_exception(e))
                _log.warning(
-                f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
+                    f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
                    f"{trace}"
                )
                raise e
            finally:
                # Always unload the PDF backend, even in case of failure
-            if in_doc._backend:
+                if conv_res.input._backend:
-                in_doc._backend.unload()
+                    conv_res.input._backend.unload()
        return conv_res
-    def _determine_status(
+    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
        self, in_doc: InputDocument, conv_res: ConversionResult
    ) -> ConversionStatus:
        status = ConversionStatus.SUCCESS
        for page in conv_res.pages:
            if page._backend is None or not page._backend.is_valid():
@ -186,5 +186,5 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
    # Initialise and load resources for a page
    @abstractmethod
-    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
+    def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
        pass
--- a/docling/pipeline/simple_pipeline.py
+++ b/docling/pipeline/simple_pipeline.py
@ -5,8 +5,9 @@ from docling.backend.abstract_backend import (
    DeclarativeDocumentBackend,
 )
 from docling.datamodel.base_models import ConversionStatus
-from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.models.base_model import TimeRecorder
 from docling.pipeline.base_pipeline import BasePipeline
 _log = logging.getLogger(__name__)
@ -22,13 +23,11 @@ class SimplePipeline(BasePipeline):
    def __init__(self, pipeline_options: PipelineOptions):
        super().__init__(pipeline_options)
-    def _build_document(
+    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
        self, in_doc: InputDocument, conv_res: ConversionResult
    ) -> ConversionResult:
-        if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
+        if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
            raise RuntimeError(
-                f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
+                f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
                f"Can not convert this with simple pipeline. "
                f"Please check your format configuration on DocumentConverter."
            )
@ -38,13 +37,11 @@ class SimplePipeline(BasePipeline):
        # Instead of running a page-level pipeline to build up the document structure,
        # the backend is expected to be of type DeclarativeDocumentBackend, which can output
        # a DoclingDocument straight.
-
+        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
-        conv_res.document = in_doc._backend.convert()
+            conv_res.document = conv_res.input._backend.convert()
        return conv_res
-    def _determine_status(
+    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
        self, in_doc: InputDocument, conv_res: ConversionResult
    ) -> ConversionStatus:
        # This is called only if the previous steps didn't raise.
        # Since we don't have anything else to evaluate, we can
        # safely return SUCCESS.
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -7,13 +7,14 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import AssembledUnit, Page
-from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope
 from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    PdfPipelineOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )
 from docling.models.base_model import TimeRecorder
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.ds_glm_model import GlmModel, GlmOptions
 from docling.models.easyocr_model import EasyOcrModel
@ -119,20 +120,20 @@ class StandardPdfPipeline(PaginatedPipeline):
            )
        return None
-    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
+    def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
-        page._backend = doc._backend.load_page(page.page_no)  # type: ignore
+        with TimeRecorder(conv_res, "init_page"):
            page._backend = conv_res.input._backend.load_page(page.page_no)  # type: ignore
            if page._backend is not None and page._backend.is_valid():
                page.size = page._backend.get_size()
        return page
-    def _assemble_document(
+    def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
        self, in_doc: InputDocument, conv_res: ConversionResult
    ) -> ConversionResult:
        all_elements = []
        all_headers = []
        all_body = []
        with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
            for p in conv_res.pages:
                if p.assembled is not None:
                    for el in p.assembled.body:
@ -185,7 +186,9 @@ class StandardPdfPipeline(PaginatedPipeline):
                        )
                        cropped_im = page.image.crop(crop_bbox.as_tuple())
-                    element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale))
+                        element.image = ImageRef.from_pil(
                            cropped_im, dpi=int(72 * scale)
                        )
        return conv_res