Fixes for time logging

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-30 14:04:27 +00:00 · 2024-10-29 15:46:20 +01:00 · 2024-10-29 15:46:20 +01:00 · 3de3f1371c
commit 3de3f1371c
parent e1b83ec485
3 changed files with 17 additions and 10 deletions
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -1,5 +1,6 @@
 import logging
 import re
 import time
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
@ -52,7 +53,7 @@ from docling.datamodel.base_models import (
    Page,
 )
 from docling.datamodel.settings import DocumentLimits
-from docling.utils.profiling import ProfilingItem
+from docling.utils.profiling import ProfilingItem, TimeRecorder
 from docling.utils.utils import create_file_hash, create_hash
 if TYPE_CHECKING:
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -189,24 +189,35 @@ class DocumentConverter:
    ) -> Iterator[ConversionResult]:
        assert self.format_to_options is not None
        start_time = time.monotonic()
        for input_batch in chunkify(
            conv_input.docs(self.format_to_options),
            settings.perf.doc_batch_size,  # pass format_options
        ):
            _log.info(f"Going to convert document batch...")
            # parallel processing only within input_batch
            # with ThreadPoolExecutor(
            #    max_workers=settings.perf.doc_batch_concurrency
            # ) as pool:
            #   yield from pool.map(self.process_document, input_batch)
            # Note: PDF backends are not thread-safe, thread pool usage was disabled.
            for item in map(
                partial(self._process_document, raises_on_error=raises_on_error),
                input_batch,
            ):
                elapsed = time.monotonic() - start_time
                start_time = time.monotonic()
                if item is not None:
                    _log.info(
                        f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
                    )
                    yield item
                else:
                    _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
    def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
        assert self.format_to_options is not None
@ -237,15 +248,8 @@ class DocumentConverter:
        assert self.allowed_formats is not None
        assert in_doc.format in self.allowed_formats
        start_doc_time = time.time()
        conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
        end_doc_time = time.time() - start_doc_time
        _log.info(
            f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
        )
        return conv_res
    def _execute_pipeline(
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@ -36,7 +36,9 @@ class BasePipeline(ABC):
        _log.info(f"Processing document {in_doc.file.name}")
        try:
-            with TimeRecorder(conv_res, "total", scope=ProfilingScope.DOCUMENT):
+            with TimeRecorder(
                conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
            ):
                # These steps are building and assembling the structure of the
                # output DoclingDocument
                conv_res = self._build_document(conv_res)