Fixes for time logging

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-29 15:46:20 +01:00
parent e1b83ec485
commit 3de3f1371c
3 changed files with 17 additions and 10 deletions

View File

@ -1,5 +1,6 @@
import logging import logging
import re import re
import time
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
@ -52,7 +53,7 @@ from docling.datamodel.base_models import (
Page, Page,
) )
from docling.datamodel.settings import DocumentLimits from docling.datamodel.settings import DocumentLimits
from docling.utils.profiling import ProfilingItem from docling.utils.profiling import ProfilingItem, TimeRecorder
from docling.utils.utils import create_file_hash, create_hash from docling.utils.utils import create_file_hash, create_hash
if TYPE_CHECKING: if TYPE_CHECKING:

View File

@ -189,24 +189,35 @@ class DocumentConverter:
) -> Iterator[ConversionResult]: ) -> Iterator[ConversionResult]:
assert self.format_to_options is not None assert self.format_to_options is not None
start_time = time.monotonic()
for input_batch in chunkify( for input_batch in chunkify(
conv_input.docs(self.format_to_options), conv_input.docs(self.format_to_options),
settings.perf.doc_batch_size, # pass format_options settings.perf.doc_batch_size, # pass format_options
): ):
_log.info(f"Going to convert document batch...") _log.info(f"Going to convert document batch...")
# parallel processing only within input_batch # parallel processing only within input_batch
# with ThreadPoolExecutor( # with ThreadPoolExecutor(
# max_workers=settings.perf.doc_batch_concurrency # max_workers=settings.perf.doc_batch_concurrency
# ) as pool: # ) as pool:
# yield from pool.map(self.process_document, input_batch) # yield from pool.map(self.process_document, input_batch)
# Note: PDF backends are not thread-safe, thread pool usage was disabled. # Note: PDF backends are not thread-safe, thread pool usage was disabled.
for item in map( for item in map(
partial(self._process_document, raises_on_error=raises_on_error), partial(self._process_document, raises_on_error=raises_on_error),
input_batch, input_batch,
): ):
elapsed = time.monotonic() - start_time
start_time = time.monotonic()
if item is not None: if item is not None:
_log.info(
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
)
yield item yield item
else:
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]: def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
assert self.format_to_options is not None assert self.format_to_options is not None
@ -237,15 +248,8 @@ class DocumentConverter:
assert self.allowed_formats is not None assert self.allowed_formats is not None
assert in_doc.format in self.allowed_formats assert in_doc.format in self.allowed_formats
start_doc_time = time.time()
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error) conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
end_doc_time = time.time() - start_doc_time
_log.info(
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
)
return conv_res return conv_res
def _execute_pipeline( def _execute_pipeline(

View File

@ -36,7 +36,9 @@ class BasePipeline(ABC):
_log.info(f"Processing document {in_doc.file.name}") _log.info(f"Processing document {in_doc.file.name}")
try: try:
with TimeRecorder(conv_res, "total", scope=ProfilingScope.DOCUMENT): with TimeRecorder(
conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
):
# These steps are building and assembling the structure of the # These steps are building and assembling the structure of the
# output DoclingDocument # output DoclingDocument
conv_res = self._build_document(conv_res) conv_res = self._build_document(conv_res)