diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index be4e9a12..ef3ee85a 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -1,5 +1,6 @@ import logging import re +import time from enum import Enum from io import BytesIO from pathlib import Path, PurePath @@ -52,7 +53,7 @@ from docling.datamodel.base_models import ( Page, ) from docling.datamodel.settings import DocumentLimits -from docling.utils.profiling import ProfilingItem +from docling.utils.profiling import ProfilingItem, TimeRecorder from docling.utils.utils import create_file_hash, create_hash if TYPE_CHECKING: diff --git a/docling/document_converter.py b/docling/document_converter.py index 0fae96b0..d6d4a630 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -189,24 +189,35 @@ class DocumentConverter: ) -> Iterator[ConversionResult]: assert self.format_to_options is not None + start_time = time.monotonic() + for input_batch in chunkify( conv_input.docs(self.format_to_options), settings.perf.doc_batch_size, # pass format_options ): _log.info(f"Going to convert document batch...") + # parallel processing only within input_batch # with ThreadPoolExecutor( # max_workers=settings.perf.doc_batch_concurrency # ) as pool: # yield from pool.map(self.process_document, input_batch) - # Note: PDF backends are not thread-safe, thread pool usage was disabled. + for item in map( partial(self._process_document, raises_on_error=raises_on_error), input_batch, ): + elapsed = time.monotonic() - start_time + start_time = time.monotonic() + if item is not None: + _log.info( + f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec." + ) yield item + else: + _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.") def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]: assert self.format_to_options is not None @@ -237,15 +248,8 @@ class DocumentConverter: assert self.allowed_formats is not None assert in_doc.format in self.allowed_formats - start_doc_time = time.time() - conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error) - end_doc_time = time.time() - start_doc_time - _log.info( - f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds." - ) - return conv_res def _execute_pipeline( diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 3880fe3e..6bfaea53 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -36,7 +36,9 @@ class BasePipeline(ABC): _log.info(f"Processing document {in_doc.file.name}") try: - with TimeRecorder(conv_res, "total", scope=ProfilingScope.DOCUMENT): + with TimeRecorder( + conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT + ): # These steps are building and assembling the structure of the # output DoclingDocument conv_res = self._build_document(conv_res)