mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 14:18:30 +00:00
feat: Add pipeline timings and toggle visualization, establish debug settings (#183)
* Add settings to turn visualization on or off Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add profiling code to all models Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Refactor and fix profiling codes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Visualization codes output PNG to debug dir Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for time logging Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Optimize imports Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add start_timestamps to ProfilingItem Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -189,24 +189,35 @@ class DocumentConverter:
|
||||
) -> Iterator[ConversionResult]:
|
||||
assert self.format_to_options is not None
|
||||
|
||||
start_time = time.monotonic()
|
||||
|
||||
for input_batch in chunkify(
|
||||
conv_input.docs(self.format_to_options),
|
||||
settings.perf.doc_batch_size, # pass format_options
|
||||
):
|
||||
_log.info(f"Going to convert document batch...")
|
||||
|
||||
# parallel processing only within input_batch
|
||||
# with ThreadPoolExecutor(
|
||||
# max_workers=settings.perf.doc_batch_concurrency
|
||||
# ) as pool:
|
||||
# yield from pool.map(self.process_document, input_batch)
|
||||
|
||||
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
||||
|
||||
for item in map(
|
||||
partial(self._process_document, raises_on_error=raises_on_error),
|
||||
input_batch,
|
||||
):
|
||||
elapsed = time.monotonic() - start_time
|
||||
start_time = time.monotonic()
|
||||
|
||||
if item is not None:
|
||||
_log.info(
|
||||
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
||||
)
|
||||
yield item
|
||||
else:
|
||||
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
|
||||
|
||||
def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
|
||||
assert self.format_to_options is not None
|
||||
@@ -237,15 +248,8 @@ class DocumentConverter:
|
||||
assert self.allowed_formats is not None
|
||||
assert in_doc.format in self.allowed_formats
|
||||
|
||||
start_doc_time = time.time()
|
||||
|
||||
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
||||
|
||||
end_doc_time = time.time() - start_doc_time
|
||||
_log.info(
|
||||
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
|
||||
)
|
||||
|
||||
return conv_res
|
||||
|
||||
def _execute_pipeline(
|
||||
|
||||
Reference in New Issue
Block a user