feat: Add pipeline timings and toggle visualization, establish debug settings (#183)

* Add settings to turn visualization on or off

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add profiling code to all models

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Refactor and fix profiling codes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Visualization codes output PNG to debug dir

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes for time logging

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Optimize imports

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update lockfile

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add start_timestamps to ProfilingItem

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-30 15:04:19 +01:00
committed by GitHub
parent 94a5290789
commit 2a2c65bf4f
23 changed files with 998 additions and 771 deletions

View File

@@ -189,24 +189,35 @@ class DocumentConverter:
) -> Iterator[ConversionResult]:
assert self.format_to_options is not None
start_time = time.monotonic()
for input_batch in chunkify(
conv_input.docs(self.format_to_options),
settings.perf.doc_batch_size, # pass format_options
):
_log.info(f"Going to convert document batch...")
# parallel processing only within input_batch
# with ThreadPoolExecutor(
# max_workers=settings.perf.doc_batch_concurrency
# ) as pool:
# yield from pool.map(self.process_document, input_batch)
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
for item in map(
partial(self._process_document, raises_on_error=raises_on_error),
input_batch,
):
elapsed = time.monotonic() - start_time
start_time = time.monotonic()
if item is not None:
_log.info(
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
)
yield item
else:
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
assert self.format_to_options is not None
@@ -237,15 +248,8 @@ class DocumentConverter:
assert self.allowed_formats is not None
assert in_doc.format in self.allowed_formats
start_doc_time = time.time()
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
end_doc_time = time.time() - start_doc_time
_log.info(
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
)
return conv_res
def _execute_pipeline(