mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
Fixes for time logging
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
e1b83ec485
commit
3de3f1371c
@ -1,5 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
@ -52,7 +53,7 @@ from docling.datamodel.base_models import (
|
|||||||
Page,
|
Page,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import DocumentLimits
|
from docling.datamodel.settings import DocumentLimits
|
||||||
from docling.utils.profiling import ProfilingItem
|
from docling.utils.profiling import ProfilingItem, TimeRecorder
|
||||||
from docling.utils.utils import create_file_hash, create_hash
|
from docling.utils.utils import create_file_hash, create_hash
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
@ -189,24 +189,35 @@ class DocumentConverter:
|
|||||||
) -> Iterator[ConversionResult]:
|
) -> Iterator[ConversionResult]:
|
||||||
assert self.format_to_options is not None
|
assert self.format_to_options is not None
|
||||||
|
|
||||||
|
start_time = time.monotonic()
|
||||||
|
|
||||||
for input_batch in chunkify(
|
for input_batch in chunkify(
|
||||||
conv_input.docs(self.format_to_options),
|
conv_input.docs(self.format_to_options),
|
||||||
settings.perf.doc_batch_size, # pass format_options
|
settings.perf.doc_batch_size, # pass format_options
|
||||||
):
|
):
|
||||||
_log.info(f"Going to convert document batch...")
|
_log.info(f"Going to convert document batch...")
|
||||||
|
|
||||||
# parallel processing only within input_batch
|
# parallel processing only within input_batch
|
||||||
# with ThreadPoolExecutor(
|
# with ThreadPoolExecutor(
|
||||||
# max_workers=settings.perf.doc_batch_concurrency
|
# max_workers=settings.perf.doc_batch_concurrency
|
||||||
# ) as pool:
|
# ) as pool:
|
||||||
# yield from pool.map(self.process_document, input_batch)
|
# yield from pool.map(self.process_document, input_batch)
|
||||||
|
|
||||||
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
||||||
|
|
||||||
for item in map(
|
for item in map(
|
||||||
partial(self._process_document, raises_on_error=raises_on_error),
|
partial(self._process_document, raises_on_error=raises_on_error),
|
||||||
input_batch,
|
input_batch,
|
||||||
):
|
):
|
||||||
|
elapsed = time.monotonic() - start_time
|
||||||
|
start_time = time.monotonic()
|
||||||
|
|
||||||
if item is not None:
|
if item is not None:
|
||||||
|
_log.info(
|
||||||
|
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
||||||
|
)
|
||||||
yield item
|
yield item
|
||||||
|
else:
|
||||||
|
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
|
||||||
|
|
||||||
def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
|
def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
|
||||||
assert self.format_to_options is not None
|
assert self.format_to_options is not None
|
||||||
@ -237,15 +248,8 @@ class DocumentConverter:
|
|||||||
assert self.allowed_formats is not None
|
assert self.allowed_formats is not None
|
||||||
assert in_doc.format in self.allowed_formats
|
assert in_doc.format in self.allowed_formats
|
||||||
|
|
||||||
start_doc_time = time.time()
|
|
||||||
|
|
||||||
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
||||||
|
|
||||||
end_doc_time = time.time() - start_doc_time
|
|
||||||
_log.info(
|
|
||||||
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
|
|
||||||
)
|
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
def _execute_pipeline(
|
def _execute_pipeline(
|
||||||
|
@ -36,7 +36,9 @@ class BasePipeline(ABC):
|
|||||||
|
|
||||||
_log.info(f"Processing document {in_doc.file.name}")
|
_log.info(f"Processing document {in_doc.file.name}")
|
||||||
try:
|
try:
|
||||||
with TimeRecorder(conv_res, "total", scope=ProfilingScope.DOCUMENT):
|
with TimeRecorder(
|
||||||
|
conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
|
||||||
|
):
|
||||||
# These steps are building and assembling the structure of the
|
# These steps are building and assembling the structure of the
|
||||||
# output DoclingDocument
|
# output DoclingDocument
|
||||||
conv_res = self._build_document(conv_res)
|
conv_res = self._build_document(conv_res)
|
||||||
|
Loading…
Reference in New Issue
Block a user