mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
Refactor and fix profiling codes
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
0814f32ae4
commit
79f90b46dc
@ -3,10 +3,9 @@ import re
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
|
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
|
||||||
|
|
||||||
import filetype
|
import filetype
|
||||||
import numpy as np
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItem,
|
DocItem,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
@ -53,6 +52,7 @@ from docling.datamodel.base_models import (
|
|||||||
Page,
|
Page,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import DocumentLimits
|
from docling.datamodel.settings import DocumentLimits
|
||||||
|
from docling.utils.profiling import ProfilingItem
|
||||||
from docling.utils.utils import create_file_hash, create_hash
|
from docling.utils.utils import create_file_hash, create_hash
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@ -180,29 +180,6 @@ class DocumentFormat(str, Enum):
|
|||||||
V1 = "v1"
|
V1 = "v1"
|
||||||
|
|
||||||
|
|
||||||
class ProfilingScope(str, Enum):
|
|
||||||
PAGE = "page"
|
|
||||||
DOCUMENT = "document"
|
|
||||||
|
|
||||||
|
|
||||||
class ProfilingItem(BaseModel):
|
|
||||||
scope: ProfilingScope
|
|
||||||
count: int = 0
|
|
||||||
times: List[float] = []
|
|
||||||
|
|
||||||
def avg(self) -> float:
|
|
||||||
return np.average(self.times) # type: ignore
|
|
||||||
|
|
||||||
def std(self) -> float:
|
|
||||||
return np.std(self.times) # type: ignore
|
|
||||||
|
|
||||||
def mean(self) -> float:
|
|
||||||
return np.mean(self.times) # type: ignore
|
|
||||||
|
|
||||||
def percentile(self, perc: float) -> float:
|
|
||||||
return np.percentile(self.times, perc) # type: ignore
|
|
||||||
|
|
||||||
|
|
||||||
class ConversionResult(BaseModel):
|
class ConversionResult(BaseModel):
|
||||||
input: InputDocument
|
input: InputDocument
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import sys
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from pydantic_settings import BaseSettings
|
from pydantic_settings import BaseSettings
|
||||||
@ -34,6 +35,9 @@ class DebugSettings(BaseModel):
|
|||||||
|
|
||||||
profile_pipeline_timings: bool = False
|
profile_pipeline_timings: bool = False
|
||||||
|
|
||||||
|
# Path used to output debug information.
|
||||||
|
debug_output_path: str = str(Path.cwd())
|
||||||
|
|
||||||
|
|
||||||
class AppSettings(BaseSettings):
|
class AppSettings(BaseSettings):
|
||||||
perf: BatchConcurrencySettings
|
perf: BatchConcurrencySettings
|
||||||
|
@ -1,12 +1,10 @@
|
|||||||
import time
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any, Callable, Iterable, Type
|
from typing import Any, Iterable
|
||||||
|
|
||||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult, ProfilingItem, ProfilingScope
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.settings import settings
|
|
||||||
|
|
||||||
|
|
||||||
class BasePageModel(ABC):
|
class BasePageModel(ABC):
|
||||||
@ -28,28 +26,3 @@ class BaseEnrichmentModel(ABC):
|
|||||||
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
||||||
) -> Iterable[Any]:
|
) -> Iterable[Any]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class TimeRecorder:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
conv_res: ConversionResult,
|
|
||||||
key: str,
|
|
||||||
scope: ProfilingScope = ProfilingScope.PAGE,
|
|
||||||
):
|
|
||||||
if settings.debug.profile_pipeline_timings:
|
|
||||||
if key not in conv_res.timings.keys():
|
|
||||||
conv_res.timings[key] = ProfilingItem(scope=scope)
|
|
||||||
self.conv_res = conv_res
|
|
||||||
self.key = key
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
if settings.debug.profile_pipeline_timings:
|
|
||||||
self.start = time.monotonic()
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __exit__(self, *args):
|
|
||||||
if settings.debug.profile_pipeline_timings:
|
|
||||||
elapsed = time.monotonic() - self.start
|
|
||||||
self.conv_res.timings[self.key].times.append(elapsed)
|
|
||||||
self.conv_res.timings[self.key].count += 1
|
|
||||||
|
@ -27,7 +27,7 @@ from pydantic import BaseModel, ConfigDict
|
|||||||
|
|
||||||
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
|
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
|
||||||
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
||||||
from docling.models.base_model import TimeRecorder
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
from docling.utils.utils import create_hash
|
from docling.utils.utils import create_hash
|
||||||
|
|
||||||
|
|
||||||
@ -227,7 +227,7 @@ class GlmModel:
|
|||||||
return ds_doc
|
return ds_doc
|
||||||
|
|
||||||
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
||||||
with TimeRecorder(conv_res, "glm"):
|
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
||||||
ds_doc = self._to_legacy_document(conv_res)
|
ds_doc = self._to_legacy_document(conv_res)
|
||||||
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
||||||
|
|
||||||
|
@ -6,11 +6,11 @@ import numpy
|
|||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docling.datamodel.base_models import OcrCell, Page
|
||||||
from docling.datamodel.document import ConversionResult, ProfilingItem
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import EasyOcrOptions
|
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import TimeRecorder
|
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
|
from docling.utils.profiling import ProfilingItem, TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -18,8 +18,9 @@ from docling.datamodel.base_models import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel, TimeRecorder
|
from docling.models.base_model import BasePageModel
|
||||||
from docling.utils import layout_utils as lu
|
from docling.utils import layout_utils as lu
|
||||||
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -13,8 +13,9 @@ from docling.datamodel.base_models import (
|
|||||||
TextElement,
|
TextElement,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.models.base_model import BasePageModel, TimeRecorder
|
from docling.models.base_model import BasePageModel
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -5,7 +5,8 @@ from pydantic import BaseModel
|
|||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.models.base_model import BasePageModel, TimeRecorder
|
from docling.models.base_model import BasePageModel
|
||||||
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
|
|
||||||
class PagePreprocessingOptions(BaseModel):
|
class PagePreprocessingOptions(BaseModel):
|
||||||
|
@ -11,7 +11,8 @@ from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
|||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
|
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel, TimeRecorder
|
from docling.models.base_model import BasePageModel
|
||||||
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
|
|
||||||
class TableStructureModel(BasePageModel):
|
class TableStructureModel(BasePageModel):
|
||||||
|
@ -11,8 +11,8 @@ from docling.datamodel.base_models import OcrCell, Page
|
|||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import TimeRecorder
|
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -7,8 +7,8 @@ from docling.datamodel.base_models import OcrCell, Page
|
|||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import TimeRecorder
|
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -15,15 +15,11 @@ from docling.datamodel.base_models import (
|
|||||||
ErrorItem,
|
ErrorItem,
|
||||||
Page,
|
Page,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
ConversionResult,
|
|
||||||
InputDocument,
|
|
||||||
ProfilingItem,
|
|
||||||
ProfilingScope,
|
|
||||||
)
|
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BaseEnrichmentModel, TimeRecorder
|
from docling.models.base_model import BaseEnrichmentModel
|
||||||
|
from docling.utils.profiling import ProfilingItem, ProfilingScope, TimeRecorder
|
||||||
from docling.utils.utils import chunkify
|
from docling.utils.utils import chunkify
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -40,13 +36,14 @@ class BasePipeline(ABC):
|
|||||||
|
|
||||||
_log.info(f"Processing document {in_doc.file.name}")
|
_log.info(f"Processing document {in_doc.file.name}")
|
||||||
try:
|
try:
|
||||||
# These steps are building and assembling the structure of the
|
with TimeRecorder(conv_res, "total", scope=ProfilingScope.DOCUMENT):
|
||||||
# output DoclingDocument
|
# These steps are building and assembling the structure of the
|
||||||
conv_res = self._build_document(conv_res)
|
# output DoclingDocument
|
||||||
conv_res = self._assemble_document(conv_res)
|
conv_res = self._build_document(conv_res)
|
||||||
# From this stage, all operations should rely only on conv_res.output
|
conv_res = self._assemble_document(conv_res)
|
||||||
conv_res = self._enrich_document(conv_res)
|
# From this stage, all operations should rely only on conv_res.output
|
||||||
conv_res.status = self._determine_status(conv_res)
|
conv_res = self._enrich_document(conv_res)
|
||||||
|
conv_res.status = self._determine_status(conv_res)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
if raises_on_error:
|
if raises_on_error:
|
||||||
|
@ -5,10 +5,10 @@ from docling.backend.abstract_backend import (
|
|||||||
DeclarativeDocumentBackend,
|
DeclarativeDocumentBackend,
|
||||||
)
|
)
|
||||||
from docling.datamodel.base_models import ConversionStatus
|
from docling.datamodel.base_models import ConversionStatus
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.models.base_model import TimeRecorder
|
|
||||||
from docling.pipeline.base_pipeline import BasePipeline
|
from docling.pipeline.base_pipeline import BasePipeline
|
||||||
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -7,14 +7,13 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
|||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import AssembledUnit, Page
|
from docling.datamodel.base_models import AssembledUnit, Page
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
)
|
)
|
||||||
from docling.models.base_model import TimeRecorder
|
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
@ -28,6 +27,7 @@ from docling.models.table_structure_model import TableStructureModel
|
|||||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||||
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -121,7 +121,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||||
with TimeRecorder(conv_res, "init_page"):
|
with TimeRecorder(conv_res, "page_init"):
|
||||||
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
||||||
if page._backend is not None and page._backend.is_valid():
|
if page._backend is not None and page._backend.is_valid():
|
||||||
page.size = page._backend.get_size()
|
page.size = page._backend.get_size()
|
||||||
|
59
docling/utils/profiling.py
Normal file
59
docling/utils/profiling.py
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
import time
|
||||||
|
from enum import Enum
|
||||||
|
from typing import TYPE_CHECKING, List
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from docling.datamodel.settings import settings
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from docling.datamodel.document import ConversionResult
|
||||||
|
|
||||||
|
|
||||||
|
class ProfilingScope(str, Enum):
|
||||||
|
PAGE = "page"
|
||||||
|
DOCUMENT = "document"
|
||||||
|
|
||||||
|
|
||||||
|
class ProfilingItem(BaseModel):
|
||||||
|
scope: ProfilingScope
|
||||||
|
count: int = 0
|
||||||
|
times: List[float] = []
|
||||||
|
|
||||||
|
def avg(self) -> float:
|
||||||
|
return np.average(self.times) # type: ignore
|
||||||
|
|
||||||
|
def std(self) -> float:
|
||||||
|
return np.std(self.times) # type: ignore
|
||||||
|
|
||||||
|
def mean(self) -> float:
|
||||||
|
return np.mean(self.times) # type: ignore
|
||||||
|
|
||||||
|
def percentile(self, perc: float) -> float:
|
||||||
|
return np.percentile(self.times, perc) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
class TimeRecorder:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
conv_res: "ConversionResult",
|
||||||
|
key: str,
|
||||||
|
scope: ProfilingScope = ProfilingScope.PAGE,
|
||||||
|
):
|
||||||
|
if settings.debug.profile_pipeline_timings:
|
||||||
|
if key not in conv_res.timings.keys():
|
||||||
|
conv_res.timings[key] = ProfilingItem(scope=scope)
|
||||||
|
self.conv_res = conv_res
|
||||||
|
self.key = key
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
if settings.debug.profile_pipeline_timings:
|
||||||
|
self.start = time.monotonic()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, *args):
|
||||||
|
if settings.debug.profile_pipeline_timings:
|
||||||
|
elapsed = time.monotonic() - self.start
|
||||||
|
self.conv_res.timings[self.key].times.append(elapsed)
|
||||||
|
self.conv_res.timings[self.key].count += 1
|
Loading…
Reference in New Issue
Block a user