mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
Add profiling code to all models
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
a00f01cf07
commit
0814f32ae4
@ -6,6 +6,7 @@ from pathlib import Path, PurePath
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
|
||||
import filetype
|
||||
import numpy as np
|
||||
from docling_core.types.doc import (
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
@ -179,6 +180,29 @@ class DocumentFormat(str, Enum):
|
||||
V1 = "v1"
|
||||
|
||||
|
||||
class ProfilingScope(str, Enum):
|
||||
PAGE = "page"
|
||||
DOCUMENT = "document"
|
||||
|
||||
|
||||
class ProfilingItem(BaseModel):
|
||||
scope: ProfilingScope
|
||||
count: int = 0
|
||||
times: List[float] = []
|
||||
|
||||
def avg(self) -> float:
|
||||
return np.average(self.times) # type: ignore
|
||||
|
||||
def std(self) -> float:
|
||||
return np.std(self.times) # type: ignore
|
||||
|
||||
def mean(self) -> float:
|
||||
return np.mean(self.times) # type: ignore
|
||||
|
||||
def percentile(self, perc: float) -> float:
|
||||
return np.percentile(self.times, perc) # type: ignore
|
||||
|
||||
|
||||
class ConversionResult(BaseModel):
|
||||
input: InputDocument
|
||||
|
||||
@ -187,6 +211,7 @@ class ConversionResult(BaseModel):
|
||||
|
||||
pages: List[Page] = []
|
||||
assembled: AssembledUnit = AssembledUnit()
|
||||
timings: Dict[str, ProfilingItem] = {}
|
||||
|
||||
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||
|
||||
|
@ -32,6 +32,8 @@ class DebugSettings(BaseModel):
|
||||
visualize_layout: bool = False
|
||||
visualize_tables: bool = False
|
||||
|
||||
profile_pipeline_timings: bool = False
|
||||
|
||||
|
||||
class AppSettings(BaseSettings):
|
||||
perf: BatchConcurrencySettings
|
||||
|
@ -1,14 +1,19 @@
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Iterable
|
||||
from typing import Any, Callable, Iterable, Type
|
||||
|
||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult, ProfilingItem, ProfilingScope
|
||||
from docling.datamodel.settings import settings
|
||||
|
||||
|
||||
class BasePageModel(ABC):
|
||||
@abstractmethod
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
pass
|
||||
|
||||
|
||||
@ -23,3 +28,28 @@ class BaseEnrichmentModel(ABC):
|
||||
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
||||
) -> Iterable[Any]:
|
||||
pass
|
||||
|
||||
|
||||
class TimeRecorder:
|
||||
def __init__(
|
||||
self,
|
||||
conv_res: ConversionResult,
|
||||
key: str,
|
||||
scope: ProfilingScope = ProfilingScope.PAGE,
|
||||
):
|
||||
if settings.debug.profile_pipeline_timings:
|
||||
if key not in conv_res.timings.keys():
|
||||
conv_res.timings[key] = ProfilingItem(scope=scope)
|
||||
self.conv_res = conv_res
|
||||
self.key = key
|
||||
|
||||
def __enter__(self):
|
||||
if settings.debug.profile_pipeline_timings:
|
||||
self.start = time.monotonic()
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
if settings.debug.profile_pipeline_timings:
|
||||
elapsed = time.monotonic() - self.start
|
||||
self.conv_res.timings[self.key].times.append(elapsed)
|
||||
self.conv_res.timings[self.key].count += 1
|
||||
|
@ -10,12 +10,14 @@ from rtree import index
|
||||
from scipy.ndimage import find_objects, label
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import OcrOptions
|
||||
from docling.models.base_model import BasePageModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseOcrModel:
|
||||
class BaseOcrModel(BasePageModel):
|
||||
def __init__(self, enabled: bool, options: OcrOptions):
|
||||
self.enabled = enabled
|
||||
self.options = options
|
||||
@ -133,5 +135,7 @@ class BaseOcrModel:
|
||||
image.show()
|
||||
|
||||
@abstractmethod
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
pass
|
||||
|
@ -27,6 +27,7 @@ from pydantic import BaseModel, ConfigDict
|
||||
|
||||
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
|
||||
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
||||
from docling.models.base_model import TimeRecorder
|
||||
from docling.utils.utils import create_hash
|
||||
|
||||
|
||||
@ -226,6 +227,7 @@ class GlmModel:
|
||||
return ds_doc
|
||||
|
||||
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
||||
with TimeRecorder(conv_res, "glm"):
|
||||
ds_doc = self._to_legacy_document(conv_res)
|
||||
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
||||
|
||||
|
@ -1,12 +1,15 @@
|
||||
import logging
|
||||
import time
|
||||
from typing import Iterable
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult, ProfilingItem
|
||||
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import TimeRecorder
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -34,17 +37,21 @@ class EasyOcrModel(BaseOcrModel):
|
||||
download_enabled=self.options.download_enabled,
|
||||
)
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
return
|
||||
|
||||
for page in page_batch:
|
||||
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
with TimeRecorder(conv_res, "ocr"):
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
|
||||
all_ocr_cells = []
|
||||
@ -81,7 +88,9 @@ class EasyOcrModel(BaseOcrModel):
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
|
||||
|
@ -16,8 +16,9 @@ from docling.datamodel.base_models import (
|
||||
LayoutPrediction,
|
||||
Page,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.base_model import BasePageModel, TimeRecorder
|
||||
from docling.utils import layout_utils as lu
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -272,12 +273,16 @@ class LayoutModel(BasePageModel):
|
||||
|
||||
return clusters_out_new, cells_out_new
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
with TimeRecorder(conv_res, "layout"):
|
||||
assert page.size is not None
|
||||
|
||||
clusters = []
|
||||
@ -285,7 +290,10 @@ class LayoutModel(BasePageModel):
|
||||
self.layout_predictor.predict(page.get_image(scale=1.0))
|
||||
):
|
||||
label = DocItemLabel(
|
||||
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
|
||||
pred_item["label"]
|
||||
.lower()
|
||||
.replace(" ", "_")
|
||||
.replace("-", "_")
|
||||
) # Temporary, until docling-ibm-model uses docling-core types
|
||||
cluster = Cluster(
|
||||
id=ix,
|
||||
@ -330,7 +338,9 @@ class LayoutModel(BasePageModel):
|
||||
)
|
||||
for tc in c.cells: # [:1]:
|
||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
draw.rectangle(
|
||||
[(x0, y0), (x1, y1)], outline=cell_color
|
||||
)
|
||||
if show:
|
||||
image.show()
|
||||
|
||||
@ -340,9 +350,9 @@ class LayoutModel(BasePageModel):
|
||||
clusters, page.cells, page.size.height
|
||||
)
|
||||
|
||||
page.predictions.layout = LayoutPrediction(clusters=clusters)
|
||||
|
||||
if settings.debug.visualize_layout:
|
||||
draw_clusters_and_cells()
|
||||
|
||||
page.predictions.layout = LayoutPrediction(clusters=clusters)
|
||||
|
||||
yield page
|
||||
|
@ -12,7 +12,8 @@ from docling.datamodel.base_models import (
|
||||
Table,
|
||||
TextElement,
|
||||
)
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.models.base_model import BasePageModel, TimeRecorder
|
||||
from docling.models.layout_model import LayoutModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -51,12 +52,16 @@ class PageAssembleModel(BasePageModel):
|
||||
|
||||
return sanitized_text.strip() # Strip any leading or trailing whitespace
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
with TimeRecorder(conv_res, "page_assemble"):
|
||||
|
||||
assert page.predictions.layout is not None
|
||||
|
||||
# assembles some JSON output page by page.
|
||||
@ -112,11 +117,9 @@ class PageAssembleModel(BasePageModel):
|
||||
elif cluster.label == LayoutModel.FIGURE_LABEL:
|
||||
fig = None
|
||||
if page.predictions.figures_classification:
|
||||
fig = (
|
||||
page.predictions.figures_classification.figure_map.get(
|
||||
fig = page.predictions.figures_classification.figure_map.get(
|
||||
cluster.id, None
|
||||
)
|
||||
)
|
||||
if (
|
||||
not fig
|
||||
): # fallback: add figure without classification, if it isn't present
|
||||
@ -133,11 +136,9 @@ class PageAssembleModel(BasePageModel):
|
||||
elif cluster.label == LayoutModel.FORMULA_LABEL:
|
||||
equation = None
|
||||
if page.predictions.equations_prediction:
|
||||
equation = (
|
||||
page.predictions.equations_prediction.equation_map.get(
|
||||
equation = page.predictions.equations_prediction.equation_map.get(
|
||||
cluster.id, None
|
||||
)
|
||||
)
|
||||
if (
|
||||
not equation
|
||||
): # fallback: add empty formula, if it isn't present
|
||||
|
@ -4,7 +4,8 @@ from PIL import ImageDraw
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.models.base_model import BasePageModel, TimeRecorder
|
||||
|
||||
|
||||
class PagePreprocessingOptions(BaseModel):
|
||||
@ -15,12 +16,15 @@ class PagePreprocessingModel(BasePageModel):
|
||||
def __init__(self, options: PagePreprocessingOptions):
|
||||
self.options = options
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
with TimeRecorder(conv_res, "page_parse"):
|
||||
page = self._populate_page_images(page)
|
||||
page = self._parse_page_cells(page)
|
||||
yield page
|
||||
|
@ -8,9 +8,10 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
|
||||
from PIL import ImageDraw
|
||||
|
||||
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.base_model import BasePageModel, TimeRecorder
|
||||
|
||||
|
||||
class TableStructureModel(BasePageModel):
|
||||
@ -64,7 +65,9 @@ class TableStructureModel(BasePageModel):
|
||||
|
||||
image.show()
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
@ -75,11 +78,14 @@ class TableStructureModel(BasePageModel):
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
with TimeRecorder(conv_res, "table_structure"):
|
||||
|
||||
assert page.predictions.layout is not None
|
||||
assert page.size is not None
|
||||
|
||||
page.predictions.tablestructure = TableStructurePrediction() # dummy
|
||||
page.predictions.tablestructure = (
|
||||
TableStructurePrediction()
|
||||
) # dummy
|
||||
|
||||
in_tables = [
|
||||
(
|
||||
@ -121,7 +127,9 @@ class TableStructureModel(BasePageModel):
|
||||
"width": page.size.width * self.scale,
|
||||
"height": page.size.height * self.scale,
|
||||
}
|
||||
page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
|
||||
page_input["image"] = numpy.asarray(
|
||||
page.get_image(scale=self.scale)
|
||||
)
|
||||
|
||||
table_clusters, table_bboxes = zip(*in_tables)
|
||||
|
||||
@ -138,7 +146,9 @@ class TableStructureModel(BasePageModel):
|
||||
the_bbox = BoundingBox.model_validate(
|
||||
element["bbox"]
|
||||
).scaled(1 / self.scale)
|
||||
text_piece = page._backend.get_text_in_rect(the_bbox)
|
||||
text_piece = page._backend.get_text_in_rect(
|
||||
the_bbox
|
||||
)
|
||||
element["bbox"]["token"] = text_piece
|
||||
|
||||
tc = TableCell.model_validate(element)
|
||||
@ -149,7 +159,9 @@ class TableStructureModel(BasePageModel):
|
||||
# Retrieving cols/rows, after post processing:
|
||||
num_rows = table_out["predict_details"]["num_rows"]
|
||||
num_cols = table_out["predict_details"]["num_cols"]
|
||||
otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
|
||||
otsl_seq = table_out["predict_details"]["prediction"][
|
||||
"rs_seq"
|
||||
]
|
||||
|
||||
tbl = Table(
|
||||
otsl_seq=otsl_seq,
|
||||
@ -162,9 +174,9 @@ class TableStructureModel(BasePageModel):
|
||||
label=DocItemLabel.TABLE,
|
||||
)
|
||||
|
||||
page.predictions.tablestructure.table_map[table_cluster.id] = (
|
||||
tbl
|
||||
)
|
||||
page.predictions.tablestructure.table_map[
|
||||
table_cluster.id
|
||||
] = tbl
|
||||
|
||||
# For debugging purposes:
|
||||
if settings.debug.visualize_tables:
|
||||
|
@ -8,8 +8,10 @@ import pandas as pd
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import TimeRecorder
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -103,7 +105,9 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
|
||||
return df_filtered
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
@ -114,6 +118,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
with TimeRecorder(conv_res, "ocr"):
|
||||
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
|
||||
all_ocr_cells = []
|
||||
@ -165,7 +171,9 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
all_ocr_cells.append(cell)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
|
||||
|
@ -4,8 +4,10 @@ from typing import Iterable
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import TimeRecorder
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -62,7 +64,9 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
# Finalize the tesseractAPI
|
||||
self.reader.End()
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
@ -73,6 +77,8 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
with TimeRecorder(conv_res, "ocr"):
|
||||
|
||||
assert self.reader is not None
|
||||
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
@ -95,7 +101,9 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
cells = []
|
||||
for ix, (im, box, _, _) in enumerate(boxes):
|
||||
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
||||
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
|
||||
self.reader.SetRectangle(
|
||||
box["x"], box["y"], box["w"], box["h"]
|
||||
)
|
||||
|
||||
# Extract text within the bounding box
|
||||
text = self.reader.GetUTF8Text().strip()
|
||||
@ -121,7 +129,9 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
|
||||
|
@ -15,10 +15,15 @@ from docling.datamodel.base_models import (
|
||||
ErrorItem,
|
||||
Page,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
InputDocument,
|
||||
ProfilingItem,
|
||||
ProfilingScope,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BaseEnrichmentModel
|
||||
from docling.models.base_model import BaseEnrichmentModel, TimeRecorder
|
||||
from docling.utils.utils import chunkify
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -37,11 +42,11 @@ class BasePipeline(ABC):
|
||||
try:
|
||||
# These steps are building and assembling the structure of the
|
||||
# output DoclingDocument
|
||||
conv_res = self._build_document(in_doc, conv_res)
|
||||
conv_res = self._assemble_document(in_doc, conv_res)
|
||||
conv_res = self._build_document(conv_res)
|
||||
conv_res = self._assemble_document(conv_res)
|
||||
# From this stage, all operations should rely only on conv_res.output
|
||||
conv_res = self._enrich_document(in_doc, conv_res)
|
||||
conv_res.status = self._determine_status(in_doc, conv_res)
|
||||
conv_res = self._enrich_document(conv_res)
|
||||
conv_res.status = self._determine_status(conv_res)
|
||||
except Exception as e:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
if raises_on_error:
|
||||
@ -50,19 +55,13 @@ class BasePipeline(ABC):
|
||||
return conv_res
|
||||
|
||||
@abstractmethod
|
||||
def _build_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
pass
|
||||
|
||||
def _assemble_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
return conv_res
|
||||
|
||||
def _enrich_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
|
||||
def _filter_elements(
|
||||
doc: DoclingDocument, model: BaseEnrichmentModel
|
||||
@ -71,6 +70,7 @@ class BasePipeline(ABC):
|
||||
if model.is_processable(doc=doc, element=element):
|
||||
yield element
|
||||
|
||||
with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
|
||||
for model in self.enrichment_pipe:
|
||||
for element_batch in chunkify(
|
||||
_filter_elements(conv_res.document, model),
|
||||
@ -86,9 +86,7 @@ class BasePipeline(ABC):
|
||||
return conv_res
|
||||
|
||||
@abstractmethod
|
||||
def _determine_status(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionStatus:
|
||||
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@ -110,40 +108,44 @@ class BasePipeline(ABC):
|
||||
|
||||
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
|
||||
def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def _apply_on_pages(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
for model in self.build_pipe:
|
||||
page_batch = model(page_batch)
|
||||
page_batch = model(conv_res, page_batch)
|
||||
|
||||
yield from page_batch
|
||||
|
||||
def _build_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
|
||||
if not isinstance(in_doc._backend, PdfDocumentBackend):
|
||||
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
|
||||
raise RuntimeError(
|
||||
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
|
||||
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
|
||||
f"Can not convert this with a PDF pipeline. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
# conv_res.status = ConversionStatus.FAILURE
|
||||
# return conv_res
|
||||
|
||||
for i in range(0, in_doc.page_count):
|
||||
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||
|
||||
for i in range(0, conv_res.input.page_count):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
|
||||
try:
|
||||
# Iterate batches of pages (page_batch_size) in the doc
|
||||
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
|
||||
for page_batch in chunkify(
|
||||
conv_res.pages, settings.perf.page_batch_size
|
||||
):
|
||||
start_pb_time = time.time()
|
||||
|
||||
# 1. Initialise the page resources
|
||||
init_pages = map(
|
||||
functools.partial(self.initialize_page, in_doc), page_batch
|
||||
functools.partial(self.initialize_page, conv_res), page_batch
|
||||
)
|
||||
|
||||
# 2. Run pipeline stages
|
||||
pipeline_pages = self._apply_on_pages(init_pages)
|
||||
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
||||
|
||||
for p in pipeline_pages: # Must exhaust!
|
||||
pass
|
||||
@ -155,21 +157,19 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
trace = "\n".join(traceback.format_exception(e))
|
||||
_log.warning(
|
||||
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
||||
f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
|
||||
f"{trace}"
|
||||
)
|
||||
raise e
|
||||
|
||||
finally:
|
||||
# Always unload the PDF backend, even in case of failure
|
||||
if in_doc._backend:
|
||||
in_doc._backend.unload()
|
||||
if conv_res.input._backend:
|
||||
conv_res.input._backend.unload()
|
||||
|
||||
return conv_res
|
||||
|
||||
def _determine_status(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionStatus:
|
||||
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
||||
status = ConversionStatus.SUCCESS
|
||||
for page in conv_res.pages:
|
||||
if page._backend is None or not page._backend.is_valid():
|
||||
@ -186,5 +186,5 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
|
||||
# Initialise and load resources for a page
|
||||
@abstractmethod
|
||||
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
||||
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||
pass
|
||||
|
@ -5,8 +5,9 @@ from docling.backend.abstract_backend import (
|
||||
DeclarativeDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.models.base_model import TimeRecorder
|
||||
from docling.pipeline.base_pipeline import BasePipeline
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -22,13 +23,11 @@ class SimplePipeline(BasePipeline):
|
||||
def __init__(self, pipeline_options: PipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
|
||||
def _build_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
|
||||
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
|
||||
if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
|
||||
raise RuntimeError(
|
||||
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
|
||||
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
|
||||
f"Can not convert this with simple pipeline. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
@ -38,13 +37,11 @@ class SimplePipeline(BasePipeline):
|
||||
# Instead of running a page-level pipeline to build up the document structure,
|
||||
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
|
||||
# a DoclingDocument straight.
|
||||
|
||||
conv_res.document = in_doc._backend.convert()
|
||||
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||
conv_res.document = conv_res.input._backend.convert()
|
||||
return conv_res
|
||||
|
||||
def _determine_status(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionStatus:
|
||||
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
||||
# This is called only if the previous steps didn't raise.
|
||||
# Since we don't have anything else to evaluate, we can
|
||||
# safely return SUCCESS.
|
||||
|
@ -7,13 +7,14 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import AssembledUnit, Page
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.models.base_model import TimeRecorder
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
@ -119,20 +120,20 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
)
|
||||
return None
|
||||
|
||||
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
||||
page._backend = doc._backend.load_page(page.page_no) # type: ignore
|
||||
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||
with TimeRecorder(conv_res, "init_page"):
|
||||
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
||||
if page._backend is not None and page._backend.is_valid():
|
||||
page.size = page._backend.get_size()
|
||||
|
||||
return page
|
||||
|
||||
def _assemble_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
all_elements = []
|
||||
all_headers = []
|
||||
all_body = []
|
||||
|
||||
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
||||
for p in conv_res.pages:
|
||||
if p.assembled is not None:
|
||||
for el in p.assembled.body:
|
||||
@ -185,7 +186,9 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
)
|
||||
|
||||
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
||||
element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale))
|
||||
element.image = ImageRef.from_pil(
|
||||
cropped_im, dpi=int(72 * scale)
|
||||
)
|
||||
|
||||
return conv_res
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user