feat: Add pipeline timings and toggle visualization, establish debug settings (#183)

* Add settings to turn visualization on or off

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add profiling code to all models

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Refactor and fix profiling codes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Visualization codes output PNG to debug dir

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes for time logging

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Optimize imports

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update lockfile

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add start_timestamps to ProfilingItem

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-30 15:04:19 +01:00
committed by GitHub
parent 94a5290789
commit 2a2c65bf4f
23 changed files with 998 additions and 771 deletions

View File

@@ -1,24 +1,20 @@
import logging
import os
import re
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.doc import (
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupItem,
GroupLabel,
ImageRef,
NodeItem,
Size,
TableCell,
TableData,
)
from pydantic import AnyUrl
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat

View File

@@ -1,6 +1,6 @@
from enum import Enum, auto
from io import BytesIO
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
from typing import TYPE_CHECKING, Dict, List, Optional, Union
from docling_core.types.doc import (
BoundingBox,

View File

@@ -3,7 +3,7 @@ import re
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
import filetype
from docling_core.types.doc import (
@@ -52,6 +52,7 @@ from docling.datamodel.base_models import (
Page,
)
from docling.datamodel.settings import DocumentLimits
from docling.utils.profiling import ProfilingItem
from docling.utils.utils import create_file_hash, create_hash
if TYPE_CHECKING:
@@ -187,6 +188,7 @@ class ConversionResult(BaseModel):
pages: List[Page] = []
assembled: AssembledUnit = AssembledUnit()
timings: Dict[str, ProfilingItem] = {}
document: DoclingDocument = _EMPTY_DOCLING_DOC

View File

@@ -1,4 +1,5 @@
import sys
from pathlib import Path
from pydantic import BaseModel
from pydantic_settings import BaseSettings
@@ -26,8 +27,21 @@ class BatchConcurrencySettings(BaseModel):
# To force models into single core: export OMP_NUM_THREADS=1
class DebugSettings(BaseModel):
visualize_cells: bool = False
visualize_ocr: bool = False
visualize_layout: bool = False
visualize_tables: bool = False
profile_pipeline_timings: bool = False
# Path used to output debug information.
debug_output_path: str = str(Path.cwd() / "debug")
class AppSettings(BaseSettings):
perf: BatchConcurrencySettings
debug: DebugSettings
settings = AppSettings(perf=BatchConcurrencySettings())
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())

View File

@@ -189,24 +189,35 @@ class DocumentConverter:
) -> Iterator[ConversionResult]:
assert self.format_to_options is not None
start_time = time.monotonic()
for input_batch in chunkify(
conv_input.docs(self.format_to_options),
settings.perf.doc_batch_size, # pass format_options
):
_log.info(f"Going to convert document batch...")
# parallel processing only within input_batch
# with ThreadPoolExecutor(
# max_workers=settings.perf.doc_batch_concurrency
# ) as pool:
# yield from pool.map(self.process_document, input_batch)
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
for item in map(
partial(self._process_document, raises_on_error=raises_on_error),
input_batch,
):
elapsed = time.monotonic() - start_time
start_time = time.monotonic()
if item is not None:
_log.info(
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
)
yield item
else:
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
assert self.format_to_options is not None
@@ -237,15 +248,8 @@ class DocumentConverter:
assert self.allowed_formats is not None
assert in_doc.format in self.allowed_formats
start_doc_time = time.time()
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
end_doc_time = time.time() - start_doc_time
_log.info(
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
)
return conv_res
def _execute_pipeline(

View File

@@ -4,11 +4,14 @@ from typing import Any, Iterable
from docling_core.types.doc import DoclingDocument, NodeItem
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
class BasePageModel(ABC):
@abstractmethod
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
pass

View File

@@ -1,6 +1,7 @@
import copy
import logging
from abc import abstractmethod
from pathlib import Path
from typing import Iterable, List
import numpy as np
@@ -10,12 +11,15 @@ from rtree import index
from scipy.ndimage import find_objects, label
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
_log = logging.getLogger(__name__)
class BaseOcrModel:
class BaseOcrModel(BasePageModel):
def __init__(self, enabled: bool, options: OcrOptions):
self.enabled = enabled
self.options = options
@@ -113,7 +117,7 @@ class BaseOcrModel:
]
return filtered_ocr_cells
def draw_ocr_rects_and_cells(self, page, ocr_rects):
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
image = copy.deepcopy(page.image)
draw = ImageDraw.Draw(image, "RGBA")
@@ -130,8 +134,21 @@ class BaseOcrModel:
if isinstance(tc, OcrCell):
color = "magenta"
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
image.show()
if show:
image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)
out_file = out_path / f"ocr_page_{page.page_no:05}.png"
image.save(str(out_file), format="png")
@abstractmethod
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
pass

View File

@@ -1,5 +1,6 @@
import copy
import random
from pathlib import Path
from typing import List, Union
from deepsearch_glm.nlp_utils import init_nlp_model
@@ -27,6 +28,8 @@ from pydantic import BaseModel, ConfigDict
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
from docling.datamodel.settings import settings
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import create_hash
@@ -226,23 +229,24 @@ class GlmModel:
return ds_doc
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
ds_doc = self._to_legacy_document(conv_res)
ds_doc_dict = ds_doc.model_dump(by_alias=True)
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
ds_doc = self._to_legacy_document(conv_res)
ds_doc_dict = ds_doc.model_dump(by_alias=True)
glm_doc = self.model.apply_on_doc(ds_doc_dict)
glm_doc = self.model.apply_on_doc(ds_doc_dict)
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
# DEBUG code:
def draw_clusters_and_cells(ds_document, page_no):
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
clusters_to_draw = []
image = copy.deepcopy(conv_res.pages[page_no].image)
for ix, elem in enumerate(ds_document.main_text):
if isinstance(elem, BaseText):
prov = elem.prov[0]
prov = elem.prov[0] # type: ignore
elif isinstance(elem, Ref):
_, arr, index = elem.ref.split("/")
index = int(index)
index = int(index) # type: ignore
if arr == "tables":
prov = ds_document.tables[index].prov[0]
elif arr == "figures":
@@ -256,7 +260,7 @@ class GlmModel:
id=ix,
label=elem.name,
bbox=BoundingBox.from_tuple(
coord=prov.bbox,
coord=prov.bbox, # type: ignore
origin=CoordOrigin.BOTTOMLEFT,
).to_top_left_origin(conv_res.pages[page_no].size.height),
)
@@ -276,9 +280,21 @@ class GlmModel:
for tc in c.cells: # [:1]:
x0, y0, x1, y1 = tc.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# draw_clusters_and_cells(ds_doc, 0)
# draw_clusters_and_cells(exported_doc, 0)
if show:
image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)
out_file = out_path / f"doc_page_{page_no:05}.png"
image.save(str(out_file), format="png")
# for item in ds_doc.page_dimensions:
# page_no = item.page
# draw_clusters_and_cells(ds_doc, page_no)
return docling_doc

View File

@@ -5,8 +5,11 @@ import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import EasyOcrOptions
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@@ -33,58 +36,65 @@ class EasyOcrModel(BaseOcrModel):
download_enabled=self.options.download_enabled,
)
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
return
for page in page_batch:
assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
ocr_rects = self.get_ocr_rects(page)
with TimeRecorder(conv_res, "ocr"):
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []
for ocr_rect in ocr_rects:
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
im = numpy.array(high_res_image)
result = self.reader.readtext(im)
del high_res_image
del im
cells = [
OcrCell(
id=ix,
text=line[1],
confidence=line[2],
bbox=BoundingBox.from_tuple(
coord=(
(line[0][0][0] / self.scale) + ocr_rect.l,
(line[0][0][1] / self.scale) + ocr_rect.t,
(line[0][2][0] / self.scale) + ocr_rect.l,
(line[0][2][1] / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
),
all_ocr_cells = []
for ocr_rect in ocr_rects:
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
for ix, line in enumerate(result)
]
all_ocr_cells.extend(cells)
im = numpy.array(high_res_image)
result = self.reader.readtext(im)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
del high_res_image
del im
page.cells.extend(filtered_ocr_cells)
cells = [
OcrCell(
id=ix,
text=line[1],
confidence=line[2],
bbox=BoundingBox.from_tuple(
coord=(
(line[0][0][0] / self.scale) + ocr_rect.l,
(line[0][0][1] / self.scale) + ocr_rect.t,
(line[0][2][0] / self.scale) + ocr_rect.l,
(line[0][2][1] / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
),
)
for ix, line in enumerate(result)
]
all_ocr_cells.extend(cells)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(
all_ocr_cells, page.cells
)
page.cells.extend(filtered_ocr_cells)
# DEBUG code:
# self.draw_ocr_rects_and_cells(page, ocr_rects)
if settings.debug.visualize_ocr:
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
yield page

View File

@@ -16,8 +16,11 @@ from docling.datamodel.base_models import (
LayoutPrediction,
Page,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils import layout_utils as lu
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@@ -271,74 +274,97 @@ class LayoutModel(BasePageModel):
return clusters_out_new, cells_out_new
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
for page in page_batch:
assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
assert page.size is not None
with TimeRecorder(conv_res, "layout"):
assert page.size is not None
clusters = []
for ix, pred_item in enumerate(
self.layout_predictor.predict(page.get_image(scale=1.0))
):
label = DocItemLabel(
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
) # Temporary, until docling-ibm-model uses docling-core types
cluster = Cluster(
id=ix,
label=label,
confidence=pred_item["confidence"],
bbox=BoundingBox.model_validate(pred_item),
cells=[],
)
clusters.append(cluster)
# Map cells to clusters
# TODO: Remove, postprocess should take care of it anyway.
for cell in page.cells:
for cluster in clusters:
if not cell.bbox.area() > 0:
overlap_frac = 0.0
else:
overlap_frac = (
cell.bbox.intersection_area_with(cluster.bbox)
/ cell.bbox.area()
)
if overlap_frac > 0.5:
cluster.cells.append(cell)
# Pre-sort clusters
# clusters = self.sort_clusters_by_cell_order(clusters)
# DEBUG code:
def draw_clusters_and_cells():
image = copy.deepcopy(page.image)
draw = ImageDraw.Draw(image)
for c in clusters:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
clusters = []
for ix, pred_item in enumerate(
self.layout_predictor.predict(page.get_image(scale=1.0))
):
label = DocItemLabel(
pred_item["label"]
.lower()
.replace(" ", "_")
.replace("-", "_")
) # Temporary, until docling-ibm-model uses docling-core types
cluster = Cluster(
id=ix,
label=label,
confidence=pred_item["confidence"],
bbox=BoundingBox.model_validate(pred_item),
cells=[],
)
for tc in c.cells: # [:1]:
x0, y0, x1, y1 = tc.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
clusters.append(cluster)
# draw_clusters_and_cells()
# Map cells to clusters
# TODO: Remove, postprocess should take care of it anyway.
for cell in page.cells:
for cluster in clusters:
if not cell.bbox.area() > 0:
overlap_frac = 0.0
else:
overlap_frac = (
cell.bbox.intersection_area_with(cluster.bbox)
/ cell.bbox.area()
)
clusters, page.cells = self.postprocess(
clusters, page.cells, page.size.height
)
if overlap_frac > 0.5:
cluster.cells.append(cell)
# draw_clusters_and_cells()
# Pre-sort clusters
# clusters = self.sort_clusters_by_cell_order(clusters)
page.predictions.layout = LayoutPrediction(clusters=clusters)
# DEBUG code:
def draw_clusters_and_cells(show: bool = False):
image = copy.deepcopy(page.image)
if image is not None:
draw = ImageDraw.Draw(image)
for c in clusters:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
for tc in c.cells: # [:1]:
x0, y0, x1, y1 = tc.bbox.as_tuple()
draw.rectangle(
[(x0, y0), (x1, y1)], outline=cell_color
)
if show:
image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)
out_file = (
out_path / f"layout_page_{page.page_no:05}.png"
)
image.save(str(out_file), format="png")
# draw_clusters_and_cells()
clusters, page.cells = self.postprocess(
clusters, page.cells, page.size.height
)
page.predictions.layout = LayoutPrediction(clusters=clusters)
if settings.debug.visualize_layout:
draw_clusters_and_cells()
yield page

View File

@@ -12,8 +12,10 @@ from docling.datamodel.base_models import (
Table,
TextElement,
)
from docling.datamodel.document import ConversionResult
from docling.models.base_model import BasePageModel
from docling.models.layout_model import LayoutModel
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@@ -51,122 +53,122 @@ class PageAssembleModel(BasePageModel):
return sanitized_text.strip() # Strip any leading or trailing whitespace
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
for page in page_batch:
assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
assert page.predictions.layout is not None
with TimeRecorder(conv_res, "page_assemble"):
# assembles some JSON output page by page.
assert page.predictions.layout is not None
elements: List[PageElement] = []
headers: List[PageElement] = []
body: List[PageElement] = []
# assembles some JSON output page by page.
for cluster in page.predictions.layout.clusters:
# _log.info("Cluster label seen:", cluster.label)
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
elements: List[PageElement] = []
headers: List[PageElement] = []
body: List[PageElement] = []
textlines = [
cell.text.replace("\x02", "-").strip()
for cell in cluster.cells
if len(cell.text.strip()) > 0
]
text = self.sanitize_text(textlines)
text_el = TextElement(
label=cluster.label,
id=cluster.id,
text=text,
page_no=page.page_no,
cluster=cluster,
)
elements.append(text_el)
for cluster in page.predictions.layout.clusters:
# _log.info("Cluster label seen:", cluster.label)
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
headers.append(text_el)
else:
body.append(text_el)
elif cluster.label == LayoutModel.TABLE_LABEL:
tbl = None
if page.predictions.tablestructure:
tbl = page.predictions.tablestructure.table_map.get(
cluster.id, None
)
if (
not tbl
): # fallback: add table without structure, if it isn't present
tbl = Table(
textlines = [
cell.text.replace("\x02", "-").strip()
for cell in cluster.cells
if len(cell.text.strip()) > 0
]
text = self.sanitize_text(textlines)
text_el = TextElement(
label=cluster.label,
id=cluster.id,
text="",
otsl_seq=[],
table_cells=[],
cluster=cluster,
page_no=page.page_no,
)
elements.append(tbl)
body.append(tbl)
elif cluster.label == LayoutModel.FIGURE_LABEL:
fig = None
if page.predictions.figures_classification:
fig = (
page.predictions.figures_classification.figure_map.get(
cluster.id, None
)
)
if (
not fig
): # fallback: add figure without classification, if it isn't present
fig = FigureElement(
label=cluster.label,
id=cluster.id,
text="",
data=None,
cluster=cluster,
page_no=page.page_no,
)
elements.append(fig)
body.append(fig)
elif cluster.label == LayoutModel.FORMULA_LABEL:
equation = None
if page.predictions.equations_prediction:
equation = (
page.predictions.equations_prediction.equation_map.get(
cluster.id, None
)
)
if (
not equation
): # fallback: add empty formula, if it isn't present
text = self.sanitize_text(
[
cell.text.replace("\x02", "-").strip()
for cell in cluster.cells
if len(cell.text.strip()) > 0
]
)
equation = TextElement(
label=cluster.label,
id=cluster.id,
cluster=cluster,
page_no=page.page_no,
text=text,
page_no=page.page_no,
cluster=cluster,
)
elements.append(equation)
body.append(equation)
elements.append(text_el)
page.assembled = AssembledUnit(
elements=elements, headers=headers, body=body
)
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
headers.append(text_el)
else:
body.append(text_el)
elif cluster.label == LayoutModel.TABLE_LABEL:
tbl = None
if page.predictions.tablestructure:
tbl = page.predictions.tablestructure.table_map.get(
cluster.id, None
)
if (
not tbl
): # fallback: add table without structure, if it isn't present
tbl = Table(
label=cluster.label,
id=cluster.id,
text="",
otsl_seq=[],
table_cells=[],
cluster=cluster,
page_no=page.page_no,
)
# Remove page images (can be disabled)
if not self.options.keep_images:
page._image_cache = {}
elements.append(tbl)
body.append(tbl)
elif cluster.label == LayoutModel.FIGURE_LABEL:
fig = None
if page.predictions.figures_classification:
fig = page.predictions.figures_classification.figure_map.get(
cluster.id, None
)
if (
not fig
): # fallback: add figure without classification, if it isn't present
fig = FigureElement(
label=cluster.label,
id=cluster.id,
text="",
data=None,
cluster=cluster,
page_no=page.page_no,
)
elements.append(fig)
body.append(fig)
elif cluster.label == LayoutModel.FORMULA_LABEL:
equation = None
if page.predictions.equations_prediction:
equation = page.predictions.equations_prediction.equation_map.get(
cluster.id, None
)
if (
not equation
): # fallback: add empty formula, if it isn't present
text = self.sanitize_text(
[
cell.text.replace("\x02", "-").strip()
for cell in cluster.cells
if len(cell.text.strip()) > 0
]
)
equation = TextElement(
label=cluster.label,
id=cluster.id,
cluster=cluster,
page_no=page.page_no,
text=text,
)
elements.append(equation)
body.append(equation)
# Unload backend
page._backend.unload()
page.assembled = AssembledUnit(
elements=elements, headers=headers, body=body
)
# Remove page images (can be disabled)
if not self.options.keep_images:
page._image_cache = {}
# Unload backend
page._backend.unload()
yield page

View File

@@ -1,10 +1,14 @@
from pathlib import Path
from typing import Iterable, Optional
from PIL import ImageDraw
from pydantic import BaseModel
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.profiling import TimeRecorder
class PagePreprocessingOptions(BaseModel):
@@ -15,14 +19,17 @@ class PagePreprocessingModel(BasePageModel):
def __init__(self, options: PagePreprocessingOptions):
self.options = options
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
for page in page_batch:
assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
page = self._populate_page_images(page)
page = self._parse_page_cells(page)
with TimeRecorder(conv_res, "page_parse"):
page = self._populate_page_images(page)
page = self._parse_page_cells(conv_res, page)
yield page
# Generate the page image and store it in the page object
@@ -43,19 +50,30 @@ class PagePreprocessingModel(BasePageModel):
return page
# Extract and populate the page cells and store it in the page object
def _parse_page_cells(self, page: Page) -> Page:
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
assert page._backend is not None
page.cells = list(page._backend.get_text_cells())
# DEBUG code:
def draw_text_boxes(image, cells):
def draw_text_boxes(image, cells, show: bool = False):
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
image.show()
if show:
image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)
# draw_text_boxes(page.get_image(scale=1.0), cells)
out_file = out_path / f"cells_page_{page.page_no:05}.png"
image.save(str(out_file), format="png")
if settings.debug.visualize_cells:
draw_text_boxes(page.get_image(scale=1.0), page.cells)
return page

View File

@@ -1,6 +1,6 @@
import copy
from pathlib import Path
from typing import Iterable, List
from typing import Iterable
import numpy
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@@ -8,8 +8,11 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
from PIL import ImageDraw
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.profiling import TimeRecorder
class TableStructureModel(BasePageModel):
@@ -35,7 +38,13 @@ class TableStructureModel(BasePageModel):
self.tf_predictor = TFPredictor(self.tm_config)
self.scale = 2.0 # Scale up table input images to 144 dpi
def draw_table_and_cells(self, page: Page, tbl_list: List[Table]):
def draw_table_and_cells(
self,
conv_res: ConversionResult,
page: Page,
tbl_list: Iterable[Table],
show: bool = False,
):
assert page._backend is not None
image = (
@@ -61,9 +70,21 @@ class TableStructureModel(BasePageModel):
fill="black",
)
image.show()
if show:
image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
out_file = out_path / f"table_struct_page_{page.page_no:05}.png"
image.save(str(out_file), format="png")
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
@@ -74,98 +95,112 @@ class TableStructureModel(BasePageModel):
if not page._backend.is_valid():
yield page
else:
with TimeRecorder(conv_res, "table_structure"):
assert page.predictions.layout is not None
assert page.size is not None
assert page.predictions.layout is not None
assert page.size is not None
page.predictions.tablestructure = TableStructurePrediction() # dummy
page.predictions.tablestructure = (
TableStructurePrediction()
) # dummy
in_tables = [
(
cluster,
[
round(cluster.bbox.l) * self.scale,
round(cluster.bbox.t) * self.scale,
round(cluster.bbox.r) * self.scale,
round(cluster.bbox.b) * self.scale,
],
in_tables = [
(
cluster,
[
round(cluster.bbox.l) * self.scale,
round(cluster.bbox.t) * self.scale,
round(cluster.bbox.r) * self.scale,
round(cluster.bbox.b) * self.scale,
],
)
for cluster in page.predictions.layout.clusters
if cluster.label == DocItemLabel.TABLE
]
if not len(in_tables):
yield page
continue
tokens = []
for c in page.cells:
for cluster, _ in in_tables:
if c.bbox.area() > 0:
if (
c.bbox.intersection_area_with(cluster.bbox)
/ c.bbox.area()
> 0.2
):
# Only allow non empty stings (spaces) into the cells of a table
if len(c.text.strip()) > 0:
new_cell = copy.deepcopy(c)
new_cell.bbox = new_cell.bbox.scaled(
scale=self.scale
)
tokens.append(new_cell.model_dump())
page_input = {
"tokens": tokens,
"width": page.size.width * self.scale,
"height": page.size.height * self.scale,
}
page_input["image"] = numpy.asarray(
page.get_image(scale=self.scale)
)
for cluster in page.predictions.layout.clusters
if cluster.label == DocItemLabel.TABLE
]
if not len(in_tables):
yield page
continue
tokens = []
for c in page.cells:
for cluster, _ in in_tables:
if c.bbox.area() > 0:
if (
c.bbox.intersection_area_with(cluster.bbox)
/ c.bbox.area()
> 0.2
):
# Only allow non empty stings (spaces) into the cells of a table
if len(c.text.strip()) > 0:
new_cell = copy.deepcopy(c)
new_cell.bbox = new_cell.bbox.scaled(
scale=self.scale
table_clusters, table_bboxes = zip(*in_tables)
if len(table_bboxes):
tf_output = self.tf_predictor.multi_table_predict(
page_input, table_bboxes, do_matching=self.do_cell_matching
)
for table_cluster, table_out in zip(table_clusters, tf_output):
table_cells = []
for element in table_out["tf_responses"]:
if not self.do_cell_matching:
the_bbox = BoundingBox.model_validate(
element["bbox"]
).scaled(1 / self.scale)
text_piece = page._backend.get_text_in_rect(
the_bbox
)
element["bbox"]["token"] = text_piece
tokens.append(new_cell.model_dump())
tc = TableCell.model_validate(element)
if self.do_cell_matching and tc.bbox is not None:
tc.bbox = tc.bbox.scaled(1 / self.scale)
table_cells.append(tc)
page_input = {
"tokens": tokens,
"width": page.size.width * self.scale,
"height": page.size.height * self.scale,
}
page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
# Retrieving cols/rows, after post processing:
num_rows = table_out["predict_details"]["num_rows"]
num_cols = table_out["predict_details"]["num_cols"]
otsl_seq = table_out["predict_details"]["prediction"][
"rs_seq"
]
table_clusters, table_bboxes = zip(*in_tables)
tbl = Table(
otsl_seq=otsl_seq,
table_cells=table_cells,
num_rows=num_rows,
num_cols=num_cols,
id=table_cluster.id,
page_no=page.page_no,
cluster=table_cluster,
label=DocItemLabel.TABLE,
)
if len(table_bboxes):
tf_output = self.tf_predictor.multi_table_predict(
page_input, table_bboxes, do_matching=self.do_cell_matching
)
for table_cluster, table_out in zip(table_clusters, tf_output):
table_cells = []
for element in table_out["tf_responses"]:
if not self.do_cell_matching:
the_bbox = BoundingBox.model_validate(
element["bbox"]
).scaled(1 / self.scale)
text_piece = page._backend.get_text_in_rect(the_bbox)
element["bbox"]["token"] = text_piece
tc = TableCell.model_validate(element)
if self.do_cell_matching and tc.bbox is not None:
tc.bbox = tc.bbox.scaled(1 / self.scale)
table_cells.append(tc)
# Retrieving cols/rows, after post processing:
num_rows = table_out["predict_details"]["num_rows"]
num_cols = table_out["predict_details"]["num_cols"]
otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
tbl = Table(
otsl_seq=otsl_seq,
table_cells=table_cells,
num_rows=num_rows,
num_cols=num_cols,
id=table_cluster.id,
page_no=page.page_no,
cluster=table_cluster,
label=DocItemLabel.TABLE,
)
page.predictions.tablestructure.table_map[table_cluster.id] = (
tbl
)
page.predictions.tablestructure.table_map[
table_cluster.id
] = tbl
# For debugging purposes:
# self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
if settings.debug.visualize_tables:
self.draw_table_and_cells(
conv_res,
page,
page.predictions.tablestructure.table_map.values(),
)
yield page

View File

@@ -8,8 +8,11 @@ import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@@ -102,7 +105,9 @@ class TesseractOcrCliModel(BaseOcrModel):
return df_filtered
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
@@ -113,62 +118,67 @@ class TesseractOcrCliModel(BaseOcrModel):
if not page._backend.is_valid():
yield page
else:
ocr_rects = self.get_ocr_rects(page)
with TimeRecorder(conv_res, "ocr"):
all_ocr_cells = []
for ocr_rect in ocr_rects:
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []
for ocr_rect in ocr_rects:
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
with tempfile.NamedTemporaryFile(
suffix=".png", mode="w"
) as image_file:
fname = image_file.name
high_res_image.save(fname)
df = self._run_tesseract(fname)
# _log.info(df)
# Print relevant columns (bounding box and text)
for ix, row in df.iterrows():
text = row["text"]
conf = row["conf"]
l = float(row["left"])
b = float(row["top"])
w = float(row["width"])
h = float(row["height"])
t = b + h
r = l + w
cell = OcrCell(
id=ix,
text=text,
confidence=conf / 100.0,
bbox=BoundingBox.from_tuple(
coord=(
(l / self.scale) + ocr_rect.l,
(b / self.scale) + ocr_rect.t,
(r / self.scale) + ocr_rect.l,
(t / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
),
)
all_ocr_cells.append(cell)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(
all_ocr_cells, page.cells
)
with tempfile.NamedTemporaryFile(
suffix=".png", mode="w"
) as image_file:
fname = image_file.name
high_res_image.save(fname)
df = self._run_tesseract(fname)
# _log.info(df)
# Print relevant columns (bounding box and text)
for ix, row in df.iterrows():
text = row["text"]
conf = row["conf"]
l = float(row["left"])
b = float(row["top"])
w = float(row["width"])
h = float(row["height"])
t = b + h
r = l + w
cell = OcrCell(
id=ix,
text=text,
confidence=conf / 100.0,
bbox=BoundingBox.from_tuple(
coord=(
(l / self.scale) + ocr_rect.l,
(b / self.scale) + ocr_rect.t,
(r / self.scale) + ocr_rect.l,
(t / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
),
)
all_ocr_cells.append(cell)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
page.cells.extend(filtered_ocr_cells)
page.cells.extend(filtered_ocr_cells)
# DEBUG code:
# self.draw_ocr_rects_and_cells(page, ocr_rects)
if settings.debug.visualize_ocr:
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
yield page

View File

@@ -4,8 +4,11 @@ from typing import Iterable
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractOcrOptions
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@@ -61,7 +64,9 @@ class TesseractOcrModel(BaseOcrModel):
# Finalize the tesseractAPI
self.reader.End()
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
@@ -72,59 +77,66 @@ class TesseractOcrModel(BaseOcrModel):
if not page._backend.is_valid():
yield page
else:
assert self.reader is not None
with TimeRecorder(conv_res, "ocr"):
ocr_rects = self.get_ocr_rects(page)
assert self.reader is not None
all_ocr_cells = []
for ocr_rect in ocr_rects:
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
ocr_rects = self.get_ocr_rects(page)
# Retrieve text snippets with their bounding boxes
self.reader.SetImage(high_res_image)
boxes = self.reader.GetComponentImages(
self.reader_RIL.TEXTLINE, True
)
cells = []
for ix, (im, box, _, _) in enumerate(boxes):
# Set the area of interest. Tesseract uses Bottom-Left for the origin
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
# Extract text within the bounding box
text = self.reader.GetUTF8Text().strip()
confidence = self.reader.MeanTextConf()
left = box["x"] / self.scale
bottom = box["y"] / self.scale
right = (box["x"] + box["w"]) / self.scale
top = (box["y"] + box["h"]) / self.scale
cells.append(
OcrCell(
id=ix,
text=text,
confidence=confidence,
bbox=BoundingBox.from_tuple(
coord=(left, top, right, bottom),
origin=CoordOrigin.TOPLEFT,
),
)
all_ocr_cells = []
for ocr_rect in ocr_rects:
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
# del high_res_image
all_ocr_cells.extend(cells)
# Retrieve text snippets with their bounding boxes
self.reader.SetImage(high_res_image)
boxes = self.reader.GetComponentImages(
self.reader_RIL.TEXTLINE, True
)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
cells = []
for ix, (im, box, _, _) in enumerate(boxes):
# Set the area of interest. Tesseract uses Bottom-Left for the origin
self.reader.SetRectangle(
box["x"], box["y"], box["w"], box["h"]
)
page.cells.extend(filtered_ocr_cells)
# Extract text within the bounding box
text = self.reader.GetUTF8Text().strip()
confidence = self.reader.MeanTextConf()
left = box["x"] / self.scale
bottom = box["y"] / self.scale
right = (box["x"] + box["w"]) / self.scale
top = (box["y"] + box["h"]) / self.scale
cells.append(
OcrCell(
id=ix,
text=text,
confidence=confidence,
bbox=BoundingBox.from_tuple(
coord=(left, top, right, bottom),
origin=CoordOrigin.TOPLEFT,
),
)
)
# del high_res_image
all_ocr_cells.extend(cells)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(
all_ocr_cells, page.cells
)
page.cells.extend(filtered_ocr_cells)
# DEBUG code:
# self.draw_ocr_rects_and_cells(page, ocr_rects)
if settings.debug.visualize_ocr:
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
yield page

View File

@@ -19,6 +19,7 @@ from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BaseEnrichmentModel
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import chunkify
_log = logging.getLogger(__name__)
@@ -35,13 +36,16 @@ class BasePipeline(ABC):
_log.info(f"Processing document {in_doc.file.name}")
try:
# These steps are building and assembling the structure of the
# output DoclingDocument
conv_res = self._build_document(in_doc, conv_res)
conv_res = self._assemble_document(in_doc, conv_res)
# From this stage, all operations should rely only on conv_res.output
conv_res = self._enrich_document(in_doc, conv_res)
conv_res.status = self._determine_status(in_doc, conv_res)
with TimeRecorder(
conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
):
# These steps are building and assembling the structure of the
# output DoclingDocument
conv_res = self._build_document(conv_res)
conv_res = self._assemble_document(conv_res)
# From this stage, all operations should rely only on conv_res.output
conv_res = self._enrich_document(conv_res)
conv_res.status = self._determine_status(conv_res)
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
if raises_on_error:
@@ -50,19 +54,13 @@ class BasePipeline(ABC):
return conv_res
@abstractmethod
def _build_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
pass
def _assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
return conv_res
def _enrich_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
def _filter_elements(
doc: DoclingDocument, model: BaseEnrichmentModel
@@ -71,24 +69,23 @@ class BasePipeline(ABC):
if model.is_processable(doc=doc, element=element):
yield element
for model in self.enrichment_pipe:
for element_batch in chunkify(
_filter_elements(conv_res.document, model),
settings.perf.elements_batch_size,
):
# TODO: currently we assume the element itself is modified, because
# we don't have an interface to save the element back to the document
for element in model(
doc=conv_res.document, element_batch=element_batch
): # Must exhaust!
pass
with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
for model in self.enrichment_pipe:
for element_batch in chunkify(
_filter_elements(conv_res.document, model),
settings.perf.elements_batch_size,
):
# TODO: currently we assume the element itself is modified, because
# we don't have an interface to save the element back to the document
for element in model(
doc=conv_res.document, element_batch=element_batch
): # Must exhaust!
pass
return conv_res
@abstractmethod
def _determine_status(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionStatus:
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
pass
@classmethod
@@ -110,66 +107,68 @@ class BasePipeline(ABC):
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
def _apply_on_pages(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
for model in self.build_pipe:
page_batch = model(page_batch)
page_batch = model(conv_res, page_batch)
yield from page_batch
def _build_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
if not isinstance(in_doc._backend, PdfDocumentBackend):
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
raise RuntimeError(
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
f"Can not convert this with a PDF pipeline. "
f"Please check your format configuration on DocumentConverter."
)
# conv_res.status = ConversionStatus.FAILURE
# return conv_res
for i in range(0, in_doc.page_count):
conv_res.pages.append(Page(page_no=i))
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
try:
# Iterate batches of pages (page_batch_size) in the doc
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
start_pb_time = time.time()
for i in range(0, conv_res.input.page_count):
conv_res.pages.append(Page(page_no=i))
# 1. Initialise the page resources
init_pages = map(
functools.partial(self.initialize_page, in_doc), page_batch
try:
# Iterate batches of pages (page_batch_size) in the doc
for page_batch in chunkify(
conv_res.pages, settings.perf.page_batch_size
):
start_pb_time = time.time()
# 1. Initialise the page resources
init_pages = map(
functools.partial(self.initialize_page, conv_res), page_batch
)
# 2. Run pipeline stages
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
for p in pipeline_pages: # Must exhaust!
pass
end_pb_time = time.time() - start_pb_time
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e))
_log.warning(
f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
f"{trace}"
)
raise e
# 2. Run pipeline stages
pipeline_pages = self._apply_on_pages(init_pages)
for p in pipeline_pages: # Must exhaust!
pass
end_pb_time = time.time() - start_pb_time
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e))
_log.warning(
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
f"{trace}"
)
raise e
finally:
# Always unload the PDF backend, even in case of failure
if in_doc._backend:
in_doc._backend.unload()
finally:
# Always unload the PDF backend, even in case of failure
if conv_res.input._backend:
conv_res.input._backend.unload()
return conv_res
def _determine_status(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionStatus:
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
status = ConversionStatus.SUCCESS
for page in conv_res.pages:
if page._backend is None or not page._backend.is_valid():
@@ -186,5 +185,5 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
# Initialise and load resources for a page
@abstractmethod
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
pass

View File

@@ -5,9 +5,10 @@ from docling.backend.abstract_backend import (
DeclarativeDocumentBackend,
)
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PipelineOptions
from docling.pipeline.base_pipeline import BasePipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__)
@@ -22,13 +23,11 @@ class SimplePipeline(BasePipeline):
def __init__(self, pipeline_options: PipelineOptions):
super().__init__(pipeline_options)
def _build_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
raise RuntimeError(
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
f"Can not convert this with simple pipeline. "
f"Please check your format configuration on DocumentConverter."
)
@@ -38,13 +37,11 @@ class SimplePipeline(BasePipeline):
# Instead of running a page-level pipeline to build up the document structure,
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
# a DoclingDocument straight.
conv_res.document = in_doc._backend.convert()
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
conv_res.document = conv_res.input._backend.convert()
return conv_res
def _determine_status(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionStatus:
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
# This is called only if the previous steps didn't raise.
# Since we don't have anything else to evaluate, we can
# safely return SUCCESS.

View File

@@ -7,7 +7,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
PdfPipelineOptions,
@@ -27,6 +27,7 @@ from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel
from docling.pipeline.base_pipeline import PaginatedPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__)
@@ -119,73 +120,75 @@ class StandardPdfPipeline(PaginatedPipeline):
)
return None
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
page._backend = doc._backend.load_page(page.page_no) # type: ignore
if page._backend is not None and page._backend.is_valid():
page.size = page._backend.get_size()
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
with TimeRecorder(conv_res, "page_init"):
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
if page._backend is not None and page._backend.is_valid():
page.size = page._backend.get_size()
return page
def _assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
all_elements = []
all_headers = []
all_body = []
for p in conv_res.pages:
if p.assembled is not None:
for el in p.assembled.body:
all_body.append(el)
for el in p.assembled.headers:
all_headers.append(el)
for el in p.assembled.elements:
all_elements.append(el)
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
for p in conv_res.pages:
if p.assembled is not None:
for el in p.assembled.body:
all_body.append(el)
for el in p.assembled.headers:
all_headers.append(el)
for el in p.assembled.elements:
all_elements.append(el)
conv_res.assembled = AssembledUnit(
elements=all_elements, headers=all_headers, body=all_body
)
conv_res.assembled = AssembledUnit(
elements=all_elements, headers=all_headers, body=all_body
)
conv_res.document = self.glm_model(conv_res)
conv_res.document = self.glm_model(conv_res)
# Generate page images in the output
if self.pipeline_options.generate_page_images:
for page in conv_res.pages:
assert page.image is not None
page_no = page.page_no + 1
conv_res.document.pages[page_no].image = ImageRef.from_pil(
page.image, dpi=int(72 * self.pipeline_options.images_scale)
)
# Generate images of the requested element types
if (
self.pipeline_options.generate_picture_images
or self.pipeline_options.generate_table_images
):
scale = self.pipeline_options.images_scale
for element, _level in conv_res.document.iterate_items():
if not isinstance(element, DocItem) or len(element.prov) == 0:
continue
if (
isinstance(element, PictureItem)
and self.pipeline_options.generate_picture_images
) or (
isinstance(element, TableItem)
and self.pipeline_options.generate_table_images
):
page_ix = element.prov[0].page_no - 1
page = conv_res.pages[page_ix]
assert page.size is not None
# Generate page images in the output
if self.pipeline_options.generate_page_images:
for page in conv_res.pages:
assert page.image is not None
crop_bbox = (
element.prov[0]
.bbox.scaled(scale=scale)
.to_top_left_origin(page_height=page.size.height * scale)
page_no = page.page_no + 1
conv_res.document.pages[page_no].image = ImageRef.from_pil(
page.image, dpi=int(72 * self.pipeline_options.images_scale)
)
cropped_im = page.image.crop(crop_bbox.as_tuple())
element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale))
# Generate images of the requested element types
if (
self.pipeline_options.generate_picture_images
or self.pipeline_options.generate_table_images
):
scale = self.pipeline_options.images_scale
for element, _level in conv_res.document.iterate_items():
if not isinstance(element, DocItem) or len(element.prov) == 0:
continue
if (
isinstance(element, PictureItem)
and self.pipeline_options.generate_picture_images
) or (
isinstance(element, TableItem)
and self.pipeline_options.generate_table_images
):
page_ix = element.prov[0].page_no - 1
page = conv_res.pages[page_ix]
assert page.size is not None
assert page.image is not None
crop_bbox = (
element.prov[0]
.bbox.scaled(scale=scale)
.to_top_left_origin(page_height=page.size.height * scale)
)
cropped_im = page.image.crop(crop_bbox.as_tuple())
element.image = ImageRef.from_pil(
cropped_im, dpi=int(72 * scale)
)
return conv_res

View File

@@ -0,0 +1,62 @@
import time
from datetime import datetime
from enum import Enum
from typing import TYPE_CHECKING, List
import numpy as np
from pydantic import BaseModel
from docling.datamodel.settings import settings
if TYPE_CHECKING:
from docling.datamodel.document import ConversionResult
class ProfilingScope(str, Enum):
PAGE = "page"
DOCUMENT = "document"
class ProfilingItem(BaseModel):
scope: ProfilingScope
count: int = 0
times: List[float] = []
start_timestamps: List[datetime] = []
def avg(self) -> float:
return np.average(self.times) # type: ignore
def std(self) -> float:
return np.std(self.times) # type: ignore
def mean(self) -> float:
return np.mean(self.times) # type: ignore
def percentile(self, perc: float) -> float:
return np.percentile(self.times, perc) # type: ignore
class TimeRecorder:
def __init__(
self,
conv_res: "ConversionResult",
key: str,
scope: ProfilingScope = ProfilingScope.PAGE,
):
if settings.debug.profile_pipeline_timings:
if key not in conv_res.timings.keys():
conv_res.timings[key] = ProfilingItem(scope=scope)
self.conv_res = conv_res
self.key = key
def __enter__(self):
if settings.debug.profile_pipeline_timings:
self.start = time.monotonic()
self.conv_res.timings[self.key].start_timestamps.append(datetime.utcnow())
return self
def __exit__(self, *args):
if settings.debug.profile_pipeline_timings:
elapsed = time.monotonic() - self.start
self.conv_res.timings[self.key].times.append(elapsed)
self.conv_res.timings[self.key].count += 1