fix: Add missing features in ThreadedStandardPdfPipeline (#2252)

Add missing features in ThreadedStandardPdfPipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2025-09-11 16:26:02 +02:00
committed by GitHub
parent 2c9123419f
commit 0700af212c

View File

@@ -20,10 +20,14 @@ import itertools
import logging
import threading
import time
import warnings
from collections import defaultdict, deque
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Iterable, List, Optional, Sequence, Tuple
from typing import Any, Iterable, List, Optional, Sequence, Tuple, cast
import numpy as np
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
@@ -541,6 +545,86 @@ class ThreadedStandardPdfPipeline(ConvertPipeline):
elements=elements, headers=headers, body=body
)
conv_res.document = self.reading_order_model(conv_res)
# Generate page images in the output
if self.pipeline_options.generate_page_images:
for page in conv_res.pages:
assert page.image is not None
page_no = page.page_no + 1
conv_res.document.pages[page_no].image = ImageRef.from_pil(
page.image, dpi=int(72 * self.pipeline_options.images_scale)
)
# Generate images of the requested element types
with warnings.catch_warnings(): # deprecated generate_table_images
warnings.filterwarnings("ignore", category=DeprecationWarning)
if (
self.pipeline_options.generate_picture_images
or self.pipeline_options.generate_table_images
):
scale = self.pipeline_options.images_scale
for element, _level in conv_res.document.iterate_items():
if not isinstance(element, DocItem) or len(element.prov) == 0:
continue
if (
isinstance(element, PictureItem)
and self.pipeline_options.generate_picture_images
) or (
isinstance(element, TableItem)
and self.pipeline_options.generate_table_images
):
page_ix = element.prov[0].page_no - 1
page = next(
(p for p in conv_res.pages if p.page_no == page_ix),
cast("Page", None),
)
assert page is not None
assert page.size is not None
assert page.image is not None
crop_bbox = (
element.prov[0]
.bbox.scaled(scale=scale)
.to_top_left_origin(
page_height=page.size.height * scale
)
)
cropped_im = page.image.crop(crop_bbox.as_tuple())
element.image = ImageRef.from_pil(
cropped_im, dpi=int(72 * scale)
)
# Aggregate confidence values for document:
if len(conv_res.pages) > 0:
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
category=RuntimeWarning,
message="Mean of empty slice|All-NaN slice encountered",
)
conv_res.confidence.layout_score = float(
np.nanmean(
[c.layout_score for c in conv_res.confidence.pages.values()]
)
)
conv_res.confidence.parse_score = float(
np.nanquantile(
[c.parse_score for c in conv_res.confidence.pages.values()],
q=0.1, # parse score should relate to worst 10% of pages.
)
)
conv_res.confidence.table_score = float(
np.nanmean(
[c.table_score for c in conv_res.confidence.pages.values()]
)
)
conv_res.confidence.ocr_score = float(
np.nanmean(
[c.ocr_score for c in conv_res.confidence.pages.values()]
)
)
return conv_res
# ---------------------------------------------------------------- misc