mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
fix: Add missing features in ThreadedStandardPdfPipeline (#2252)
Add missing features in ThreadedStandardPdfPipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -20,10 +20,14 @@ import itertools
|
|||||||
import logging
|
import logging
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
import warnings
|
||||||
from collections import defaultdict, deque
|
from collections import defaultdict, deque
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Iterable, List, Optional, Sequence, Tuple
|
from typing import Any, Iterable, List, Optional, Sequence, Tuple, cast
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
@@ -541,6 +545,86 @@ class ThreadedStandardPdfPipeline(ConvertPipeline):
|
|||||||
elements=elements, headers=headers, body=body
|
elements=elements, headers=headers, body=body
|
||||||
)
|
)
|
||||||
conv_res.document = self.reading_order_model(conv_res)
|
conv_res.document = self.reading_order_model(conv_res)
|
||||||
|
|
||||||
|
# Generate page images in the output
|
||||||
|
if self.pipeline_options.generate_page_images:
|
||||||
|
for page in conv_res.pages:
|
||||||
|
assert page.image is not None
|
||||||
|
page_no = page.page_no + 1
|
||||||
|
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
||||||
|
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate images of the requested element types
|
||||||
|
with warnings.catch_warnings(): # deprecated generate_table_images
|
||||||
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
|
if (
|
||||||
|
self.pipeline_options.generate_picture_images
|
||||||
|
or self.pipeline_options.generate_table_images
|
||||||
|
):
|
||||||
|
scale = self.pipeline_options.images_scale
|
||||||
|
for element, _level in conv_res.document.iterate_items():
|
||||||
|
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
||||||
|
continue
|
||||||
|
if (
|
||||||
|
isinstance(element, PictureItem)
|
||||||
|
and self.pipeline_options.generate_picture_images
|
||||||
|
) or (
|
||||||
|
isinstance(element, TableItem)
|
||||||
|
and self.pipeline_options.generate_table_images
|
||||||
|
):
|
||||||
|
page_ix = element.prov[0].page_no - 1
|
||||||
|
page = next(
|
||||||
|
(p for p in conv_res.pages if p.page_no == page_ix),
|
||||||
|
cast("Page", None),
|
||||||
|
)
|
||||||
|
assert page is not None
|
||||||
|
assert page.size is not None
|
||||||
|
assert page.image is not None
|
||||||
|
|
||||||
|
crop_bbox = (
|
||||||
|
element.prov[0]
|
||||||
|
.bbox.scaled(scale=scale)
|
||||||
|
.to_top_left_origin(
|
||||||
|
page_height=page.size.height * scale
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
||||||
|
element.image = ImageRef.from_pil(
|
||||||
|
cropped_im, dpi=int(72 * scale)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Aggregate confidence values for document:
|
||||||
|
if len(conv_res.pages) > 0:
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings(
|
||||||
|
"ignore",
|
||||||
|
category=RuntimeWarning,
|
||||||
|
message="Mean of empty slice|All-NaN slice encountered",
|
||||||
|
)
|
||||||
|
conv_res.confidence.layout_score = float(
|
||||||
|
np.nanmean(
|
||||||
|
[c.layout_score for c in conv_res.confidence.pages.values()]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conv_res.confidence.parse_score = float(
|
||||||
|
np.nanquantile(
|
||||||
|
[c.parse_score for c in conv_res.confidence.pages.values()],
|
||||||
|
q=0.1, # parse score should relate to worst 10% of pages.
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conv_res.confidence.table_score = float(
|
||||||
|
np.nanmean(
|
||||||
|
[c.table_score for c in conv_res.confidence.pages.values()]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conv_res.confidence.ocr_score = float(
|
||||||
|
np.nanmean(
|
||||||
|
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
# ---------------------------------------------------------------- misc
|
# ---------------------------------------------------------------- misc
|
||||||
|
|||||||
Reference in New Issue
Block a user