From 0814f32ae428912b11fb63ca7181df5a357f8521 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Mon, 28 Oct 2024 15:04:09 +0100 Subject: [PATCH] Add profiling code to all models Signed-off-by: Christoph Auer --- docling/datamodel/document.py | 25 +++ docling/datamodel/settings.py | 2 + docling/models/base_model.py | 34 +++- docling/models/base_ocr_model.py | 8 +- docling/models/ds_glm_model.py | 10 +- docling/models/easyocr_model.py | 81 ++++---- docling/models/layout_model.py | 134 +++++++------- docling/models/page_assemble_model.py | 203 +++++++++++---------- docling/models/page_preprocessing_model.py | 12 +- docling/models/table_structure_model.py | 176 +++++++++--------- docling/models/tesseract_ocr_cli_model.py | 112 ++++++------ docling/models/tesseract_ocr_model.py | 100 +++++----- docling/pipeline/base_pipeline.py | 142 +++++++------- docling/pipeline/simple_pipeline.py | 19 +- docling/pipeline/standard_pdf_pipeline.py | 113 ++++++------ 15 files changed, 644 insertions(+), 527 deletions(-) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 41a8af35..8f7bc842 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -6,6 +6,7 @@ from pathlib import Path, PurePath from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union import filetype +import numpy as np from docling_core.types.doc import ( DocItem, DocItemLabel, @@ -179,6 +180,29 @@ class DocumentFormat(str, Enum): V1 = "v1" +class ProfilingScope(str, Enum): + PAGE = "page" + DOCUMENT = "document" + + +class ProfilingItem(BaseModel): + scope: ProfilingScope + count: int = 0 + times: List[float] = [] + + def avg(self) -> float: + return np.average(self.times) # type: ignore + + def std(self) -> float: + return np.std(self.times) # type: ignore + + def mean(self) -> float: + return np.mean(self.times) # type: ignore + + def percentile(self, perc: float) -> float: + return np.percentile(self.times, perc) # type: ignore + + class ConversionResult(BaseModel): input: InputDocument @@ -187,6 +211,7 @@ class ConversionResult(BaseModel): pages: List[Page] = [] assembled: AssembledUnit = AssembledUnit() + timings: Dict[str, ProfilingItem] = {} document: DoclingDocument = _EMPTY_DOCLING_DOC diff --git a/docling/datamodel/settings.py b/docling/datamodel/settings.py index 582d6dc2..ce1c5faf 100644 --- a/docling/datamodel/settings.py +++ b/docling/datamodel/settings.py @@ -32,6 +32,8 @@ class DebugSettings(BaseModel): visualize_layout: bool = False visualize_tables: bool = False + profile_pipeline_timings: bool = False + class AppSettings(BaseSettings): perf: BatchConcurrencySettings diff --git a/docling/models/base_model.py b/docling/models/base_model.py index dffad502..e7bcc7f2 100644 --- a/docling/models/base_model.py +++ b/docling/models/base_model.py @@ -1,14 +1,19 @@ +import time from abc import ABC, abstractmethod -from typing import Any, Iterable +from typing import Any, Callable, Iterable, Type from docling_core.types.doc import DoclingDocument, NodeItem from docling.datamodel.base_models import Page +from docling.datamodel.document import ConversionResult, ProfilingItem, ProfilingScope +from docling.datamodel.settings import settings class BasePageModel(ABC): @abstractmethod - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: pass @@ -23,3 +28,28 @@ class BaseEnrichmentModel(ABC): self, doc: DoclingDocument, element_batch: Iterable[NodeItem] ) -> Iterable[Any]: pass + + +class TimeRecorder: + def __init__( + self, + conv_res: ConversionResult, + key: str, + scope: ProfilingScope = ProfilingScope.PAGE, + ): + if settings.debug.profile_pipeline_timings: + if key not in conv_res.timings.keys(): + conv_res.timings[key] = ProfilingItem(scope=scope) + self.conv_res = conv_res + self.key = key + + def __enter__(self): + if settings.debug.profile_pipeline_timings: + self.start = time.monotonic() + return self + + def __exit__(self, *args): + if settings.debug.profile_pipeline_timings: + elapsed = time.monotonic() - self.start + self.conv_res.timings[self.key].times.append(elapsed) + self.conv_res.timings[self.key].count += 1 diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index da6860a8..5799aa63 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -10,12 +10,14 @@ from rtree import index from scipy.ndimage import find_objects, label from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import OcrOptions +from docling.models.base_model import BasePageModel _log = logging.getLogger(__name__) -class BaseOcrModel: +class BaseOcrModel(BasePageModel): def __init__(self, enabled: bool, options: OcrOptions): self.enabled = enabled self.options = options @@ -133,5 +135,7 @@ class BaseOcrModel: image.show() @abstractmethod - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: pass diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 2f7078d3..42b84bd2 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -27,6 +27,7 @@ from pydantic import BaseModel, ConfigDict from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement from docling.datamodel.document import ConversionResult, layout_label_to_ds_type +from docling.models.base_model import TimeRecorder from docling.utils.utils import create_hash @@ -226,12 +227,13 @@ class GlmModel: return ds_doc def __call__(self, conv_res: ConversionResult) -> DoclingDocument: - ds_doc = self._to_legacy_document(conv_res) - ds_doc_dict = ds_doc.model_dump(by_alias=True) + with TimeRecorder(conv_res, "glm"): + ds_doc = self._to_legacy_document(conv_res) + ds_doc_dict = ds_doc.model_dump(by_alias=True) - glm_doc = self.model.apply_on_doc(ds_doc_dict) + glm_doc = self.model.apply_on_doc(ds_doc_dict) - docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental + docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental # DEBUG code: def draw_clusters_and_cells(ds_document, page_no): diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index dfabcca9..ff4954b6 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -1,12 +1,15 @@ import logging +import time from typing import Iterable import numpy from docling_core.types.doc import BoundingBox, CoordOrigin from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.document import ConversionResult, ProfilingItem from docling.datamodel.pipeline_options import EasyOcrOptions from docling.datamodel.settings import settings +from docling.models.base_model import TimeRecorder from docling.models.base_ocr_model import BaseOcrModel _log = logging.getLogger(__name__) @@ -34,56 +37,62 @@ class EasyOcrModel(BaseOcrModel): download_enabled=self.options.download_enabled, ) - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: if not self.enabled: yield from page_batch return for page in page_batch: + assert page._backend is not None if not page._backend.is_valid(): yield page else: - ocr_rects = self.get_ocr_rects(page) + with TimeRecorder(conv_res, "ocr"): + ocr_rects = self.get_ocr_rects(page) - all_ocr_cells = [] - for ocr_rect in ocr_rects: - # Skip zero area boxes - if ocr_rect.area() == 0: - continue - high_res_image = page._backend.get_page_image( - scale=self.scale, cropbox=ocr_rect - ) - im = numpy.array(high_res_image) - result = self.reader.readtext(im) - - del high_res_image - del im - - cells = [ - OcrCell( - id=ix, - text=line[1], - confidence=line[2], - bbox=BoundingBox.from_tuple( - coord=( - (line[0][0][0] / self.scale) + ocr_rect.l, - (line[0][0][1] / self.scale) + ocr_rect.t, - (line[0][2][0] / self.scale) + ocr_rect.l, - (line[0][2][1] / self.scale) + ocr_rect.t, - ), - origin=CoordOrigin.TOPLEFT, - ), + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect ) - for ix, line in enumerate(result) - ] - all_ocr_cells.extend(cells) + im = numpy.array(high_res_image) + result = self.reader.readtext(im) - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) + del high_res_image + del im - page.cells.extend(filtered_ocr_cells) + cells = [ + OcrCell( + id=ix, + text=line[1], + confidence=line[2], + bbox=BoundingBox.from_tuple( + coord=( + (line[0][0][0] / self.scale) + ocr_rect.l, + (line[0][0][1] / self.scale) + ocr_rect.t, + (line[0][2][0] / self.scale) + ocr_rect.l, + (line[0][2][1] / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, + ), + ) + for ix, line in enumerate(result) + ] + all_ocr_cells.extend(cells) + + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells( + all_ocr_cells, page.cells + ) + + page.cells.extend(filtered_ocr_cells) # DEBUG code: if settings.debug.visualize_ocr: diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index 81d89e4f..b885dc50 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -16,8 +16,9 @@ from docling.datamodel.base_models import ( LayoutPrediction, Page, ) +from docling.datamodel.document import ConversionResult from docling.datamodel.settings import settings -from docling.models.base_model import BasePageModel +from docling.models.base_model import BasePageModel, TimeRecorder from docling.utils import layout_utils as lu _log = logging.getLogger(__name__) @@ -272,77 +273,86 @@ class LayoutModel(BasePageModel): return clusters_out_new, cells_out_new - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: + for page in page_batch: assert page._backend is not None if not page._backend.is_valid(): yield page else: - assert page.size is not None + with TimeRecorder(conv_res, "layout"): + assert page.size is not None - clusters = [] - for ix, pred_item in enumerate( - self.layout_predictor.predict(page.get_image(scale=1.0)) - ): - label = DocItemLabel( - pred_item["label"].lower().replace(" ", "_").replace("-", "_") - ) # Temporary, until docling-ibm-model uses docling-core types - cluster = Cluster( - id=ix, - label=label, - confidence=pred_item["confidence"], - bbox=BoundingBox.model_validate(pred_item), - cells=[], + clusters = [] + for ix, pred_item in enumerate( + self.layout_predictor.predict(page.get_image(scale=1.0)) + ): + label = DocItemLabel( + pred_item["label"] + .lower() + .replace(" ", "_") + .replace("-", "_") + ) # Temporary, until docling-ibm-model uses docling-core types + cluster = Cluster( + id=ix, + label=label, + confidence=pred_item["confidence"], + bbox=BoundingBox.model_validate(pred_item), + cells=[], + ) + clusters.append(cluster) + + # Map cells to clusters + # TODO: Remove, postprocess should take care of it anyway. + for cell in page.cells: + for cluster in clusters: + if not cell.bbox.area() > 0: + overlap_frac = 0.0 + else: + overlap_frac = ( + cell.bbox.intersection_area_with(cluster.bbox) + / cell.bbox.area() + ) + + if overlap_frac > 0.5: + cluster.cells.append(cell) + + # Pre-sort clusters + # clusters = self.sort_clusters_by_cell_order(clusters) + + # DEBUG code: + def draw_clusters_and_cells(show: bool = True): + image = copy.deepcopy(page.image) + if image is not None: + draw = ImageDraw.Draw(image) + for c in clusters: + x0, y0, x1, y1 = c.bbox.as_tuple() + draw.rectangle([(x0, y0), (x1, y1)], outline="green") + + cell_color = ( + random.randint(30, 140), + random.randint(30, 140), + random.randint(30, 140), + ) + for tc in c.cells: # [:1]: + x0, y0, x1, y1 = tc.bbox.as_tuple() + draw.rectangle( + [(x0, y0), (x1, y1)], outline=cell_color + ) + if show: + image.show() + + # draw_clusters_and_cells() + + clusters, page.cells = self.postprocess( + clusters, page.cells, page.size.height ) - clusters.append(cluster) - # Map cells to clusters - # TODO: Remove, postprocess should take care of it anyway. - for cell in page.cells: - for cluster in clusters: - if not cell.bbox.area() > 0: - overlap_frac = 0.0 - else: - overlap_frac = ( - cell.bbox.intersection_area_with(cluster.bbox) - / cell.bbox.area() - ) - - if overlap_frac > 0.5: - cluster.cells.append(cell) - - # Pre-sort clusters - # clusters = self.sort_clusters_by_cell_order(clusters) - - # DEBUG code: - def draw_clusters_and_cells(show: bool = True): - image = copy.deepcopy(page.image) - if image is not None: - draw = ImageDraw.Draw(image) - for c in clusters: - x0, y0, x1, y1 = c.bbox.as_tuple() - draw.rectangle([(x0, y0), (x1, y1)], outline="green") - - cell_color = ( - random.randint(30, 140), - random.randint(30, 140), - random.randint(30, 140), - ) - for tc in c.cells: # [:1]: - x0, y0, x1, y1 = tc.bbox.as_tuple() - draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color) - if show: - image.show() - - # draw_clusters_and_cells() - - clusters, page.cells = self.postprocess( - clusters, page.cells, page.size.height - ) + page.predictions.layout = LayoutPrediction(clusters=clusters) if settings.debug.visualize_layout: draw_clusters_and_cells() - page.predictions.layout = LayoutPrediction(clusters=clusters) - yield page diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index caf168cc..c3be7126 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -12,7 +12,8 @@ from docling.datamodel.base_models import ( Table, TextElement, ) -from docling.models.base_model import BasePageModel +from docling.datamodel.document import ConversionResult +from docling.models.base_model import BasePageModel, TimeRecorder from docling.models.layout_model import LayoutModel _log = logging.getLogger(__name__) @@ -51,122 +52,122 @@ class PageAssembleModel(BasePageModel): return sanitized_text.strip() # Strip any leading or trailing whitespace - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: for page in page_batch: assert page._backend is not None if not page._backend.is_valid(): yield page else: - assert page.predictions.layout is not None + with TimeRecorder(conv_res, "page_assemble"): - # assembles some JSON output page by page. + assert page.predictions.layout is not None - elements: List[PageElement] = [] - headers: List[PageElement] = [] - body: List[PageElement] = [] + # assembles some JSON output page by page. - for cluster in page.predictions.layout.clusters: - # _log.info("Cluster label seen:", cluster.label) - if cluster.label in LayoutModel.TEXT_ELEM_LABELS: + elements: List[PageElement] = [] + headers: List[PageElement] = [] + body: List[PageElement] = [] - textlines = [ - cell.text.replace("\x02", "-").strip() - for cell in cluster.cells - if len(cell.text.strip()) > 0 - ] - text = self.sanitize_text(textlines) - text_el = TextElement( - label=cluster.label, - id=cluster.id, - text=text, - page_no=page.page_no, - cluster=cluster, - ) - elements.append(text_el) + for cluster in page.predictions.layout.clusters: + # _log.info("Cluster label seen:", cluster.label) + if cluster.label in LayoutModel.TEXT_ELEM_LABELS: - if cluster.label in LayoutModel.PAGE_HEADER_LABELS: - headers.append(text_el) - else: - body.append(text_el) - elif cluster.label == LayoutModel.TABLE_LABEL: - tbl = None - if page.predictions.tablestructure: - tbl = page.predictions.tablestructure.table_map.get( - cluster.id, None - ) - if ( - not tbl - ): # fallback: add table without structure, if it isn't present - tbl = Table( + textlines = [ + cell.text.replace("\x02", "-").strip() + for cell in cluster.cells + if len(cell.text.strip()) > 0 + ] + text = self.sanitize_text(textlines) + text_el = TextElement( label=cluster.label, id=cluster.id, - text="", - otsl_seq=[], - table_cells=[], - cluster=cluster, - page_no=page.page_no, - ) - - elements.append(tbl) - body.append(tbl) - elif cluster.label == LayoutModel.FIGURE_LABEL: - fig = None - if page.predictions.figures_classification: - fig = ( - page.predictions.figures_classification.figure_map.get( - cluster.id, None - ) - ) - if ( - not fig - ): # fallback: add figure without classification, if it isn't present - fig = FigureElement( - label=cluster.label, - id=cluster.id, - text="", - data=None, - cluster=cluster, - page_no=page.page_no, - ) - elements.append(fig) - body.append(fig) - elif cluster.label == LayoutModel.FORMULA_LABEL: - equation = None - if page.predictions.equations_prediction: - equation = ( - page.predictions.equations_prediction.equation_map.get( - cluster.id, None - ) - ) - if ( - not equation - ): # fallback: add empty formula, if it isn't present - text = self.sanitize_text( - [ - cell.text.replace("\x02", "-").strip() - for cell in cluster.cells - if len(cell.text.strip()) > 0 - ] - ) - equation = TextElement( - label=cluster.label, - id=cluster.id, - cluster=cluster, - page_no=page.page_no, text=text, + page_no=page.page_no, + cluster=cluster, ) - elements.append(equation) - body.append(equation) + elements.append(text_el) - page.assembled = AssembledUnit( - elements=elements, headers=headers, body=body - ) + if cluster.label in LayoutModel.PAGE_HEADER_LABELS: + headers.append(text_el) + else: + body.append(text_el) + elif cluster.label == LayoutModel.TABLE_LABEL: + tbl = None + if page.predictions.tablestructure: + tbl = page.predictions.tablestructure.table_map.get( + cluster.id, None + ) + if ( + not tbl + ): # fallback: add table without structure, if it isn't present + tbl = Table( + label=cluster.label, + id=cluster.id, + text="", + otsl_seq=[], + table_cells=[], + cluster=cluster, + page_no=page.page_no, + ) - # Remove page images (can be disabled) - if not self.options.keep_images: - page._image_cache = {} + elements.append(tbl) + body.append(tbl) + elif cluster.label == LayoutModel.FIGURE_LABEL: + fig = None + if page.predictions.figures_classification: + fig = page.predictions.figures_classification.figure_map.get( + cluster.id, None + ) + if ( + not fig + ): # fallback: add figure without classification, if it isn't present + fig = FigureElement( + label=cluster.label, + id=cluster.id, + text="", + data=None, + cluster=cluster, + page_no=page.page_no, + ) + elements.append(fig) + body.append(fig) + elif cluster.label == LayoutModel.FORMULA_LABEL: + equation = None + if page.predictions.equations_prediction: + equation = page.predictions.equations_prediction.equation_map.get( + cluster.id, None + ) + if ( + not equation + ): # fallback: add empty formula, if it isn't present + text = self.sanitize_text( + [ + cell.text.replace("\x02", "-").strip() + for cell in cluster.cells + if len(cell.text.strip()) > 0 + ] + ) + equation = TextElement( + label=cluster.label, + id=cluster.id, + cluster=cluster, + page_no=page.page_no, + text=text, + ) + elements.append(equation) + body.append(equation) - # Unload backend - page._backend.unload() + page.assembled = AssembledUnit( + elements=elements, headers=headers, body=body + ) + + # Remove page images (can be disabled) + if not self.options.keep_images: + page._image_cache = {} + + # Unload backend + page._backend.unload() yield page diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index 1e0032c1..adb20fc3 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -4,7 +4,8 @@ from PIL import ImageDraw from pydantic import BaseModel from docling.datamodel.base_models import Page -from docling.models.base_model import BasePageModel +from docling.datamodel.document import ConversionResult +from docling.models.base_model import BasePageModel, TimeRecorder class PagePreprocessingOptions(BaseModel): @@ -15,14 +16,17 @@ class PagePreprocessingModel(BasePageModel): def __init__(self, options: PagePreprocessingOptions): self.options = options - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: for page in page_batch: assert page._backend is not None if not page._backend.is_valid(): yield page else: - page = self._populate_page_images(page) - page = self._parse_page_cells(page) + with TimeRecorder(conv_res, "page_parse"): + page = self._populate_page_images(page) + page = self._parse_page_cells(page) yield page # Generate the page image and store it in the page object diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 6d75aab4..95aa6b5c 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -8,9 +8,10 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic from PIL import ImageDraw from docling.datamodel.base_models import Page, Table, TableStructurePrediction +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions from docling.datamodel.settings import settings -from docling.models.base_model import BasePageModel +from docling.models.base_model import BasePageModel, TimeRecorder class TableStructureModel(BasePageModel): @@ -64,7 +65,9 @@ class TableStructureModel(BasePageModel): image.show() - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: if not self.enabled: yield from page_batch @@ -75,96 +78,105 @@ class TableStructureModel(BasePageModel): if not page._backend.is_valid(): yield page else: + with TimeRecorder(conv_res, "table_structure"): - assert page.predictions.layout is not None - assert page.size is not None + assert page.predictions.layout is not None + assert page.size is not None - page.predictions.tablestructure = TableStructurePrediction() # dummy + page.predictions.tablestructure = ( + TableStructurePrediction() + ) # dummy - in_tables = [ - ( - cluster, - [ - round(cluster.bbox.l) * self.scale, - round(cluster.bbox.t) * self.scale, - round(cluster.bbox.r) * self.scale, - round(cluster.bbox.b) * self.scale, - ], + in_tables = [ + ( + cluster, + [ + round(cluster.bbox.l) * self.scale, + round(cluster.bbox.t) * self.scale, + round(cluster.bbox.r) * self.scale, + round(cluster.bbox.b) * self.scale, + ], + ) + for cluster in page.predictions.layout.clusters + if cluster.label == DocItemLabel.TABLE + ] + if not len(in_tables): + yield page + continue + + tokens = [] + for c in page.cells: + for cluster, _ in in_tables: + if c.bbox.area() > 0: + if ( + c.bbox.intersection_area_with(cluster.bbox) + / c.bbox.area() + > 0.2 + ): + # Only allow non empty stings (spaces) into the cells of a table + if len(c.text.strip()) > 0: + new_cell = copy.deepcopy(c) + new_cell.bbox = new_cell.bbox.scaled( + scale=self.scale + ) + + tokens.append(new_cell.model_dump()) + + page_input = { + "tokens": tokens, + "width": page.size.width * self.scale, + "height": page.size.height * self.scale, + } + page_input["image"] = numpy.asarray( + page.get_image(scale=self.scale) ) - for cluster in page.predictions.layout.clusters - if cluster.label == DocItemLabel.TABLE - ] - if not len(in_tables): - yield page - continue - tokens = [] - for c in page.cells: - for cluster, _ in in_tables: - if c.bbox.area() > 0: - if ( - c.bbox.intersection_area_with(cluster.bbox) - / c.bbox.area() - > 0.2 - ): - # Only allow non empty stings (spaces) into the cells of a table - if len(c.text.strip()) > 0: - new_cell = copy.deepcopy(c) - new_cell.bbox = new_cell.bbox.scaled( - scale=self.scale + table_clusters, table_bboxes = zip(*in_tables) + + if len(table_bboxes): + tf_output = self.tf_predictor.multi_table_predict( + page_input, table_bboxes, do_matching=self.do_cell_matching + ) + + for table_cluster, table_out in zip(table_clusters, tf_output): + table_cells = [] + for element in table_out["tf_responses"]: + + if not self.do_cell_matching: + the_bbox = BoundingBox.model_validate( + element["bbox"] + ).scaled(1 / self.scale) + text_piece = page._backend.get_text_in_rect( + the_bbox ) + element["bbox"]["token"] = text_piece - tokens.append(new_cell.model_dump()) + tc = TableCell.model_validate(element) + if self.do_cell_matching and tc.bbox is not None: + tc.bbox = tc.bbox.scaled(1 / self.scale) + table_cells.append(tc) - page_input = { - "tokens": tokens, - "width": page.size.width * self.scale, - "height": page.size.height * self.scale, - } - page_input["image"] = numpy.asarray(page.get_image(scale=self.scale)) + # Retrieving cols/rows, after post processing: + num_rows = table_out["predict_details"]["num_rows"] + num_cols = table_out["predict_details"]["num_cols"] + otsl_seq = table_out["predict_details"]["prediction"][ + "rs_seq" + ] - table_clusters, table_bboxes = zip(*in_tables) + tbl = Table( + otsl_seq=otsl_seq, + table_cells=table_cells, + num_rows=num_rows, + num_cols=num_cols, + id=table_cluster.id, + page_no=page.page_no, + cluster=table_cluster, + label=DocItemLabel.TABLE, + ) - if len(table_bboxes): - tf_output = self.tf_predictor.multi_table_predict( - page_input, table_bboxes, do_matching=self.do_cell_matching - ) - - for table_cluster, table_out in zip(table_clusters, tf_output): - table_cells = [] - for element in table_out["tf_responses"]: - - if not self.do_cell_matching: - the_bbox = BoundingBox.model_validate( - element["bbox"] - ).scaled(1 / self.scale) - text_piece = page._backend.get_text_in_rect(the_bbox) - element["bbox"]["token"] = text_piece - - tc = TableCell.model_validate(element) - if self.do_cell_matching and tc.bbox is not None: - tc.bbox = tc.bbox.scaled(1 / self.scale) - table_cells.append(tc) - - # Retrieving cols/rows, after post processing: - num_rows = table_out["predict_details"]["num_rows"] - num_cols = table_out["predict_details"]["num_cols"] - otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"] - - tbl = Table( - otsl_seq=otsl_seq, - table_cells=table_cells, - num_rows=num_rows, - num_cols=num_cols, - id=table_cluster.id, - page_no=page.page_no, - cluster=table_cluster, - label=DocItemLabel.TABLE, - ) - - page.predictions.tablestructure.table_map[table_cluster.id] = ( - tbl - ) + page.predictions.tablestructure.table_map[ + table_cluster.id + ] = tbl # For debugging purposes: if settings.debug.visualize_tables: diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index cb6068cc..93c07f38 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -8,8 +8,10 @@ import pandas as pd from docling_core.types.doc import BoundingBox, CoordOrigin from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TesseractCliOcrOptions from docling.datamodel.settings import settings +from docling.models.base_model import TimeRecorder from docling.models.base_ocr_model import BaseOcrModel _log = logging.getLogger(__name__) @@ -103,7 +105,9 @@ class TesseractOcrCliModel(BaseOcrModel): return df_filtered - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: if not self.enabled: yield from page_batch @@ -114,60 +118,64 @@ class TesseractOcrCliModel(BaseOcrModel): if not page._backend.is_valid(): yield page else: - ocr_rects = self.get_ocr_rects(page) + with TimeRecorder(conv_res, "ocr"): - all_ocr_cells = [] - for ocr_rect in ocr_rects: - # Skip zero area boxes - if ocr_rect.area() == 0: - continue - high_res_image = page._backend.get_page_image( - scale=self.scale, cropbox=ocr_rect + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect + ) + + with tempfile.NamedTemporaryFile( + suffix=".png", mode="w" + ) as image_file: + fname = image_file.name + high_res_image.save(fname) + + df = self._run_tesseract(fname) + + # _log.info(df) + + # Print relevant columns (bounding box and text) + for ix, row in df.iterrows(): + text = row["text"] + conf = row["conf"] + + l = float(row["left"]) + b = float(row["top"]) + w = float(row["width"]) + h = float(row["height"]) + + t = b + h + r = l + w + + cell = OcrCell( + id=ix, + text=text, + confidence=conf / 100.0, + bbox=BoundingBox.from_tuple( + coord=( + (l / self.scale) + ocr_rect.l, + (b / self.scale) + ocr_rect.t, + (r / self.scale) + ocr_rect.l, + (t / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, + ), + ) + all_ocr_cells.append(cell) + + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells( + all_ocr_cells, page.cells ) - with tempfile.NamedTemporaryFile( - suffix=".png", mode="w" - ) as image_file: - fname = image_file.name - high_res_image.save(fname) - - df = self._run_tesseract(fname) - - # _log.info(df) - - # Print relevant columns (bounding box and text) - for ix, row in df.iterrows(): - text = row["text"] - conf = row["conf"] - - l = float(row["left"]) - b = float(row["top"]) - w = float(row["width"]) - h = float(row["height"]) - - t = b + h - r = l + w - - cell = OcrCell( - id=ix, - text=text, - confidence=conf / 100.0, - bbox=BoundingBox.from_tuple( - coord=( - (l / self.scale) + ocr_rect.l, - (b / self.scale) + ocr_rect.t, - (r / self.scale) + ocr_rect.l, - (t / self.scale) + ocr_rect.t, - ), - origin=CoordOrigin.TOPLEFT, - ), - ) - all_ocr_cells.append(cell) - - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) - - page.cells.extend(filtered_ocr_cells) + page.cells.extend(filtered_ocr_cells) # DEBUG code: if settings.debug.visualize_ocr: diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index e032dd77..136b185e 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -4,8 +4,10 @@ from typing import Iterable from docling_core.types.doc import BoundingBox, CoordOrigin from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TesseractOcrOptions from docling.datamodel.settings import settings +from docling.models.base_model import TimeRecorder from docling.models.base_ocr_model import BaseOcrModel _log = logging.getLogger(__name__) @@ -62,7 +64,9 @@ class TesseractOcrModel(BaseOcrModel): # Finalize the tesseractAPI self.reader.End() - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: if not self.enabled: yield from page_batch @@ -73,57 +77,63 @@ class TesseractOcrModel(BaseOcrModel): if not page._backend.is_valid(): yield page else: - assert self.reader is not None + with TimeRecorder(conv_res, "ocr"): - ocr_rects = self.get_ocr_rects(page) + assert self.reader is not None - all_ocr_cells = [] - for ocr_rect in ocr_rects: - # Skip zero area boxes - if ocr_rect.area() == 0: - continue - high_res_image = page._backend.get_page_image( - scale=self.scale, cropbox=ocr_rect - ) + ocr_rects = self.get_ocr_rects(page) - # Retrieve text snippets with their bounding boxes - self.reader.SetImage(high_res_image) - boxes = self.reader.GetComponentImages( - self.reader_RIL.TEXTLINE, True - ) - - cells = [] - for ix, (im, box, _, _) in enumerate(boxes): - # Set the area of interest. Tesseract uses Bottom-Left for the origin - self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"]) - - # Extract text within the bounding box - text = self.reader.GetUTF8Text().strip() - confidence = self.reader.MeanTextConf() - left = box["x"] / self.scale - bottom = box["y"] / self.scale - right = (box["x"] + box["w"]) / self.scale - top = (box["y"] + box["h"]) / self.scale - - cells.append( - OcrCell( - id=ix, - text=text, - confidence=confidence, - bbox=BoundingBox.from_tuple( - coord=(left, top, right, bottom), - origin=CoordOrigin.TOPLEFT, - ), - ) + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect ) - # del high_res_image - all_ocr_cells.extend(cells) + # Retrieve text snippets with their bounding boxes + self.reader.SetImage(high_res_image) + boxes = self.reader.GetComponentImages( + self.reader_RIL.TEXTLINE, True + ) - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) + cells = [] + for ix, (im, box, _, _) in enumerate(boxes): + # Set the area of interest. Tesseract uses Bottom-Left for the origin + self.reader.SetRectangle( + box["x"], box["y"], box["w"], box["h"] + ) - page.cells.extend(filtered_ocr_cells) + # Extract text within the bounding box + text = self.reader.GetUTF8Text().strip() + confidence = self.reader.MeanTextConf() + left = box["x"] / self.scale + bottom = box["y"] / self.scale + right = (box["x"] + box["w"]) / self.scale + top = (box["y"] + box["h"]) / self.scale + + cells.append( + OcrCell( + id=ix, + text=text, + confidence=confidence, + bbox=BoundingBox.from_tuple( + coord=(left, top, right, bottom), + origin=CoordOrigin.TOPLEFT, + ), + ) + ) + + # del high_res_image + all_ocr_cells.extend(cells) + + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells( + all_ocr_cells, page.cells + ) + + page.cells.extend(filtered_ocr_cells) # DEBUG code: if settings.debug.visualize_ocr: diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 8dd074cc..a8cde7ea 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -15,10 +15,15 @@ from docling.datamodel.base_models import ( ErrorItem, Page, ) -from docling.datamodel.document import ConversionResult, InputDocument +from docling.datamodel.document import ( + ConversionResult, + InputDocument, + ProfilingItem, + ProfilingScope, +) from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.settings import settings -from docling.models.base_model import BaseEnrichmentModel +from docling.models.base_model import BaseEnrichmentModel, TimeRecorder from docling.utils.utils import chunkify _log = logging.getLogger(__name__) @@ -37,11 +42,11 @@ class BasePipeline(ABC): try: # These steps are building and assembling the structure of the # output DoclingDocument - conv_res = self._build_document(in_doc, conv_res) - conv_res = self._assemble_document(in_doc, conv_res) + conv_res = self._build_document(conv_res) + conv_res = self._assemble_document(conv_res) # From this stage, all operations should rely only on conv_res.output - conv_res = self._enrich_document(in_doc, conv_res) - conv_res.status = self._determine_status(in_doc, conv_res) + conv_res = self._enrich_document(conv_res) + conv_res.status = self._determine_status(conv_res) except Exception as e: conv_res.status = ConversionStatus.FAILURE if raises_on_error: @@ -50,19 +55,13 @@ class BasePipeline(ABC): return conv_res @abstractmethod - def _build_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: + def _build_document(self, conv_res: ConversionResult) -> ConversionResult: pass - def _assemble_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: + def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: return conv_res - def _enrich_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: + def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult: def _filter_elements( doc: DoclingDocument, model: BaseEnrichmentModel @@ -71,24 +70,23 @@ class BasePipeline(ABC): if model.is_processable(doc=doc, element=element): yield element - for model in self.enrichment_pipe: - for element_batch in chunkify( - _filter_elements(conv_res.document, model), - settings.perf.elements_batch_size, - ): - # TODO: currently we assume the element itself is modified, because - # we don't have an interface to save the element back to the document - for element in model( - doc=conv_res.document, element_batch=element_batch - ): # Must exhaust! - pass + with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT): + for model in self.enrichment_pipe: + for element_batch in chunkify( + _filter_elements(conv_res.document, model), + settings.perf.elements_batch_size, + ): + # TODO: currently we assume the element itself is modified, because + # we don't have an interface to save the element back to the document + for element in model( + doc=conv_res.document, element_batch=element_batch + ): # Must exhaust! + pass return conv_res @abstractmethod - def _determine_status( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionStatus: + def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus: pass @classmethod @@ -110,66 +108,68 @@ class BasePipeline(ABC): class PaginatedPipeline(BasePipeline): # TODO this is a bad name. - def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def _apply_on_pages( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: for model in self.build_pipe: - page_batch = model(page_batch) + page_batch = model(conv_res, page_batch) yield from page_batch - def _build_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: + def _build_document(self, conv_res: ConversionResult) -> ConversionResult: - if not isinstance(in_doc._backend, PdfDocumentBackend): + if not isinstance(conv_res.input._backend, PdfDocumentBackend): raise RuntimeError( - f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. " + f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. " f"Can not convert this with a PDF pipeline. " f"Please check your format configuration on DocumentConverter." ) # conv_res.status = ConversionStatus.FAILURE # return conv_res - for i in range(0, in_doc.page_count): - conv_res.pages.append(Page(page_no=i)) + with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): - try: - # Iterate batches of pages (page_batch_size) in the doc - for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size): - start_pb_time = time.time() + for i in range(0, conv_res.input.page_count): + conv_res.pages.append(Page(page_no=i)) - # 1. Initialise the page resources - init_pages = map( - functools.partial(self.initialize_page, in_doc), page_batch + try: + # Iterate batches of pages (page_batch_size) in the doc + for page_batch in chunkify( + conv_res.pages, settings.perf.page_batch_size + ): + start_pb_time = time.time() + + # 1. Initialise the page resources + init_pages = map( + functools.partial(self.initialize_page, conv_res), page_batch + ) + + # 2. Run pipeline stages + pipeline_pages = self._apply_on_pages(conv_res, init_pages) + + for p in pipeline_pages: # Must exhaust! + pass + + end_pb_time = time.time() - start_pb_time + _log.debug(f"Finished converting page batch time={end_pb_time:.3f}") + + except Exception as e: + conv_res.status = ConversionStatus.FAILURE + trace = "\n".join(traceback.format_exception(e)) + _log.warning( + f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n" + f"{trace}" ) + raise e - # 2. Run pipeline stages - pipeline_pages = self._apply_on_pages(init_pages) - - for p in pipeline_pages: # Must exhaust! - pass - - end_pb_time = time.time() - start_pb_time - _log.debug(f"Finished converting page batch time={end_pb_time:.3f}") - - except Exception as e: - conv_res.status = ConversionStatus.FAILURE - trace = "\n".join(traceback.format_exception(e)) - _log.warning( - f"Encountered an error during conversion of document {in_doc.document_hash}:\n" - f"{trace}" - ) - raise e - - finally: - # Always unload the PDF backend, even in case of failure - if in_doc._backend: - in_doc._backend.unload() + finally: + # Always unload the PDF backend, even in case of failure + if conv_res.input._backend: + conv_res.input._backend.unload() return conv_res - def _determine_status( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionStatus: + def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus: status = ConversionStatus.SUCCESS for page in conv_res.pages: if page._backend is None or not page._backend.is_valid(): @@ -186,5 +186,5 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. # Initialise and load resources for a page @abstractmethod - def initialize_page(self, doc: InputDocument, page: Page) -> Page: + def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page: pass diff --git a/docling/pipeline/simple_pipeline.py b/docling/pipeline/simple_pipeline.py index 0858af0b..90f74475 100644 --- a/docling/pipeline/simple_pipeline.py +++ b/docling/pipeline/simple_pipeline.py @@ -5,8 +5,9 @@ from docling.backend.abstract_backend import ( DeclarativeDocumentBackend, ) from docling.datamodel.base_models import ConversionStatus -from docling.datamodel.document import ConversionResult, InputDocument +from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope from docling.datamodel.pipeline_options import PipelineOptions +from docling.models.base_model import TimeRecorder from docling.pipeline.base_pipeline import BasePipeline _log = logging.getLogger(__name__) @@ -22,13 +23,11 @@ class SimplePipeline(BasePipeline): def __init__(self, pipeline_options: PipelineOptions): super().__init__(pipeline_options) - def _build_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: + def _build_document(self, conv_res: ConversionResult) -> ConversionResult: - if not isinstance(in_doc._backend, DeclarativeDocumentBackend): + if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend): raise RuntimeError( - f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. " + f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. " f"Can not convert this with simple pipeline. " f"Please check your format configuration on DocumentConverter." ) @@ -38,13 +37,11 @@ class SimplePipeline(BasePipeline): # Instead of running a page-level pipeline to build up the document structure, # the backend is expected to be of type DeclarativeDocumentBackend, which can output # a DoclingDocument straight. - - conv_res.document = in_doc._backend.convert() + with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): + conv_res.document = conv_res.input._backend.convert() return conv_res - def _determine_status( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionStatus: + def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus: # This is called only if the previous steps didn't raise. # Since we don't have anything else to evaluate, we can # safely return SUCCESS. diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 5de2e32f..446371b4 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -7,13 +7,14 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import AssembledUnit, Page -from docling.datamodel.document import ConversionResult, InputDocument +from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope from docling.datamodel.pipeline_options import ( EasyOcrOptions, PdfPipelineOptions, TesseractCliOcrOptions, TesseractOcrOptions, ) +from docling.models.base_model import TimeRecorder from docling.models.base_ocr_model import BaseOcrModel from docling.models.ds_glm_model import GlmModel, GlmOptions from docling.models.easyocr_model import EasyOcrModel @@ -119,73 +120,75 @@ class StandardPdfPipeline(PaginatedPipeline): ) return None - def initialize_page(self, doc: InputDocument, page: Page) -> Page: - page._backend = doc._backend.load_page(page.page_no) # type: ignore - if page._backend is not None and page._backend.is_valid(): - page.size = page._backend.get_size() + def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page: + with TimeRecorder(conv_res, "init_page"): + page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore + if page._backend is not None and page._backend.is_valid(): + page.size = page._backend.get_size() return page - def _assemble_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: + def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: all_elements = [] all_headers = [] all_body = [] - for p in conv_res.pages: - if p.assembled is not None: - for el in p.assembled.body: - all_body.append(el) - for el in p.assembled.headers: - all_headers.append(el) - for el in p.assembled.elements: - all_elements.append(el) + with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT): + for p in conv_res.pages: + if p.assembled is not None: + for el in p.assembled.body: + all_body.append(el) + for el in p.assembled.headers: + all_headers.append(el) + for el in p.assembled.elements: + all_elements.append(el) - conv_res.assembled = AssembledUnit( - elements=all_elements, headers=all_headers, body=all_body - ) + conv_res.assembled = AssembledUnit( + elements=all_elements, headers=all_headers, body=all_body + ) - conv_res.document = self.glm_model(conv_res) + conv_res.document = self.glm_model(conv_res) - # Generate page images in the output - if self.pipeline_options.generate_page_images: - for page in conv_res.pages: - assert page.image is not None - page_no = page.page_no + 1 - conv_res.document.pages[page_no].image = ImageRef.from_pil( - page.image, dpi=int(72 * self.pipeline_options.images_scale) - ) - - # Generate images of the requested element types - if ( - self.pipeline_options.generate_picture_images - or self.pipeline_options.generate_table_images - ): - scale = self.pipeline_options.images_scale - for element, _level in conv_res.document.iterate_items(): - if not isinstance(element, DocItem) or len(element.prov) == 0: - continue - if ( - isinstance(element, PictureItem) - and self.pipeline_options.generate_picture_images - ) or ( - isinstance(element, TableItem) - and self.pipeline_options.generate_table_images - ): - page_ix = element.prov[0].page_no - 1 - page = conv_res.pages[page_ix] - assert page.size is not None + # Generate page images in the output + if self.pipeline_options.generate_page_images: + for page in conv_res.pages: assert page.image is not None - - crop_bbox = ( - element.prov[0] - .bbox.scaled(scale=scale) - .to_top_left_origin(page_height=page.size.height * scale) + page_no = page.page_no + 1 + conv_res.document.pages[page_no].image = ImageRef.from_pil( + page.image, dpi=int(72 * self.pipeline_options.images_scale) ) - cropped_im = page.image.crop(crop_bbox.as_tuple()) - element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale)) + # Generate images of the requested element types + if ( + self.pipeline_options.generate_picture_images + or self.pipeline_options.generate_table_images + ): + scale = self.pipeline_options.images_scale + for element, _level in conv_res.document.iterate_items(): + if not isinstance(element, DocItem) or len(element.prov) == 0: + continue + if ( + isinstance(element, PictureItem) + and self.pipeline_options.generate_picture_images + ) or ( + isinstance(element, TableItem) + and self.pipeline_options.generate_table_images + ): + page_ix = element.prov[0].page_no - 1 + page = conv_res.pages[page_ix] + assert page.size is not None + assert page.image is not None + + crop_bbox = ( + element.prov[0] + .bbox.scaled(scale=scale) + .to_top_left_origin(page_height=page.size.height * scale) + ) + + cropped_im = page.image.crop(crop_bbox.as_tuple()) + element.image = ImageRef.from_pil( + cropped_im, dpi=int(72 * scale) + ) return conv_res