docling/docling/models/page_preprocessing_model.py
Christoph Auer 1fa7cd9855 Fundamental refactoring for multi-format support
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2024-10-01 16:54:09 +02:00

51 lines
1.6 KiB
Python

from typing import Iterable
from PIL import ImageDraw
from docling.datamodel.base_models import Page
from docling.models.abstract_model import AbstractPageModel
class PagePreprocessingModel(AbstractPageModel):
def __init__(self, config):
self.config = config
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch:
page = self._populate_page_images(page)
page = self._parse_page_cells(page)
yield page
# Generate the page image and store it in the page object
def _populate_page_images(self, page: Page) -> Page:
# default scale
page.get_image(
scale=1.0
) # puts the page image on the image cache at default scale
images_scale = self.config["images_scale"]
# user requested scales
if images_scale is not None:
page._default_image_scale = images_scale
page.get_image(
scale=images_scale
) # this will trigger storing the image in the internal cache
return page
# Extract and populate the page cells and store it in the page object
def _parse_page_cells(self, page: Page) -> Page:
page.cells = page._backend.get_text_cells()
# DEBUG code:
def draw_text_boxes(image, cells):
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
image.show()
# draw_text_boxes(page.get_image(scale=1.0), cells)
return page