new code formula model

Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com>
2025-07-31 14:34:40 +00:00 · 2025-03-10 16:36:18 +01:00 · 2025-03-10 16:36:18 +01:00 · 460e5c5106
commit 460e5c5106
parent 1b8e6c87a0
2 changed files with 114 additions and 3 deletions
--- a/docling/models/code_formula_model.py
+++ b/docling/models/code_formula_model.py
@ -1,4 +1,5 @@
 import re
 from collections import Counter
 from pathlib import Path
 from typing import Iterable, List, Literal, Optional, Tuple, Union
@ -11,7 +12,7 @@ from docling_core.types.doc import (
    TextItem,
 )
 from docling_core.types.doc.labels import CodeLanguageLabel
-from PIL import Image
+from PIL import Image, ImageOps
 from pydantic import BaseModel
 from docling.datamodel.base_models import ItemAndImageEnrichmentElement
@ -65,7 +66,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
    _model_repo_folder = "ds4sd--CodeFormula"
    elements_batch_size = 5
    images_scale = 1.66  # = 120 dpi, aligned with training data resolution
-    expansion_factor = 0.03
+    expansion_factor = 0.18
    def __init__(
        self,
@ -206,6 +207,114 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
        except ValueError:
            return CodeLanguageLabel.UNKNOWN
    # def _pad_image_with_edge_color(self, img: Union[Image.Image, np.ndarray], padding: tuple) -> Image.Image:
    #     """
    #     Pads the image with different padding sizes per side, filled with the outermost pixels.
    #     Parameters:
    #     ----------
    #         img : PIL.Image
    #             The original image.
    #         padding : tuple
    #             Padding amounts as (left, top, right, bottom).
    #     Returns:
    #     ---------
    #         PIL.Image: The padded image.
    #     """
    #     if isinstance(img, np.ndarray):
    #         img = Image.fromarray(img)
    #     left, top, right, bottom = padding
    #     img_np = np.array(img)
    #     if img_np.ndim == 2:
    #         # Grayscale
    #         pad_width = ((top, bottom), (left, right))
    #     else:
    #         # RGB or RGBA
    #         pad_width = ((top, bottom), (left, right), (0, 0))
    #     padded_np = np.pad(img_np, pad_width=pad_width, mode='edge')
    #     return Image.fromarray(padded_np, img.mode)
    def _get_most_frequent_edge_color(self, pil_img: Image.Image):
        """
        Compute the most frequent color along the outer edges of a PIL image.
        Parameters
        ----------
            pil_img : Image.Image
                A PIL Image in any mode (L, RGB, RGBA, etc.).
        Returns
        -------
            (int) or (tuple): The most common edge color as a scalar (for grayscale) or
                tuple (for RGB/RGBA).
        """
        # Convert to NumPy array for easy pixel access
        img_np = np.array(pil_img)
        if img_np.ndim == 2:
            # Grayscale-like image: shape (H, W)
            # Extract edges: top row, bottom row, left col, right col
            top = img_np[0, :]  # shape (W,)
            bottom = img_np[-1, :]  # shape (W,)
            left = img_np[:, 0]  # shape (H,)
            right = img_np[:, -1]  # shape (H,)
            # Concatenate all edges
            edges = np.concatenate([top, bottom, left, right])
            # Count frequencies
            freq = Counter(edges.tolist())
            most_common_value, _ = freq.most_common(1)[0]
            return int(most_common_value)  # single channel color
        else:
            # Color image: shape (H, W, C)
            top = img_np[0, :, :]  # shape (W, C)
            bottom = img_np[-1, :, :]  # shape (W, C)
            left = img_np[:, 0, :]  # shape (H, C)
            right = img_np[:, -1, :]  # shape (H, C)
            # Concatenate edges along first axis
            edges = np.concatenate([top, bottom, left, right], axis=0)
            # Convert each color to a tuple for counting
            edges_as_tuples = [tuple(pixel) for pixel in edges]
            freq = Counter(edges_as_tuples)
            most_common_value, _ = freq.most_common(1)[0]
            return most_common_value  # e.g. (R, G, B) or (R, G, B, A)
    def _pad_with_most_frequent_edge_color(
        self, img: Union[Image.Image, np.ndarray], padding: Tuple[int, int, int, int]
    ):
        """
        Pads an image (PIL or NumPy array) using the most frequent edge color.
        Parameters
        ----------
            img : Union[Image.Image, np.ndarray]
                The original image.
            padding : tuple
                Padding (left, top, right, bottom) in pixels.
        Returns
        -------
            Image.Image: A new PIL image with the specified padding.
        """
        if isinstance(img, np.ndarray):
            pil_img = Image.fromarray(img)
        else:
            pil_img = img
        most_freq_color = self._get_most_frequent_edge_color(pil_img)
        padded_img = ImageOps.expand(pil_img, border=padding, fill=most_freq_color)
        return padded_img
    def __call__(
        self,
        doc: DoclingDocument,
@ -238,7 +347,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
            assert isinstance(el.item, TextItem)
            elements.append(el.item)
            labels.append(el.item.label)
-            images.append(el.image)
+            images.append(
                self._pad_with_most_frequent_edge_color(el.image, (20, 10, 20, 10))
            )
        outputs = self.code_formula_model.predict(images, labels)
--- a/tests/data/pdf/code_and_formula.pdf
+++ b/tests/data/pdf/code_and_formula.pdf