mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
new code formula model
Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com>
This commit is contained in:
parent
1b8e6c87a0
commit
460e5c5106
@ -1,4 +1,5 @@
|
|||||||
import re
|
import re
|
||||||
|
from collections import Counter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
||||||
|
|
||||||
@ -11,7 +12,7 @@ from docling_core.types.doc import (
|
|||||||
TextItem,
|
TextItem,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.labels import CodeLanguageLabel
|
from docling_core.types.doc.labels import CodeLanguageLabel
|
||||||
from PIL import Image
|
from PIL import Image, ImageOps
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
|
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
|
||||||
@ -65,7 +66,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|||||||
_model_repo_folder = "ds4sd--CodeFormula"
|
_model_repo_folder = "ds4sd--CodeFormula"
|
||||||
elements_batch_size = 5
|
elements_batch_size = 5
|
||||||
images_scale = 1.66 # = 120 dpi, aligned with training data resolution
|
images_scale = 1.66 # = 120 dpi, aligned with training data resolution
|
||||||
expansion_factor = 0.03
|
expansion_factor = 0.18
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -206,6 +207,114 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
return CodeLanguageLabel.UNKNOWN
|
return CodeLanguageLabel.UNKNOWN
|
||||||
|
|
||||||
|
# def _pad_image_with_edge_color(self, img: Union[Image.Image, np.ndarray], padding: tuple) -> Image.Image:
|
||||||
|
# """
|
||||||
|
# Pads the image with different padding sizes per side, filled with the outermost pixels.
|
||||||
|
|
||||||
|
# Parameters:
|
||||||
|
# ----------
|
||||||
|
# img : PIL.Image
|
||||||
|
# The original image.
|
||||||
|
# padding : tuple
|
||||||
|
# Padding amounts as (left, top, right, bottom).
|
||||||
|
|
||||||
|
# Returns:
|
||||||
|
# ---------
|
||||||
|
# PIL.Image: The padded image.
|
||||||
|
# """
|
||||||
|
# if isinstance(img, np.ndarray):
|
||||||
|
# img = Image.fromarray(img)
|
||||||
|
|
||||||
|
# left, top, right, bottom = padding
|
||||||
|
|
||||||
|
# img_np = np.array(img)
|
||||||
|
|
||||||
|
# if img_np.ndim == 2:
|
||||||
|
# # Grayscale
|
||||||
|
# pad_width = ((top, bottom), (left, right))
|
||||||
|
# else:
|
||||||
|
# # RGB or RGBA
|
||||||
|
# pad_width = ((top, bottom), (left, right), (0, 0))
|
||||||
|
|
||||||
|
# padded_np = np.pad(img_np, pad_width=pad_width, mode='edge')
|
||||||
|
|
||||||
|
# return Image.fromarray(padded_np, img.mode)
|
||||||
|
def _get_most_frequent_edge_color(self, pil_img: Image.Image):
|
||||||
|
"""
|
||||||
|
Compute the most frequent color along the outer edges of a PIL image.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
pil_img : Image.Image
|
||||||
|
A PIL Image in any mode (L, RGB, RGBA, etc.).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
(int) or (tuple): The most common edge color as a scalar (for grayscale) or
|
||||||
|
tuple (for RGB/RGBA).
|
||||||
|
"""
|
||||||
|
# Convert to NumPy array for easy pixel access
|
||||||
|
img_np = np.array(pil_img)
|
||||||
|
|
||||||
|
if img_np.ndim == 2:
|
||||||
|
# Grayscale-like image: shape (H, W)
|
||||||
|
# Extract edges: top row, bottom row, left col, right col
|
||||||
|
top = img_np[0, :] # shape (W,)
|
||||||
|
bottom = img_np[-1, :] # shape (W,)
|
||||||
|
left = img_np[:, 0] # shape (H,)
|
||||||
|
right = img_np[:, -1] # shape (H,)
|
||||||
|
|
||||||
|
# Concatenate all edges
|
||||||
|
edges = np.concatenate([top, bottom, left, right])
|
||||||
|
|
||||||
|
# Count frequencies
|
||||||
|
freq = Counter(edges.tolist())
|
||||||
|
most_common_value, _ = freq.most_common(1)[0]
|
||||||
|
return int(most_common_value) # single channel color
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Color image: shape (H, W, C)
|
||||||
|
top = img_np[0, :, :] # shape (W, C)
|
||||||
|
bottom = img_np[-1, :, :] # shape (W, C)
|
||||||
|
left = img_np[:, 0, :] # shape (H, C)
|
||||||
|
right = img_np[:, -1, :] # shape (H, C)
|
||||||
|
|
||||||
|
# Concatenate edges along first axis
|
||||||
|
edges = np.concatenate([top, bottom, left, right], axis=0)
|
||||||
|
|
||||||
|
# Convert each color to a tuple for counting
|
||||||
|
edges_as_tuples = [tuple(pixel) for pixel in edges]
|
||||||
|
freq = Counter(edges_as_tuples)
|
||||||
|
most_common_value, _ = freq.most_common(1)[0]
|
||||||
|
return most_common_value # e.g. (R, G, B) or (R, G, B, A)
|
||||||
|
|
||||||
|
def _pad_with_most_frequent_edge_color(
|
||||||
|
self, img: Union[Image.Image, np.ndarray], padding: Tuple[int, int, int, int]
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Pads an image (PIL or NumPy array) using the most frequent edge color.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
img : Union[Image.Image, np.ndarray]
|
||||||
|
The original image.
|
||||||
|
padding : tuple
|
||||||
|
Padding (left, top, right, bottom) in pixels.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Image.Image: A new PIL image with the specified padding.
|
||||||
|
"""
|
||||||
|
if isinstance(img, np.ndarray):
|
||||||
|
pil_img = Image.fromarray(img)
|
||||||
|
else:
|
||||||
|
pil_img = img
|
||||||
|
|
||||||
|
most_freq_color = self._get_most_frequent_edge_color(pil_img)
|
||||||
|
|
||||||
|
padded_img = ImageOps.expand(pil_img, border=padding, fill=most_freq_color)
|
||||||
|
return padded_img
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
@ -238,7 +347,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|||||||
assert isinstance(el.item, TextItem)
|
assert isinstance(el.item, TextItem)
|
||||||
elements.append(el.item)
|
elements.append(el.item)
|
||||||
labels.append(el.item.label)
|
labels.append(el.item.label)
|
||||||
images.append(el.image)
|
images.append(
|
||||||
|
self._pad_with_most_frequent_edge_color(el.image, (20, 10, 20, 10))
|
||||||
|
)
|
||||||
|
|
||||||
outputs = self.code_formula_model.predict(images, labels)
|
outputs = self.code_formula_model.predict(images, labels)
|
||||||
|
|
||||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user