doing some experiments with granite-docling

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-09-08 06:03:18 +02:00
parent 0e2f370f4f
commit ae9ec37cf1
1 changed files with 89 additions and 1 deletions
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@@ -1,4 +1,5 @@
 import logging
+import re
 import threading
 import time
 from collections.abc import Iterable
@@ -6,6 +7,7 @@ from pathlib import Path
 from typing import Optional, Union

 import numpy as np
+from docling_core.types.doc import BoundingBox, CoordOrigin, DocItem
 from PIL.Image import Image

 from docling.datamodel.accelerator_options import (
@@ -27,6 +29,37 @@ _log = logging.getLogger(__name__)
 _MLX_GLOBAL_LOCK = threading.Lock()


+class DoclingStopping:
+    def __init__(self):
+        self.pattern = re.compile(
+            r"<([a-z\_\-]+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>(<)?$"
+        )
+
+        self.bboxs: list[BoundingBox] = []
+
+    def overlaps(self, text: str) -> bool:
+        match = re.search(self.pattern, text)
+        if match:
+            tag_name = match.group(1)  # First group: button
+            loc1 = float(match.group(2))  # Second group: 100
+            loc2 = float(match.group(3))  # Third group: 200
+            loc3 = float(match.group(4))  # Fourth group: 150
+            loc4 = float(match.group(5))  # Fifth group: 50
+
+            bbox = BoundingBox(
+                l=loc1, b=loc2, r=loc3, t=loc4, coord_origin=CoordOrigin.BOTTOMLEFT
+            )
+
+            for _ in self.bboxs:
+                if bbox.intersection_over_self(_) > 1.0e-6:
+                    _log.info(f"{bbox} overlaps with {_}")
+                    return True
+
+            self.bboxs.append(bbox)
+
+        return False
+
+
 class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
    def __init__(
        self,
@@ -68,6 +101,26 @@ class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
            self.vlm_model, self.processor = load(artifacts_path)
            self.config = load_config(artifacts_path)

+            self._find_doctags_labels()
+
+    def _find_doctags_labels(self):
+        """Simple iteration over vocabulary"""
+        tokenizer = (
+            self.processor.tokenizer
+            if hasattr(self.processor, "tokenizer")
+            else self.processor
+        )
+
+        self.special_tokens: dict[str, int] = {}
+        if hasattr(tokenizer, "vocab"):
+            # vocab is usually a dict mapping token_text -> token_id
+            for token_text, token_id in tokenizer.vocab.items():
+                if re.match(r"^<[a-z\_\-\d]+>$", token_text):
+                    print(f"Token ID: {token_id:6d} | Text: '{token_text}'")
+                    self.special_tokens[token_text] = token_id
+            else:
+                print("Tokenizer doesn't have a 'vocab' attribute")
+
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
@@ -199,6 +252,8 @@ class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
                tokens: list[VlmPredictionToken] = []
                output = ""

+                stopping_criteria = DoclingStopping()
+
                # Use stream_generate for proper stop string handling
                for token in self.stream_generate(
                    self.vlm_model,
@@ -209,6 +264,10 @@ class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
                    verbose=False,
                    temp=self.temperature,
                ):
+                    _log.info(
+                        f"logprobs.shape: {token.logprobs.shape} with token: {token}"
+                    )
+
                    # Collect token information
                    if len(token.logprobs.shape) == 1:
                        tokens.append(
@@ -218,6 +277,26 @@ class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
                                logprob=token.logprobs[token.token],
                            )
                        )
+                        if token.text in self.special_tokens:
+                            # Get logprobs for all special tokens
+                            special_token_logprobs = []
+                            for token_text, token_id in self.special_tokens.items():
+                                logprob = token.logprobs[token_id]
+                                special_token_logprobs.append(
+                                    (token_text, token_id, logprob)
+                                )
+
+                            # Sort by logprob (highest first) and take top 5
+                            top_5_special = sorted(
+                                special_token_logprobs, key=lambda x: x[2], reverse=True
+                            )[:5]
+
+                            print("Top 5 special tokens by logprob:")
+                            for rank, (t, token_id, logprob) in enumerate(
+                                top_5_special, 1
+                            ):
+                                print(f"  {rank}. {t}: {logprob:0.3f}")
+
                    elif (
                        len(token.logprobs.shape) == 2 and token.logprobs.shape[0] == 1
                    ):
@@ -228,6 +307,11 @@ class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
                                logprob=token.logprobs[0, token.token],
                            )
                        )
+
+                        if token.text in self.special_tokens:
+                            for t, i in self.special_tokens.items():
+                                print(f"{t}: {token.logprobs[0, i]:0.3f}")
+
                    else:
                        _log.warning(
                            f"incompatible shape for logprobs: {token.logprobs.shape}"
@@ -235,6 +319,10 @@ class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):

                    output += token.text

+                    if stopping_criteria.overlaps(output):
+                        _log.debug("Stopping generation due to overlapping bbox")
+                        break
+
                    # Check for any configured stop strings
                    if self.vlm_options.stop_strings:
                        if any(
@@ -246,7 +334,7 @@ class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):

                generation_time = time.time() - start_time

-                _log.debug(
+                _log.info(
                    f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time:.1f} tokens/sec)."
                )