dev: use granite-docling for table structure

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-09-09 18:16:16 +02:00
parent 55f5f3752f
commit a4efd70410
2 changed files with 217 additions and 1 deletions
--- a/docling/models/table_structure_model_vlm.py
+++ b/docling/models/table_structure_model_vlm.py
@@ -0,0 +1,214 @@
 import logging
 import time
 from collections.abc import Iterable
 from pathlib import Path
 from typing import List, Optional
 from docling_core.types.doc import DocItemLabel
 from docling_core.types.doc.utils import parse_otsl_table_content
 from PIL import Image
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import (
    Cluster,
    Page,
    Table,
    TableStructurePrediction,
    VlmPredictionToken,
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    TableStructureOptions,
 )
 from docling.models.base_model import BasePageModel
 from docling.models.utils.hf_model_download import HuggingFaceModelDownloadMixin
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
 class TableStructureModelVlmMlx(BasePageModel, HuggingFaceModelDownloadMixin):
    def __init__(
        self,
        enabled: bool,
        artifacts_path: Optional[Path],
        options: TableStructureOptions,
        accelerator_options: AcceleratorOptions,
    ):
        self.options = options
        model_repo_id = "ds4sd/granite-docling-258m-2-9-2025-v2-mlx-bf16"
        self.max_tokens = 4096
        self.temperature = 0
        self.stop_strings = ["</doctag>", "<end_of_utterance>"]
        self.enabled = enabled
        if self.enabled:
            try:
                from mlx_vlm import generate, load, stream_generate  # type: ignore
                from mlx_vlm.prompt_utils import apply_chat_template  # type: ignore
                from mlx_vlm.utils import load_config  # type: ignore
            except ImportError:
                raise ImportError(
                    "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
                )
            repo_cache_folder = model_repo_id.replace("/", "--")
            self.apply_chat_template = apply_chat_template
            self.stream_generate = stream_generate
            # PARAMETERS:
            if artifacts_path is None:
                artifacts_path = self.download_models(
                    model_repo_id,
                )
            elif (artifacts_path / repo_cache_folder).exists():
                artifacts_path = artifacts_path / repo_cache_folder
            ## Load the model
            self.vlm_model, self.processor = load(artifacts_path)
            self.config = load_config(artifacts_path)
            self.scale = 2.0  # Scale up table input images to 144 dpi
    def _predict_images(self, image_batch: Iterable[Image.Image]) -> Iterable[str]:
        user_prompt = "Convert table to OTSL."
        # Use the MLX chat template approach like in the __call__ method
        formatted_prompt = self.apply_chat_template(
            self.processor, self.config, user_prompt, num_images=1
        )
        for image in image_batch:
            # Stream generate with stop strings support
            start_time = time.time()
            _log.debug("start generating ...")
            tokens: list[VlmPredictionToken] = []
            output = ""
            # Use stream_generate for proper stop string handling
            for token in self.stream_generate(
                self.vlm_model,
                self.processor,
                formatted_prompt,
                [image],  # MLX stream_generate expects list of images
                max_tokens=self.max_tokens,
                verbose=False,
                temp=self.temperature,
            ):
                # Collect token information
                if len(token.logprobs.shape) == 1:
                    tokens.append(
                        VlmPredictionToken(
                            text=token.text,
                            token=token.token,
                            logprob=token.logprobs[token.token],
                        )
                    )
                elif len(token.logprobs.shape) == 2 and token.logprobs.shape[0] == 1:
                    tokens.append(
                        VlmPredictionToken(
                            text=token.text,
                            token=token.token,
                            logprob=token.logprobs[0, token.token],
                        )
                    )
                else:
                    _log.warning(
                        f"incompatible shape for logprobs: {token.logprobs.shape}"
                    )
                output += token.text
                # Check for any configured stop strings
                if self.stop_strings:
                    if any(stop_str in output for stop_str in self.stop_strings):
                        _log.debug("Stopping generation due to stop string match")
                        break
            generation_time = time.time() - start_time
            _log.debug(
                f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time:.1f} tokens/sec)."
            )
            yield output
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        # Convert to list to allow multiple iterations
        pages = list(page_batch)
        # Separate valid and invalid pages
        table_images: List[Image.Image] = []
        table_clusters: List[Cluster] = []
        pages_to_tables: List[List[int]] = []
        tbl_ix = 0
        for page in pages:
            assert page._backend is not None
            if not page._backend.is_valid():
                pages_to_tables.append([])
                continue
            table_indexes = []
            assert page.predictions.layout is not None
            for cluster in page.predictions.layout.clusters:
                if cluster.label not in {
                    DocItemLabel.TABLE,
                    DocItemLabel.DOCUMENT_INDEX,
                }:
                    continue
                table_image = page.get_image(scale=self.scale, cropbox=cluster.bbox)
                assert table_image is not None
                table_clusters.append(cluster)
                table_images.append(table_image)
                table_indexes.append(tbl_ix)
                tbl_ix += 1
            pages_to_tables.append(table_indexes)
        assert len(pages) == len(pages_to_tables)
        # Process all valid pages with batch prediction
        batch_predictions = []
        if table_images:
            with TimeRecorder(conv_res, "table_structure"):
                batch_predictions = list(self._predict_images(table_images))
        assert len(batch_predictions) == len(table_images)
        for page, page_tables_map in zip(pages, pages_to_tables):
            if not page_tables_map:
                yield page
            page.predictions.tablestructure = TableStructurePrediction()  # dummy
            for tbl_ix in page_tables_map:
                otsl_seq = batch_predictions[tbl_ix]
                table_cluster = table_clusters[tbl_ix]
                print(f"{otsl_seq=}")
                table_data = parse_otsl_table_content(otsl_seq)
                print(f"{table_data.num_rows=}")
                print(f"{table_data.num_cols=}")
                tbl = Table(
                    otsl_seq=[otsl_seq],
                    table_cells=table_data.table_cells,
                    num_rows=table_data.num_rows,
                    num_cols=table_data.num_cols,
                    id=table_cluster.id,
                    page_no=page.page_no,
                    cluster=table_cluster,
                    label=table_cluster.label,
                )
                page.predictions.tablestructure.table_map[table_cluster.id] = tbl
            yield page
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -29,6 +29,7 @@ from docling.models.page_preprocessing_model import (
 from docling.models.picture_description_base_model import PictureDescriptionBaseModel
 from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
 from docling.models.table_structure_model import TableStructureModel
 from docling.models.table_structure_model_vlm import TableStructureModelVlmMlx
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.model_downloader import download_models
 from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -81,7 +82,8 @@ class StandardPdfPipeline(PaginatedPipeline):
                options=pipeline_options.layout_options,
            ),
            # Table structure model
-            TableStructureModel(
+            # TableStructureModel(
            TableStructureModelVlmMlx(
                enabled=pipeline_options.do_table_structure,
                artifacts_path=artifacts_path,
                options=pipeline_options.table_structure_options,