mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-24 19:14:23 +00:00
Merge f4c1836c96
into 98e2fcff63
This commit is contained in:
commit
0418d2887f
@ -63,6 +63,7 @@ from docling.datamodel.vlm_model_specs import (
|
||||
GRANITE_VISION_TRANSFORMERS,
|
||||
SMOLDOCLING_MLX,
|
||||
SMOLDOCLING_TRANSFORMERS,
|
||||
VLM2STAGE,
|
||||
VlmModelType,
|
||||
)
|
||||
from docling.document_converter import (
|
||||
@ -627,6 +628,12 @@ def convert( # noqa: C901
|
||||
"To run SmolDocling faster, please install mlx-vlm:\n"
|
||||
"pip install mlx-vlm"
|
||||
)
|
||||
elif vlm_model == VlmModelType.VLM2STAGE:
|
||||
pipeline_options.vlm_options = VLM2STAGE
|
||||
else:
|
||||
raise ValueError(
|
||||
f"{vlm_model} is not of type GRANITE_VISION, GRANITE_VISION_OLLAMA, SMOLDOCLING_TRANSFORMERS or VLM2STAGE"
|
||||
)
|
||||
|
||||
pdf_format_option = PdfFormatOption(
|
||||
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
||||
|
@ -11,12 +11,13 @@ from docling.datamodel.pipeline_options_asr_model import (
|
||||
# ApiAsrOptions,
|
||||
InferenceAsrFramework,
|
||||
InlineAsrNativeWhisperOptions,
|
||||
TransformersModelType,
|
||||
InlineAsrOptions,
|
||||
# TransformersModelType,
|
||||
)
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
WHISPER_TINY = InlineAsrNativeWhisperOptions(
|
||||
WHISPER_TINY: InlineAsrOptions = InlineAsrNativeWhisperOptions(
|
||||
repo_id="tiny",
|
||||
inference_framework=InferenceAsrFramework.WHISPER,
|
||||
verbose=True,
|
||||
@ -27,7 +28,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
|
||||
WHISPER_SMALL = InlineAsrNativeWhisperOptions(
|
||||
WHISPER_SMALL: InlineAsrOptions = InlineAsrNativeWhisperOptions(
|
||||
repo_id="small",
|
||||
inference_framework=InferenceAsrFramework.WHISPER,
|
||||
verbose=True,
|
||||
@ -38,7 +39,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
|
||||
WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
|
||||
WHISPER_MEDIUM: InlineAsrOptions = InlineAsrNativeWhisperOptions(
|
||||
repo_id="medium",
|
||||
inference_framework=InferenceAsrFramework.WHISPER,
|
||||
verbose=True,
|
||||
@ -49,7 +50,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
|
||||
WHISPER_BASE = InlineAsrNativeWhisperOptions(
|
||||
WHISPER_BASE: InlineAsrOptions = InlineAsrNativeWhisperOptions(
|
||||
repo_id="base",
|
||||
inference_framework=InferenceAsrFramework.WHISPER,
|
||||
verbose=True,
|
||||
@ -60,7 +61,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
|
||||
WHISPER_LARGE = InlineAsrNativeWhisperOptions(
|
||||
WHISPER_LARGE: InlineAsrOptions = InlineAsrNativeWhisperOptions(
|
||||
repo_id="large",
|
||||
inference_framework=InferenceAsrFramework.WHISPER,
|
||||
verbose=True,
|
||||
@ -71,7 +72,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
|
||||
WHISPER_TURBO = InlineAsrNativeWhisperOptions(
|
||||
WHISPER_TURBO: InlineAsrOptions = InlineAsrNativeWhisperOptions(
|
||||
repo_id="turbo",
|
||||
inference_framework=InferenceAsrFramework.WHISPER,
|
||||
verbose=True,
|
||||
|
@ -26,8 +26,6 @@ class LayoutModelConfig(BaseModel):
|
||||
return self.repo_id.replace("/", "--")
|
||||
|
||||
|
||||
# HuggingFace Layout Models
|
||||
|
||||
# Default Docling Layout Model
|
||||
DOCLING_LAYOUT_V2 = LayoutModelConfig(
|
||||
name="docling_layout_v2",
|
||||
|
@ -12,10 +12,16 @@ from pydantic import (
|
||||
)
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from docling.datamodel import asr_model_specs
|
||||
|
||||
# Import the following for backwards compatibility
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||
from docling.datamodel.asr_model_specs import (
|
||||
WHISPER_BASE,
|
||||
WHISPER_LARGE,
|
||||
WHISPER_MEDIUM,
|
||||
WHISPER_SMALL,
|
||||
WHISPER_TINY,
|
||||
WHISPER_TURBO,
|
||||
)
|
||||
from docling.datamodel.layout_model_specs import (
|
||||
DOCLING_LAYOUT_EGRET_LARGE,
|
||||
DOCLING_LAYOUT_EGRET_MEDIUM,
|
||||
@ -33,6 +39,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
|
||||
InferenceFramework,
|
||||
InlineVlmOptions,
|
||||
ResponseFormat,
|
||||
TwoStageVlmOptions,
|
||||
)
|
||||
from docling.datamodel.vlm_model_specs import (
|
||||
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
||||
@ -270,8 +277,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
||||
False # (To be used with vlms, or other generative models)
|
||||
)
|
||||
# If True, text from backend will be used instead of generated text
|
||||
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
|
||||
vlm_options: Union[InlineVlmOptions, ApiVlmOptions, TwoStageVlmOptions] = (
|
||||
smoldocling_vlm_conversion_options
|
||||
# SMOLDOCLING_TRANSFORMERS
|
||||
)
|
||||
|
||||
|
||||
@ -286,7 +294,7 @@ class LayoutOptions(BaseModel):
|
||||
|
||||
|
||||
class AsrPipelineOptions(PipelineOptions):
|
||||
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
||||
asr_options: Union[InlineAsrOptions] = WHISPER_TINY
|
||||
artifacts_path: Optional[Union[Path, str]] = None
|
||||
|
||||
|
||||
|
@ -5,10 +5,11 @@ from pydantic import AnyUrl, BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
# InferenceFramework,
|
||||
TransformersModelType,
|
||||
)
|
||||
|
||||
# from docling.datamodel.pipeline_options_vlm_model import (
|
||||
# InferenceFramework,
|
||||
# TransformersModelType,
|
||||
# )
|
||||
|
||||
|
||||
class BaseAsrOptions(BaseModel):
|
||||
|
@ -6,6 +6,9 @@ from pydantic import AnyUrl, BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||
from docling.datamodel.layout_model_specs import (
|
||||
LayoutModelConfig,
|
||||
)
|
||||
|
||||
|
||||
class BaseVlmOptions(BaseModel):
|
||||
@ -87,3 +90,12 @@ class ApiVlmOptions(BaseVlmOptions):
|
||||
timeout: float = 60
|
||||
concurrency: int = 1
|
||||
response_format: ResponseFormat
|
||||
|
||||
|
||||
class TwoStageVlmOptions(BaseModel):
|
||||
kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options"
|
||||
|
||||
response_format: ResponseFormat # final response of the VLM
|
||||
|
||||
layout_options: LayoutModelConfig # = DOCLING_LAYOUT_V2
|
||||
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] # = SMOLDOCLING_TRANSFORMERS
|
||||
|
@ -6,12 +6,17 @@ from pydantic import (
|
||||
)
|
||||
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||
from docling.datamodel.layout_model_specs import (
|
||||
DOCLING_LAYOUT_HERON,
|
||||
DOCLING_LAYOUT_V2,
|
||||
)
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
ApiVlmOptions,
|
||||
InferenceFramework,
|
||||
InlineVlmOptions,
|
||||
ResponseFormat,
|
||||
TransformersModelType,
|
||||
TwoStageVlmOptions,
|
||||
)
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -137,8 +142,15 @@ GEMMA3_27B_MLX = InlineVlmOptions(
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
VLM2STAGE = TwoStageVlmOptions(
|
||||
vlm_options=SMOLDOCLING_MLX,
|
||||
layout_options=DOCLING_LAYOUT_HERON,
|
||||
response_format=SMOLDOCLING_MLX.response_format,
|
||||
)
|
||||
|
||||
|
||||
class VlmModelType(str, Enum):
|
||||
SMOLDOCLING = "smoldocling"
|
||||
GRANITE_VISION = "granite_vision"
|
||||
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
||||
VLM2STAGE = "vlm2stage"
|
||||
|
@ -3,9 +3,16 @@ from collections.abc import Iterable
|
||||
from typing import Generic, Optional, Protocol, Type
|
||||
|
||||
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
||||
from PIL import Image
|
||||
from typing_extensions import TypeVar
|
||||
|
||||
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
|
||||
from docling.datamodel.base_models import (
|
||||
Cluster,
|
||||
ItemAndImageEnrichmentElement,
|
||||
Page,
|
||||
TextCell,
|
||||
VlmPredictionToken,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import BaseOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@ -19,6 +26,9 @@ class BaseModelWithOptions(Protocol):
|
||||
|
||||
|
||||
class BasePageModel(ABC):
|
||||
scale: float # scale with which the page-image needs to be created (dpi = 72*scale)
|
||||
max_size: int # max size of width/height of page-image
|
||||
|
||||
@abstractmethod
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
@ -26,6 +36,30 @@ class BasePageModel(ABC):
|
||||
pass
|
||||
|
||||
|
||||
class BaseLayoutModel(BasePageModel):
|
||||
@abstractmethod
|
||||
def predict_on_page_image(self, *, page_image: Image.Image) -> list[Cluster]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def postprocess_on_page_image(
|
||||
self, *, page: Page, clusters: list[Cluster]
|
||||
) -> tuple[Page, list[Cluster], list[TextCell]]:
|
||||
pass
|
||||
|
||||
|
||||
class BaseVlmModel(BasePageModel):
|
||||
@abstractmethod
|
||||
def get_user_prompt(self, page: Optional[Page]) -> str:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def predict_on_page_image(
|
||||
self, *, page_image: Image.Image, prompt: str, output_tokens: bool = False
|
||||
) -> tuple[str, Optional[list[VlmPredictionToken]]]:
|
||||
pass
|
||||
|
||||
|
||||
EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
|
||||
|
||||
|
||||
|
@ -7,6 +7,7 @@ from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import DocItemLabel
|
||||
from docling_core.types.doc.page import TextCell
|
||||
from PIL import Image
|
||||
|
||||
from docling.datamodel.accelerator_options import AcceleratorOptions
|
||||
@ -15,7 +16,7 @@ from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
|
||||
from docling.datamodel.pipeline_options import LayoutOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.base_model import BaseLayoutModel, BasePageModel
|
||||
from docling.models.utils.hf_model_download import download_hf_model
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.layout_postprocessor import LayoutPostprocessor
|
||||
@ -25,7 +26,7 @@ from docling.utils.visualization import draw_clusters
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LayoutModel(BasePageModel):
|
||||
class LayoutModel(BaseLayoutModel):
|
||||
TEXT_ELEM_LABELS = [
|
||||
DocItemLabel.TEXT,
|
||||
DocItemLabel.FOOTNOTE,
|
||||
@ -158,6 +159,7 @@ class LayoutModel(BasePageModel):
|
||||
page_image = page.get_image(scale=1.0)
|
||||
assert page_image is not None
|
||||
|
||||
"""
|
||||
clusters = []
|
||||
for ix, pred_item in enumerate(
|
||||
self.layout_predictor.predict(page_image)
|
||||
@ -176,14 +178,18 @@ class LayoutModel(BasePageModel):
|
||||
cells=[],
|
||||
)
|
||||
clusters.append(cluster)
|
||||
"""
|
||||
predicted_clusters = self.predict_on_page_image(
|
||||
page_image=page_image
|
||||
)
|
||||
|
||||
if settings.debug.visualize_raw_layout:
|
||||
self.draw_clusters_and_cells_side_by_side(
|
||||
conv_res, page, clusters, mode_prefix="raw"
|
||||
conv_res, page, predicted_clusters, mode_prefix="raw"
|
||||
)
|
||||
|
||||
# Apply postprocessing
|
||||
|
||||
"""
|
||||
processed_clusters, processed_cells = LayoutPostprocessor(
|
||||
page, clusters, self.options
|
||||
).postprocess()
|
||||
@ -210,6 +216,30 @@ class LayoutModel(BasePageModel):
|
||||
page.predictions.layout = LayoutPrediction(
|
||||
clusters=processed_clusters
|
||||
)
|
||||
"""
|
||||
page, processed_clusters, processed_cells = (
|
||||
self.postprocess_on_page_image(
|
||||
page=page, clusters=predicted_clusters
|
||||
)
|
||||
)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
"Mean of empty slice|invalid value encountered in scalar divide",
|
||||
RuntimeWarning,
|
||||
"numpy",
|
||||
)
|
||||
|
||||
conv_res.confidence.pages[page.page_no].layout_score = float(
|
||||
np.mean([c.confidence for c in processed_clusters])
|
||||
)
|
||||
|
||||
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
||||
np.mean(
|
||||
[c.confidence for c in processed_cells if c.from_ocr]
|
||||
)
|
||||
)
|
||||
|
||||
if settings.debug.visualize_layout:
|
||||
self.draw_clusters_and_cells_side_by_side(
|
||||
@ -217,3 +247,34 @@ class LayoutModel(BasePageModel):
|
||||
)
|
||||
|
||||
yield page
|
||||
|
||||
def predict_on_page_image(self, *, page_image: Image.Image) -> list[Cluster]:
|
||||
pred_items = self.layout_predictor.predict(page_image)
|
||||
|
||||
clusters = []
|
||||
for ix, pred_item in enumerate(pred_items):
|
||||
label = DocItemLabel(
|
||||
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
|
||||
) # Temporary, until docling-ibm-model uses docling-core types
|
||||
cluster = Cluster(
|
||||
id=ix,
|
||||
label=label,
|
||||
confidence=pred_item["confidence"],
|
||||
bbox=BoundingBox.model_validate(pred_item),
|
||||
cells=[],
|
||||
)
|
||||
clusters.append(cluster)
|
||||
|
||||
return clusters
|
||||
|
||||
def postprocess_on_page_image(
|
||||
self, *, page: Page, clusters: list[Cluster]
|
||||
) -> tuple[Page, list[Cluster], list[TextCell]]:
|
||||
processed_clusters, processed_cells = LayoutPostprocessor(
|
||||
page, clusters, self.options
|
||||
).postprocess()
|
||||
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
|
||||
|
||||
page.predictions.layout = LayoutPrediction(clusters=processed_clusters)
|
||||
|
||||
return page, processed_clusters, processed_cells
|
||||
|
@ -5,17 +5,19 @@ from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from docling.datamodel.accelerator_options import (
|
||||
AcceleratorOptions,
|
||||
)
|
||||
from docling.datamodel.base_models import Page, VlmPrediction
|
||||
from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
InlineVlmOptions,
|
||||
TransformersModelType,
|
||||
TransformersPromptStyle,
|
||||
)
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.base_model import BasePageModel, BaseVlmModel
|
||||
from docling.models.utils.hf_model_download import (
|
||||
HuggingFaceModelDownloadMixin,
|
||||
)
|
||||
@ -25,7 +27,7 @@ from docling.utils.profiling import TimeRecorder
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
||||
class HuggingFaceTransformersVlmModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
@ -37,6 +39,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
||||
|
||||
self.vlm_options = vlm_options
|
||||
|
||||
self.scale = self.vlm_options.scale
|
||||
self.max_size = 512
|
||||
if isinstance(self.vlm_options.max_size, int):
|
||||
self.max_size = self.vlm_options.max_size
|
||||
|
||||
if self.enabled:
|
||||
import torch
|
||||
from transformers import (
|
||||
@ -119,6 +126,43 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
||||
# Load generation config
|
||||
self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
|
||||
|
||||
def get_user_prompt(self, page: Optional[Page]) -> str:
|
||||
# Define prompt structure
|
||||
user_prompt = ""
|
||||
if callable(self.vlm_options.prompt) and page is not None:
|
||||
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
||||
elif isinstance(self.vlm_options.prompt, str):
|
||||
user_prompt = self.vlm_options.prompt
|
||||
|
||||
prompt = self.formulate_prompt(user_prompt)
|
||||
return prompt
|
||||
|
||||
def predict_on_page_image(
|
||||
self, *, page_image: Image.Image, prompt: str, output_tokens: bool = False
|
||||
) -> tuple[str, Optional[list[VlmPredictionToken]]]:
|
||||
output = ""
|
||||
|
||||
inputs = self.processor(
|
||||
text=prompt, images=[page_image], return_tensors="pt"
|
||||
).to(self.device)
|
||||
|
||||
# Call model to generate:
|
||||
generated_ids = self.vlm_model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=self.max_new_tokens,
|
||||
use_cache=self.use_cache,
|
||||
temperature=self.temperature,
|
||||
generation_config=self.generation_config,
|
||||
**self.vlm_options.extra_generation_config,
|
||||
)
|
||||
|
||||
output = self.processor.batch_decode(
|
||||
generated_ids[:, inputs["input_ids"].shape[1] :],
|
||||
skip_special_tokens=False,
|
||||
)[0]
|
||||
|
||||
return output, []
|
||||
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
@ -130,22 +174,29 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
||||
with TimeRecorder(conv_res, "vlm"):
|
||||
assert page.size is not None
|
||||
|
||||
hi_res_image = page.get_image(
|
||||
page_image = page.get_image(
|
||||
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
|
||||
)
|
||||
|
||||
assert page_image is not None
|
||||
|
||||
# Define prompt structure
|
||||
"""
|
||||
if callable(self.vlm_options.prompt):
|
||||
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
||||
else:
|
||||
user_prompt = self.vlm_options.prompt
|
||||
prompt = self.formulate_prompt(user_prompt)
|
||||
|
||||
inputs = self.processor(
|
||||
text=prompt, images=[hi_res_image], return_tensors="pt"
|
||||
).to(self.device)
|
||||
"""
|
||||
prompt = self.get_user_prompt(page=page)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
"""
|
||||
inputs = self.processor(
|
||||
text=prompt, images=[page_image], return_tensors="pt"
|
||||
).to(self.device)
|
||||
|
||||
# Call model to generate:
|
||||
generated_ids = self.vlm_model.generate(
|
||||
**inputs,
|
||||
@ -166,9 +217,14 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
||||
_log.debug(
|
||||
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
|
||||
)
|
||||
"""
|
||||
generated_text = self.predict_on_page_image(
|
||||
page_image=page_image, prompt=prompt, output_tokens=False
|
||||
)
|
||||
|
||||
page.predictions.vlm_response = VlmPrediction(
|
||||
text=generated_texts,
|
||||
generation_time=generation_time,
|
||||
text=generated_text,
|
||||
generation_time=time.time() - start_time,
|
||||
)
|
||||
|
||||
yield page
|
||||
|
@ -4,13 +4,15 @@ from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from docling.datamodel.accelerator_options import (
|
||||
AcceleratorOptions,
|
||||
)
|
||||
from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.base_model import BasePageModel, BaseVlmModel
|
||||
from docling.models.utils.hf_model_download import (
|
||||
HuggingFaceModelDownloadMixin,
|
||||
)
|
||||
@ -19,7 +21,7 @@ from docling.utils.profiling import TimeRecorder
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
||||
class HuggingFaceMlxModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
@ -28,10 +30,15 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
||||
vlm_options: InlineVlmOptions,
|
||||
):
|
||||
self.enabled = enabled
|
||||
|
||||
self.vlm_options = vlm_options
|
||||
|
||||
self.max_tokens = vlm_options.max_new_tokens
|
||||
self.temperature = vlm_options.temperature
|
||||
self.scale = self.vlm_options.scale
|
||||
|
||||
self.max_size = 512
|
||||
if isinstance(self.vlm_options.max_size, int):
|
||||
self.max_size = self.vlm_options.max_size
|
||||
|
||||
if self.enabled:
|
||||
try:
|
||||
@ -60,6 +67,55 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
||||
self.vlm_model, self.processor = load(artifacts_path)
|
||||
self.config = load_config(artifacts_path)
|
||||
|
||||
def get_user_prompt(self, page: Optional[Page]) -> str:
|
||||
if callable(self.vlm_options.prompt) and page is not None:
|
||||
return self.vlm_options.prompt(page.parsed_page)
|
||||
else:
|
||||
user_prompt = self.vlm_options.prompt
|
||||
prompt = self.apply_chat_template(
|
||||
self.processor, self.config, user_prompt, num_images=1
|
||||
)
|
||||
return prompt
|
||||
|
||||
def predict_on_page_image(
|
||||
self, *, page_image: Image.Image, prompt: str, output_tokens: bool = False
|
||||
) -> tuple[str, Optional[list[VlmPredictionToken]]]:
|
||||
tokens = []
|
||||
output = ""
|
||||
for token in self.stream_generate(
|
||||
self.vlm_model,
|
||||
self.processor,
|
||||
prompt,
|
||||
[page_image],
|
||||
max_tokens=self.max_tokens,
|
||||
verbose=False,
|
||||
temp=self.temperature,
|
||||
):
|
||||
if len(token.logprobs.shape) == 1:
|
||||
tokens.append(
|
||||
VlmPredictionToken(
|
||||
text=token.text,
|
||||
token=token.token,
|
||||
logprob=token.logprobs[token.token],
|
||||
)
|
||||
)
|
||||
elif len(token.logprobs.shape) == 2 and token.logprobs.shape[0] == 1:
|
||||
tokens.append(
|
||||
VlmPredictionToken(
|
||||
text=token.text,
|
||||
token=token.token,
|
||||
logprob=token.logprobs[0, token.token],
|
||||
)
|
||||
)
|
||||
else:
|
||||
_log.warning(f"incompatible shape for logprobs: {token.logprobs.shape}")
|
||||
|
||||
output += token.text
|
||||
if "</doctag>" in token.text:
|
||||
break
|
||||
|
||||
return output, tokens
|
||||
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
@ -71,19 +127,23 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
||||
with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
|
||||
assert page.size is not None
|
||||
|
||||
hi_res_image = page.get_image(
|
||||
page_image = page.get_image(
|
||||
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
|
||||
)
|
||||
if hi_res_image is not None:
|
||||
im_width, im_height = hi_res_image.size
|
||||
"""
|
||||
if page_image is not None:
|
||||
im_width, im_height = page_image.size
|
||||
"""
|
||||
assert page_image is not None
|
||||
|
||||
# populate page_tags with predicted doc tags
|
||||
page_tags = ""
|
||||
|
||||
if hi_res_image:
|
||||
if hi_res_image.mode != "RGB":
|
||||
hi_res_image = hi_res_image.convert("RGB")
|
||||
if page_image:
|
||||
if page_image.mode != "RGB":
|
||||
page_image = page_image.convert("RGB")
|
||||
|
||||
"""
|
||||
if callable(self.vlm_options.prompt):
|
||||
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
||||
else:
|
||||
@ -91,11 +151,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
||||
prompt = self.apply_chat_template(
|
||||
self.processor, self.config, user_prompt, num_images=1
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
_log.debug("start generating ...")
|
||||
"""
|
||||
prompt = self.get_user_prompt(page)
|
||||
|
||||
# Call model to generate:
|
||||
start_time = time.time()
|
||||
"""
|
||||
tokens: list[VlmPredictionToken] = []
|
||||
|
||||
output = ""
|
||||
@ -103,7 +164,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
||||
self.vlm_model,
|
||||
self.processor,
|
||||
prompt,
|
||||
[hi_res_image],
|
||||
[page_image],
|
||||
max_tokens=self.max_tokens,
|
||||
verbose=False,
|
||||
temp=self.temperature,
|
||||
@ -135,13 +196,20 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
||||
output += token.text
|
||||
if "</doctag>" in token.text:
|
||||
break
|
||||
"""
|
||||
output, tokens = self.predict_on_page_image(
|
||||
page_image=page_image, prompt=prompt, output_tokens=True
|
||||
)
|
||||
|
||||
generation_time = time.time() - start_time
|
||||
page_tags = output
|
||||
|
||||
"""
|
||||
_log.debug(
|
||||
f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
|
||||
)
|
||||
"""
|
||||
|
||||
page.predictions.vlm_response = VlmPrediction(
|
||||
text=page_tags,
|
||||
generation_time=generation_time,
|
||||
|
119
docling/models/vlm_models_inline/two_stage_vlm_model.py
Normal file
119
docling/models/vlm_models_inline/two_stage_vlm_model.py
Normal file
@ -0,0 +1,119 @@
|
||||
import importlib.metadata
|
||||
import logging
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
from docling.datamodel.accelerator_options import (
|
||||
AcceleratorOptions,
|
||||
)
|
||||
from docling.datamodel.base_models import Cluster, Page, VlmPrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
InlineVlmOptions,
|
||||
TransformersModelType,
|
||||
TransformersPromptStyle,
|
||||
)
|
||||
from docling.models.base_model import BaseLayoutModel, BasePageModel, BaseVlmModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.utils.hf_model_download import (
|
||||
HuggingFaceModelDownloadMixin,
|
||||
)
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
layout_model: BaseLayoutModel,
|
||||
vlm_model: BaseVlmModel,
|
||||
):
|
||||
self.layout_model = layout_model
|
||||
self.vlm_model = vlm_model
|
||||
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
with TimeRecorder(conv_res, "two-staged-vlm"):
|
||||
assert page.size is not None
|
||||
|
||||
page_image = page.get_image(
|
||||
scale=self.vlm_model.scale, max_size=self.vlm_model.max_size
|
||||
)
|
||||
assert page_image is not None
|
||||
|
||||
pred_clusters = self.layout_model.predict_on_page_image(
|
||||
page_image=page_image
|
||||
)
|
||||
|
||||
page, processed_clusters, processed_cells = (
|
||||
self.layout_model.postprocess_on_page_image(
|
||||
page=page, clusters=pred_clusters
|
||||
)
|
||||
)
|
||||
|
||||
user_prompt = self.vlm_model.get_user_prompt(page=page)
|
||||
prompt = self.formulate_prompt(
|
||||
user_prompt=user_prompt,
|
||||
clusters=processed_clusters,
|
||||
image_width=page_image.width,
|
||||
image_height=page_image.height,
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
generated_text, generated_tokens = (
|
||||
self.vlm_model.predict_on_page_image(
|
||||
page_image=page_image, prompt=prompt
|
||||
)
|
||||
)
|
||||
print("generated-text: \n", generated_text, "\n")
|
||||
page.predictions.vlm_response = VlmPrediction(
|
||||
text=generated_text,
|
||||
generation_time=time.time() - start_time,
|
||||
generated_tokens=generated_tokens,
|
||||
)
|
||||
exit(-1)
|
||||
|
||||
yield page
|
||||
|
||||
def formulate_prompt(
|
||||
self,
|
||||
*,
|
||||
user_prompt: str,
|
||||
clusters: list[Cluster],
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
vlm_width: int = 512,
|
||||
vlm_height: int = 512,
|
||||
) -> str:
|
||||
"""Formulate a prompt for the VLM."""
|
||||
|
||||
known_clusters = ["here is a list of unsorted text-blocks:", "<doctags>"]
|
||||
for cluster in clusters:
|
||||
print(" => ", cluster)
|
||||
|
||||
loc_l = f"<loc_{int(vlm_width * cluster.bbox.l / image_width)}>"
|
||||
loc_b = f"<loc_{int(vlm_height * cluster.bbox.b / image_height)}>"
|
||||
loc_r = f"<loc_{int(vlm_width * cluster.bbox.r / image_width)}>"
|
||||
loc_t = f"<loc_{int(vlm_height * cluster.bbox.t / image_height)}>"
|
||||
|
||||
known_clusters.append(
|
||||
f"<{cluster.label}>{loc_l}{loc_b}{loc_r}{loc_t}</{cluster.label}>"
|
||||
)
|
||||
|
||||
known_clusters.append("</doctags>")
|
||||
|
||||
user_prompt = "\n".join(known_clusters) + f"\n\n{user_prompt}"
|
||||
print("user-prompt: ", user_prompt, "\n")
|
||||
|
||||
return user_prompt
|
@ -26,21 +26,24 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat, Page
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import (
|
||||
VlmPipelineOptions,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import LayoutOptions, VlmPipelineOptions
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
ApiVlmOptions,
|
||||
InferenceFramework,
|
||||
InlineVlmOptions,
|
||||
ResponseFormat,
|
||||
TwoStageVlmOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.api_vlm_model import ApiVlmModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.vlm_models_inline.hf_transformers_model import (
|
||||
HuggingFaceTransformersVlmModel,
|
||||
)
|
||||
from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
|
||||
from docling.models.vlm_models_inline.two_stage_vlm_model import (
|
||||
TwoStageVlmModel,
|
||||
)
|
||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
||||
@ -107,6 +110,53 @@ class VlmPipeline(PaginatedPipeline):
|
||||
raise ValueError(
|
||||
f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
|
||||
)
|
||||
elif isinstance(self.pipeline_options.vlm_options, TwoStageVlmOptions):
|
||||
twostagevlm_options = cast(
|
||||
TwoStageVlmOptions, self.pipeline_options.vlm_options
|
||||
)
|
||||
|
||||
stage_1_options = twostagevlm_options.layout_options
|
||||
stage_2_options = twostagevlm_options.vlm_options
|
||||
|
||||
layout_model = LayoutModel(
|
||||
artifacts_path=artifacts_path,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
options=LayoutOptions(
|
||||
create_orphan_clusters=False, model_spec=stage_1_options
|
||||
),
|
||||
)
|
||||
|
||||
if (
|
||||
isinstance(stage_2_options, InlineVlmOptions)
|
||||
and stage_2_options.inference_framework == InferenceFramework.MLX
|
||||
):
|
||||
vlm_model_mlx = HuggingFaceMlxModel(
|
||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||
artifacts_path=artifacts_path,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
vlm_options=stage_2_options,
|
||||
)
|
||||
self.build_pipe = [
|
||||
TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_mlx)
|
||||
]
|
||||
elif (
|
||||
isinstance(stage_2_options, InlineVlmOptions)
|
||||
and stage_2_options.inference_framework
|
||||
== InferenceFramework.TRANSFORMERS
|
||||
):
|
||||
vlm_model_hf = HuggingFaceTransformersVlmModel(
|
||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||
artifacts_path=artifacts_path,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
vlm_options=stage_2_options,
|
||||
)
|
||||
self.build_pipe = [
|
||||
TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_hf)
|
||||
]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Could not instantiate the right type of VLM pipeline: {stage_2_options}"
|
||||
)
|
||||
|
||||
self.enrichment_pipe = [
|
||||
# Other models working on `NodeItem` elements in the DoclingDocument
|
||||
|
@ -213,10 +213,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
139.66741943359375,
|
||||
139.6674041748047,
|
||||
322.5054626464844,
|
||||
475.00927734375,
|
||||
454.45458984375
|
||||
454.4546203613281
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
|
@ -2705,7 +2705,7 @@
|
||||
"b": 102.78223000000003,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9373534917831421,
|
||||
"confidence": 0.9373533129692078,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
@ -2745,7 +2745,7 @@
|
||||
"b": 102.78223000000003,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.8858680725097656,
|
||||
"confidence": 0.8858679533004761,
|
||||
"cells": [
|
||||
{
|
||||
"index": 1,
|
||||
@ -2785,7 +2785,7 @@
|
||||
"b": 152.90697999999998,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9806433916091919,
|
||||
"confidence": 0.9806435108184814,
|
||||
"cells": [
|
||||
{
|
||||
"index": 2,
|
||||
@ -2940,7 +2940,7 @@
|
||||
"b": 255.42400999999995,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.98504239320755,
|
||||
"confidence": 0.9850425124168396,
|
||||
"cells": [
|
||||
{
|
||||
"index": 7,
|
||||
@ -3155,7 +3155,7 @@
|
||||
"b": 327.98218,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9591909050941467,
|
||||
"confidence": 0.9591907262802124,
|
||||
"cells": [
|
||||
{
|
||||
"index": 15,
|
||||
@ -3339,8 +3339,8 @@
|
||||
"id": 0,
|
||||
"label": "table",
|
||||
"bbox": {
|
||||
"l": 139.66741943359375,
|
||||
"t": 337.54541015625,
|
||||
"l": 139.6674041748047,
|
||||
"t": 337.5453796386719,
|
||||
"r": 475.00927734375,
|
||||
"b": 469.4945373535156,
|
||||
"coord_origin": "TOPLEFT"
|
||||
@ -7846,7 +7846,7 @@
|
||||
"b": 518.17419,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9589294195175171,
|
||||
"confidence": 0.9589295387268066,
|
||||
"cells": [
|
||||
{
|
||||
"index": 91,
|
||||
@ -7911,7 +7911,7 @@
|
||||
"b": 618.3,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9849975109100342,
|
||||
"confidence": 0.9849976301193237,
|
||||
"cells": [
|
||||
{
|
||||
"index": 93,
|
||||
@ -8243,8 +8243,8 @@
|
||||
"id": 0,
|
||||
"label": "table",
|
||||
"bbox": {
|
||||
"l": 139.66741943359375,
|
||||
"t": 337.54541015625,
|
||||
"l": 139.6674041748047,
|
||||
"t": 337.5453796386719,
|
||||
"r": 475.00927734375,
|
||||
"b": 469.4945373535156,
|
||||
"coord_origin": "TOPLEFT"
|
||||
@ -13641,7 +13641,7 @@
|
||||
"b": 102.78223000000003,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9373534917831421,
|
||||
"confidence": 0.9373533129692078,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
@ -13687,7 +13687,7 @@
|
||||
"b": 102.78223000000003,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.8858680725097656,
|
||||
"confidence": 0.8858679533004761,
|
||||
"cells": [
|
||||
{
|
||||
"index": 1,
|
||||
@ -13733,7 +13733,7 @@
|
||||
"b": 152.90697999999998,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9806433916091919,
|
||||
"confidence": 0.9806435108184814,
|
||||
"cells": [
|
||||
{
|
||||
"index": 2,
|
||||
@ -13900,7 +13900,7 @@
|
||||
"b": 255.42400999999995,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.98504239320755,
|
||||
"confidence": 0.9850425124168396,
|
||||
"cells": [
|
||||
{
|
||||
"index": 7,
|
||||
@ -14121,7 +14121,7 @@
|
||||
"b": 327.98218,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9591909050941467,
|
||||
"confidence": 0.9591907262802124,
|
||||
"cells": [
|
||||
{
|
||||
"index": 15,
|
||||
@ -14311,8 +14311,8 @@
|
||||
"id": 0,
|
||||
"label": "table",
|
||||
"bbox": {
|
||||
"l": 139.66741943359375,
|
||||
"t": 337.54541015625,
|
||||
"l": 139.6674041748047,
|
||||
"t": 337.5453796386719,
|
||||
"r": 475.00927734375,
|
||||
"b": 469.4945373535156,
|
||||
"coord_origin": "TOPLEFT"
|
||||
@ -19701,7 +19701,7 @@
|
||||
"b": 518.17419,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9589294195175171,
|
||||
"confidence": 0.9589295387268066,
|
||||
"cells": [
|
||||
{
|
||||
"index": 91,
|
||||
@ -19772,7 +19772,7 @@
|
||||
"b": 618.3,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9849975109100342,
|
||||
"confidence": 0.9849976301193237,
|
||||
"cells": [
|
||||
{
|
||||
"index": 93,
|
||||
@ -20116,7 +20116,7 @@
|
||||
"b": 152.90697999999998,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9806433916091919,
|
||||
"confidence": 0.9806435108184814,
|
||||
"cells": [
|
||||
{
|
||||
"index": 2,
|
||||
@ -20283,7 +20283,7 @@
|
||||
"b": 255.42400999999995,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.98504239320755,
|
||||
"confidence": 0.9850425124168396,
|
||||
"cells": [
|
||||
{
|
||||
"index": 7,
|
||||
@ -20504,7 +20504,7 @@
|
||||
"b": 327.98218,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9591909050941467,
|
||||
"confidence": 0.9591907262802124,
|
||||
"cells": [
|
||||
{
|
||||
"index": 15,
|
||||
@ -20694,8 +20694,8 @@
|
||||
"id": 0,
|
||||
"label": "table",
|
||||
"bbox": {
|
||||
"l": 139.66741943359375,
|
||||
"t": 337.54541015625,
|
||||
"l": 139.6674041748047,
|
||||
"t": 337.5453796386719,
|
||||
"r": 475.00927734375,
|
||||
"b": 469.4945373535156,
|
||||
"coord_origin": "TOPLEFT"
|
||||
@ -26084,7 +26084,7 @@
|
||||
"b": 518.17419,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9589294195175171,
|
||||
"confidence": 0.9589295387268066,
|
||||
"cells": [
|
||||
{
|
||||
"index": 91,
|
||||
@ -26155,7 +26155,7 @@
|
||||
"b": 618.3,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9849975109100342,
|
||||
"confidence": 0.9849976301193237,
|
||||
"cells": [
|
||||
{
|
||||
"index": 93,
|
||||
@ -26499,7 +26499,7 @@
|
||||
"b": 102.78223000000003,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.9373534917831421,
|
||||
"confidence": 0.9373533129692078,
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
@ -26545,7 +26545,7 @@
|
||||
"b": 102.78223000000003,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"confidence": 0.8858680725097656,
|
||||
"confidence": 0.8858679533004761,
|
||||
"cells": [
|
||||
{
|
||||
"index": 1,
|
||||
|
Loading…
Reference in New Issue
Block a user