mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
need to get Phi4 working again ...
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
15a8f328c2
commit
e2c95d09bc
@ -73,21 +73,8 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
|
|||||||
artifacts_path,
|
artifacts_path,
|
||||||
trust_remote_code=self.trust_remote_code,
|
trust_remote_code=self.trust_remote_code,
|
||||||
)
|
)
|
||||||
if not self.param_quantized:
|
if self.param_quantized:
|
||||||
self.vlm_model = AutoModelForCausalLM.from_pretrained(
|
print("using quantized")
|
||||||
artifacts_path,
|
|
||||||
device_map=self.device,
|
|
||||||
torch_dtype=torch.bfloat16,
|
|
||||||
_attn_implementation=(
|
|
||||||
"flash_attention_2"
|
|
||||||
if self.device.startswith("cuda")
|
|
||||||
and accelerator_options.cuda_use_flash_attention2
|
|
||||||
else "eager"
|
|
||||||
),
|
|
||||||
trust_remote_code=self.trust_remote_code,
|
|
||||||
).to(self.device)
|
|
||||||
|
|
||||||
else:
|
|
||||||
self.vlm_model = AutoModelForCausalLM.from_pretrained(
|
self.vlm_model = AutoModelForCausalLM.from_pretrained(
|
||||||
artifacts_path,
|
artifacts_path,
|
||||||
device_map=self.device,
|
device_map=self.device,
|
||||||
@ -100,7 +87,21 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
|
|||||||
else "eager"
|
else "eager"
|
||||||
),
|
),
|
||||||
trust_remote_code=self.trust_remote_code,
|
trust_remote_code=self.trust_remote_code,
|
||||||
).to(self.device)
|
) # .to(self.device)
|
||||||
|
else:
|
||||||
|
print("using original")
|
||||||
|
self.vlm_model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
artifacts_path,
|
||||||
|
device_map=self.device,
|
||||||
|
torch_dtype="auto", # torch.bfloat16,
|
||||||
|
_attn_implementation=(
|
||||||
|
"flash_attention_2"
|
||||||
|
if self.device.startswith("cuda")
|
||||||
|
and accelerator_options.cuda_use_flash_attention2
|
||||||
|
else "eager"
|
||||||
|
),
|
||||||
|
trust_remote_code=self.trust_remote_code,
|
||||||
|
) # .to(self.device)
|
||||||
|
|
||||||
model_path = artifacts_path
|
model_path = artifacts_path
|
||||||
|
|
||||||
@ -118,7 +119,8 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
|
|||||||
with TimeRecorder(conv_res, "vlm"):
|
with TimeRecorder(conv_res, "vlm"):
|
||||||
assert page.size is not None
|
assert page.size is not None
|
||||||
|
|
||||||
hi_res_image = page.get_image(scale=self.vlm_options.scale)
|
hi_res_image = page.get_image(scale=2) # self.vlm_options.scale)
|
||||||
|
# hi_res_image.show()
|
||||||
|
|
||||||
if hi_res_image is not None:
|
if hi_res_image is not None:
|
||||||
im_width, im_height = hi_res_image.size
|
im_width, im_height = hi_res_image.size
|
||||||
@ -129,6 +131,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
|
|||||||
|
|
||||||
# Define prompt structure
|
# Define prompt structure
|
||||||
prompt = self.formulate_prompt()
|
prompt = self.formulate_prompt()
|
||||||
|
print(f"prompt: '{prompt}', size: {im_width}, {im_height}")
|
||||||
|
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=prompt, images=hi_res_image, return_tensors="pt"
|
text=prompt, images=hi_res_image, return_tensors="pt"
|
||||||
@ -138,8 +141,8 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel):
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
generate_ids = self.vlm_model.generate(
|
generate_ids = self.vlm_model.generate(
|
||||||
**inputs,
|
**inputs,
|
||||||
max_new_tokens=self.max_new_tokens,
|
max_new_tokens=4096, # self.max_new_tokens,
|
||||||
use_cache=self.use_cache, # Enables KV caching which can improve performance
|
# use_cache=self.use_cache, # Enables KV caching which can improve performance
|
||||||
generation_config=self.generation_config,
|
generation_config=self.generation_config,
|
||||||
num_logits_to_keep=1,
|
num_logits_to_keep=1,
|
||||||
)
|
)
|
||||||
|
@ -6,13 +6,11 @@ from docling_core.types.doc import DocItemLabel, ImageRefMode
|
|||||||
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_model_specializations import (
|
||||||
HuggingFaceVlmOptions,
|
HuggingFaceVlmOptions,
|
||||||
InferenceFramework,
|
InferenceFramework,
|
||||||
ResponseFormat,
|
ResponseFormat,
|
||||||
VlmPipelineOptions,
|
|
||||||
granite_vision_vlm_conversion_options,
|
granite_vision_vlm_conversion_options,
|
||||||
granite_vision_vlm_mlx_conversion_options,
|
|
||||||
granite_vision_vlm_ollama_conversion_options,
|
granite_vision_vlm_ollama_conversion_options,
|
||||||
phi_vlm_conversion_options,
|
phi_vlm_conversion_options,
|
||||||
pixtral_12b_vlm_conversion_options,
|
pixtral_12b_vlm_conversion_options,
|
||||||
@ -21,6 +19,9 @@ from docling.datamodel.pipeline_options import (
|
|||||||
smoldocling_vlm_conversion_options,
|
smoldocling_vlm_conversion_options,
|
||||||
smoldocling_vlm_mlx_conversion_options,
|
smoldocling_vlm_mlx_conversion_options,
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.pipeline_options import (
|
||||||
|
VlmPipelineOptions,
|
||||||
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
@ -49,6 +50,7 @@ pipeline_options.generate_page_images = True
|
|||||||
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
||||||
|
|
||||||
pipeline_options.vlm_options = phi_vlm_conversion_options
|
pipeline_options.vlm_options = phi_vlm_conversion_options
|
||||||
|
# pipeline_options.vlm_options = qwen25_vl_3b_vlm_mlx_conversion_options
|
||||||
|
|
||||||
"""
|
"""
|
||||||
pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
|
pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
|
||||||
|
Loading…
Reference in New Issue
Block a user