mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
finalising last points for vlms support
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
fc61258273
commit
d41b856961
@ -64,6 +64,7 @@ class ApiVlmOptions(BaseVlmOptions):
|
||||
params: Dict[str, Any] = {}
|
||||
scale: float = 2.0
|
||||
timeout: float = 60
|
||||
concurrency: int = 1
|
||||
response_format: ResponseFormat
|
||||
|
||||
|
||||
|
@ -186,6 +186,11 @@ class DocumentConverter:
|
||||
Tuple[Type[BasePipeline], str], BasePipeline
|
||||
] = {}
|
||||
|
||||
def _get_initialized_pipelines(self) -> dict[
|
||||
tuple[Type[BasePipeline], str], BasePipeline
|
||||
]:
|
||||
return self.initialized_pipelines
|
||||
|
||||
def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
|
||||
"""Generate a hash of pipeline options to use as part of the cache key."""
|
||||
options_str = str(pipeline_options.model_dump())
|
||||
|
@ -71,7 +71,7 @@ class HuggingFaceMlxModel(BasePageModel):
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
with TimeRecorder(conv_res, "vlm"):
|
||||
with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
|
||||
assert page.size is not None
|
||||
|
||||
hi_res_image = page.get_image(scale=self.vlm_options.scale)
|
||||
@ -124,7 +124,9 @@ class HuggingFaceMlxModel(BasePageModel):
|
||||
logprob=token.logprobs[0, token.token],
|
||||
)
|
||||
)
|
||||
|
||||
else:
|
||||
_log.warning(f"incompatible shape for logprobs: {token.logprobs.shape}")
|
||||
|
||||
output += token.text
|
||||
if "</doctag>" in token.text:
|
||||
break
|
||||
|
@ -141,7 +141,10 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel):
|
||||
_log.debug(
|
||||
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
|
||||
)
|
||||
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
||||
page.predictions.vlm_response = VlmPrediction(
|
||||
text=page_tags,
|
||||
generation_time=generation_time,
|
||||
)
|
||||
|
||||
yield page
|
||||
|
||||
|
@ -1,3 +1,4 @@
|
||||
import re
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
@ -19,6 +20,14 @@ from docling.datamodel.pipeline_model_specializations import (
|
||||
InferenceFramework,
|
||||
ResponseFormat,
|
||||
)
|
||||
from docling_core.types.doc.base import (
|
||||
Size,
|
||||
BoundingBox,
|
||||
)
|
||||
from docling_core.types.doc import (
|
||||
ProvenanceItem,
|
||||
DoclingDocument
|
||||
)
|
||||
from docling.datamodel.pipeline_options import (
|
||||
VlmPipelineOptions,
|
||||
)
|
||||
@ -237,6 +246,48 @@ class VlmPipeline(PaginatedPipeline):
|
||||
|
||||
return conv_res
|
||||
|
||||
def _turn_dt_into_doc(self, conv_res) -> DoclingDocument:
|
||||
doctags_list = []
|
||||
image_list = []
|
||||
for page in conv_res.pages:
|
||||
predicted_doctags = ""
|
||||
img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
|
||||
if page.predictions.vlm_response:
|
||||
predicted_doctags = page.predictions.vlm_response.text
|
||||
if page.image:
|
||||
img = page.image
|
||||
image_list.append(img)
|
||||
doctags_list.append(predicted_doctags)
|
||||
|
||||
doctags_list_c = cast(List[Union[Path, str]], doctags_list)
|
||||
image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
|
||||
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
||||
doctags_list_c, image_list_c
|
||||
)
|
||||
conv_res.document.load_from_doctags(doctags_doc)
|
||||
|
||||
# If forced backend text, replace model predicted text with backend one
|
||||
if page.size:
|
||||
if self.force_backend_text:
|
||||
scale = self.pipeline_options.images_scale
|
||||
for element, _level in conv_res.document.iterate_items():
|
||||
if (not isinstance(element, TextItem)
|
||||
or len(element.prov) == 0
|
||||
):
|
||||
continue
|
||||
crop_bbox = (
|
||||
element.prov[0]
|
||||
.bbox.scaled(scale=scale)
|
||||
.to_top_left_origin(
|
||||
page_height=page.size.height * scale
|
||||
)
|
||||
)
|
||||
txt = self.extract_text_from_backend(page, crop_bbox)
|
||||
element.text = txt
|
||||
element.orig = txt
|
||||
|
||||
|
||||
"""
|
||||
def _turn_md_into_doc(self, conv_res):
|
||||
predicted_text = ""
|
||||
for pg_idx, page in enumerate(conv_res.pages):
|
||||
@ -254,7 +305,85 @@ class VlmPipeline(PaginatedPipeline):
|
||||
path_or_stream=response_bytes,
|
||||
)
|
||||
return backend.convert()
|
||||
"""
|
||||
|
||||
def _turn_md_into_doc(self, conv_res):
|
||||
|
||||
def _extract_markdown_code(text):
|
||||
"""
|
||||
Extracts text from markdown code blocks (enclosed in triple backticks).
|
||||
If no code blocks are found, returns the original text.
|
||||
|
||||
Args:
|
||||
text (str): Input text that may contain markdown code blocks
|
||||
|
||||
Returns:
|
||||
str: Extracted code if code blocks exist, otherwise original text
|
||||
"""
|
||||
# Regex pattern to match content between triple backticks
|
||||
# This handles multiline content and optional language specifier
|
||||
pattern = r'^```(?:\w*\n)?(.*?)```(\n)*$'
|
||||
|
||||
# Search for matches with DOTALL flag to match across multiple lines
|
||||
matches = re.findall(pattern, text, re.DOTALL)
|
||||
|
||||
# Search with DOTALL flag to match across multiple lines
|
||||
mtch = re.search(pattern, text, re.DOTALL)
|
||||
|
||||
if mtch:
|
||||
# Return only the content of the first capturing group
|
||||
return mtch.group(1)
|
||||
else:
|
||||
# No code blocks found, return original text
|
||||
return text
|
||||
|
||||
for pg_idx, page in enumerate(conv_res.pages):
|
||||
|
||||
page_no = pg_idx+1 # FIXME: might be incorrect
|
||||
|
||||
predicted_text = ""
|
||||
if page.predictions.vlm_response:
|
||||
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
||||
|
||||
predicted_text = _extract_markdown_code(text=predicted_text)
|
||||
|
||||
response_bytes = BytesIO(predicted_text.encode("utf8"))
|
||||
out_doc = InputDocument(
|
||||
path_or_stream=response_bytes,
|
||||
filename=conv_res.input.file.name,
|
||||
format=InputFormat.MD,
|
||||
backend=MarkdownDocumentBackend,
|
||||
)
|
||||
backend = MarkdownDocumentBackend(
|
||||
in_doc=out_doc,
|
||||
path_or_stream=response_bytes,
|
||||
)
|
||||
page_doc = backend.convert()
|
||||
|
||||
if page.image is not None:
|
||||
pg_width = page.image.width
|
||||
pg_height = page.image.height
|
||||
else:
|
||||
pg_width = 1
|
||||
pg_height = 1
|
||||
|
||||
conv_res.document.add_page(
|
||||
page_no=page_no,
|
||||
size=Size(width=pg_width, height=pg_height),
|
||||
image=ImageRef.from_pil(image=page.image, dpi=72) if page.image else None,
|
||||
)
|
||||
|
||||
for item, level in page_doc.iterate_items():
|
||||
item.prov = [
|
||||
ProvenanceItem(page_no=pg_idx+1,
|
||||
bbox=BoundingBox(t=0.0, b=0.0, l=0.0, r=0.0),
|
||||
charspan=[0,0])
|
||||
]
|
||||
conv_res.document.append_child_item(child=item)
|
||||
print(item)
|
||||
|
||||
return conv_res.document
|
||||
|
||||
@classmethod
|
||||
def get_default_options(cls) -> VlmPipelineOptions:
|
||||
return VlmPipelineOptions()
|
||||
|
@ -25,10 +25,7 @@ from docling.datamodel.pipeline_options import (
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
sources = [
|
||||
# "tests/data/2305.03393v1-pg9-img.png",
|
||||
"tests/data/pdf/2305.03393v1-pg9.pdf",
|
||||
]
|
||||
from tabulate import tabulate
|
||||
|
||||
## Use experimental VlmPipeline
|
||||
pipeline_options = VlmPipelineOptions()
|
||||
@ -104,75 +101,120 @@ qwen_vlm_conversion_options = HuggingFaceVlmOptions(
|
||||
pipeline_options.vlm_options = qwen_vlm_conversion_options
|
||||
"""
|
||||
|
||||
## Set up pipeline for PDF or image inputs
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=VlmPipeline,
|
||||
pipeline_options=pipeline_options,
|
||||
),
|
||||
InputFormat.IMAGE: PdfFormatOption(
|
||||
pipeline_cls=VlmPipeline,
|
||||
pipeline_options=pipeline_options,
|
||||
),
|
||||
},
|
||||
)
|
||||
def convert(sources: list[Path], converter):
|
||||
for source in sources:
|
||||
#start_time = time.time()
|
||||
print("================================================")
|
||||
print(f"Processing... {source}")
|
||||
print("================================================")
|
||||
print("")
|
||||
|
||||
out_path = Path("scratch")
|
||||
out_path.mkdir(parents=True, exist_ok=True)
|
||||
res = converter.convert(source)
|
||||
|
||||
print("")
|
||||
# print(res.document.export_to_markdown())
|
||||
|
||||
model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
|
||||
framework = pipeline_options.vlm_options.inference_framework
|
||||
fname = f"{res.input.file.stem}-{model_id}-{framework}"
|
||||
|
||||
for source in sources:
|
||||
start_time = time.time()
|
||||
print("================================================")
|
||||
print(f"Processing... {source}")
|
||||
print("================================================")
|
||||
print("")
|
||||
inference_time = 0.0
|
||||
for i, page in enumerate(res.pages):
|
||||
inference_time += page.predictions.vlm_response.generation_time
|
||||
print("")
|
||||
print(
|
||||
f" ---------- Predicted page {i} in {pipeline_options.vlm_options.response_format} in {page.predictions.vlm_response.generation_time} [sec]:"
|
||||
)
|
||||
print(page.predictions.vlm_response.text)
|
||||
print(" ---------- ")
|
||||
|
||||
print("===== Final output of the converted document =======")
|
||||
|
||||
res = converter.convert(source)
|
||||
with (out_path / f"{fname}.json").open("w") as fp:
|
||||
fp.write(json.dumps(res.document.export_to_dict()))
|
||||
|
||||
print("")
|
||||
# print(res.document.export_to_markdown())
|
||||
res.document.save_as_json(
|
||||
out_path / f"{fname}.json",
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
)
|
||||
print(f" => produced {out_path / fname}.json")
|
||||
|
||||
model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
|
||||
fname = f"{model_id}-{res.input.file.stem}"
|
||||
res.document.save_as_markdown(
|
||||
out_path / f"{fname}.md",
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
)
|
||||
print(f" => produced {out_path / fname}.md")
|
||||
|
||||
for i, page in enumerate(res.pages):
|
||||
res.document.save_as_html(
|
||||
out_path / f"{fname}.html",
|
||||
image_mode=ImageRefMode.EMBEDDED,
|
||||
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
|
||||
split_page_view=True,
|
||||
)
|
||||
print(f" => produced {out_path / fname}.html")
|
||||
|
||||
pg_num = res.document.num_pages()
|
||||
print("")
|
||||
print(
|
||||
f" ---------- Predicted page {i} in {pipeline_options.vlm_options.response_format}:"
|
||||
f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
|
||||
)
|
||||
print(page.predictions.vlm_response.text)
|
||||
print(" ---------- ")
|
||||
print("====================================================")
|
||||
|
||||
print("===== Final output of the converted document =======")
|
||||
# return [source, f"{out_path / fname}.html", model_id, framework, inference_time, ]
|
||||
return [source, model_id, framework, pg_num, inference_time, ]
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
with (out_path / f"{fname}.json").open("w") as fp:
|
||||
fp.write(json.dumps(res.document.export_to_dict()))
|
||||
sources = [
|
||||
# "tests/data/2305.03393v1-pg9-img.png",
|
||||
"tests/data/pdf/2305.03393v1-pg9.pdf",
|
||||
]
|
||||
|
||||
out_path = Path("scratch")
|
||||
out_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
## Use VlmPipeline
|
||||
pipeline_options = VlmPipelineOptions()
|
||||
|
||||
res.document.save_as_json(
|
||||
out_path / f"{fname}.json",
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
)
|
||||
print(f" => produced {out_path / fname}.json")
|
||||
# If force_backend_text = True, text from backend will be used instead of generated text
|
||||
pipeline_options.force_backend_text = False
|
||||
pipeline_options.generate_page_images = True
|
||||
|
||||
res.document.save_as_markdown(
|
||||
out_path / f"{fname}.md",
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
)
|
||||
print(f" => produced {out_path / fname}.md")
|
||||
## On GPU systems, enable flash_attention_2 with CUDA:
|
||||
# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
|
||||
# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
|
||||
|
||||
res.document.save_as_html(
|
||||
out_path / f"{fname}.html",
|
||||
image_mode=ImageRefMode.EMBEDDED,
|
||||
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
|
||||
split_page_view=True,
|
||||
)
|
||||
print(f" => produced {out_path / fname}.html")
|
||||
rows = []
|
||||
for vlm_options in [
|
||||
# smoldocling_vlm_conversion_options, \
|
||||
smoldocling_vlm_mlx_conversion_options, \
|
||||
granite_vision_vlm_conversion_options, \
|
||||
# phi_vlm_conversion_options, \
|
||||
qwen25_vl_3b_vlm_mlx_conversion_options, \
|
||||
pixtral_12b_vlm_mlx_conversion_options,
|
||||
]:
|
||||
pipeline_options.vlm_options = vlm_options
|
||||
|
||||
## Set up pipeline for PDF or image inputs
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=VlmPipeline,
|
||||
pipeline_options=pipeline_options,
|
||||
),
|
||||
InputFormat.IMAGE: PdfFormatOption(
|
||||
pipeline_cls=VlmPipeline,
|
||||
pipeline_options=pipeline_options,
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
row = convert(sources=sources, converter=converter)
|
||||
print("pipelines: \n", converter._get_initialized_pipelines())
|
||||
|
||||
rows.append(row)
|
||||
|
||||
print(tabulate(rows))
|
||||
|
||||
pg_num = res.document.num_pages()
|
||||
print("")
|
||||
inference_time = time.time() - start_time
|
||||
print(
|
||||
f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
|
||||
)
|
||||
print("====================================================")
|
||||
print("see if memory gets released ...")
|
||||
time.sleep(10)
|
||||
|
Loading…
Reference in New Issue
Block a user