chore: clean up code and comments

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-02-26 12:46:41 +01:00
parent f994654918
commit c5873f2496
5 changed files with 37 additions and 29 deletions

View File

@ -333,6 +333,8 @@ class PaginatedPipelineOptions(PipelineOptions):
class VlmPipelineOptions(PaginatedPipelineOptions): class VlmPipelineOptions(PaginatedPipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None artifacts_path: Optional[Union[Path, str]] = None
generate_page_images: bool = True
force_backend_text: bool = ( force_backend_text: bool = (
False # (To be used with vlms, or other generative models) False # (To be used with vlms, or other generative models)
) )

View File

@ -116,7 +116,7 @@ class HuggingFaceVlmModel(BasePageModel):
if not page._backend.is_valid(): if not page._backend.is_valid():
yield page yield page
else: else:
with TimeRecorder(conv_res, "smolvlm"): with TimeRecorder(conv_res, "vlm"):
assert page.size is not None assert page.size is not None
hi_res_image = page.get_image(scale=2.0) # 144dpi hi_res_image = page.get_image(scale=2.0) # 144dpi

View File

@ -51,7 +51,7 @@ class VlmPipeline(PaginatedPipeline):
self.keep_backend = True self.keep_backend = True
warnings.warn( warnings.warn(
"This API is currently experimental and may change in upcoming versions without notice.", "The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
category=UserWarning, category=UserWarning,
stacklevel=2, stacklevel=2,
) )
@ -70,18 +70,18 @@ class VlmPipeline(PaginatedPipeline):
"When defined, it must point to a folder containing all models required by the pipeline." "When defined, it must point to a folder containing all models required by the pipeline."
) )
# force_backend_text = False - use text that is coming from SmolDocling # force_backend_text = False - use text that is coming from VLM response
# force_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss # force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags
self.force_backend_text = pipeline_options.force_backend_text self.force_backend_text = (
pipeline_options.force_backend_text
self.keep_images = ( and pipeline_options.vlm_options.response_format == ResponseFormat.DOCTAGS
self.pipeline_options.generate_page_images
or self.pipeline_options.generate_picture_images
) )
self.keep_images = self.pipeline_options.generate_page_images
self.build_pipe = [ self.build_pipe = [
HuggingFaceVlmModel( HuggingFaceVlmModel(
enabled=True, enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path, artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options, accelerator_options=pipeline_options.accelerator_options,
vlm_options=self.pipeline_options.vlm_options, vlm_options=self.pipeline_options.vlm_options,
@ -397,6 +397,7 @@ class VlmPipeline(PaginatedPipeline):
if page.predictions.vlm_response: if page.predictions.vlm_response:
predicted_text = page.predictions.vlm_response.text predicted_text = page.predictions.vlm_response.text
image = page.image image = page.image
page_no = pg_idx + 1 page_no = pg_idx + 1
bounding_boxes = [] bounding_boxes = []
@ -448,12 +449,13 @@ class VlmPipeline(PaginatedPipeline):
text_caption_content = extract_inner_text(full_chunk) text_caption_content = extract_inner_text(full_chunk)
if image: if image:
if bbox: if bbox:
width, height = image.size im_width, im_height = image.size
crop_box = ( crop_box = (
int(bbox.l * width), int(bbox.l * im_width),
int(bbox.t * height), int(bbox.t * im_height),
int(bbox.r * width), int(bbox.r * im_width),
int(bbox.b * height), int(bbox.b * im_height),
) )
cropped_image = image.crop(crop_box) cropped_image = image.crop(crop_box)
pic = doc.add_picture( pic = doc.add_picture(
@ -461,7 +463,9 @@ class VlmPipeline(PaginatedPipeline):
image=ImageRef.from_pil(image=cropped_image, dpi=72), image=ImageRef.from_pil(image=cropped_image, dpi=72),
prov=( prov=(
ProvenanceItem( ProvenanceItem(
bbox=bbox, charspan=(0, 0), page_no=page_no bbox=bbox.resize_by_scale(pg_width, pg_height),
charspan=(0, 0),
page_no=page_no,
) )
), ),
) )
@ -501,7 +505,7 @@ class VlmPipeline(PaginatedPipeline):
text=text_content, text=text_content,
prov=( prov=(
ProvenanceItem( ProvenanceItem(
bbox=bbox, bbox=bbox.resize_by_scale(pg_width, pg_height),
charspan=(0, len(text_content)), charspan=(0, len(text_content)),
page_no=page_no, page_no=page_no,
) )

View File

@ -11,32 +11,34 @@ from docling.datamodel.pipeline_options import (
granite_vision_vlm_conversion_options, granite_vision_vlm_conversion_options,
smoldocling_vlm_conversion_options, smoldocling_vlm_conversion_options,
) )
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline from docling.pipeline.vlm_pipeline import VlmPipeline
sources = [ sources = [
# "https://arxiv.org/pdf/2408.09869",
"tests/data/2305.03393v1-pg9-img.png", "tests/data/2305.03393v1-pg9-img.png",
# "tests/data/2305.03393v1-pg9.pdf",
] ]
pipeline_options = VlmPipelineOptions() # artifacts_path="~/local_model_artifacts/" settings.debug.profile_pipeline_timings = True
pipeline_options.generate_page_images = True ## Use experimental VlmPipeline
pipeline_options = VlmPipelineOptions()
# If force_backend_text = True, text from backend will be used instead of generated text # If force_backend_text = True, text from backend will be used instead of generated text
pipeline_options.force_backend_text = False pipeline_options.force_backend_text = False
## Enable flash_attention_2 with CUDA: ## On GPU systems, enable flash_attention_2 with CUDA:
# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA # pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
## Pick a VLM model. We choose SmolDocling-256M by default
pipeline_options.vlm_options = smoldocling_vlm_conversion_options pipeline_options.vlm_options = smoldocling_vlm_conversion_options
## Choose alternative VLM models: ## Alternative VLM models:
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options # pipeline_options.vlm_options = granite_vision_vlm_conversion_options
from docling_core.types.doc import DocItemLabel, ImageRefMode from docling_core.types.doc import DocItemLabel, ImageRefMode
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
## Set up pipeline for PDF or image inputs
converter = DocumentConverter( converter = DocumentConverter(
format_options={ format_options={
InputFormat.PDF: PdfFormatOption( InputFormat.PDF: PdfFormatOption(
@ -68,6 +70,12 @@ for source in sources:
print("") print("")
print(res.document.export_to_markdown()) print(res.document.export_to_markdown())
print("------------------------------------------------")
print("Timings:")
print("------------------------------------------------")
print("")
print(res.timings)
for page in res.pages: for page in res.pages:
print("") print("")
print("Predicted page in DOCTAGS:") print("Predicted page in DOCTAGS:")
@ -82,9 +90,6 @@ for source in sources:
with (out_path / f"{res.input.file.stem}.json").open("w") as fp: with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
fp.write(json.dumps(res.document.export_to_dict())) fp.write(json.dumps(res.document.export_to_dict()))
with (out_path / f"{res.input.file.stem}.yaml").open("w") as fp:
fp.write(yaml.safe_dump(res.document.export_to_dict()))
pg_num = res.document.num_pages() pg_num = res.document.num_pages()
print("") print("")

View File

@ -69,9 +69,6 @@ accelerate = [
pillow = ">=10.0.0,<12.0.0" pillow = ">=10.0.0,<12.0.0"
tqdm = "^4.65.0" tqdm = "^4.65.0"
# transformers = "^4.47.1"
# accelerate = "^1.2.1"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"} black = {extras = ["jupyter"], version = "^24.4.2"}
pytest = "^7.2.2" pytest = "^7.2.2"