From c5873f2496d1241a92d76b01ee34343429085b4c Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Wed, 26 Feb 2025 12:46:41 +0100 Subject: [PATCH] chore: clean up code and comments Signed-off-by: Christoph Auer --- docling/datamodel/pipeline_options.py | 2 ++ docling/models/hf_vlm_model.py | 2 +- docling/pipeline/vlm_pipeline.py | 36 +++++++++++++++------------ docs/examples/minimal_vlm_pipeline.py | 23 ++++++++++------- pyproject.toml | 3 --- 5 files changed, 37 insertions(+), 29 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index e537894f..3a55ecfc 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -333,6 +333,8 @@ class PaginatedPipelineOptions(PipelineOptions): class VlmPipelineOptions(PaginatedPipelineOptions): artifacts_path: Optional[Union[Path, str]] = None + + generate_page_images: bool = True force_backend_text: bool = ( False # (To be used with vlms, or other generative models) ) diff --git a/docling/models/hf_vlm_model.py b/docling/models/hf_vlm_model.py index d6021187..f0280cc6 100644 --- a/docling/models/hf_vlm_model.py +++ b/docling/models/hf_vlm_model.py @@ -116,7 +116,7 @@ class HuggingFaceVlmModel(BasePageModel): if not page._backend.is_valid(): yield page else: - with TimeRecorder(conv_res, "smolvlm"): + with TimeRecorder(conv_res, "vlm"): assert page.size is not None hi_res_image = page.get_image(scale=2.0) # 144dpi diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index e7cd2d59..cdf91795 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -51,7 +51,7 @@ class VlmPipeline(PaginatedPipeline): self.keep_backend = True warnings.warn( - "This API is currently experimental and may change in upcoming versions without notice.", + "The VlmPipeline is currently experimental and may change in upcoming versions without notice.", category=UserWarning, stacklevel=2, ) @@ -70,18 +70,18 @@ class VlmPipeline(PaginatedPipeline): "When defined, it must point to a folder containing all models required by the pipeline." ) - # force_backend_text = False - use text that is coming from SmolDocling - # force_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss - self.force_backend_text = pipeline_options.force_backend_text - - self.keep_images = ( - self.pipeline_options.generate_page_images - or self.pipeline_options.generate_picture_images + # force_backend_text = False - use text that is coming from VLM response + # force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags + self.force_backend_text = ( + pipeline_options.force_backend_text + and pipeline_options.vlm_options.response_format == ResponseFormat.DOCTAGS ) + self.keep_images = self.pipeline_options.generate_page_images + self.build_pipe = [ HuggingFaceVlmModel( - enabled=True, + enabled=True, # must be always enabled for this pipeline to make sense. artifacts_path=artifacts_path, accelerator_options=pipeline_options.accelerator_options, vlm_options=self.pipeline_options.vlm_options, @@ -397,6 +397,7 @@ class VlmPipeline(PaginatedPipeline): if page.predictions.vlm_response: predicted_text = page.predictions.vlm_response.text image = page.image + page_no = pg_idx + 1 bounding_boxes = [] @@ -448,12 +449,13 @@ class VlmPipeline(PaginatedPipeline): text_caption_content = extract_inner_text(full_chunk) if image: if bbox: - width, height = image.size + im_width, im_height = image.size + crop_box = ( - int(bbox.l * width), - int(bbox.t * height), - int(bbox.r * width), - int(bbox.b * height), + int(bbox.l * im_width), + int(bbox.t * im_height), + int(bbox.r * im_width), + int(bbox.b * im_height), ) cropped_image = image.crop(crop_box) pic = doc.add_picture( @@ -461,7 +463,9 @@ class VlmPipeline(PaginatedPipeline): image=ImageRef.from_pil(image=cropped_image, dpi=72), prov=( ProvenanceItem( - bbox=bbox, charspan=(0, 0), page_no=page_no + bbox=bbox.resize_by_scale(pg_width, pg_height), + charspan=(0, 0), + page_no=page_no, ) ), ) @@ -501,7 +505,7 @@ class VlmPipeline(PaginatedPipeline): text=text_content, prov=( ProvenanceItem( - bbox=bbox, + bbox=bbox.resize_by_scale(pg_width, pg_height), charspan=(0, len(text_content)), page_no=page_no, ) diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index 7c9913e9..424df9cb 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -11,32 +11,34 @@ from docling.datamodel.pipeline_options import ( granite_vision_vlm_conversion_options, smoldocling_vlm_conversion_options, ) +from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline sources = [ - # "https://arxiv.org/pdf/2408.09869", "tests/data/2305.03393v1-pg9-img.png", - # "tests/data/2305.03393v1-pg9.pdf", ] -pipeline_options = VlmPipelineOptions() # artifacts_path="~/local_model_artifacts/" -pipeline_options.generate_page_images = True +settings.debug.profile_pipeline_timings = True +## Use experimental VlmPipeline +pipeline_options = VlmPipelineOptions() # If force_backend_text = True, text from backend will be used instead of generated text pipeline_options.force_backend_text = False -## Enable flash_attention_2 with CUDA: +## On GPU systems, enable flash_attention_2 with CUDA: # pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True +## Pick a VLM model. We choose SmolDocling-256M by default pipeline_options.vlm_options = smoldocling_vlm_conversion_options -## Choose alternative VLM models: +## Alternative VLM models: # pipeline_options.vlm_options = granite_vision_vlm_conversion_options from docling_core.types.doc import DocItemLabel, ImageRefMode from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS +## Set up pipeline for PDF or image inputs converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( @@ -68,6 +70,12 @@ for source in sources: print("") print(res.document.export_to_markdown()) + print("------------------------------------------------") + print("Timings:") + print("------------------------------------------------") + print("") + print(res.timings) + for page in res.pages: print("") print("Predicted page in DOCTAGS:") @@ -82,9 +90,6 @@ for source in sources: with (out_path / f"{res.input.file.stem}.json").open("w") as fp: fp.write(json.dumps(res.document.export_to_dict())) - with (out_path / f"{res.input.file.stem}.yaml").open("w") as fp: - fp.write(yaml.safe_dump(res.document.export_to_dict())) - pg_num = res.document.num_pages() print("") diff --git a/pyproject.toml b/pyproject.toml index a22ee024..647d586f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,9 +69,6 @@ accelerate = [ pillow = ">=10.0.0,<12.0.0" tqdm = "^4.65.0" -# transformers = "^4.47.1" -# accelerate = "^1.2.1" - [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "^24.4.2"} pytest = "^7.2.2"