diff --git a/docling/models/smol_docling_model.py b/docling/models/smol_docling_model.py index ca8bb381..72113a96 100644 --- a/docling/models/smol_docling_model.py +++ b/docling/models/smol_docling_model.py @@ -1,23 +1,9 @@ -import argparse -import itertools import logging -import os -import re -import time -from io import BytesIO - -# import copy -# import random -# import time from pathlib import Path from typing import Iterable, List, Optional import torch from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS - -# from docling_core.types.doc import CoordOrigin, DocItemLabel -from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor -from PIL import Image, ImageDraw, ImageFont from transformers import ( # type: ignore AutoProcessor, BitsAndBytesConfig, @@ -129,6 +115,8 @@ class SmolDoclingModel(BasePageModel): )[0] generated_texts = generated_texts.replace("Assistant: ", "") page_tags = generated_texts + print("Page predictions:") + print(page_tags) page.predictions.doctags = DocTagsPrediction(tag_string=page_tags) diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 7df8f15b..1278c9d1 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -56,6 +56,7 @@ class StandardPdfPipeline(PaginatedPipeline): def __init__(self, pipeline_options: PdfPipelineOptions): super().__init__(pipeline_options) + print("------> Init Standard PDF Pipeline!") self.pipeline_options: PdfPipelineOptions artifacts_path: Optional[Path] = None diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 30ec8265..6de5385d 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -1,3 +1,4 @@ +import itertools import logging import re from io import BytesIO @@ -19,7 +20,7 @@ from docling_core.types.doc import ( TableData, TableItem, ) -from PIL import Image, ImageDraw +from PIL.Image import Image from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend @@ -38,6 +39,7 @@ class VlmPipeline(PaginatedPipeline): def __init__(self, pipeline_options: PdfPipelineOptions): super().__init__(pipeline_options) + print("------> Init VLM Pipeline!") self.pipeline_options: PdfPipelineOptions if pipeline_options.artifacts_path is None: @@ -98,13 +100,15 @@ class VlmPipeline(PaginatedPipeline): if page.predictions.doctags is not None: document_tags += page.predictions.doctags.tag_string - image_bytes = BytesIO() - if page.image: - page.image.save(image_bytes, format="PNG") - # TODO implement this function - conv_res.document = self._turn_tags_into_doc( - document_tags, image_bytes.getvalue() - ) + conv_res.document = self._turn_tags_into_doc(document_tags, None) + """ + image_bytes = BytesIO() + if page.image: + page.image.save(image_bytes, format="PNG") + # TODO implement this function + conv_res.document = self._turn_tags_into_doc( + document_tags, image_bytes.getvalue() + ) # Generate page images in the output if self.pipeline_options.generate_page_images: @@ -114,7 +118,7 @@ class VlmPipeline(PaginatedPipeline): conv_res.document.pages[page_no].image = ImageRef.from_pil( page.image, dpi=int(72 * self.pipeline_options.images_scale) ) - + """ # Generate images of the requested element types if ( self.pipeline_options.generate_picture_images @@ -151,7 +155,7 @@ class VlmPipeline(PaginatedPipeline): # def _turn_tags_into_doc(self, xml_content: str, image_bytes: bytes) -> (DoclingDocument, list): def _turn_tags_into_doc( - self, xml_content: str, image_bytes: bytes + self, xml_content: str, input_image: Optional[Image] = None ) -> DoclingDocument: def extract_text(tag_content: str) -> str: return re.sub(r"<.*?>", "", tag_content).strip() @@ -332,7 +336,7 @@ class VlmPipeline(PaginatedPipeline): doc = DoclingDocument(name="Example Document") current_group = None lines = xml_content.split("\n") - pil_image = Image.open(BytesIO(image_bytes)) + # pil_image = input_image #Image.open(BytesIO(image_bytes)) bounding_boxes = [] for line in lines: @@ -454,6 +458,7 @@ class VlmPipeline(PaginatedPipeline): if bbox: bounding_boxes.append((bbox, "yellow")) # Convert bounding box normalized to 0-100 into pixel coordinates for cropping + """ width, height = pil_image.size crop_box = ( int(bbox.l * width), @@ -461,13 +466,14 @@ class VlmPipeline(PaginatedPipeline): int(bbox.r * width), int(bbox.b * height), ) + cropped_image = pil_image.crop(crop_box) doc.add_picture( parent=current_group, image=ImageRef.from_pil(image=cropped_image, dpi=300), - # prov=[ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=1)], prov=ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=1), ) + """ elif line.startswith(""): content = extract_text(line) prov_item_inst = None diff --git a/docs/examples/minimal_smol_docling.py b/docs/examples/minimal_smol_docling.py index 308193c5..e8ad8b02 100644 --- a/docs/examples/minimal_smol_docling.py +++ b/docs/examples/minimal_smol_docling.py @@ -1,10 +1,12 @@ +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline # source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL -source = "tests/data/2305.03393v1-pg9-img.png" +# source = "tests/data/2305.03393v1-pg9-img.png" +source = "tests/data/2305.03393v1-pg9.pdf" pipeline_options = PdfPipelineOptions() pipeline_options.artifacts_path = "model_artifacts" @@ -12,13 +14,40 @@ pipeline_options.artifacts_path = "model_artifacts" converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( - pipeline_cls=VlmPipeline, pipeline_options=pipeline_options - ) + pipeline_cls=VlmPipeline, + pipeline_options=pipeline_options, + backend=DoclingParseDocumentBackend, + ), + InputFormat.IMAGE: PdfFormatOption( + pipeline_cls=VlmPipeline, + pipeline_options=pipeline_options, + backend=DoclingParseDocumentBackend, + ), } ) + +print("============") +print("starting...") +print("============") +print("") + result = converter.convert(source) + +print("------------") +print("result:") +print("------------") +print("") +print(result) + +print("------------") +print("MD:") +print("------------") +print("") print(result.document.export_to_markdown()) +print("") +print("============") print("done!") +print("============") # output: ## Docling Technical Report [...]"