WIP, first working code for inference of SmolDocling, and vlm pipeline assembly code, example included.

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2025-01-09 18:41:00 +01:00
parent 03c8d45790
commit 3c4c647615
4 changed files with 53 additions and 29 deletions

View File

@ -1,23 +1,9 @@
import argparse
import itertools
import logging
import os
import re
import time
from io import BytesIO
# import copy
# import random
# import time
from pathlib import Path
from typing import Iterable, List, Optional
import torch
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
# from docling_core.types.doc import CoordOrigin, DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import Image, ImageDraw, ImageFont
from transformers import ( # type: ignore
AutoProcessor,
BitsAndBytesConfig,
@ -129,6 +115,8 @@ class SmolDoclingModel(BasePageModel):
)[0]
generated_texts = generated_texts.replace("Assistant: ", "")
page_tags = generated_texts
print("Page predictions:")
print(page_tags)
page.predictions.doctags = DocTagsPrediction(tag_string=page_tags)

View File

@ -56,6 +56,7 @@ class StandardPdfPipeline(PaginatedPipeline):
def __init__(self, pipeline_options: PdfPipelineOptions):
super().__init__(pipeline_options)
print("------> Init Standard PDF Pipeline!")
self.pipeline_options: PdfPipelineOptions
artifacts_path: Optional[Path] = None

View File

@ -1,3 +1,4 @@
import itertools
import logging
import re
from io import BytesIO
@ -19,7 +20,7 @@ from docling_core.types.doc import (
TableData,
TableItem,
)
from PIL import Image, ImageDraw
from PIL.Image import Image
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
@ -38,6 +39,7 @@ class VlmPipeline(PaginatedPipeline):
def __init__(self, pipeline_options: PdfPipelineOptions):
super().__init__(pipeline_options)
print("------> Init VLM Pipeline!")
self.pipeline_options: PdfPipelineOptions
if pipeline_options.artifacts_path is None:
@ -98,6 +100,8 @@ class VlmPipeline(PaginatedPipeline):
if page.predictions.doctags is not None:
document_tags += page.predictions.doctags.tag_string
conv_res.document = self._turn_tags_into_doc(document_tags, None)
"""
image_bytes = BytesIO()
if page.image:
page.image.save(image_bytes, format="PNG")
@ -114,7 +118,7 @@ class VlmPipeline(PaginatedPipeline):
conv_res.document.pages[page_no].image = ImageRef.from_pil(
page.image, dpi=int(72 * self.pipeline_options.images_scale)
)
"""
# Generate images of the requested element types
if (
self.pipeline_options.generate_picture_images
@ -151,7 +155,7 @@ class VlmPipeline(PaginatedPipeline):
# def _turn_tags_into_doc(self, xml_content: str, image_bytes: bytes) -> (DoclingDocument, list):
def _turn_tags_into_doc(
self, xml_content: str, image_bytes: bytes
self, xml_content: str, input_image: Optional[Image] = None
) -> DoclingDocument:
def extract_text(tag_content: str) -> str:
return re.sub(r"<.*?>", "", tag_content).strip()
@ -332,7 +336,7 @@ class VlmPipeline(PaginatedPipeline):
doc = DoclingDocument(name="Example Document")
current_group = None
lines = xml_content.split("\n")
pil_image = Image.open(BytesIO(image_bytes))
# pil_image = input_image #Image.open(BytesIO(image_bytes))
bounding_boxes = []
for line in lines:
@ -454,6 +458,7 @@ class VlmPipeline(PaginatedPipeline):
if bbox:
bounding_boxes.append((bbox, "yellow"))
# Convert bounding box normalized to 0-100 into pixel coordinates for cropping
"""
width, height = pil_image.size
crop_box = (
int(bbox.l * width),
@ -461,13 +466,14 @@ class VlmPipeline(PaginatedPipeline):
int(bbox.r * width),
int(bbox.b * height),
)
cropped_image = pil_image.crop(crop_box)
doc.add_picture(
parent=current_group,
image=ImageRef.from_pil(image=cropped_image, dpi=300),
# prov=[ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=1)],
prov=ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=1),
)
"""
elif line.startswith("<list>"):
content = extract_text(line)
prov_item_inst = None

View File

@ -1,10 +1,12 @@
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
# source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
source = "tests/data/2305.03393v1-pg9-img.png"
# source = "tests/data/2305.03393v1-pg9-img.png"
source = "tests/data/2305.03393v1-pg9.pdf"
pipeline_options = PdfPipelineOptions()
pipeline_options.artifacts_path = "model_artifacts"
@ -12,13 +14,40 @@ pipeline_options.artifacts_path = "model_artifacts"
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
)
pipeline_cls=VlmPipeline,
pipeline_options=pipeline_options,
backend=DoclingParseDocumentBackend,
),
InputFormat.IMAGE: PdfFormatOption(
pipeline_cls=VlmPipeline,
pipeline_options=pipeline_options,
backend=DoclingParseDocumentBackend,
),
}
)
print("============")
print("starting...")
print("============")
print("")
result = converter.convert(source)
print("------------")
print("result:")
print("------------")
print("")
print(result)
print("------------")
print("MD:")
print("------------")
print("")
print(result.document.export_to_markdown())
print("")
print("============")
print("done!")
print("============")
# output: ## Docling Technical Report [...]"