mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
WIP, first working code for inference of SmolDocling, and vlm pipeline assembly code, example included.
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
03c8d45790
commit
3c4c647615
@ -1,23 +1,9 @@
|
|||||||
import argparse
|
|
||||||
import itertools
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import time
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
# import copy
|
|
||||||
# import random
|
|
||||||
# import time
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Optional
|
from typing import Iterable, List, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||||
|
|
||||||
# from docling_core.types.doc import CoordOrigin, DocItemLabel
|
|
||||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
|
||||||
from PIL import Image, ImageDraw, ImageFont
|
|
||||||
from transformers import ( # type: ignore
|
from transformers import ( # type: ignore
|
||||||
AutoProcessor,
|
AutoProcessor,
|
||||||
BitsAndBytesConfig,
|
BitsAndBytesConfig,
|
||||||
@ -129,6 +115,8 @@ class SmolDoclingModel(BasePageModel):
|
|||||||
)[0]
|
)[0]
|
||||||
generated_texts = generated_texts.replace("Assistant: ", "")
|
generated_texts = generated_texts.replace("Assistant: ", "")
|
||||||
page_tags = generated_texts
|
page_tags = generated_texts
|
||||||
|
print("Page predictions:")
|
||||||
|
print(page_tags)
|
||||||
|
|
||||||
page.predictions.doctags = DocTagsPrediction(tag_string=page_tags)
|
page.predictions.doctags = DocTagsPrediction(tag_string=page_tags)
|
||||||
|
|
||||||
|
@ -56,6 +56,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
|
print("------> Init Standard PDF Pipeline!")
|
||||||
self.pipeline_options: PdfPipelineOptions
|
self.pipeline_options: PdfPipelineOptions
|
||||||
|
|
||||||
artifacts_path: Optional[Path] = None
|
artifacts_path: Optional[Path] = None
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
@ -19,7 +20,7 @@ from docling_core.types.doc import (
|
|||||||
TableData,
|
TableData,
|
||||||
TableItem,
|
TableItem,
|
||||||
)
|
)
|
||||||
from PIL import Image, ImageDraw
|
from PIL.Image import Image
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
@ -38,6 +39,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
|
print("------> Init VLM Pipeline!")
|
||||||
self.pipeline_options: PdfPipelineOptions
|
self.pipeline_options: PdfPipelineOptions
|
||||||
|
|
||||||
if pipeline_options.artifacts_path is None:
|
if pipeline_options.artifacts_path is None:
|
||||||
@ -98,13 +100,15 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
if page.predictions.doctags is not None:
|
if page.predictions.doctags is not None:
|
||||||
document_tags += page.predictions.doctags.tag_string
|
document_tags += page.predictions.doctags.tag_string
|
||||||
|
|
||||||
image_bytes = BytesIO()
|
conv_res.document = self._turn_tags_into_doc(document_tags, None)
|
||||||
if page.image:
|
"""
|
||||||
page.image.save(image_bytes, format="PNG")
|
image_bytes = BytesIO()
|
||||||
# TODO implement this function
|
if page.image:
|
||||||
conv_res.document = self._turn_tags_into_doc(
|
page.image.save(image_bytes, format="PNG")
|
||||||
document_tags, image_bytes.getvalue()
|
# TODO implement this function
|
||||||
)
|
conv_res.document = self._turn_tags_into_doc(
|
||||||
|
document_tags, image_bytes.getvalue()
|
||||||
|
)
|
||||||
|
|
||||||
# Generate page images in the output
|
# Generate page images in the output
|
||||||
if self.pipeline_options.generate_page_images:
|
if self.pipeline_options.generate_page_images:
|
||||||
@ -114,7 +118,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
||||||
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
||||||
)
|
)
|
||||||
|
"""
|
||||||
# Generate images of the requested element types
|
# Generate images of the requested element types
|
||||||
if (
|
if (
|
||||||
self.pipeline_options.generate_picture_images
|
self.pipeline_options.generate_picture_images
|
||||||
@ -151,7 +155,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
# def _turn_tags_into_doc(self, xml_content: str, image_bytes: bytes) -> (DoclingDocument, list):
|
# def _turn_tags_into_doc(self, xml_content: str, image_bytes: bytes) -> (DoclingDocument, list):
|
||||||
def _turn_tags_into_doc(
|
def _turn_tags_into_doc(
|
||||||
self, xml_content: str, image_bytes: bytes
|
self, xml_content: str, input_image: Optional[Image] = None
|
||||||
) -> DoclingDocument:
|
) -> DoclingDocument:
|
||||||
def extract_text(tag_content: str) -> str:
|
def extract_text(tag_content: str) -> str:
|
||||||
return re.sub(r"<.*?>", "", tag_content).strip()
|
return re.sub(r"<.*?>", "", tag_content).strip()
|
||||||
@ -332,7 +336,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
doc = DoclingDocument(name="Example Document")
|
doc = DoclingDocument(name="Example Document")
|
||||||
current_group = None
|
current_group = None
|
||||||
lines = xml_content.split("\n")
|
lines = xml_content.split("\n")
|
||||||
pil_image = Image.open(BytesIO(image_bytes))
|
# pil_image = input_image #Image.open(BytesIO(image_bytes))
|
||||||
bounding_boxes = []
|
bounding_boxes = []
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
@ -454,6 +458,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
if bbox:
|
if bbox:
|
||||||
bounding_boxes.append((bbox, "yellow"))
|
bounding_boxes.append((bbox, "yellow"))
|
||||||
# Convert bounding box normalized to 0-100 into pixel coordinates for cropping
|
# Convert bounding box normalized to 0-100 into pixel coordinates for cropping
|
||||||
|
"""
|
||||||
width, height = pil_image.size
|
width, height = pil_image.size
|
||||||
crop_box = (
|
crop_box = (
|
||||||
int(bbox.l * width),
|
int(bbox.l * width),
|
||||||
@ -461,13 +466,14 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
int(bbox.r * width),
|
int(bbox.r * width),
|
||||||
int(bbox.b * height),
|
int(bbox.b * height),
|
||||||
)
|
)
|
||||||
|
|
||||||
cropped_image = pil_image.crop(crop_box)
|
cropped_image = pil_image.crop(crop_box)
|
||||||
doc.add_picture(
|
doc.add_picture(
|
||||||
parent=current_group,
|
parent=current_group,
|
||||||
image=ImageRef.from_pil(image=cropped_image, dpi=300),
|
image=ImageRef.from_pil(image=cropped_image, dpi=300),
|
||||||
# prov=[ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=1)],
|
|
||||||
prov=ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=1),
|
prov=ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=1),
|
||||||
)
|
)
|
||||||
|
"""
|
||||||
elif line.startswith("<list>"):
|
elif line.startswith("<list>"):
|
||||||
content = extract_text(line)
|
content = extract_text(line)
|
||||||
prov_item_inst = None
|
prov_item_inst = None
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
# source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
# source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
||||||
source = "tests/data/2305.03393v1-pg9-img.png"
|
# source = "tests/data/2305.03393v1-pg9-img.png"
|
||||||
|
source = "tests/data/2305.03393v1-pg9.pdf"
|
||||||
|
|
||||||
pipeline_options = PdfPipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.artifacts_path = "model_artifacts"
|
pipeline_options.artifacts_path = "model_artifacts"
|
||||||
@ -12,13 +14,40 @@ pipeline_options.artifacts_path = "model_artifacts"
|
|||||||
converter = DocumentConverter(
|
converter = DocumentConverter(
|
||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
pipeline_cls=VlmPipeline,
|
||||||
)
|
pipeline_options=pipeline_options,
|
||||||
|
backend=DoclingParseDocumentBackend,
|
||||||
|
),
|
||||||
|
InputFormat.IMAGE: PdfFormatOption(
|
||||||
|
pipeline_cls=VlmPipeline,
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
backend=DoclingParseDocumentBackend,
|
||||||
|
),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
print("============")
|
||||||
|
print("starting...")
|
||||||
|
print("============")
|
||||||
|
print("")
|
||||||
|
|
||||||
result = converter.convert(source)
|
result = converter.convert(source)
|
||||||
|
|
||||||
|
print("------------")
|
||||||
|
print("result:")
|
||||||
|
print("------------")
|
||||||
|
print("")
|
||||||
|
print(result)
|
||||||
|
|
||||||
|
print("------------")
|
||||||
|
print("MD:")
|
||||||
|
print("------------")
|
||||||
|
print("")
|
||||||
print(result.document.export_to_markdown())
|
print(result.document.export_to_markdown())
|
||||||
|
|
||||||
|
print("")
|
||||||
|
print("============")
|
||||||
print("done!")
|
print("done!")
|
||||||
|
print("============")
|
||||||
|
|
||||||
# output: ## Docling Technical Report [...]"
|
# output: ## Docling Technical Report [...]"
|
||||||
|
Loading…
Reference in New Issue
Block a user