mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
wip smolDocling inference and vlm pipeline
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
dc3a388aa2
commit
03c8d45790
@ -1,13 +1,28 @@
|
||||
import copy
|
||||
import argparse
|
||||
import itertools
|
||||
import logging
|
||||
import random
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List
|
||||
from io import BytesIO
|
||||
|
||||
from docling_core.types.doc import CoordOrigin, DocItemLabel
|
||||
# import copy
|
||||
# import random
|
||||
# import time
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional
|
||||
|
||||
import torch
|
||||
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||
|
||||
# from docling_core.types.doc import CoordOrigin, DocItemLabel
|
||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from transformers import ( # type: ignore
|
||||
AutoProcessor,
|
||||
BitsAndBytesConfig,
|
||||
Idefics3ForConditionalGeneration,
|
||||
)
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
BoundingBox,
|
||||
@ -31,14 +46,41 @@ _log = logging.getLogger(__name__)
|
||||
class SmolDoclingModel(BasePageModel):
|
||||
|
||||
def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
|
||||
print("SmolDocling, init...")
|
||||
device = decide_device(accelerator_options.device)
|
||||
self.device = device
|
||||
_log.info("Available device for SmolDocling: {}".format(device))
|
||||
|
||||
# PARAMETERS:
|
||||
self.param_question = "Perform Layout Analysis."
|
||||
self.param_quantization_config = BitsAndBytesConfig(
|
||||
load_in_8bit=True, llm_int8_threshold=6.0
|
||||
)
|
||||
self.param_quantized = False
|
||||
|
||||
# self.your_vlm_predictor(..., device) = None # TODO
|
||||
self.processor = AutoProcessor.from_pretrained(artifacts_path)
|
||||
if not self.param_quantized:
|
||||
self.vlm_model = Idefics3ForConditionalGeneration.from_pretrained(
|
||||
artifacts_path,
|
||||
device_map=device,
|
||||
torch_dtype=torch.bfloat16,
|
||||
# _attn_implementation="flash_attention_2",
|
||||
)
|
||||
self.vlm_model = self.vlm_model.to(device)
|
||||
else:
|
||||
self.vlm_model = Idefics3ForConditionalGeneration.from_pretrained(
|
||||
artifacts_path,
|
||||
device_map=device,
|
||||
torch_dtype="auto",
|
||||
quantization_config=self.param_quantization_config,
|
||||
)
|
||||
print("SmolDocling, init... done!")
|
||||
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
print("SmolDocling, processing...")
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
@ -48,11 +90,46 @@ class SmolDoclingModel(BasePageModel):
|
||||
assert page.size is not None
|
||||
|
||||
hi_res_image = page.get_image(scale=2.0) # 144dpi
|
||||
|
||||
# Call your self.your_vlm_predictor with the page image as input (hi_res_image)
|
||||
# populate page_tags
|
||||
# populate page_tags with predicted doc tags
|
||||
page_tags = ""
|
||||
|
||||
if hi_res_image:
|
||||
if hi_res_image.mode != "RGB":
|
||||
hi_res_image = hi_res_image.convert("RGB")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "This is a page from a document.",
|
||||
},
|
||||
{"type": "image"},
|
||||
{"type": "text", "text": self.param_question},
|
||||
],
|
||||
}
|
||||
]
|
||||
prompt = self.processor.apply_chat_template(
|
||||
messages, add_generation_prompt=False
|
||||
)
|
||||
inputs = self.processor(
|
||||
text=prompt, images=[hi_res_image], return_tensors="pt"
|
||||
)
|
||||
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
||||
prompt = prompt.replace("<end_of_utterance>", "")
|
||||
|
||||
# Call model to generate:
|
||||
generated_ids = self.vlm_model.generate(
|
||||
**inputs, max_new_tokens=4096
|
||||
)
|
||||
|
||||
generated_texts = self.processor.batch_decode(
|
||||
generated_ids, skip_special_tokens=True
|
||||
)[0]
|
||||
generated_texts = generated_texts.replace("Assistant: ", "")
|
||||
page_tags = generated_texts
|
||||
|
||||
page.predictions.doctags = DocTagsPrediction(tag_string=page_tags)
|
||||
|
||||
yield page
|
||||
|
@ -1,9 +1,25 @@
|
||||
import logging
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from docling_core.types import DoclingDocument
|
||||
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
GroupLabel,
|
||||
ImageRef,
|
||||
ImageRefMode,
|
||||
PictureItem,
|
||||
ProvenanceItem,
|
||||
TableCell,
|
||||
TableData,
|
||||
TableItem,
|
||||
)
|
||||
from PIL import Image, ImageDraw
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
@ -18,7 +34,7 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VlmPipeline(PaginatedPipeline):
|
||||
_smol_vlm_path = "model_artifacts/smol_vlm" # TODO or whatever is needed.
|
||||
_smol_vlm_path = "SmolDocling-0.0.2"
|
||||
|
||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
@ -73,6 +89,7 @@ class VlmPipeline(PaginatedPipeline):
|
||||
return page
|
||||
|
||||
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
print("VLM, assembling document...")
|
||||
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
||||
|
||||
# Read and concatenate the page doctags:
|
||||
@ -81,8 +98,13 @@ class VlmPipeline(PaginatedPipeline):
|
||||
if page.predictions.doctags is not None:
|
||||
document_tags += page.predictions.doctags.tag_string
|
||||
|
||||
# TODO implement this function
|
||||
conv_res.document = self._turn_tags_into_doc(document_tags)
|
||||
image_bytes = BytesIO()
|
||||
if page.image:
|
||||
page.image.save(image_bytes, format="PNG")
|
||||
# TODO implement this function
|
||||
conv_res.document = self._turn_tags_into_doc(
|
||||
document_tags, image_bytes.getvalue()
|
||||
)
|
||||
|
||||
# Generate page images in the output
|
||||
if self.pipeline_options.generate_page_images:
|
||||
@ -127,6 +149,390 @@ class VlmPipeline(PaginatedPipeline):
|
||||
|
||||
return conv_res
|
||||
|
||||
# def _turn_tags_into_doc(self, xml_content: str, image_bytes: bytes) -> (DoclingDocument, list):
|
||||
def _turn_tags_into_doc(
|
||||
self, xml_content: str, image_bytes: bytes
|
||||
) -> DoclingDocument:
|
||||
def extract_text(tag_content: str) -> str:
|
||||
return re.sub(r"<.*?>", "", tag_content).strip()
|
||||
|
||||
def extract_bounding_box(tag_content: str) -> Optional[BoundingBox]:
|
||||
locs = re.findall(r"<loc_(\d+)>", tag_content)
|
||||
if len(locs) == 4:
|
||||
l, t, r, b = map(float, locs)
|
||||
l, t, r, b = [coord / 500.0 for coord in (l, t, r, b)]
|
||||
return BoundingBox(l=l, t=t, r=r, b=b)
|
||||
return None
|
||||
|
||||
def parse_table_content_old(otsl_content: str) -> TableData:
|
||||
rows = []
|
||||
table_cells = []
|
||||
|
||||
for row_content in otsl_content.split("<nl>"):
|
||||
row_content = row_content.strip()
|
||||
if not row_content:
|
||||
continue
|
||||
|
||||
current_row = []
|
||||
cells = re.findall(r"<(fcel|ecel)>([^<]*)", row_content)
|
||||
for cell_type, cell_content in cells:
|
||||
if cell_type == "fcel":
|
||||
current_row.append(cell_content.strip())
|
||||
elif cell_type == "ecel":
|
||||
current_row.append("")
|
||||
|
||||
if current_row:
|
||||
rows.append(current_row)
|
||||
|
||||
for r_idx, row in enumerate(rows):
|
||||
for c_idx, cell_text in enumerate(row):
|
||||
table_cells.append(
|
||||
TableCell(
|
||||
text=cell_text.strip(),
|
||||
row_span=1,
|
||||
col_span=1,
|
||||
start_row_offset_idx=r_idx,
|
||||
end_row_offset_idx=r_idx + 1,
|
||||
start_col_offset_idx=c_idx,
|
||||
end_col_offset_idx=c_idx + 1,
|
||||
)
|
||||
)
|
||||
|
||||
return TableData(
|
||||
num_rows=len(rows),
|
||||
num_cols=max(len(row) for row in rows) if rows else 0,
|
||||
table_cells=table_cells,
|
||||
)
|
||||
|
||||
def parse_texts(texts, tokens):
|
||||
split_word = "<nl>"
|
||||
split_row_tokens = [
|
||||
list(y)
|
||||
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
||||
if not x
|
||||
]
|
||||
table_cells = []
|
||||
# print("\nText parts:")
|
||||
r_idx = 0
|
||||
c_idx = 0
|
||||
|
||||
def count_right(tokens, c_idx, r_idx, which_tokens):
|
||||
# for t in tokens:
|
||||
# print(t)
|
||||
span = 1
|
||||
c_idx_iter = c_idx
|
||||
while tokens[r_idx][c_idx_iter] in which_tokens:
|
||||
c_idx_iter += 1
|
||||
if c_idx_iter >= len(tokens[r_idx]):
|
||||
break
|
||||
span += 1
|
||||
return span
|
||||
|
||||
def count_down(tokens, c_idx, r_idx, which_tokens):
|
||||
span = 1
|
||||
r_idx_iter = r_idx
|
||||
while tokens[r_idx_iter][c_idx] in which_tokens:
|
||||
r_idx_iter += 1
|
||||
if r_idx_iter >= len(tokens):
|
||||
break
|
||||
span += 1
|
||||
return span
|
||||
|
||||
for i, text in enumerate(texts):
|
||||
# print(f" {text}")
|
||||
cell_text = ""
|
||||
if text in ["<fcel>", "<ecel>", "<ched>", "<rhed>", "<srow>"]:
|
||||
row_span = 1
|
||||
col_span = 1
|
||||
right_offset = 1
|
||||
if text != "<ecel>":
|
||||
cell_text = texts[i + 1]
|
||||
right_offset = 2
|
||||
|
||||
# TODO: Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
|
||||
next_right_cell = texts[i + right_offset]
|
||||
|
||||
next_bottom_cell = ""
|
||||
if r_idx + 1 < len(split_row_tokens):
|
||||
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
||||
|
||||
if next_right_cell in ["<lcel>", "<xcel>"]:
|
||||
# we have horisontal spanning cell or 2d spanning cell
|
||||
col_span += count_right(
|
||||
split_row_tokens, c_idx + 1, r_idx, ["<lcel>", "<xcel>"]
|
||||
)
|
||||
if next_bottom_cell in ["<ucel>", "<xcel>"]:
|
||||
# we have a vertical spanning cell or 2d spanning cell
|
||||
row_span += count_down(
|
||||
split_row_tokens, c_idx, r_idx + 1, ["<lcel>", "<xcel>"]
|
||||
)
|
||||
|
||||
table_cells.append(
|
||||
TableCell(
|
||||
text=cell_text.strip(),
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
start_row_offset_idx=r_idx,
|
||||
end_row_offset_idx=r_idx + row_span,
|
||||
start_col_offset_idx=c_idx,
|
||||
end_col_offset_idx=c_idx + col_span,
|
||||
)
|
||||
)
|
||||
if text in [
|
||||
"<fcel>",
|
||||
"<ecel>",
|
||||
"<ched>",
|
||||
"<rhed>",
|
||||
"<srow>",
|
||||
"<lcel>",
|
||||
"<ucel>",
|
||||
"<xcel>",
|
||||
]:
|
||||
c_idx += 1
|
||||
if text == "<nl>":
|
||||
r_idx += 1
|
||||
c_idx = 0
|
||||
return table_cells, split_row_tokens
|
||||
|
||||
def extract_tokens_and_text(s: str):
|
||||
# Pattern to match anything enclosed by < > (including the angle brackets themselves)
|
||||
pattern = r"(<[^>]+>)"
|
||||
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
|
||||
tokens = re.findall(pattern, s)
|
||||
# Remove any tokens that start with "<loc_"
|
||||
tokens = [
|
||||
token
|
||||
for token in tokens
|
||||
if not (token.startswith("<loc_") or token in ["<otsl>", "</otsl>"])
|
||||
]
|
||||
# Split the string by those tokens to get the in-between text
|
||||
text_parts = re.split(pattern, s)
|
||||
text_parts = [
|
||||
token
|
||||
for token in text_parts
|
||||
if not (token.startswith("<loc_") or token in ["<otsl>", "</otsl>"])
|
||||
]
|
||||
# Remove any empty or purely whitespace strings from text_parts
|
||||
text_parts = [part for part in text_parts if part.strip()]
|
||||
|
||||
return tokens, text_parts
|
||||
|
||||
def parse_table_content(otsl_content: str) -> TableData:
|
||||
tokens, mixed_texts = extract_tokens_and_text(otsl_content)
|
||||
table_cells, split_row_tokens = parse_texts(mixed_texts, tokens)
|
||||
|
||||
return TableData(
|
||||
num_rows=len(split_row_tokens),
|
||||
num_cols=(
|
||||
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
|
||||
),
|
||||
table_cells=table_cells,
|
||||
)
|
||||
|
||||
doc = DoclingDocument(name="Example Document")
|
||||
current_group = None
|
||||
lines = xml_content.split("\n")
|
||||
pil_image = Image.open(BytesIO(image_bytes))
|
||||
bounding_boxes = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
line = line.replace("<doc_tag>", "")
|
||||
if line.startswith("<paragraph>"):
|
||||
content = extract_text(line)
|
||||
prov_item = extract_bounding_box(line)
|
||||
if prov_item:
|
||||
bounding_boxes.append((prov_item, "red"))
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
text=content,
|
||||
parent=current_group,
|
||||
prov=(
|
||||
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
||||
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)
|
||||
if prov_item
|
||||
else None
|
||||
),
|
||||
)
|
||||
elif line.startswith("<title>"):
|
||||
content = extract_text(line)
|
||||
prov_item = extract_bounding_box(line)
|
||||
if prov_item:
|
||||
bounding_boxes.append((prov_item, "blue"))
|
||||
current_group = doc.add_group(label=GroupLabel.SECTION, name=content)
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TITLE,
|
||||
text=content,
|
||||
parent=current_group,
|
||||
prov=(
|
||||
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
||||
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)
|
||||
if prov_item
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
elif line.startswith("<section-header>"):
|
||||
content = extract_text(line)
|
||||
prov_item = extract_bounding_box(line)
|
||||
if prov_item:
|
||||
bounding_boxes.append((prov_item, "green"))
|
||||
current_group = doc.add_group(label=GroupLabel.SECTION, name=content)
|
||||
doc.add_text(
|
||||
label=DocItemLabel.SECTION_HEADER,
|
||||
text=content,
|
||||
parent=current_group,
|
||||
prov=(
|
||||
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
||||
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)
|
||||
if prov_item
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
elif line.startswith("<otsl>"):
|
||||
prov_item = extract_bounding_box(line)
|
||||
if prov_item:
|
||||
bounding_boxes.append((prov_item, "aquamarine"))
|
||||
|
||||
table_data = parse_table_content(line)
|
||||
doc.add_table(data=table_data, parent=current_group)
|
||||
|
||||
elif line.startswith("<footnote>"):
|
||||
content = extract_text(line)
|
||||
prov_item = extract_bounding_box(line)
|
||||
if prov_item:
|
||||
bounding_boxes.append((prov_item, "orange"))
|
||||
doc.add_text(
|
||||
label=DocItemLabel.FOOTNOTE,
|
||||
text=content,
|
||||
parent=current_group,
|
||||
prov=(
|
||||
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
||||
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)
|
||||
if prov_item
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
elif line.startswith("<page-header>"):
|
||||
content = extract_text(line)
|
||||
prov_item = extract_bounding_box(line)
|
||||
if prov_item:
|
||||
bounding_boxes.append((prov_item, "purple"))
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PAGE_HEADER,
|
||||
text=content,
|
||||
parent=current_group,
|
||||
prov=(
|
||||
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
||||
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)
|
||||
if prov_item
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
elif line.startswith("<page-footer>"):
|
||||
content = extract_text(line)
|
||||
prov_item = extract_bounding_box(line)
|
||||
if prov_item:
|
||||
bounding_boxes.append((prov_item, "cyan"))
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PAGE_FOOTER,
|
||||
text=content,
|
||||
parent=current_group,
|
||||
prov=(
|
||||
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
||||
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)
|
||||
if prov_item
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
elif line.startswith("<figure>"):
|
||||
bbox = extract_bounding_box(line)
|
||||
if bbox:
|
||||
bounding_boxes.append((bbox, "yellow"))
|
||||
# Convert bounding box normalized to 0-100 into pixel coordinates for cropping
|
||||
width, height = pil_image.size
|
||||
crop_box = (
|
||||
int(bbox.l * width),
|
||||
int(bbox.t * height),
|
||||
int(bbox.r * width),
|
||||
int(bbox.b * height),
|
||||
)
|
||||
cropped_image = pil_image.crop(crop_box)
|
||||
doc.add_picture(
|
||||
parent=current_group,
|
||||
image=ImageRef.from_pil(image=cropped_image, dpi=300),
|
||||
# prov=[ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=1)],
|
||||
prov=ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=1),
|
||||
)
|
||||
elif line.startswith("<list>"):
|
||||
content = extract_text(line)
|
||||
prov_item_inst = None
|
||||
prov_item = extract_bounding_box(line)
|
||||
if prov_item:
|
||||
bounding_boxes.append((prov_item, "brown"))
|
||||
prov_item_inst = ProvenanceItem(
|
||||
bbox=prov_item, charspan=(0, 0), page_no=1
|
||||
)
|
||||
doc.add_text(
|
||||
label=DocItemLabel.LIST_ITEM,
|
||||
text=content,
|
||||
parent=current_group,
|
||||
prov=prov_item_inst if prov_item_inst else None,
|
||||
)
|
||||
|
||||
elif line.startswith("<caption>"):
|
||||
content = extract_text(line)
|
||||
prov_item_inst = None
|
||||
prov_item = extract_bounding_box(line)
|
||||
if prov_item:
|
||||
bounding_boxes.append((prov_item, "magenta"))
|
||||
prov_item_inst = ProvenanceItem(
|
||||
bbox=prov_item, charspan=(0, 0), page_no=1
|
||||
)
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
text=content,
|
||||
parent=current_group,
|
||||
prov=prov_item_inst if prov_item_inst else None,
|
||||
)
|
||||
elif line.startswith("<checkbox-unselected>"):
|
||||
content = extract_text(line)
|
||||
prov_item_inst = None
|
||||
prov_item = extract_bounding_box(line)
|
||||
if prov_item:
|
||||
bounding_boxes.append((prov_item, "gray"))
|
||||
prov_item_inst = ProvenanceItem(
|
||||
bbox=prov_item, charspan=(0, 0), page_no=1
|
||||
)
|
||||
doc.add_text(
|
||||
label=DocItemLabel.CHECKBOX_UNSELECTED,
|
||||
text=content,
|
||||
parent=current_group,
|
||||
prov=prov_item_inst if prov_item_inst else None,
|
||||
)
|
||||
|
||||
elif line.startswith("<checkbox-selected>"):
|
||||
content = extract_text(line)
|
||||
prov_item_inst = None
|
||||
prov_item = extract_bounding_box(line)
|
||||
if prov_item:
|
||||
bounding_boxes.append((prov_item, "black"))
|
||||
prov_item_inst = ProvenanceItem(
|
||||
bbox=prov_item, charspan=(0, 0), page_no=1
|
||||
)
|
||||
doc.add_text(
|
||||
label=DocItemLabel.CHECKBOX_SELECTED,
|
||||
text=content,
|
||||
parent=current_group,
|
||||
prov=prov_item_inst if prov_item_inst else None,
|
||||
)
|
||||
# return doc, bounding_boxes
|
||||
return doc
|
||||
|
||||
@classmethod
|
||||
def get_default_options(cls) -> PdfPipelineOptions:
|
||||
return PdfPipelineOptions()
|
||||
@ -135,5 +541,5 @@ class VlmPipeline(PaginatedPipeline):
|
||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||
return isinstance(backend, PdfDocumentBackend)
|
||||
|
||||
def _turn_tags_into_doc(self, document_tags):
|
||||
return DoclingDocument()
|
||||
# def _turn_tags_into_doc(self, document_tags):
|
||||
# return DoclingDocument()
|
||||
|
@ -1,13 +1,24 @@
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
||||
# source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
||||
source = "tests/data/2305.03393v1-pg9-img.png"
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.artifacts_path = "model_artifacts"
|
||||
|
||||
converter = DocumentConverter(
|
||||
doc_converter=DocumentConverter(
|
||||
format_options={InputFormat.PDF: PdfFormatOption(pipeline_cls=VlmPipeline)}
|
||||
)
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
||||
)
|
||||
}
|
||||
)
|
||||
result = converter.convert(source)
|
||||
print(result.document.export_to_markdown())
|
||||
|
||||
print("done!")
|
||||
|
||||
# output: ## Docling Technical Report [...]"
|
||||
|
32
poetry.lock
generated
32
poetry.lock
generated
@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohappyeyeballs"
|
||||
@ -187,8 +187,8 @@ files = [
|
||||
lazy-object-proxy = ">=1.4.0"
|
||||
typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
|
||||
wrapt = [
|
||||
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
|
||||
{version = ">=1.11,<2", markers = "python_version < \"3.11\""},
|
||||
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -2791,8 +2791,8 @@ files = [
|
||||
|
||||
[package.dependencies]
|
||||
multiprocess = [
|
||||
{version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""},
|
||||
{version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""},
|
||||
{version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""},
|
||||
]
|
||||
pygments = ">=2.0"
|
||||
pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""}
|
||||
@ -3790,21 +3790,15 @@ description = "Wrapper package for OpenCV python bindings."
|
||||
optional = true
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "opencv-python-4.11.0.86.tar.gz", hash = "sha256:03d60ccae62304860d232272e4a4fda93c39d595780cb40b161b310244b736a4"},
|
||||
{file = "opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:432f67c223f1dc2824f5e73cdfcd9db0efc8710647d4e813012195dc9122a52a"},
|
||||
{file = "opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:9d05ef13d23fe97f575153558653e2d6e87103995d54e6a35db3f282fe1f9c66"},
|
||||
{file = "opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b92ae2c8852208817e6776ba1ea0d6b1e0a1b5431e971a2a0ddd2a8cc398202"},
|
||||
{file = "opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b02611523803495003bd87362db3e1d2a0454a6a63025dc6658a9830570aa0d"},
|
||||
{file = "opencv_python-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:810549cb2a4aedaa84ad9a1c92fbfdfc14090e2749cedf2c1589ad8359aa169b"},
|
||||
{file = "opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:085ad9b77c18853ea66283e98affefe2de8cc4c1f43eda4c100cf9b2721142ec"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
numpy = [
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
|
||||
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
|
||||
]
|
||||
@ -3816,21 +3810,15 @@ description = "Wrapper package for OpenCV python bindings."
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "opencv-python-headless-4.11.0.86.tar.gz", hash = "sha256:996eb282ca4b43ec6a3972414de0e2331f5d9cda2b41091a49739c19fb843798"},
|
||||
{file = "opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:48128188ade4a7e517237c8e1e11a9cdf5c282761473383e77beb875bb1e61ca"},
|
||||
{file = "opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:a66c1b286a9de872c343ee7c3553b084244299714ebb50fbdcd76f07ebbe6c81"},
|
||||
{file = "opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6efabcaa9df731f29e5ea9051776715b1bdd1845d7c9530065c7951d2a2899eb"},
|
||||
{file = "opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e0a27c19dd1f40ddff94976cfe43066fbbe9dfbb2ec1907d66c19caef42a57b"},
|
||||
{file = "opencv_python_headless-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:f447d8acbb0b6f2808da71fddd29c1cdd448d2bc98f72d9bb78a7a898fc9621b"},
|
||||
{file = "opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
numpy = [
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
|
||||
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
|
||||
]
|
||||
@ -4016,9 +4004,9 @@ files = [
|
||||
|
||||
[package.dependencies]
|
||||
numpy = [
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
]
|
||||
python-dateutil = ">=2.8.2"
|
||||
pytz = ">=2020.1"
|
||||
@ -4766,8 +4754,8 @@ files = [
|
||||
astroid = ">=2.15.8,<=2.17.0-dev0"
|
||||
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
|
||||
dill = [
|
||||
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
|
||||
{version = ">=0.2", markers = "python_version < \"3.11\""},
|
||||
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
|
||||
]
|
||||
isort = ">=4.2.5,<6"
|
||||
mccabe = ">=0.6,<0.8"
|
||||
|
Loading…
Reference in New Issue
Block a user