mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 15:02:21 +00:00
chore: clean up code and comments
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
f994654918
commit
c5873f2496
@ -333,6 +333,8 @@ class PaginatedPipelineOptions(PipelineOptions):
|
|||||||
|
|
||||||
class VlmPipelineOptions(PaginatedPipelineOptions):
|
class VlmPipelineOptions(PaginatedPipelineOptions):
|
||||||
artifacts_path: Optional[Union[Path, str]] = None
|
artifacts_path: Optional[Union[Path, str]] = None
|
||||||
|
|
||||||
|
generate_page_images: bool = True
|
||||||
force_backend_text: bool = (
|
force_backend_text: bool = (
|
||||||
False # (To be used with vlms, or other generative models)
|
False # (To be used with vlms, or other generative models)
|
||||||
)
|
)
|
||||||
|
@ -116,7 +116,7 @@ class HuggingFaceVlmModel(BasePageModel):
|
|||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
yield page
|
yield page
|
||||||
else:
|
else:
|
||||||
with TimeRecorder(conv_res, "smolvlm"):
|
with TimeRecorder(conv_res, "vlm"):
|
||||||
assert page.size is not None
|
assert page.size is not None
|
||||||
|
|
||||||
hi_res_image = page.get_image(scale=2.0) # 144dpi
|
hi_res_image = page.get_image(scale=2.0) # 144dpi
|
||||||
|
@ -51,7 +51,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
self.keep_backend = True
|
self.keep_backend = True
|
||||||
|
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"This API is currently experimental and may change in upcoming versions without notice.",
|
"The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
|
||||||
category=UserWarning,
|
category=UserWarning,
|
||||||
stacklevel=2,
|
stacklevel=2,
|
||||||
)
|
)
|
||||||
@ -70,18 +70,18 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
"When defined, it must point to a folder containing all models required by the pipeline."
|
"When defined, it must point to a folder containing all models required by the pipeline."
|
||||||
)
|
)
|
||||||
|
|
||||||
# force_backend_text = False - use text that is coming from SmolDocling
|
# force_backend_text = False - use text that is coming from VLM response
|
||||||
# force_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss
|
# force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags
|
||||||
self.force_backend_text = pipeline_options.force_backend_text
|
self.force_backend_text = (
|
||||||
|
pipeline_options.force_backend_text
|
||||||
self.keep_images = (
|
and pipeline_options.vlm_options.response_format == ResponseFormat.DOCTAGS
|
||||||
self.pipeline_options.generate_page_images
|
|
||||||
or self.pipeline_options.generate_picture_images
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.keep_images = self.pipeline_options.generate_page_images
|
||||||
|
|
||||||
self.build_pipe = [
|
self.build_pipe = [
|
||||||
HuggingFaceVlmModel(
|
HuggingFaceVlmModel(
|
||||||
enabled=True,
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=artifacts_path,
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
vlm_options=self.pipeline_options.vlm_options,
|
vlm_options=self.pipeline_options.vlm_options,
|
||||||
@ -397,6 +397,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
if page.predictions.vlm_response:
|
if page.predictions.vlm_response:
|
||||||
predicted_text = page.predictions.vlm_response.text
|
predicted_text = page.predictions.vlm_response.text
|
||||||
image = page.image
|
image = page.image
|
||||||
|
|
||||||
page_no = pg_idx + 1
|
page_no = pg_idx + 1
|
||||||
bounding_boxes = []
|
bounding_boxes = []
|
||||||
|
|
||||||
@ -448,12 +449,13 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
text_caption_content = extract_inner_text(full_chunk)
|
text_caption_content = extract_inner_text(full_chunk)
|
||||||
if image:
|
if image:
|
||||||
if bbox:
|
if bbox:
|
||||||
width, height = image.size
|
im_width, im_height = image.size
|
||||||
|
|
||||||
crop_box = (
|
crop_box = (
|
||||||
int(bbox.l * width),
|
int(bbox.l * im_width),
|
||||||
int(bbox.t * height),
|
int(bbox.t * im_height),
|
||||||
int(bbox.r * width),
|
int(bbox.r * im_width),
|
||||||
int(bbox.b * height),
|
int(bbox.b * im_height),
|
||||||
)
|
)
|
||||||
cropped_image = image.crop(crop_box)
|
cropped_image = image.crop(crop_box)
|
||||||
pic = doc.add_picture(
|
pic = doc.add_picture(
|
||||||
@ -461,7 +463,9 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
image=ImageRef.from_pil(image=cropped_image, dpi=72),
|
image=ImageRef.from_pil(image=cropped_image, dpi=72),
|
||||||
prov=(
|
prov=(
|
||||||
ProvenanceItem(
|
ProvenanceItem(
|
||||||
bbox=bbox, charspan=(0, 0), page_no=page_no
|
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
||||||
|
charspan=(0, 0),
|
||||||
|
page_no=page_no,
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
@ -501,7 +505,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
text=text_content,
|
text=text_content,
|
||||||
prov=(
|
prov=(
|
||||||
ProvenanceItem(
|
ProvenanceItem(
|
||||||
bbox=bbox,
|
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
||||||
charspan=(0, len(text_content)),
|
charspan=(0, len(text_content)),
|
||||||
page_no=page_no,
|
page_no=page_no,
|
||||||
)
|
)
|
||||||
|
@ -11,32 +11,34 @@ from docling.datamodel.pipeline_options import (
|
|||||||
granite_vision_vlm_conversion_options,
|
granite_vision_vlm_conversion_options,
|
||||||
smoldocling_vlm_conversion_options,
|
smoldocling_vlm_conversion_options,
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.settings import settings
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
sources = [
|
sources = [
|
||||||
# "https://arxiv.org/pdf/2408.09869",
|
|
||||||
"tests/data/2305.03393v1-pg9-img.png",
|
"tests/data/2305.03393v1-pg9-img.png",
|
||||||
# "tests/data/2305.03393v1-pg9.pdf",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
pipeline_options = VlmPipelineOptions() # artifacts_path="~/local_model_artifacts/"
|
settings.debug.profile_pipeline_timings = True
|
||||||
pipeline_options.generate_page_images = True
|
## Use experimental VlmPipeline
|
||||||
|
pipeline_options = VlmPipelineOptions()
|
||||||
# If force_backend_text = True, text from backend will be used instead of generated text
|
# If force_backend_text = True, text from backend will be used instead of generated text
|
||||||
pipeline_options.force_backend_text = False
|
pipeline_options.force_backend_text = False
|
||||||
|
|
||||||
## Enable flash_attention_2 with CUDA:
|
## On GPU systems, enable flash_attention_2 with CUDA:
|
||||||
# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
|
# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
|
||||||
# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
|
# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
|
||||||
|
|
||||||
|
## Pick a VLM model. We choose SmolDocling-256M by default
|
||||||
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
||||||
|
|
||||||
## Choose alternative VLM models:
|
## Alternative VLM models:
|
||||||
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
||||||
|
|
||||||
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
||||||
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||||
|
|
||||||
|
## Set up pipeline for PDF or image inputs
|
||||||
converter = DocumentConverter(
|
converter = DocumentConverter(
|
||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
@ -68,6 +70,12 @@ for source in sources:
|
|||||||
print("")
|
print("")
|
||||||
print(res.document.export_to_markdown())
|
print(res.document.export_to_markdown())
|
||||||
|
|
||||||
|
print("------------------------------------------------")
|
||||||
|
print("Timings:")
|
||||||
|
print("------------------------------------------------")
|
||||||
|
print("")
|
||||||
|
print(res.timings)
|
||||||
|
|
||||||
for page in res.pages:
|
for page in res.pages:
|
||||||
print("")
|
print("")
|
||||||
print("Predicted page in DOCTAGS:")
|
print("Predicted page in DOCTAGS:")
|
||||||
@ -82,9 +90,6 @@ for source in sources:
|
|||||||
with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
|
with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
|
||||||
fp.write(json.dumps(res.document.export_to_dict()))
|
fp.write(json.dumps(res.document.export_to_dict()))
|
||||||
|
|
||||||
with (out_path / f"{res.input.file.stem}.yaml").open("w") as fp:
|
|
||||||
fp.write(yaml.safe_dump(res.document.export_to_dict()))
|
|
||||||
|
|
||||||
pg_num = res.document.num_pages()
|
pg_num = res.document.num_pages()
|
||||||
|
|
||||||
print("")
|
print("")
|
||||||
|
@ -69,9 +69,6 @@ accelerate = [
|
|||||||
pillow = ">=10.0.0,<12.0.0"
|
pillow = ">=10.0.0,<12.0.0"
|
||||||
tqdm = "^4.65.0"
|
tqdm = "^4.65.0"
|
||||||
|
|
||||||
# transformers = "^4.47.1"
|
|
||||||
# accelerate = "^1.2.1"
|
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
black = {extras = ["jupyter"], version = "^24.4.2"}
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||||
pytest = "^7.2.2"
|
pytest = "^7.2.2"
|
||||||
|
Loading…
Reference in New Issue
Block a user