mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
46 lines
1.1 KiB
Python
46 lines
1.1 KiB
Python
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.pipeline_options import (
|
|
VlmPipelineOptions,
|
|
)
|
|
from docling.datamodel.vlm_model_spec import SMOLDOCLING_MLX
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
|
|
|
source = "https://arxiv.org/pdf/2501.17887"
|
|
|
|
###### USING SIMPLE DEFAULT VALUES
|
|
# - SmolDocling model
|
|
# - Using the transformers framework
|
|
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_cls=VlmPipeline,
|
|
),
|
|
}
|
|
)
|
|
|
|
doc = converter.convert(source=source).document
|
|
|
|
print(doc.export_to_markdown())
|
|
|
|
|
|
###### USING MACOS MPS ACCELERATOR
|
|
|
|
pipeline_options = VlmPipelineOptions(
|
|
vlm_options=SMOLDOCLING_MLX,
|
|
)
|
|
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_cls=VlmPipeline,
|
|
pipeline_options=pipeline_options,
|
|
),
|
|
}
|
|
)
|
|
|
|
doc = converter.convert(source=source).document
|
|
|
|
print(doc.export_to_markdown())
|