mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
Add CLI choices for VLM pipeline and model
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
0cd9b48372
commit
8e2b0b39c1
@ -32,13 +32,21 @@ from docling.datamodel.pipeline_options import (
|
|||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
|
PaginatedPipelineOptions,
|
||||||
PdfBackend,
|
PdfBackend,
|
||||||
|
PdfPipeline,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
TableFormerMode,
|
TableFormerMode,
|
||||||
|
VlmModelType,
|
||||||
|
VlmPipelineOptions,
|
||||||
|
granite_vision_vlm_conversion_options,
|
||||||
|
smoldocling_vlm_conversion_options,
|
||||||
|
smoldocling_vlm_mlx_conversion_options,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||||
from docling.models.factories import get_ocr_factory
|
from docling.models.factories import get_ocr_factory
|
||||||
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||||
@ -200,6 +208,14 @@ def convert(
|
|||||||
help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
|
help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
|
||||||
),
|
),
|
||||||
] = ImageRefMode.EMBEDDED,
|
] = ImageRefMode.EMBEDDED,
|
||||||
|
pipeline: Annotated[
|
||||||
|
PdfPipeline,
|
||||||
|
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
|
||||||
|
] = PdfPipeline.STANDARD,
|
||||||
|
vlm_model: Annotated[
|
||||||
|
VlmModelType,
|
||||||
|
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
|
||||||
|
] = VlmModelType.SMOLDOCLING,
|
||||||
ocr: Annotated[
|
ocr: Annotated[
|
||||||
bool,
|
bool,
|
||||||
typer.Option(
|
typer.Option(
|
||||||
@ -420,50 +436,77 @@ def convert(
|
|||||||
ocr_options.lang = ocr_lang_list
|
ocr_options.lang = ocr_lang_list
|
||||||
|
|
||||||
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
||||||
pipeline_options = PdfPipelineOptions(
|
pipeline_options: PaginatedPipelineOptions
|
||||||
allow_external_plugins=allow_external_plugins,
|
|
||||||
enable_remote_services=enable_remote_services,
|
|
||||||
accelerator_options=accelerator_options,
|
|
||||||
do_ocr=ocr,
|
|
||||||
ocr_options=ocr_options,
|
|
||||||
do_table_structure=True,
|
|
||||||
do_code_enrichment=enrich_code,
|
|
||||||
do_formula_enrichment=enrich_formula,
|
|
||||||
do_picture_description=enrich_picture_description,
|
|
||||||
do_picture_classification=enrich_picture_classes,
|
|
||||||
document_timeout=document_timeout,
|
|
||||||
)
|
|
||||||
pipeline_options.table_structure_options.do_cell_matching = (
|
|
||||||
True # do_cell_matching
|
|
||||||
)
|
|
||||||
pipeline_options.table_structure_options.mode = table_mode
|
|
||||||
|
|
||||||
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
if pipeline == PdfPipeline.STANDARD:
|
||||||
pipeline_options.generate_page_images = True
|
pipeline_options = PdfPipelineOptions(
|
||||||
pipeline_options.generate_picture_images = (
|
allow_external_plugins=allow_external_plugins,
|
||||||
True # FIXME: to be deprecated in verson 3
|
enable_remote_services=enable_remote_services,
|
||||||
|
accelerator_options=accelerator_options,
|
||||||
|
do_ocr=ocr,
|
||||||
|
ocr_options=ocr_options,
|
||||||
|
do_table_structure=True,
|
||||||
|
do_code_enrichment=enrich_code,
|
||||||
|
do_formula_enrichment=enrich_formula,
|
||||||
|
do_picture_description=enrich_picture_description,
|
||||||
|
do_picture_classification=enrich_picture_classes,
|
||||||
|
document_timeout=document_timeout,
|
||||||
|
)
|
||||||
|
pipeline_options.table_structure_options.do_cell_matching = (
|
||||||
|
True # do_cell_matching
|
||||||
|
)
|
||||||
|
pipeline_options.table_structure_options.mode = table_mode
|
||||||
|
|
||||||
|
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
||||||
|
pipeline_options.generate_page_images = True
|
||||||
|
pipeline_options.generate_picture_images = (
|
||||||
|
True # FIXME: to be deprecated in verson 3
|
||||||
|
)
|
||||||
|
pipeline_options.images_scale = 2
|
||||||
|
|
||||||
|
backend: Type[PdfDocumentBackend]
|
||||||
|
if pdf_backend == PdfBackend.DLPARSE_V1:
|
||||||
|
backend = DoclingParseDocumentBackend
|
||||||
|
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
||||||
|
backend = DoclingParseV2DocumentBackend
|
||||||
|
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
||||||
|
backend = DoclingParseV4DocumentBackend # type: ignore
|
||||||
|
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
||||||
|
backend = PyPdfiumDocumentBackend # type: ignore
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
||||||
|
|
||||||
|
pdf_format_option = PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
backend=backend, # pdf_backend
|
||||||
|
)
|
||||||
|
elif pipeline == PdfPipeline.VLM:
|
||||||
|
pipeline_options = VlmPipelineOptions()
|
||||||
|
|
||||||
|
if vlm_model == VlmModelType.GRANITE_VISION:
|
||||||
|
pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
||||||
|
elif vlm_model == VlmModelType.SMOLDOCLING:
|
||||||
|
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
||||||
|
if sys.platform == "darwin":
|
||||||
|
try:
|
||||||
|
import mlx_vlm
|
||||||
|
|
||||||
|
pipeline_options.vlm_options = (
|
||||||
|
smoldocling_vlm_mlx_conversion_options
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
_log.warning(
|
||||||
|
"To run SmolDocling faster, please install mlx-vlm:\n"
|
||||||
|
"pip install mlx-vlm"
|
||||||
|
)
|
||||||
|
|
||||||
|
pdf_format_option = PdfFormatOption(
|
||||||
|
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
||||||
)
|
)
|
||||||
pipeline_options.images_scale = 2
|
|
||||||
|
|
||||||
if artifacts_path is not None:
|
if artifacts_path is not None:
|
||||||
pipeline_options.artifacts_path = artifacts_path
|
pipeline_options.artifacts_path = artifacts_path
|
||||||
|
|
||||||
backend: Type[PdfDocumentBackend]
|
|
||||||
if pdf_backend == PdfBackend.DLPARSE_V1:
|
|
||||||
backend = DoclingParseDocumentBackend
|
|
||||||
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
|
||||||
backend = DoclingParseV2DocumentBackend
|
|
||||||
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
|
||||||
backend = DoclingParseV4DocumentBackend # type: ignore
|
|
||||||
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
|
||||||
backend = PyPdfiumDocumentBackend # type: ignore
|
|
||||||
else:
|
|
||||||
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
|
||||||
|
|
||||||
pdf_format_option = PdfFormatOption(
|
|
||||||
pipeline_options=pipeline_options,
|
|
||||||
backend=backend, # pdf_backend
|
|
||||||
)
|
|
||||||
format_options: Dict[InputFormat, FormatOption] = {
|
format_options: Dict[InputFormat, FormatOption] = {
|
||||||
InputFormat.PDF: pdf_format_option,
|
InputFormat.PDF: pdf_format_option,
|
||||||
InputFormat.IMAGE: pdf_format_option,
|
InputFormat.IMAGE: pdf_format_option,
|
||||||
|
@ -308,6 +308,11 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class VlmModelType(str, Enum):
|
||||||
|
SMOLDOCLING = "smoldocling"
|
||||||
|
GRANITE_VISION = "granite_vision"
|
||||||
|
|
||||||
|
|
||||||
# Define an enum for the backend options
|
# Define an enum for the backend options
|
||||||
class PdfBackend(str, Enum):
|
class PdfBackend(str, Enum):
|
||||||
"""Enum of valid PDF backends."""
|
"""Enum of valid PDF backends."""
|
||||||
@ -343,13 +348,14 @@ class PipelineOptions(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class PaginatedPipelineOptions(PipelineOptions):
|
class PaginatedPipelineOptions(PipelineOptions):
|
||||||
|
artifacts_path: Optional[Union[Path, str]] = None
|
||||||
|
|
||||||
images_scale: float = 1.0
|
images_scale: float = 1.0
|
||||||
generate_page_images: bool = False
|
generate_page_images: bool = False
|
||||||
generate_picture_images: bool = False
|
generate_picture_images: bool = False
|
||||||
|
|
||||||
|
|
||||||
class VlmPipelineOptions(PaginatedPipelineOptions):
|
class VlmPipelineOptions(PaginatedPipelineOptions):
|
||||||
artifacts_path: Optional[Union[Path, str]] = None
|
|
||||||
|
|
||||||
generate_page_images: bool = True
|
generate_page_images: bool = True
|
||||||
force_backend_text: bool = (
|
force_backend_text: bool = (
|
||||||
@ -362,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|||||||
class PdfPipelineOptions(PaginatedPipelineOptions):
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||||
"""Options for the PDF pipeline."""
|
"""Options for the PDF pipeline."""
|
||||||
|
|
||||||
artifacts_path: Optional[Union[Path, str]] = None
|
|
||||||
do_table_structure: bool = True # True: perform table structure extraction
|
do_table_structure: bool = True # True: perform table structure extraction
|
||||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||||
do_code_enrichment: bool = False # True: perform code OCR
|
do_code_enrichment: bool = False # True: perform code OCR
|
||||||
@ -393,3 +398,8 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|||||||
)
|
)
|
||||||
|
|
||||||
generate_parsed_pages: bool = False
|
generate_parsed_pages: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class PdfPipeline(str, Enum):
|
||||||
|
STANDARD = "standard"
|
||||||
|
VLM = "vlm"
|
||||||
|
@ -32,9 +32,15 @@ class HuggingFaceMlxModel(BasePageModel):
|
|||||||
self.vlm_options = vlm_options
|
self.vlm_options = vlm_options
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
from mlx_vlm import generate, load # type: ignore
|
|
||||||
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
try:
|
||||||
from mlx_vlm.utils import load_config, stream_generate # type: ignore
|
from mlx_vlm import generate, load # type: ignore
|
||||||
|
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
||||||
|
from mlx_vlm.utils import load_config, stream_generate # type: ignore
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
|
||||||
|
)
|
||||||
|
|
||||||
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
||||||
self.apply_chat_template = apply_chat_template
|
self.apply_chat_template = apply_chat_template
|
||||||
@ -113,7 +119,6 @@ class HuggingFaceMlxModel(BasePageModel):
|
|||||||
verbose=False,
|
verbose=False,
|
||||||
):
|
):
|
||||||
output += token.text
|
output += token.text
|
||||||
print(token.text, end="")
|
|
||||||
if "</doctag>" in token.text:
|
if "</doctag>" in token.text:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
@ -34,12 +34,6 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
self.keep_backend = True
|
self.keep_backend = True
|
||||||
|
|
||||||
warnings.warn(
|
|
||||||
"The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
|
|
||||||
category=UserWarning,
|
|
||||||
stacklevel=2,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.pipeline_options: VlmPipelineOptions
|
self.pipeline_options: VlmPipelineOptions
|
||||||
|
|
||||||
artifacts_path: Optional[Path] = None
|
artifacts_path: Optional[Path] = None
|
||||||
|
@ -192,6 +192,7 @@ module = [
|
|||||||
"docling_ibm_models.*",
|
"docling_ibm_models.*",
|
||||||
"easyocr.*",
|
"easyocr.*",
|
||||||
"ocrmac.*",
|
"ocrmac.*",
|
||||||
|
"mlx_vlm.*",
|
||||||
"lxml.*",
|
"lxml.*",
|
||||||
"huggingface_hub.*",
|
"huggingface_hub.*",
|
||||||
"transformers.*",
|
"transformers.*",
|
||||||
|
Loading…
Reference in New Issue
Block a user