scaffolding in place

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2025-06-12 17:57:29 +02:00
parent 0432a31b2f
commit 5c606c2574
6 changed files with 186 additions and 6 deletions

View File

@ -29,6 +29,10 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.asr_model_specs import (
WHISPER_TINY,
AsrModelType,
)
from docling.datamodel.base_models import (
ConversionStatus,
FormatToExtensions,
@ -37,12 +41,13 @@ from docling.datamodel.base_models import (
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AsrPipelineOptions,
EasyOcrOptions,
OcrOptions,
PaginatedPipelineOptions,
PdfBackend,
PdfPipeline,
PdfPipelineOptions,
ProcessingPipeline,
TableFormerMode,
VlmPipelineOptions,
)
@ -56,6 +61,7 @@ from docling.datamodel.vlm_model_specs import (
)
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.models.factories import get_ocr_factory
from docling.pipeline.asr_pipeline import AsrPipeline
from docling.pipeline.vlm_pipeline import VlmPipeline
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@ -296,13 +302,17 @@ def convert( # noqa: C901
),
] = ImageRefMode.EMBEDDED,
pipeline: Annotated[
PdfPipeline,
ProcessingPipeline,
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
] = PdfPipeline.STANDARD,
] = ProcessingPipeline.STANDARD,
vlm_model: Annotated[
VlmModelType,
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
] = VlmModelType.SMOLDOCLING,
asr_model: Annotated[
AsrModelType,
typer.Option(..., help="Choose the ASR model to use with audio/video files."),
] = AsrModelType.WHISPER_TINY,
ocr: Annotated[
bool,
typer.Option(
@ -532,7 +542,7 @@ def convert( # noqa: C901
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
pipeline_options: PaginatedPipelineOptions
if pipeline == PdfPipeline.STANDARD:
if pipeline == ProcessingPipeline.STANDARD:
pipeline_options = PdfPipelineOptions(
allow_external_plugins=allow_external_plugins,
enable_remote_services=enable_remote_services,
@ -574,7 +584,7 @@ def convert( # noqa: C901
pipeline_options=pipeline_options,
backend=backend, # pdf_backend
)
elif pipeline == PdfPipeline.VLM:
elif pipeline == ProcessingPipeline.VLM:
pipeline_options = VlmPipelineOptions(
enable_remote_services=enable_remote_services,
)
@ -599,13 +609,28 @@ def convert( # noqa: C901
pdf_format_option = PdfFormatOption(
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
)
elif pipeline == ProcessingPipeline.ASR:
audio_pipeline_options = AsrPipelineOptions(
# enable_remote_services=enable_remote_services,
)
audio_format_option = PdfFormatOption(
pipeline_cls=AsrPipeline, pipeline_options=audio_pipeline_options
)
"""
if asr_model == AsrModelType.WHISPER_TINY:
pipeline_options.asr_options = WHISPER_TINY:
"""
if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path
# audio_pipeline_options.artifacts_path = artifacts_path
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
InputFormat.AUDIO: audio_format_option,
}
doc_converter = DocumentConverter(
allowed_formats=from_formats,

View File

@ -0,0 +1,27 @@
import logging
from enum import Enum
from pydantic import (
AnyUrl,
)
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.pipeline_options_asr_model import (
# ApiAsrOptions,
InferenceFramework,
InlineAsrOptions,
AsrResponseFormat,
TransformersModelType,
)
_log = logging.getLogger(__name__)
# SmolDocling
WHISPER_TINY = InlineAsrOptions(
repo_id="openai/whisper-tiny",
inference_framework=InferenceFramework.TRANSFORMERS,
response_format = AsrResponseFormat.WHISPER,
)
class AsrModelType(str, Enum):
WHISPER_TINY = "whisper_tiny"

View File

@ -49,6 +49,7 @@ class InputFormat(str, Enum):
XML_USPTO = "xml_uspto"
XML_JATS = "xml_jats"
JSON_DOCLING = "json_docling"
AUDIO = "audio"
class OutputFormat(str, Enum):
@ -73,6 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.XLSX: ["xlsx", "xlsm"],
InputFormat.XML_USPTO: ["xml", "txt"],
InputFormat.JSON_DOCLING: ["json"],
InputFormat.AUDIO: ["wav", "mp3"],
}
FormatToMimeType: Dict[InputFormat, List[str]] = {
@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
],
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
InputFormat.JSON_DOCLING: ["application/json"],
InputFormat.AUDIO: ["audio/wav"],
}
MimeTypeToFormat: dict[str, list[InputFormat]] = {

View File

@ -13,6 +13,13 @@ from typing_extensions import deprecated
# Import the following for backwards compatibility
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.asr_model_specs import (
WHISPER_TINY as whisper_tiny,
AsrModelType,
)
from docling.datamodel.pipeline_options_asr_model import (
InlineAsrOptions,
)
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
InferenceFramework,
@ -260,6 +267,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
)
class AsrPipelineOptions(PipelineOptions):
asr_options: Union[InlineAsrOptions] = whisper_tiny
artifacts_path: Optional[Union[Path, str]] = None
class PdfPipelineOptions(PaginatedPipelineOptions):
"""Options for the PDF pipeline."""
@ -295,6 +307,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
generate_parsed_pages: bool = False
class PdfPipeline(str, Enum):
class ProcessingPipeline(str, Enum):
STANDARD = "standard"
VLM = "vlm"
ASR = "asr"

View File

@ -0,0 +1,50 @@
from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Union
from pydantic import AnyUrl, BaseModel
from typing_extensions import deprecated
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.pipeline_options_vlm_model import InferenceFramework, TransformersModelType
class BaseAsrOptions(BaseModel):
kind: str
# prompt: str
class AsrResponseFormat(str, Enum):
WHISPER = "whisper"
class InlineAsrOptions(BaseAsrOptions):
kind: Literal["inline_model_options"] = "inline_model_options"
repo_id: str
trust_remote_code: bool = False
load_in_8bit: bool = True
llm_int8_threshold: float = 6.0
quantized: bool = False
inference_framework: InferenceFramework
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
response_format: AsrResponseFormat
torch_dtype: Optional[str] = None
supported_devices: List[AcceleratorDevice] = [
AcceleratorDevice.CPU,
AcceleratorDevice.CUDA,
AcceleratorDevice.MPS,
]
temperature: float = 0.0
stop_strings: List[str] = []
extra_generation_config: Dict[str, Any] = {}
use_kv_cache: bool = True
max_new_tokens: int = 4096
@property
def repo_cache_folder(self) -> str:
return self.repo_id.replace("/", "--")

View File

@ -0,0 +1,62 @@
import logging
import re
from io import BytesIO
from pathlib import Path
from typing import List, Optional, Union, cast
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import (
AsrPipelineOptions,
)
from docling.datamodel.pipeline_options_vlm_model import (
InferenceFramework,
)
from docling.datamodel.pipeline_options_asr_model import (
InlineAsrOptions,
AsrResponseFormat,
)
from docling.datamodel.settings import settings
from docling.pipeline.base_pipeline import BasePipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.datamodel.document import ConversionResult, InputDocument
_log = logging.getLogger(__name__)
class AsrPipeline(BasePipeline):
def __init__(self, pipeline_options: AsrPipelineOptions):
super().__init__(pipeline_options)
self.keep_backend = True
self.pipeline_options: AsrPipelineOptions
artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
artifacts_path = Path(settings.artifacts_path).expanduser()
if artifacts_path is not None and not artifacts_path.is_dir():
raise RuntimeError(
f"The value of {artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
total_elapsed_time = 0.0
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
print("do something")
return conv_res
"""
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
status = ConversionStatus()
return status
"""
@classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
return True