mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
doing scaffolding for audio pipeline
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
5c606c2574
commit
1d4008ac7c
@ -542,7 +542,41 @@ def convert( # noqa: C901
|
|||||||
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
||||||
pipeline_options: PaginatedPipelineOptions
|
pipeline_options: PaginatedPipelineOptions
|
||||||
|
|
||||||
if pipeline == ProcessingPipeline.STANDARD:
|
format_options: Dict[InputFormat, FormatOption] = {}
|
||||||
|
|
||||||
|
if pipeline == ProcessingPipeline.VLM:
|
||||||
|
pipeline_options = VlmPipelineOptions(
|
||||||
|
enable_remote_services=enable_remote_services,
|
||||||
|
)
|
||||||
|
|
||||||
|
if vlm_model == VlmModelType.GRANITE_VISION:
|
||||||
|
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
|
||||||
|
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
||||||
|
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
|
||||||
|
elif vlm_model == VlmModelType.SMOLDOCLING:
|
||||||
|
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
|
||||||
|
if sys.platform == "darwin":
|
||||||
|
try:
|
||||||
|
import mlx_vlm
|
||||||
|
|
||||||
|
pipeline_options.vlm_options = SMOLDOCLING_MLX
|
||||||
|
except ImportError:
|
||||||
|
_log.warning(
|
||||||
|
"To run SmolDocling faster, please install mlx-vlm:\n"
|
||||||
|
"pip install mlx-vlm"
|
||||||
|
)
|
||||||
|
|
||||||
|
pdf_format_option = PdfFormatOption(
|
||||||
|
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
||||||
|
)
|
||||||
|
|
||||||
|
format_options: Dict[InputFormat, FormatOption] = {
|
||||||
|
InputFormat.PDF: pdf_format_option,
|
||||||
|
InputFormat.IMAGE: pdf_format_option,
|
||||||
|
}
|
||||||
|
|
||||||
|
elif pipeline == ProcessingPipeline.STANDARD:
|
||||||
|
|
||||||
pipeline_options = PdfPipelineOptions(
|
pipeline_options = PdfPipelineOptions(
|
||||||
allow_external_plugins=allow_external_plugins,
|
allow_external_plugins=allow_external_plugins,
|
||||||
enable_remote_services=enable_remote_services,
|
enable_remote_services=enable_remote_services,
|
||||||
@ -584,40 +618,28 @@ def convert( # noqa: C901
|
|||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
backend=backend, # pdf_backend
|
backend=backend, # pdf_backend
|
||||||
)
|
)
|
||||||
elif pipeline == ProcessingPipeline.VLM:
|
|
||||||
pipeline_options = VlmPipelineOptions(
|
|
||||||
enable_remote_services=enable_remote_services,
|
|
||||||
)
|
|
||||||
|
|
||||||
if vlm_model == VlmModelType.GRANITE_VISION:
|
format_options: Dict[InputFormat, FormatOption] = {
|
||||||
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
|
InputFormat.PDF: pdf_format_option,
|
||||||
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
InputFormat.IMAGE: pdf_format_option,
|
||||||
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
|
}
|
||||||
elif vlm_model == VlmModelType.SMOLDOCLING:
|
|
||||||
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
|
|
||||||
if sys.platform == "darwin":
|
|
||||||
try:
|
|
||||||
import mlx_vlm
|
|
||||||
|
|
||||||
pipeline_options.vlm_options = SMOLDOCLING_MLX
|
|
||||||
except ImportError:
|
|
||||||
_log.warning(
|
|
||||||
"To run SmolDocling faster, please install mlx-vlm:\n"
|
|
||||||
"pip install mlx-vlm"
|
|
||||||
)
|
|
||||||
|
|
||||||
pdf_format_option = PdfFormatOption(
|
|
||||||
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
|
||||||
)
|
|
||||||
elif pipeline == ProcessingPipeline.ASR:
|
elif pipeline == ProcessingPipeline.ASR:
|
||||||
audio_pipeline_options = AsrPipelineOptions(
|
audio_pipeline_options = AsrPipelineOptions(
|
||||||
# enable_remote_services=enable_remote_services,
|
# enable_remote_services=enable_remote_services,
|
||||||
|
artifacts_path = artifacts_path
|
||||||
)
|
)
|
||||||
|
|
||||||
audio_format_option = PdfFormatOption(
|
audio_format_option = PdfFormatOption(
|
||||||
pipeline_cls=AsrPipeline, pipeline_options=audio_pipeline_options
|
pipeline_cls=AsrPipeline,
|
||||||
|
pipeline_options=audio_pipeline_options,
|
||||||
|
# backend = FIXME
|
||||||
)
|
)
|
||||||
|
|
||||||
|
format_options: Dict[InputFormat, FormatOption] = {
|
||||||
|
InputFormat.AUDIO_WAV: audio_format_option,
|
||||||
|
}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if asr_model == AsrModelType.WHISPER_TINY:
|
if asr_model == AsrModelType.WHISPER_TINY:
|
||||||
pipeline_options.asr_options = WHISPER_TINY:
|
pipeline_options.asr_options = WHISPER_TINY:
|
||||||
@ -627,11 +649,6 @@ def convert( # noqa: C901
|
|||||||
pipeline_options.artifacts_path = artifacts_path
|
pipeline_options.artifacts_path = artifacts_path
|
||||||
# audio_pipeline_options.artifacts_path = artifacts_path
|
# audio_pipeline_options.artifacts_path = artifacts_path
|
||||||
|
|
||||||
format_options: Dict[InputFormat, FormatOption] = {
|
|
||||||
InputFormat.PDF: pdf_format_option,
|
|
||||||
InputFormat.IMAGE: pdf_format_option,
|
|
||||||
InputFormat.AUDIO: audio_format_option,
|
|
||||||
}
|
|
||||||
doc_converter = DocumentConverter(
|
doc_converter = DocumentConverter(
|
||||||
allowed_formats=from_formats,
|
allowed_formats=from_formats,
|
||||||
format_options=format_options,
|
format_options=format_options,
|
||||||
|
@ -49,7 +49,7 @@ class InputFormat(str, Enum):
|
|||||||
XML_USPTO = "xml_uspto"
|
XML_USPTO = "xml_uspto"
|
||||||
XML_JATS = "xml_jats"
|
XML_JATS = "xml_jats"
|
||||||
JSON_DOCLING = "json_docling"
|
JSON_DOCLING = "json_docling"
|
||||||
AUDIO = "audio"
|
AUDIO_WAV = "wav"
|
||||||
|
|
||||||
|
|
||||||
class OutputFormat(str, Enum):
|
class OutputFormat(str, Enum):
|
||||||
@ -74,7 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|||||||
InputFormat.XLSX: ["xlsx", "xlsm"],
|
InputFormat.XLSX: ["xlsx", "xlsm"],
|
||||||
InputFormat.XML_USPTO: ["xml", "txt"],
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
||||||
InputFormat.JSON_DOCLING: ["json"],
|
InputFormat.JSON_DOCLING: ["json"],
|
||||||
InputFormat.AUDIO: ["wav", "mp3"],
|
InputFormat.AUDIO_WAV: ["wav"],
|
||||||
}
|
}
|
||||||
|
|
||||||
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||||
@ -106,7 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|||||||
],
|
],
|
||||||
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
||||||
InputFormat.JSON_DOCLING: ["application/json"],
|
InputFormat.JSON_DOCLING: ["application/json"],
|
||||||
InputFormat.AUDIO: ["audio/wav"],
|
InputFormat.AUDIO_WAV: ["audio/wav", "audio/x-wav"],
|
||||||
}
|
}
|
||||||
|
|
||||||
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
||||||
|
@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
backend: Type[AbstractDocumentBackend]
|
backend: Type[AbstractDocumentBackend]
|
||||||
if format not in format_options.keys():
|
if format not in format_options.keys():
|
||||||
_log.error(
|
_log.error(
|
||||||
f"Input document {obj.name} does not match any allowed format."
|
f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
|
||||||
)
|
)
|
||||||
backend = _DummyBackend
|
backend = _DummyBackend
|
||||||
else:
|
else:
|
||||||
@ -280,9 +280,12 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
|
|
||||||
if isinstance(obj, Path):
|
if isinstance(obj, Path):
|
||||||
mime = filetype.guess_mime(str(obj))
|
mime = filetype.guess_mime(str(obj))
|
||||||
|
print(f"mime: {mime}")
|
||||||
if mime is None:
|
if mime is None:
|
||||||
ext = obj.suffix[1:]
|
ext = obj.suffix[1:]
|
||||||
|
print(f"ext: {ext}")
|
||||||
mime = _DocumentConversionInput._mime_from_extension(ext)
|
mime = _DocumentConversionInput._mime_from_extension(ext)
|
||||||
|
print(f"mime: {mime}")
|
||||||
if mime is None: # must guess from
|
if mime is None: # must guess from
|
||||||
with obj.open("rb") as f:
|
with obj.open("rb") as f:
|
||||||
content = f.read(1024) # Read first 1KB
|
content = f.read(1024) # Read first 1KB
|
||||||
@ -318,6 +321,8 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
mime = mime or _DocumentConversionInput._detect_csv(content)
|
mime = mime or _DocumentConversionInput._detect_csv(content)
|
||||||
mime = mime or "text/plain"
|
mime = mime or "text/plain"
|
||||||
formats = MimeTypeToFormat.get(mime, [])
|
formats = MimeTypeToFormat.get(mime, [])
|
||||||
|
print(formats)
|
||||||
|
|
||||||
if formats:
|
if formats:
|
||||||
if len(formats) == 1 and mime not in ("text/plain"):
|
if len(formats) == 1 and mime not in ("text/plain"):
|
||||||
return formats[0]
|
return formats[0]
|
||||||
|
Loading…
Reference in New Issue
Block a user