mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
scaffolding in place
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
0432a31b2f
commit
5c606c2574
@ -29,6 +29,10 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
|
|||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||||
|
from docling.datamodel.asr_model_specs import (
|
||||||
|
WHISPER_TINY,
|
||||||
|
AsrModelType,
|
||||||
|
)
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
FormatToExtensions,
|
FormatToExtensions,
|
||||||
@ -37,12 +41,13 @@ from docling.datamodel.base_models import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
|
AsrPipelineOptions,
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
PaginatedPipelineOptions,
|
PaginatedPipelineOptions,
|
||||||
PdfBackend,
|
PdfBackend,
|
||||||
PdfPipeline,
|
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
|
ProcessingPipeline,
|
||||||
TableFormerMode,
|
TableFormerMode,
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
)
|
)
|
||||||
@ -56,6 +61,7 @@ from docling.datamodel.vlm_model_specs import (
|
|||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||||
from docling.models.factories import get_ocr_factory
|
from docling.models.factories import get_ocr_factory
|
||||||
|
from docling.pipeline.asr_pipeline import AsrPipeline
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||||
@ -296,13 +302,17 @@ def convert( # noqa: C901
|
|||||||
),
|
),
|
||||||
] = ImageRefMode.EMBEDDED,
|
] = ImageRefMode.EMBEDDED,
|
||||||
pipeline: Annotated[
|
pipeline: Annotated[
|
||||||
PdfPipeline,
|
ProcessingPipeline,
|
||||||
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
|
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
|
||||||
] = PdfPipeline.STANDARD,
|
] = ProcessingPipeline.STANDARD,
|
||||||
vlm_model: Annotated[
|
vlm_model: Annotated[
|
||||||
VlmModelType,
|
VlmModelType,
|
||||||
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
|
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
|
||||||
] = VlmModelType.SMOLDOCLING,
|
] = VlmModelType.SMOLDOCLING,
|
||||||
|
asr_model: Annotated[
|
||||||
|
AsrModelType,
|
||||||
|
typer.Option(..., help="Choose the ASR model to use with audio/video files."),
|
||||||
|
] = AsrModelType.WHISPER_TINY,
|
||||||
ocr: Annotated[
|
ocr: Annotated[
|
||||||
bool,
|
bool,
|
||||||
typer.Option(
|
typer.Option(
|
||||||
@ -532,7 +542,7 @@ def convert( # noqa: C901
|
|||||||
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
||||||
pipeline_options: PaginatedPipelineOptions
|
pipeline_options: PaginatedPipelineOptions
|
||||||
|
|
||||||
if pipeline == PdfPipeline.STANDARD:
|
if pipeline == ProcessingPipeline.STANDARD:
|
||||||
pipeline_options = PdfPipelineOptions(
|
pipeline_options = PdfPipelineOptions(
|
||||||
allow_external_plugins=allow_external_plugins,
|
allow_external_plugins=allow_external_plugins,
|
||||||
enable_remote_services=enable_remote_services,
|
enable_remote_services=enable_remote_services,
|
||||||
@ -574,7 +584,7 @@ def convert( # noqa: C901
|
|||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
backend=backend, # pdf_backend
|
backend=backend, # pdf_backend
|
||||||
)
|
)
|
||||||
elif pipeline == PdfPipeline.VLM:
|
elif pipeline == ProcessingPipeline.VLM:
|
||||||
pipeline_options = VlmPipelineOptions(
|
pipeline_options = VlmPipelineOptions(
|
||||||
enable_remote_services=enable_remote_services,
|
enable_remote_services=enable_remote_services,
|
||||||
)
|
)
|
||||||
@ -599,13 +609,28 @@ def convert( # noqa: C901
|
|||||||
pdf_format_option = PdfFormatOption(
|
pdf_format_option = PdfFormatOption(
|
||||||
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
||||||
)
|
)
|
||||||
|
elif pipeline == ProcessingPipeline.ASR:
|
||||||
|
audio_pipeline_options = AsrPipelineOptions(
|
||||||
|
# enable_remote_services=enable_remote_services,
|
||||||
|
)
|
||||||
|
|
||||||
|
audio_format_option = PdfFormatOption(
|
||||||
|
pipeline_cls=AsrPipeline, pipeline_options=audio_pipeline_options
|
||||||
|
)
|
||||||
|
|
||||||
|
"""
|
||||||
|
if asr_model == AsrModelType.WHISPER_TINY:
|
||||||
|
pipeline_options.asr_options = WHISPER_TINY:
|
||||||
|
"""
|
||||||
|
|
||||||
if artifacts_path is not None:
|
if artifacts_path is not None:
|
||||||
pipeline_options.artifacts_path = artifacts_path
|
pipeline_options.artifacts_path = artifacts_path
|
||||||
|
# audio_pipeline_options.artifacts_path = artifacts_path
|
||||||
|
|
||||||
format_options: Dict[InputFormat, FormatOption] = {
|
format_options: Dict[InputFormat, FormatOption] = {
|
||||||
InputFormat.PDF: pdf_format_option,
|
InputFormat.PDF: pdf_format_option,
|
||||||
InputFormat.IMAGE: pdf_format_option,
|
InputFormat.IMAGE: pdf_format_option,
|
||||||
|
InputFormat.AUDIO: audio_format_option,
|
||||||
}
|
}
|
||||||
doc_converter = DocumentConverter(
|
doc_converter = DocumentConverter(
|
||||||
allowed_formats=from_formats,
|
allowed_formats=from_formats,
|
||||||
|
27
docling/datamodel/asr_model_specs.py
Normal file
27
docling/datamodel/asr_model_specs.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import logging
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
from pydantic import (
|
||||||
|
AnyUrl,
|
||||||
|
)
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||||
|
from docling.datamodel.pipeline_options_asr_model import (
|
||||||
|
# ApiAsrOptions,
|
||||||
|
InferenceFramework,
|
||||||
|
InlineAsrOptions,
|
||||||
|
AsrResponseFormat,
|
||||||
|
TransformersModelType,
|
||||||
|
)
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# SmolDocling
|
||||||
|
WHISPER_TINY = InlineAsrOptions(
|
||||||
|
repo_id="openai/whisper-tiny",
|
||||||
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||||
|
response_format = AsrResponseFormat.WHISPER,
|
||||||
|
)
|
||||||
|
|
||||||
|
class AsrModelType(str, Enum):
|
||||||
|
WHISPER_TINY = "whisper_tiny"
|
@ -49,6 +49,7 @@ class InputFormat(str, Enum):
|
|||||||
XML_USPTO = "xml_uspto"
|
XML_USPTO = "xml_uspto"
|
||||||
XML_JATS = "xml_jats"
|
XML_JATS = "xml_jats"
|
||||||
JSON_DOCLING = "json_docling"
|
JSON_DOCLING = "json_docling"
|
||||||
|
AUDIO = "audio"
|
||||||
|
|
||||||
|
|
||||||
class OutputFormat(str, Enum):
|
class OutputFormat(str, Enum):
|
||||||
@ -73,6 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|||||||
InputFormat.XLSX: ["xlsx", "xlsm"],
|
InputFormat.XLSX: ["xlsx", "xlsm"],
|
||||||
InputFormat.XML_USPTO: ["xml", "txt"],
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
||||||
InputFormat.JSON_DOCLING: ["json"],
|
InputFormat.JSON_DOCLING: ["json"],
|
||||||
|
InputFormat.AUDIO: ["wav", "mp3"],
|
||||||
}
|
}
|
||||||
|
|
||||||
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||||
@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|||||||
],
|
],
|
||||||
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
||||||
InputFormat.JSON_DOCLING: ["application/json"],
|
InputFormat.JSON_DOCLING: ["application/json"],
|
||||||
|
InputFormat.AUDIO: ["audio/wav"],
|
||||||
}
|
}
|
||||||
|
|
||||||
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
||||||
|
@ -13,6 +13,13 @@ from typing_extensions import deprecated
|
|||||||
|
|
||||||
# Import the following for backwards compatibility
|
# Import the following for backwards compatibility
|
||||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||||
|
from docling.datamodel.asr_model_specs import (
|
||||||
|
WHISPER_TINY as whisper_tiny,
|
||||||
|
AsrModelType,
|
||||||
|
)
|
||||||
|
from docling.datamodel.pipeline_options_asr_model import (
|
||||||
|
InlineAsrOptions,
|
||||||
|
)
|
||||||
from docling.datamodel.pipeline_options_vlm_model import (
|
from docling.datamodel.pipeline_options_vlm_model import (
|
||||||
ApiVlmOptions,
|
ApiVlmOptions,
|
||||||
InferenceFramework,
|
InferenceFramework,
|
||||||
@ -260,6 +267,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AsrPipelineOptions(PipelineOptions):
|
||||||
|
asr_options: Union[InlineAsrOptions] = whisper_tiny
|
||||||
|
artifacts_path: Optional[Union[Path, str]] = None
|
||||||
|
|
||||||
|
|
||||||
class PdfPipelineOptions(PaginatedPipelineOptions):
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||||
"""Options for the PDF pipeline."""
|
"""Options for the PDF pipeline."""
|
||||||
|
|
||||||
@ -295,6 +307,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|||||||
generate_parsed_pages: bool = False
|
generate_parsed_pages: bool = False
|
||||||
|
|
||||||
|
|
||||||
class PdfPipeline(str, Enum):
|
class ProcessingPipeline(str, Enum):
|
||||||
STANDARD = "standard"
|
STANDARD = "standard"
|
||||||
VLM = "vlm"
|
VLM = "vlm"
|
||||||
|
ASR = "asr"
|
||||||
|
50
docling/datamodel/pipeline_options_asr_model.py
Normal file
50
docling/datamodel/pipeline_options_asr_model.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
from enum import Enum
|
||||||
|
from typing import Any, Dict, List, Literal, Optional, Union
|
||||||
|
|
||||||
|
from pydantic import AnyUrl, BaseModel
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import InferenceFramework, TransformersModelType
|
||||||
|
|
||||||
|
class BaseAsrOptions(BaseModel):
|
||||||
|
kind: str
|
||||||
|
# prompt: str
|
||||||
|
|
||||||
|
|
||||||
|
class AsrResponseFormat(str, Enum):
|
||||||
|
WHISPER = "whisper"
|
||||||
|
|
||||||
|
|
||||||
|
class InlineAsrOptions(BaseAsrOptions):
|
||||||
|
kind: Literal["inline_model_options"] = "inline_model_options"
|
||||||
|
|
||||||
|
repo_id: str
|
||||||
|
trust_remote_code: bool = False
|
||||||
|
load_in_8bit: bool = True
|
||||||
|
llm_int8_threshold: float = 6.0
|
||||||
|
quantized: bool = False
|
||||||
|
|
||||||
|
inference_framework: InferenceFramework
|
||||||
|
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
|
||||||
|
response_format: AsrResponseFormat
|
||||||
|
|
||||||
|
torch_dtype: Optional[str] = None
|
||||||
|
supported_devices: List[AcceleratorDevice] = [
|
||||||
|
AcceleratorDevice.CPU,
|
||||||
|
AcceleratorDevice.CUDA,
|
||||||
|
AcceleratorDevice.MPS,
|
||||||
|
]
|
||||||
|
|
||||||
|
temperature: float = 0.0
|
||||||
|
stop_strings: List[str] = []
|
||||||
|
extra_generation_config: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
use_kv_cache: bool = True
|
||||||
|
max_new_tokens: int = 4096
|
||||||
|
|
||||||
|
@property
|
||||||
|
def repo_cache_folder(self) -> str:
|
||||||
|
return self.repo_id.replace("/", "--")
|
||||||
|
|
||||||
|
|
62
docling/pipeline/asr_pipeline.py
Normal file
62
docling/pipeline/asr_pipeline.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional, Union, cast
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
|
|
||||||
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
|
from docling.datamodel.pipeline_options import (
|
||||||
|
AsrPipelineOptions,
|
||||||
|
)
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import (
|
||||||
|
InferenceFramework,
|
||||||
|
)
|
||||||
|
from docling.datamodel.pipeline_options_asr_model import (
|
||||||
|
InlineAsrOptions,
|
||||||
|
AsrResponseFormat,
|
||||||
|
)
|
||||||
|
from docling.datamodel.settings import settings
|
||||||
|
from docling.pipeline.base_pipeline import BasePipeline
|
||||||
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class AsrPipeline(BasePipeline):
|
||||||
|
def __init__(self, pipeline_options: AsrPipelineOptions):
|
||||||
|
super().__init__(pipeline_options)
|
||||||
|
self.keep_backend = True
|
||||||
|
|
||||||
|
self.pipeline_options: AsrPipelineOptions
|
||||||
|
|
||||||
|
artifacts_path: Optional[Path] = None
|
||||||
|
if pipeline_options.artifacts_path is not None:
|
||||||
|
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
||||||
|
elif settings.artifacts_path is not None:
|
||||||
|
artifacts_path = Path(settings.artifacts_path).expanduser()
|
||||||
|
|
||||||
|
if artifacts_path is not None and not artifacts_path.is_dir():
|
||||||
|
raise RuntimeError(
|
||||||
|
f"The value of {artifacts_path=} is not valid. "
|
||||||
|
"When defined, it must point to a folder containing all models required by the pipeline."
|
||||||
|
)
|
||||||
|
|
||||||
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
|
total_elapsed_time = 0.0
|
||||||
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||||
|
print("do something")
|
||||||
|
|
||||||
|
return conv_res
|
||||||
|
|
||||||
|
"""
|
||||||
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
||||||
|
status = ConversionStatus()
|
||||||
|
return status
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||||
|
return True
|
Loading…
Reference in New Issue
Block a user