need to fix ruff linter

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2025-05-12 07:34:24 +02:00
parent 32ad65cb9f
commit 76501331d2
8 changed files with 45 additions and 42 deletions

View File

@ -8,11 +8,11 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
class WavDocumentBackend(AbstractDocumentBackend):
class WavDocumentBackend(AbstractDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)
def is_valid(self) -> bool: def is_valid(self) -> bool:
return True return True
@ -29,4 +29,3 @@ class WavDocumentBackend(AbstractDocumentBackend):
@classmethod @classmethod
def supported_formats(cls) -> set[InputFormat]: def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.WAV} return {InputFormat.WAV}

View File

@ -577,11 +577,10 @@ def convert( # noqa: C901
asr_format_option = AsrFormatOption( asr_format_option = AsrFormatOption(
pipeline_cls=AsrPipeline, pipeline_options=pipeline_options pipeline_cls=AsrPipeline, pipeline_options=pipeline_options
) )
else: else:
_log.error(f"Did not find the correct pipeline: {pipeline}") _log.error(f"Did not find the correct pipeline: {pipeline}")
if artifacts_path is not None: if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path pipeline_options.artifacts_path = artifacts_path

View File

@ -34,7 +34,7 @@ class ConversionStatus(str, Enum):
class InputFormat(str, Enum): class InputFormat(str, Enum):
"""A document format supported by document backend parsers.""" """A document format supported by document backend parsers."""
# Documents # Documents
DOCX = "docx" DOCX = "docx"
PPTX = "pptx" PPTX = "pptx"
HTML = "html" HTML = "html"
@ -51,6 +51,7 @@ class InputFormat(str, Enum):
# Audio # Audio
WAV = "wav" WAV = "wav"
class OutputFormat(str, Enum): class OutputFormat(str, Enum):
MARKDOWN = "md" MARKDOWN = "md"
JSON = "json" JSON = "json"
@ -105,8 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
], ],
InputFormat.XML_USPTO: ["application/xml", "text/plain"], InputFormat.XML_USPTO: ["application/xml", "text/plain"],
InputFormat.JSON_DOCLING: ["application/json"], InputFormat.JSON_DOCLING: ["application/json"],
# Audio
# Audio
InputFormat.WAV: ["audio/wav", "audio/x-wav"], InputFormat.WAV: ["audio/wav", "audio/x-wav"],
} }
@ -165,8 +165,9 @@ class LayoutPrediction(BaseModel):
class VlmPrediction(BaseModel): class VlmPrediction(BaseModel):
text: str = "" text: str = ""
class AsrPrediction(BaseModel): class AsrPrediction(BaseModel):
text: str = "" text: str = ""
class ContainerElement( class ContainerElement(

View File

@ -279,7 +279,7 @@ class _DocumentConversionInput(BaseModel):
if isinstance(obj, Path): if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj)) mime = filetype.guess_mime(str(obj))
print(mime) print(mime)
if mime is None: if mime is None:
ext = obj.suffix[1:] ext = obj.suffix[1:]
mime = _DocumentConversionInput._mime_from_extension(ext) mime = _DocumentConversionInput._mime_from_extension(ext)
@ -292,8 +292,8 @@ class _DocumentConversionInput(BaseModel):
elif obj.suffixes[-1].lower() == ".docx": elif obj.suffixes[-1].lower() == ".docx":
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
elif obj.suffixes[-1].lower() == ".pptx": elif obj.suffixes[-1].lower() == ".pptx":
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation" mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
elif isinstance(obj, DocumentStream): elif isinstance(obj, DocumentStream):
content = obj.stream.read(8192) content = obj.stream.read(8192)
obj.stream.seek(0) obj.stream.seek(0)
@ -313,11 +313,11 @@ class _DocumentConversionInput(BaseModel):
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
elif objname.endswith(".pptx"): elif objname.endswith(".pptx"):
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation" mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
mime = mime or _DocumentConversionInput._detect_html_xhtml(content) mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
mime = mime or _DocumentConversionInput._detect_csv(content) mime = mime or _DocumentConversionInput._detect_csv(content)
mime = mime or "text/plain" mime = mime or "text/plain"
formats = MimeTypeToFormat.get(mime, []) formats = MimeTypeToFormat.get(mime, [])
if formats: if formats:
if len(formats) == 1 and mime not in ("text/plain"): if len(formats) == 1 and mime not in ("text/plain"):
@ -367,7 +367,7 @@ class _DocumentConversionInput(BaseModel):
@staticmethod @staticmethod
def _mime_from_extension(ext): def _mime_from_extension(ext):
print("ext: ", ext) print("ext: ", ext)
mime = None mime = None
if ext in FormatToExtensions[InputFormat.ASCIIDOC]: if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
mime = FormatToMimeType[InputFormat.ASCIIDOC][0] mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
@ -382,7 +382,7 @@ class _DocumentConversionInput(BaseModel):
elif ext in FormatToExtensions[InputFormat.PDF]: elif ext in FormatToExtensions[InputFormat.PDF]:
mime = FormatToMimeType[InputFormat.PDF][0] mime = FormatToMimeType[InputFormat.PDF][0]
elif ext in FormatToExtensions[InputFormat.WAV]: elif ext in FormatToExtensions[InputFormat.WAV]:
mime = FormatToMimeType[InputFormat.WAV][0] mime = FormatToMimeType[InputFormat.WAV][0]
return mime return mime
@staticmethod @staticmethod

View File

@ -257,10 +257,12 @@ class BaseVlmOptions(BaseModel):
kind: str kind: str
prompt: str prompt: str
class BaseAsrOptions(BaseModel): class BaseAsrOptions(BaseModel):
kind: str kind: str
prompt: str prompt: str
class ResponseFormat(str, Enum): class ResponseFormat(str, Enum):
DOCTAGS = "doctags" DOCTAGS = "doctags"
MARKDOWN = "markdown" MARKDOWN = "markdown"
@ -274,6 +276,7 @@ class InferenceFramework(str, Enum):
# Audio # Audio
ASR_NEMO = "asr_nemo" ASR_NEMO = "asr_nemo"
class HuggingFaceVlmOptions(BaseVlmOptions): class HuggingFaceVlmOptions(BaseVlmOptions):
kind: Literal["hf_model_options"] = "hf_model_options" kind: Literal["hf_model_options"] = "hf_model_options"
@ -289,6 +292,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
def repo_cache_folder(self) -> str: def repo_cache_folder(self) -> str:
return self.repo_id.replace("/", "--") return self.repo_id.replace("/", "--")
class HuggingFaceAsrOptions(BaseVlmOptions): class HuggingFaceAsrOptions(BaseVlmOptions):
kind: Literal["hf_model_options"] = "hf_model_options" kind: Literal["hf_model_options"] = "hf_model_options"
@ -304,6 +308,7 @@ class HuggingFaceAsrOptions(BaseVlmOptions):
def repo_cache_folder(self) -> str: def repo_cache_folder(self) -> str:
return self.repo_id.replace("/", "--") return self.repo_id.replace("/", "--")
class ApiVlmOptions(BaseVlmOptions): class ApiVlmOptions(BaseVlmOptions):
kind: Literal["api_model_options"] = "api_model_options" kind: Literal["api_model_options"] = "api_model_options"
@ -415,11 +420,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
smoldocling_vlm_conversion_options smoldocling_vlm_conversion_options
) )
class AsrPipelineOptions(PaginatedPipelineOptions): class AsrPipelineOptions(PaginatedPipelineOptions):
asr_options: Union[HuggingFaceAsrOptions] = ( asr_options: Union[HuggingFaceAsrOptions] = asr_nemo_conversion_options
asr_nemo_conversion_options
)
class PdfPipelineOptions(PaginatedPipelineOptions): class PdfPipelineOptions(PaginatedPipelineOptions):
"""Options for the PDF pipeline.""" """Options for the PDF pipeline."""

View File

@ -19,9 +19,9 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.wav_backend import WavDocumentBackend
from docling.backend.xml.jats_backend import JatsDocumentBackend from docling.backend.xml.jats_backend import JatsDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.backend.wav_backend import WavDocumentBackend
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
ConversionStatus, ConversionStatus,
DoclingComponentType, DoclingComponentType,
@ -34,7 +34,7 @@ from docling.datamodel.document import (
InputDocument, InputDocument,
_DocumentConversionInput, _DocumentConversionInput,
) )
from docling.datamodel.pipeline_options import PipelineOptions, AsrPipelineOptions from docling.datamodel.pipeline_options import AsrPipelineOptions, PipelineOptions
from docling.datamodel.settings import ( from docling.datamodel.settings import (
DEFAULT_PAGE_RANGE, DEFAULT_PAGE_RANGE,
DocumentLimits, DocumentLimits,
@ -42,10 +42,10 @@ from docling.datamodel.settings import (
settings, settings,
) )
from docling.exceptions import ConversionError from docling.exceptions import ConversionError
from docling.pipeline.asr_pipeline import AsrPipeline
from docling.pipeline.base_pipeline import BasePipeline from docling.pipeline.base_pipeline import BasePipeline
from docling.pipeline.simple_pipeline import SimplePipeline from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.pipeline.asr_pipeline import AsrPipeline
from docling.utils.utils import chunkify from docling.utils.utils import chunkify
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -119,9 +119,11 @@ class PdfFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
class AsrFormatOption(FormatOption): class AsrFormatOption(FormatOption):
pipeline_cls: Type = AsrPipeline pipeline_cls: Type = AsrPipeline
def _get_default_option(format: InputFormat) -> FormatOption: def _get_default_option(format: InputFormat) -> FormatOption:
format_to_default_options = { format_to_default_options = {
InputFormat.CSV: FormatOption( InputFormat.CSV: FormatOption(
@ -300,7 +302,7 @@ class DocumentConverter:
fopt = self.format_to_options.get(doc_format) fopt = self.format_to_options.get(doc_format)
print(self.format_to_options) print(self.format_to_options)
if fopt is None or fopt.pipeline_options is None: if fopt is None or fopt.pipeline_options is None:
_log.warning(f"fopt ({fopt}) or its options are None for {doc_format}") _log.warning(f"fopt ({fopt}) or its options are None for {doc_format}")
return None return None

View File

@ -10,13 +10,13 @@ from docling.datamodel.pipeline_options import (
AcceleratorOptions, AcceleratorOptions,
HuggingFaceAsrOptions, HuggingFaceAsrOptions,
) )
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class AsrNemoModel(BasePageModel): class AsrNemoModel(BasePageModel):
def __init__( def __init__(
self, self,
@ -26,7 +26,7 @@ class AsrNemoModel(BasePageModel):
asr_options: HuggingFaceAsrOptions, asr_options: HuggingFaceAsrOptions,
): ):
self.enabled = enabled self.enabled = enabled
self.asr_options = asr_options self.asr_options = asr_options
if self.enabled: if self.enabled:
@ -45,7 +45,6 @@ class AsrNemoModel(BasePageModel):
elif (artifacts_path / repo_cache_folder).exists(): elif (artifacts_path / repo_cache_folder).exists():
artifacts_path = artifacts_path / repo_cache_folder artifacts_path = artifacts_path / repo_cache_folder
self.model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2") self.model = nemo_asr.models.ASRModel.from_pretrained(
"nvidia/parakeet-tdt-0.6b-v2"
)

View File

@ -9,18 +9,16 @@ from docling.backend.abstract_backend import (
) )
from docling.datamodel.base_models import ConversionStatus from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PipelineOptions
from docling.pipeline.base_pipeline import BasePipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
AsrPipelineOptions,
HuggingFaceAsrOptions, HuggingFaceAsrOptions,
InferenceFramework, InferenceFramework,
PipelineOptions,
ResponseFormat, ResponseFormat,
AsrPipelineOptions,
) )
from docling.models.hf_asr_models.asr_nemo import AsrNemoModel from docling.models.hf_asr_models.asr_nemo import AsrNemoModel
from docling.pipeline.base_pipeline import BasePipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -44,7 +42,7 @@ class AsrPipeline(BasePipeline):
"When defined, it must point to a folder containing all models required by the pipeline." "When defined, it must point to a folder containing all models required by the pipeline."
) )
if isinstance(self.pipeline_options.asr_options, HuggingFaceAsrOptions): if isinstance(self.pipeline_options.asr_options, HuggingFaceAsrOptions):
asr_options = cast(HuggingFaceAsrOptions, self.pipeline_options.asr_options) asr_options = cast(HuggingFaceAsrOptions, self.pipeline_options.asr_options)
if asr_options.inference_framework == InferenceFramework.ASR_NENO: if asr_options.inference_framework == InferenceFramework.ASR_NENO:
self.build_pipe = [ self.build_pipe = [
@ -59,10 +57,10 @@ class AsrPipeline(BasePipeline):
_log.error(f"{asr_options.inference_framework} is not supported") _log.error(f"{asr_options.inference_framework} is not supported")
else: else:
_log.error(f"ASR is not supported") _log.error("ASR is not supported")
def _build_document(self, conv_res: ConversionResult) -> ConversionResult: def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
pass pass
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
return conv_res return conv_res
@ -79,4 +77,4 @@ class AsrPipeline(BasePipeline):
@classmethod @classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend): def is_backend_supported(cls, backend: AbstractDocumentBackend):
pass pass