Source code for fmus_vox.tts.speaker

"""
Base Speaker class for text-to-speech functionality.

This module provides the Speaker class which is the common interface for
all text-to-speech models in fmus-vox.
"""

import os
from typing import Any, Dict, List, Optional, Union, Generator
from pathlib import Path
import asyncio

from fmus_vox.core.audio import Audio
from fmus_vox.core.config import get_config
from fmus_vox.core.errors import SynthesisError, ModelError
from fmus_vox.core.utils import get_logger, LazyLoader, timed

class SpeechResult:
    """
    Container for speech synthesis results, including audio and metadata.
    """

    def __init__(self,
                audio: Audio,
                voice_id: str,
                metadata: Optional[Dict[str, Any]] = None):
        """
        Initialize a speech result.

        Args:
            audio: Synthesized audio
            voice_id: ID of the voice used
            metadata: Additional synthesis metadata
        """
        self.audio = audio
        self.voice_id = voice_id
        self.metadata = metadata or {}

    def save(self, path: Union[str, Path]) -> str:
        """Save synthesized audio to file."""
        return self.audio.save(path)

    def play(self) -> None:
        """Play synthesized audio."""
        self.audio.play()

[docs] class Speaker: """ Base class for text-to-speech synthesis. This class provides the common interface for all TTS models and handles model loading, voice selection, and synthesis. Args: model: Name of the model to use (vits, coqui, etc.) voice: Voice ID or name to use device: Computation device (cpu, cuda, auto) **kwargs: Additional model-specific parameters """ # Registry of available model implementations _model_registry = {}
[docs] @classmethod def register_model(cls, name: str, implementation: type) -> None: """ Register a model implementation. Args: name: Model name implementation: Model implementation class """ cls._model_registry[name] = implementation
[docs] def __new__(cls, model: str = "vits", **kwargs) -> "Speaker": """ Create a new Speaker instance of the appropriate subclass. Args: model: Name of the model to use **kwargs: Additional model-specific parameters Returns: Speaker instance Raises: ModelError: If the model is not supported """ if cls is Speaker: # Determine which implementation to use based on model name if model.startswith("vits"): from fmus_vox.tts.vits import VitsSpeaker return VitsSpeaker(model=model, **kwargs) elif model.startswith("coqui"): from fmus_vox.tts.coqui import CoquiSpeaker return CoquiSpeaker(model=model, **kwargs) elif model.startswith("fastspeech"): from fmus_vox.tts.fastspeech import FastSpeechSpeaker return FastSpeechSpeaker(model=model, **kwargs) elif model.startswith("elevenlabs") or model.startswith("eleven"): from fmus_vox.tts.elevenlabs import ElevenLabsSpeaker return ElevenLabsSpeaker(model=model, **kwargs) elif model in cls._model_registry: implementation = cls._model_registry[model] return implementation(model=model, **kwargs) else: raise ModelError(f"Unsupported TTS model: {model}") else: # If called from a subclass, use normal instantiation return super().__new__(cls)
[docs] def __init__(self, model: str = "vits", voice: str = "default", device: Optional[str] = None, **kwargs): """ Initialize the speaker. Args: model: Name of the model to use voice: Voice ID or name to use device: Computation device (cpu, cuda, auto) **kwargs: Additional model-specific parameters """ self.logger = get_logger(f"{__name__}.{self.__class__.__name__}") self.config = get_config() self.model_name = model self.voice_id = voice self.device = device or self.config.get_device() self.model_params = kwargs # Voice settings self.speed = kwargs.get("speed", 1.0) self.pitch = kwargs.get("pitch", 0.0) self.style = kwargs.get("style", "neutral") # Lazy-loaded model self._model = LazyLoader(self._load_model) self.logger.debug(f"Initialized {self.__class__.__name__} with model={model}, voice={voice}")
def _load_model(self) -> Any: """ Load the TTS model. Returns: Loaded model Raises: ModelError: If model loading fails """ raise NotImplementedError("Subclasses must implement this method") @timed def speak(self, text: str) -> Audio: """ Synthesize speech from text. Args: text: Text to synthesize Returns: Audio object with synthesized speech Raises: SynthesisError: If synthesis fails """ result = self.speak_with_metadata(text) return result.audio @timed def speak_with_metadata(self, text: str) -> SpeechResult: """ Synthesize speech from text with additional metadata. Args: text: Text to synthesize Returns: SpeechResult object Raises: SynthesisError: If synthesis fails """ raise NotImplementedError("Subclasses must implement this method")
[docs] async def speak_async(self, text: str) -> Audio: """ Synthesize speech from text asynchronously. Args: text: Text to synthesize Returns: Audio object with synthesized speech Raises: SynthesisError: If synthesis fails """ result = await self.speak_with_metadata_async(text) return result.audio
[docs] async def speak_with_metadata_async(self, text: str) -> SpeechResult: """ Synthesize speech from text asynchronously with additional metadata. Args: text: Text to synthesize Returns: SpeechResult object Raises: SynthesisError: If synthesis fails """ # Default implementation runs synchronous version in a thread pool loop = asyncio.get_event_loop() return await loop.run_in_executor( None, self.speak_with_metadata, text )
[docs] def stream(self, text_generator: Generator[str, None, None]) -> Generator[Audio, None, None]: """ Stream synthesis for incoming text chunks. Args: text_generator: Generator yielding text chunks Yields: Audio object for each synthesized chunk Raises: SynthesisError: If synthesis fails """ for text in text_generator: if text.strip(): # Skip empty text yield self.speak(text)
# Fluent interface for setting voice properties
[docs] def set_voice(self, voice_id: str) -> "Speaker": """Set the voice to use.""" self.voice_id = voice_id return self
[docs] def set_speed(self, speed: float) -> "Speaker": """Set the speaking speed (1.0 is normal).""" self.speed = speed return self
[docs] def set_pitch(self, pitch: float) -> "Speaker": """Set the voice pitch in semitones (0.0 is normal).""" self.pitch = pitch return self
[docs] def set_style(self, style: str) -> "Speaker": """Set the speaking style (e.g., 'neutral', 'happy', 'sad').""" self.style = style return self
# Alternative names for fluent interface
[docs] def voice(self, voice_id: str) -> "Speaker": """Set the voice to use (alias for set_voice).""" return self.set_voice(voice_id)
[docs] def speed(self, speed: float) -> "Speaker": """Set the speaking speed (alias for set_speed).""" return self.set_speed(speed)
[docs] def pitch(self, pitch: float) -> "Speaker": """Set the voice pitch (alias for set_pitch).""" return self.set_pitch(pitch)
[docs] def style(self, style: str) -> "Speaker": """Set the speaking style (alias for set_style).""" return self.set_style(style)
[docs] def get_available_voices(self) -> List[Dict[str, Any]]: """ Get list of available voices. Returns: List of voice dictionaries with id, name, and language """ raise NotImplementedError("Subclasses must implement this method")