Source code for fmus_vox.voice.cloner

"""
Voice cloning functionality for fmus-vox.

This module provides the VoiceCloner class which is the base for all
voice cloning implementations.
"""

import os
import uuid
from typing import Any, Dict, List, Optional, Union
from pathlib import Path

from fmus_vox.core.audio import Audio
from fmus_vox.core.config import get_config
from fmus_vox.core.errors import VoiceError, ModelError
from fmus_vox.core.utils import get_logger, LazyLoader, timed

class Voice:
    """
    Container for voice information and metadata.
    """

    def __init__(self,
                voice_id: str,
                name: Optional[str] = None,
                reference_audio: Optional[Audio] = None,
                embeddings: Optional[Any] = None,
                metadata: Optional[Dict[str, Any]] = None):
        """
        Initialize a voice.

        Args:
            voice_id: Unique identifier for the voice
            name: Display name for the voice
            reference_audio: Reference audio for the voice
            embeddings: Voice embeddings or model-specific data
            metadata: Additional voice metadata
        """
        self.voice_id = voice_id
        self.name = name or voice_id
        self.reference_audio = reference_audio
        self.embeddings = embeddings
        self.metadata = metadata or {}

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary representation (without audio data)."""
        return {
            "voice_id": self.voice_id,
            "name": self.name,
            "metadata": self.metadata
        }

[docs] class VoiceCloner: """ Base class for voice cloning. This class provides the common interface for all voice cloning models and handles voice management, cloning, and synthesis. Args: model: Name of the model to use (yourtts, etc.) device: Computation device (cpu, cuda, auto) **kwargs: Additional model-specific parameters """ # Registry of available model implementations _model_registry = {}
[docs] @classmethod def register_model(cls, name: str, implementation: type) -> None: """ Register a model implementation. Args: name: Model name implementation: Model implementation class """ cls._model_registry[name] = implementation
[docs] def __new__(cls, model: str = "yourtts", **kwargs) -> "VoiceCloner": """ Create a new VoiceCloner instance of the appropriate subclass. Args: model: Name of the model to use **kwargs: Additional model-specific parameters Returns: VoiceCloner instance Raises: ModelError: If the model is not supported """ if cls is VoiceCloner: # Determine which implementation to use based on model name if model.startswith("yourtts"): from fmus_vox.voice.yourtts import YourTTSCloner return YourTTSCloner(model=model, **kwargs) elif model.startswith("sv2tts"): from fmus_vox.voice.sv2tts import SV2TTSCloner return SV2TTSCloner(model=model, **kwargs) elif model in cls._model_registry: implementation = cls._model_registry[model] return implementation(model=model, **kwargs) else: raise ModelError(f"Unsupported voice cloning model: {model}") else: # If called from a subclass, use normal instantiation return super().__new__(cls)
[docs] def __init__(self, model: str = "yourtts", device: Optional[str] = None, **kwargs): """ Initialize the voice cloner. Args: model: Name of the model to use device: Computation device (cpu, cuda, auto) **kwargs: Additional model-specific parameters """ self.logger = get_logger(f"{__name__}.{self.__class__.__name__}") self.config = get_config() self.model_name = model self.device = device or self.config.get_device() self.model_params = kwargs # Dictionary of registered voices self.voices = {} # Lazy-loaded models self._encoder = LazyLoader(self._load_encoder) self._synthesizer = LazyLoader(self._load_synthesizer) self.logger.debug(f"Initialized {self.__class__.__name__} with model={model}")
def _load_encoder(self) -> Any: """ Load the voice encoder model. Returns: Loaded encoder model Raises: ModelError: If model loading fails """ raise NotImplementedError("Subclasses must implement this method") def _load_synthesizer(self) -> Any: """ Load the speech synthesizer model. Returns: Loaded synthesizer model Raises: ModelError: If model loading fails """ raise NotImplementedError("Subclasses must implement this method") @timed def add_reference(self, audio: Union[str, Audio], name: Optional[str] = None) -> str: """ Add a reference voice from audio. Args: audio: Reference audio (file path or Audio object) name: Display name for the voice (if None, use generated ID) Returns: Voice ID that can be used for synthesis Raises: VoiceError: If reference processing fails """ try: # Load audio if path is provided if isinstance(audio, str): audio = Audio.load(audio) # Use filename as name if not provided if name is None: name = Path(audio).stem # Resample if needed if audio.sample_rate != 16000: audio = audio.resample(target_sr=16000) # Generate voice ID voice_id = str(uuid.uuid4()) # Process for voice embedding (to be implemented by subclasses) embeddings = self._process_reference(audio) # Create voice object voice = Voice( voice_id=voice_id, name=name or voice_id, reference_audio=audio, embeddings=embeddings ) # Store in dictionary self.voices[voice_id] = voice return voice_id except Exception as e: raise VoiceError(f"Failed to add reference voice: {str(e)}") def _process_reference(self, audio: Audio) -> Any: """ Process reference audio to extract voice characteristics. Args: audio: Reference audio Returns: Processed voice embeddings or model-specific data Raises: VoiceError: If processing fails """ raise NotImplementedError("Subclasses must implement this method") @timed def synthesize(self, text: str, voice_id: str) -> Audio: """ Synthesize text with a cloned voice. Args: text: Text to synthesize voice_id: ID of the voice to use Returns: Audio object with synthesized speech Raises: VoiceError: If synthesis fails or voice ID is invalid """ try: # Check if voice exists if voice_id not in self.voices: raise VoiceError(f"Voice ID not found: {voice_id}") # Get voice voice = self.voices[voice_id] # Synthesize speech (to be implemented by subclasses) audio = self._synthesize_with_voice(text, voice) return audio except Exception as e: raise VoiceError(f"Failed to synthesize speech: {str(e)}") def _synthesize_with_voice(self, text: str, voice: Voice) -> Audio: """ Synthesize text with a specific voice. Args: text: Text to synthesize voice: Voice to use Returns: Audio object with synthesized speech Raises: VoiceError: If synthesis fails """ raise NotImplementedError("Subclasses must implement this method")
[docs] def get_voice(self, voice_id: str) -> Voice: """ Get a voice by ID. Args: voice_id: ID of the voice to get Returns: Voice object Raises: VoiceError: If voice ID is invalid """ if voice_id not in self.voices: raise VoiceError(f"Voice ID not found: {voice_id}") return self.voices[voice_id]
[docs] def list_voices(self) -> List[Dict[str, Any]]: """ List all registered voices. Returns: List of voice dictionaries with id, name, and metadata """ return [voice.to_dict() for voice in self.voices.values()]
[docs] def remove_voice(self, voice_id: str) -> None: """ Remove a voice. Args: voice_id: ID of the voice to remove Raises: VoiceError: If voice ID is invalid """ if voice_id not in self.voices: raise VoiceError(f"Voice ID not found: {voice_id}") del self.voices[voice_id]