""" Media to WAV Converter Module Converts various media formats (m4a, mp3, mp4, etc.) to standardized WAV files and PyTorch tensors for audio transcription pipelines. Standardization: - 16kHz sample rate - Mono channel (merged if multi-channel) - Layer normalized - bfloat16 dtype tensor - Fail-fast error handling """ import os import tempfile from pathlib import Path from typing import Tuple, Union, Optional import librosa import numpy as np import soundfile as sf import torch import torch.nn.functional as F from pydub import AudioSegment from pydub.utils import which # Constants TARGET_SAMPLE_RATE = 16000 TARGET_DTYPE = torch.bfloat16 def verify_ffmpeg_installation(): """Verify FFmpeg is available for pydub operations.""" if not which("ffmpeg"): raise RuntimeError( "FFmpeg not found. Please install FFmpeg for media format support. " "On Ubuntu: sudo apt install ffmpeg" ) def layer_norm(tensor: torch.Tensor, shape: torch.Size) -> torch.Tensor: """Apply layer normalization to audio tensor.""" # Simple layer normalization: (x - mean) / std mean = tensor.mean() std = tensor.std() if std == 0: return tensor - mean return (tensor - mean) / std def detect_media_format(file_path: str) -> str: """Detect media format from file extension.""" file_path = Path(file_path) extension = file_path.suffix.lower() supported_formats = { '.wav': 'wav', '.mp3': 'mp3', '.m4a': 'm4a', '.aac': 'aac', '.flac': 'flac', '.ogg': 'ogg', '.wma': 'wma', '.mp4': 'mp4', '.avi': 'avi', '.mov': 'mov', '.mkv': 'mkv' } # Return known format or just pass through the extension without the dot # Let downstream processing handle unknown formats with detailed error messages return supported_formats.get(extension, extension[1:] if extension.startswith('.') else extension) def convert_to_wav_with_pydub(input_path: str, output_path: str, format_hint: str = None): """Convert media file to WAV using pydub (FFmpeg backend).""" verify_ffmpeg_installation() # Load audio file - pydub auto-detects format or use hint if format_hint: audio = AudioSegment.from_file(input_path, format=format_hint) else: # Let pydub auto-detect audio = AudioSegment.from_file(input_path) # Convert to WAV format with standard settings # pydub will handle the initial conversion, librosa will do the final processing audio.export(output_path, format="wav") def process_wav_to_standard_format(wav_path: str) -> Tuple[np.ndarray, int]: """Process WAV file to standard format using librosa.""" # Load the WAV file with librosa (handles resampling better than pydub) data, fs = librosa.load(wav_path, sr=None) # Load at original sample rate first # Resample to target sample rate if needed if fs != TARGET_SAMPLE_RATE: data = librosa.resample(data, orig_sr=fs, target_sr=TARGET_SAMPLE_RATE) # Handle multi-channel audio by merging to mono if len(data.shape) > 1: # Average across channels data = np.mean(data, axis=0) # Ensure it's a 1D array data = np.asarray(data, dtype=np.float32) return data, TARGET_SAMPLE_RATE def create_normalized_tensor(audio_data: np.ndarray) -> torch.Tensor: """Convert numpy audio data to normalized PyTorch tensor with device handling.""" # Convert to bf16 tensor and normalize device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") data = torch.Tensor(audio_data).to(torch.bfloat16) data = layer_norm(data, data.shape) data = data.unsqueeze(0).to(device) return data def convert_media_to_wav( input_path: str, output_dir: Optional[str] = None, keep_temp_wav: bool = True ) -> Tuple[str, torch.Tensor]: """ Convert media file to standardized WAV file and normalized tensor. Args: input_path: Path to input media file output_dir: Directory for output WAV file (default: temp directory) keep_temp_wav: Whether to keep the temporary WAV file Returns: Tuple of (wav_file_path, normalized_tensor) Raises: ValueError: If file format is unsupported RuntimeError: If FFmpeg is not available FileNotFoundError: If input file doesn't exist """ # Validate input file if not os.path.exists(input_path): raise FileNotFoundError(f"Input file not found: {input_path}") input_path = os.path.abspath(input_path) # Detect format media_format = detect_media_format(input_path) # Setup output path if output_dir is None: output_dir = tempfile.gettempdir() # Create output filename input_name = Path(input_path).stem output_wav_path = os.path.join(output_dir, f"{input_name}_converted.wav") # Step 1: Convert to WAV using pydub (handles format conversion) if media_format == 'wav': # Already WAV, but still process through pydub to normalize format convert_to_wav_with_pydub(input_path, output_wav_path, 'wav') else: # Convert from other format to WAV convert_to_wav_with_pydub(input_path, output_wav_path, media_format) # Step 2: Process WAV to standard format using librosa audio_data, sample_rate = process_wav_to_standard_format(output_wav_path) # Step 3: Create normalized tensor normalized_tensor = create_normalized_tensor(audio_data) # Step 4: Save the processed audio back to WAV file # Overwrite the temp WAV with the processed version sf.write(output_wav_path, audio_data, sample_rate) return output_wav_path, normalized_tensor def convert_media_to_wav_from_bytes( media_bytes: bytes, original_filename: str, output_dir: Optional[str] = None ) -> Tuple[str, torch.Tensor]: """ Convert media from bytes to WAV file and tensor. Args: media_bytes: Raw media file bytes original_filename: Original filename for format detection output_dir: Directory for output files Returns: Tuple of (wav_file_path, normalized_tensor) """ # Create temporary input file input_extension = Path(original_filename).suffix with tempfile.NamedTemporaryFile(delete=False, suffix=input_extension) as temp_input: temp_input.write(media_bytes) temp_input_path = temp_input.name # Convert using the main function wav_path, tensor = convert_media_to_wav(temp_input_path, output_dir) # Clean up temporary input file os.unlink(temp_input_path) return wav_path, tensor # Utility function for getting audio info def get_media_info(file_path: str) -> dict: """Get information about media file.""" verify_ffmpeg_installation() audio = AudioSegment.from_file(file_path) return { "duration_seconds": len(audio) / 1000.0, "frame_rate": audio.frame_rate, "channels": audio.channels, "sample_width": audio.sample_width, "format": detect_media_format(file_path) } if __name__ == "__main__": # Example usage import sys if len(sys.argv) != 2: print("Usage: python convert_media_to_wav.py ") sys.exit(1) input_file = sys.argv[1] print(f"Converting {input_file}...") wav_path, tensor = convert_media_to_wav(input_file) print(f"✓ WAV file: {wav_path}") print(f"✓ Tensor shape: {tensor.shape}") print(f"✓ Tensor dtype: {tensor.dtype}") print(f"✓ Tensor device: {tensor.device}") # Show media info info = get_media_info(input_file) print(f"✓ Media info: {info}")