Spaces:

facebook
/

omniasr-transcriptions

Running on A100

File size: 1,478 Bytes

ae238b3

import io

import numpy as np
import soundfile as sf
import torch
from numpy.typing import NDArray


def wav_to_bytes(
    wav: torch.Tensor | NDArray, sample_rate: int = 16_000, format: str = "wav"
) -> NDArray[np.int8]:
    """Convert audio tensor to bytes using soundfile directly."""
    # Convert to numpy if torch tensor
    if isinstance(wav, torch.Tensor):
        if wav.is_cuda:
            wav = wav.cpu()
        # Convert to float32 first (numpy doesn't support bfloat16)
        if wav.dtype != torch.float32:
            wav = wav.float()
        wav = wav.numpy()

    # Ensure float32 dtype for numpy arrays
    if wav.dtype != np.float32:
        wav = wav.astype(np.float32)

    # Handle shape: soundfile expects (samples,) for mono or (samples, channels) for multi-channel
    if wav.ndim == 1:
        # Already correct shape for mono
        pass
    elif wav.ndim == 2:
        # If shape is (channels, samples), transpose to (samples, channels)
        if wav.shape[0] < wav.shape[1]:
            wav = wav.T

    # Create buffer and write using soundfile directly
    buffer = io.BytesIO()

    # Map format string to soundfile format
    sf_format = format.upper() if format.lower() in ['wav', 'flac', 'ogg'] else 'WAV'
    subtype = 'PCM_16' if sf_format == 'WAV' else None

    # Write to buffer
    sf.write(buffer, wav, sample_rate, format=sf_format, subtype=subtype)

    buffer.seek(0)
    return np.frombuffer(buffer.getvalue(), dtype=np.int8)