File size: 1,478 Bytes
ae238b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import io

import numpy as np
import soundfile as sf
import torch
from numpy.typing import NDArray


def wav_to_bytes(
    wav: torch.Tensor | NDArray, sample_rate: int = 16_000, format: str = "wav"
) -> NDArray[np.int8]:
    """Convert audio tensor to bytes using soundfile directly."""
    # Convert to numpy if torch tensor
    if isinstance(wav, torch.Tensor):
        if wav.is_cuda:
            wav = wav.cpu()
        # Convert to float32 first (numpy doesn't support bfloat16)
        if wav.dtype != torch.float32:
            wav = wav.float()
        wav = wav.numpy()

    # Ensure float32 dtype for numpy arrays
    if wav.dtype != np.float32:
        wav = wav.astype(np.float32)

    # Handle shape: soundfile expects (samples,) for mono or (samples, channels) for multi-channel
    if wav.ndim == 1:
        # Already correct shape for mono
        pass
    elif wav.ndim == 2:
        # If shape is (channels, samples), transpose to (samples, channels)
        if wav.shape[0] < wav.shape[1]:
            wav = wav.T

    # Create buffer and write using soundfile directly
    buffer = io.BytesIO()

    # Map format string to soundfile format
    sf_format = format.upper() if format.lower() in ['wav', 'flac', 'ogg'] else 'WAV'
    subtype = 'PCM_16' if sf_format == 'WAV' else None

    # Write to buffer
    sf.write(buffer, wav, sample_rate, format=sf_format, subtype=subtype)

    buffer.seek(0)
    return np.frombuffer(buffer.getvalue(), dtype=np.int8)