Spaces:
Running
whisper : fix VAD processing for skipped audio segments (#3230)
Browse filesThis commit addresses an issue with token timestamps when audio segments
are skipped, in `whisper_exp_compute_token_level_timestamps` related to
the VAD processing and the energy levels.
The motivation for this is that the token timestamps exceed the energy
array bounds due to segment timing misalignment:
```console
(skipped introduction)
↓
Audio segment: [2600ms → 5600ms] (3 seconds of actual audio)
Energy array: [0 → 480652] (samples for 3 seconds)
Token timestamps: [3266ms → 3408ms] (absolute timestamps)
```
So both `s0` and `t1` get clamped to the maximum sample index (480652)
which causes the start/end timestamps to be the same for all the tokens
after a certain point.
This is addressed by using segment-relative timestamps in the
`timestamp_to_sample` and `sample_to_timestamp`.
- src/whisper.cpp +18 -10
|
@@ -8325,10 +8325,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
| 8325 |
// token-level timestamps
|
| 8326 |
//
|
| 8327 |
|
| 8328 |
-
static int timestamp_to_sample(int64_t t, int n_samples) {
|
| 8329 |
-
return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
|
| 8330 |
-
}
|
| 8331 |
-
|
| 8332 |
static int64_t sample_to_timestamp(int i_sample) {
|
| 8333 |
return (100ll*i_sample)/WHISPER_SAMPLE_RATE;
|
| 8334 |
}
|
|
@@ -8378,6 +8374,18 @@ static std::vector<float> get_signal_energy(const float * signal, int n_samples,
|
|
| 8378 |
return result;
|
| 8379 |
}
|
| 8380 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8381 |
static void whisper_exp_compute_token_level_timestamps(
|
| 8382 |
struct whisper_context & ctx,
|
| 8383 |
struct whisper_state & state,
|
|
@@ -8518,8 +8526,8 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
| 8518 |
continue;
|
| 8519 |
}
|
| 8520 |
|
| 8521 |
-
int s0 = timestamp_to_sample(tokens[j].t0, n_samples);
|
| 8522 |
-
int s1 = timestamp_to_sample(tokens[j].t1, n_samples);
|
| 8523 |
|
| 8524 |
const int ss0 = std::max(s0 - hw, 0);
|
| 8525 |
const int ss1 = std::min(s1 + hw, n_samples);
|
|
@@ -8540,7 +8548,7 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
| 8540 |
while (k > 0 && state.energy[k] > thold) {
|
| 8541 |
k--;
|
| 8542 |
}
|
| 8543 |
-
tokens[j].t0 = sample_to_timestamp(k);
|
| 8544 |
if (tokens[j].t0 < tokens[j - 1].t1) {
|
| 8545 |
tokens[j].t0 = tokens[j - 1].t1;
|
| 8546 |
} else {
|
|
@@ -8551,7 +8559,7 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
| 8551 |
k++;
|
| 8552 |
}
|
| 8553 |
s0 = k;
|
| 8554 |
-
tokens[j].t0 = sample_to_timestamp(k);
|
| 8555 |
}
|
| 8556 |
}
|
| 8557 |
|
|
@@ -8561,7 +8569,7 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
| 8561 |
while (k < n_samples - 1 && state.energy[k] > thold) {
|
| 8562 |
k++;
|
| 8563 |
}
|
| 8564 |
-
tokens[j].t1 = sample_to_timestamp(k);
|
| 8565 |
if (j < n - 1 && tokens[j].t1 > tokens[j + 1].t0) {
|
| 8566 |
tokens[j].t1 = tokens[j + 1].t0;
|
| 8567 |
} else {
|
|
@@ -8572,7 +8580,7 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
| 8572 |
k--;
|
| 8573 |
}
|
| 8574 |
s1 = k;
|
| 8575 |
-
tokens[j].t1 = sample_to_timestamp(k);
|
| 8576 |
}
|
| 8577 |
}
|
| 8578 |
}
|
|
|
|
| 8325 |
// token-level timestamps
|
| 8326 |
//
|
| 8327 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8328 |
static int64_t sample_to_timestamp(int i_sample) {
|
| 8329 |
return (100ll*i_sample)/WHISPER_SAMPLE_RATE;
|
| 8330 |
}
|
|
|
|
| 8374 |
return result;
|
| 8375 |
}
|
| 8376 |
|
| 8377 |
+
static int timestamp_to_sample(int64_t t, int64_t segment_t0, int n_samples) {
|
| 8378 |
+
// Convert absolute timestamp to segment-relative timestamp
|
| 8379 |
+
int64_t relative_t = t - segment_t0;
|
| 8380 |
+
int sample = (int)((relative_t * WHISPER_SAMPLE_RATE) / 100);
|
| 8381 |
+
return std::max(0, std::min(n_samples - 1, sample));
|
| 8382 |
+
}
|
| 8383 |
+
|
| 8384 |
+
static int64_t sample_to_timestamp(int i_sample, int64_t segment_t0) {
|
| 8385 |
+
int64_t relative_timestamp = (100ll * i_sample) / WHISPER_SAMPLE_RATE;
|
| 8386 |
+
return relative_timestamp + segment_t0;
|
| 8387 |
+
}
|
| 8388 |
+
|
| 8389 |
static void whisper_exp_compute_token_level_timestamps(
|
| 8390 |
struct whisper_context & ctx,
|
| 8391 |
struct whisper_state & state,
|
|
|
|
| 8526 |
continue;
|
| 8527 |
}
|
| 8528 |
|
| 8529 |
+
int s0 = timestamp_to_sample(tokens[j].t0, segment.t0, n_samples);
|
| 8530 |
+
int s1 = timestamp_to_sample(tokens[j].t1, segment.t0, n_samples);
|
| 8531 |
|
| 8532 |
const int ss0 = std::max(s0 - hw, 0);
|
| 8533 |
const int ss1 = std::min(s1 + hw, n_samples);
|
|
|
|
| 8548 |
while (k > 0 && state.energy[k] > thold) {
|
| 8549 |
k--;
|
| 8550 |
}
|
| 8551 |
+
tokens[j].t0 = sample_to_timestamp(k, segment.t0);
|
| 8552 |
if (tokens[j].t0 < tokens[j - 1].t1) {
|
| 8553 |
tokens[j].t0 = tokens[j - 1].t1;
|
| 8554 |
} else {
|
|
|
|
| 8559 |
k++;
|
| 8560 |
}
|
| 8561 |
s0 = k;
|
| 8562 |
+
tokens[j].t0 = sample_to_timestamp(k, segment.t0);
|
| 8563 |
}
|
| 8564 |
}
|
| 8565 |
|
|
|
|
| 8569 |
while (k < n_samples - 1 && state.energy[k] > thold) {
|
| 8570 |
k++;
|
| 8571 |
}
|
| 8572 |
+
tokens[j].t1 = sample_to_timestamp(k, segment.t0);
|
| 8573 |
if (j < n - 1 && tokens[j].t1 > tokens[j + 1].t0) {
|
| 8574 |
tokens[j].t1 = tokens[j + 1].t0;
|
| 8575 |
} else {
|
|
|
|
| 8580 |
k--;
|
| 8581 |
}
|
| 8582 |
s1 = k;
|
| 8583 |
+
tokens[j].t1 = sample_to_timestamp(k, segment.t0);
|
| 8584 |
}
|
| 8585 |
}
|
| 8586 |
}
|