danbev commited on
Commit
a69c121
·
unverified ·
1 Parent(s): 58d6e4e

whisper : fix VAD processing for skipped audio segments (#3230)

Browse files

This commit addresses an issue with token timestamps when audio segments
are skipped, in `whisper_exp_compute_token_level_timestamps` related to
the VAD processing and the energy levels.

The motivation for this is that the token timestamps exceed the energy
array bounds due to segment timing misalignment:
```console
(skipped introduction)

Audio segment: [2600ms → 5600ms] (3 seconds of actual audio)
Energy array: [0 → 480652] (samples for 3 seconds)
Token timestamps: [3266ms → 3408ms] (absolute timestamps)
```
So both `s0` and `t1` get clamped to the maximum sample index (480652)
which causes the start/end timestamps to be the same for all the tokens
after a certain point.

This is addressed by using segment-relative timestamps in the
`timestamp_to_sample` and `sample_to_timestamp`.

Files changed (1) hide show
  1. src/whisper.cpp +18 -10
src/whisper.cpp CHANGED
@@ -8325,10 +8325,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
8325
  // token-level timestamps
8326
  //
8327
 
8328
- static int timestamp_to_sample(int64_t t, int n_samples) {
8329
- return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
8330
- }
8331
-
8332
  static int64_t sample_to_timestamp(int i_sample) {
8333
  return (100ll*i_sample)/WHISPER_SAMPLE_RATE;
8334
  }
@@ -8378,6 +8374,18 @@ static std::vector<float> get_signal_energy(const float * signal, int n_samples,
8378
  return result;
8379
  }
8380
 
 
 
 
 
 
 
 
 
 
 
 
 
8381
  static void whisper_exp_compute_token_level_timestamps(
8382
  struct whisper_context & ctx,
8383
  struct whisper_state & state,
@@ -8518,8 +8526,8 @@ static void whisper_exp_compute_token_level_timestamps(
8518
  continue;
8519
  }
8520
 
8521
- int s0 = timestamp_to_sample(tokens[j].t0, n_samples);
8522
- int s1 = timestamp_to_sample(tokens[j].t1, n_samples);
8523
 
8524
  const int ss0 = std::max(s0 - hw, 0);
8525
  const int ss1 = std::min(s1 + hw, n_samples);
@@ -8540,7 +8548,7 @@ static void whisper_exp_compute_token_level_timestamps(
8540
  while (k > 0 && state.energy[k] > thold) {
8541
  k--;
8542
  }
8543
- tokens[j].t0 = sample_to_timestamp(k);
8544
  if (tokens[j].t0 < tokens[j - 1].t1) {
8545
  tokens[j].t0 = tokens[j - 1].t1;
8546
  } else {
@@ -8551,7 +8559,7 @@ static void whisper_exp_compute_token_level_timestamps(
8551
  k++;
8552
  }
8553
  s0 = k;
8554
- tokens[j].t0 = sample_to_timestamp(k);
8555
  }
8556
  }
8557
 
@@ -8561,7 +8569,7 @@ static void whisper_exp_compute_token_level_timestamps(
8561
  while (k < n_samples - 1 && state.energy[k] > thold) {
8562
  k++;
8563
  }
8564
- tokens[j].t1 = sample_to_timestamp(k);
8565
  if (j < n - 1 && tokens[j].t1 > tokens[j + 1].t0) {
8566
  tokens[j].t1 = tokens[j + 1].t0;
8567
  } else {
@@ -8572,7 +8580,7 @@ static void whisper_exp_compute_token_level_timestamps(
8572
  k--;
8573
  }
8574
  s1 = k;
8575
- tokens[j].t1 = sample_to_timestamp(k);
8576
  }
8577
  }
8578
  }
 
8325
  // token-level timestamps
8326
  //
8327
 
 
 
 
 
8328
  static int64_t sample_to_timestamp(int i_sample) {
8329
  return (100ll*i_sample)/WHISPER_SAMPLE_RATE;
8330
  }
 
8374
  return result;
8375
  }
8376
 
8377
+ static int timestamp_to_sample(int64_t t, int64_t segment_t0, int n_samples) {
8378
+ // Convert absolute timestamp to segment-relative timestamp
8379
+ int64_t relative_t = t - segment_t0;
8380
+ int sample = (int)((relative_t * WHISPER_SAMPLE_RATE) / 100);
8381
+ return std::max(0, std::min(n_samples - 1, sample));
8382
+ }
8383
+
8384
+ static int64_t sample_to_timestamp(int i_sample, int64_t segment_t0) {
8385
+ int64_t relative_timestamp = (100ll * i_sample) / WHISPER_SAMPLE_RATE;
8386
+ return relative_timestamp + segment_t0;
8387
+ }
8388
+
8389
  static void whisper_exp_compute_token_level_timestamps(
8390
  struct whisper_context & ctx,
8391
  struct whisper_state & state,
 
8526
  continue;
8527
  }
8528
 
8529
+ int s0 = timestamp_to_sample(tokens[j].t0, segment.t0, n_samples);
8530
+ int s1 = timestamp_to_sample(tokens[j].t1, segment.t0, n_samples);
8531
 
8532
  const int ss0 = std::max(s0 - hw, 0);
8533
  const int ss1 = std::min(s1 + hw, n_samples);
 
8548
  while (k > 0 && state.energy[k] > thold) {
8549
  k--;
8550
  }
8551
+ tokens[j].t0 = sample_to_timestamp(k, segment.t0);
8552
  if (tokens[j].t0 < tokens[j - 1].t1) {
8553
  tokens[j].t0 = tokens[j - 1].t1;
8554
  } else {
 
8559
  k++;
8560
  }
8561
  s0 = k;
8562
+ tokens[j].t0 = sample_to_timestamp(k, segment.t0);
8563
  }
8564
  }
8565
 
 
8569
  while (k < n_samples - 1 && state.energy[k] > thold) {
8570
  k++;
8571
  }
8572
+ tokens[j].t1 = sample_to_timestamp(k, segment.t0);
8573
  if (j < n - 1 && tokens[j].t1 > tokens[j + 1].t0) {
8574
  tokens[j].t1 = tokens[j + 1].t0;
8575
  } else {
 
8580
  k--;
8581
  }
8582
  s1 = k;
8583
+ tokens[j].t1 = sample_to_timestamp(k, segment.t0);
8584
  }
8585
  }
8586
  }