Spaces:
Running
Running
whisper : allow whisper_full from mel spectrogram - no audio (#1214)
Browse files- whisper.cpp +14 -14
whisper.cpp
CHANGED
|
@@ -3140,7 +3140,6 @@ int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, i
|
|
| 3140 |
return false;
|
| 3141 |
}
|
| 3142 |
|
| 3143 |
-
|
| 3144 |
if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
|
| 3145 |
log("%s: failed to eval\n", __func__);
|
| 3146 |
return 1;
|
|
@@ -3374,7 +3373,6 @@ float * whisper_get_logits(struct whisper_context * ctx) {
|
|
| 3374 |
return ctx->state->logits.data();
|
| 3375 |
}
|
| 3376 |
|
| 3377 |
-
|
| 3378 |
float * whisper_get_logits_from_state(struct whisper_state * state) {
|
| 3379 |
return state->logits.data();
|
| 3380 |
}
|
|
@@ -4087,15 +4085,17 @@ int whisper_full_with_state(
|
|
| 4087 |
|
| 4088 |
result_all.clear();
|
| 4089 |
|
| 4090 |
-
|
| 4091 |
-
|
| 4092 |
-
|
| 4093 |
-
|
| 4094 |
-
return -1;
|
| 4095 |
-
} else {
|
| 4096 |
-
if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
|
| 4097 |
log("%s: failed to compute log mel spectrogram\n", __func__);
|
| 4098 |
-
return -
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4099 |
}
|
| 4100 |
}
|
| 4101 |
|
|
@@ -4121,7 +4121,9 @@ int whisper_full_with_state(
|
|
| 4121 |
state->t_beg = 0;
|
| 4122 |
state->t_last = 0;
|
| 4123 |
state->tid_last = 0;
|
| 4124 |
-
|
|
|
|
|
|
|
| 4125 |
}
|
| 4126 |
|
| 4127 |
const int seek_start = params.offset_ms/10;
|
|
@@ -4258,7 +4260,7 @@ int whisper_full_with_state(
|
|
| 4258 |
while (true) {
|
| 4259 |
if (params.progress_callback) {
|
| 4260 |
const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
|
| 4261 |
-
|
| 4262 |
params.progress_callback(
|
| 4263 |
ctx, ctx->state, progress_cur, params.progress_callback_user_data);
|
| 4264 |
}
|
|
@@ -4813,7 +4815,6 @@ int whisper_full_with_state(
|
|
| 4813 |
return 0;
|
| 4814 |
}
|
| 4815 |
|
| 4816 |
-
|
| 4817 |
int whisper_full(
|
| 4818 |
struct whisper_context * ctx,
|
| 4819 |
struct whisper_full_params params,
|
|
@@ -4890,7 +4891,6 @@ int whisper_full_parallel(
|
|
| 4890 |
result.t0 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
|
| 4891 |
result.t1 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
|
| 4892 |
|
| 4893 |
-
|
| 4894 |
// make sure that segments are not overlapping
|
| 4895 |
if (!ctx->state->result_all.empty()) {
|
| 4896 |
result.t0 = std::max(result.t0, ctx->state->result_all.back().t1);
|
|
|
|
| 3140 |
return false;
|
| 3141 |
}
|
| 3142 |
|
|
|
|
| 3143 |
if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
|
| 3144 |
log("%s: failed to eval\n", __func__);
|
| 3145 |
return 1;
|
|
|
|
| 3373 |
return ctx->state->logits.data();
|
| 3374 |
}
|
| 3375 |
|
|
|
|
| 3376 |
float * whisper_get_logits_from_state(struct whisper_state * state) {
|
| 3377 |
return state->logits.data();
|
| 3378 |
}
|
|
|
|
| 4085 |
|
| 4086 |
result_all.clear();
|
| 4087 |
|
| 4088 |
+
if (n_samples > 0) {
|
| 4089 |
+
// compute log mel spectrogram
|
| 4090 |
+
if (params.speed_up) {
|
| 4091 |
+
// TODO: Replace PV with more advanced algorithm
|
|
|
|
|
|
|
|
|
|
| 4092 |
log("%s: failed to compute log mel spectrogram\n", __func__);
|
| 4093 |
+
return -1;
|
| 4094 |
+
} else {
|
| 4095 |
+
if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
|
| 4096 |
+
log("%s: failed to compute log mel spectrogram\n", __func__);
|
| 4097 |
+
return -2;
|
| 4098 |
+
}
|
| 4099 |
}
|
| 4100 |
}
|
| 4101 |
|
|
|
|
| 4121 |
state->t_beg = 0;
|
| 4122 |
state->t_last = 0;
|
| 4123 |
state->tid_last = 0;
|
| 4124 |
+
if (n_samples > 0) {
|
| 4125 |
+
state->energy = get_signal_energy(samples, n_samples, 32);
|
| 4126 |
+
}
|
| 4127 |
}
|
| 4128 |
|
| 4129 |
const int seek_start = params.offset_ms/10;
|
|
|
|
| 4260 |
while (true) {
|
| 4261 |
if (params.progress_callback) {
|
| 4262 |
const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
|
| 4263 |
+
|
| 4264 |
params.progress_callback(
|
| 4265 |
ctx, ctx->state, progress_cur, params.progress_callback_user_data);
|
| 4266 |
}
|
|
|
|
| 4815 |
return 0;
|
| 4816 |
}
|
| 4817 |
|
|
|
|
| 4818 |
int whisper_full(
|
| 4819 |
struct whisper_context * ctx,
|
| 4820 |
struct whisper_full_params params,
|
|
|
|
| 4891 |
result.t0 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
|
| 4892 |
result.t1 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
|
| 4893 |
|
|
|
|
| 4894 |
// make sure that segments are not overlapping
|
| 4895 |
if (!ctx->state->result_all.empty()) {
|
| 4896 |
result.t0 = std::max(result.t0, ctx->state->result_all.back().t1);
|