Spaces:
Running
Running
AfryMask
AfryMask
commited on
whisper : fix the bug related to word splitting errors in the "tokenize" function. (#760)
Browse files- whisper.cpp +6 -11
whisper.cpp
CHANGED
|
@@ -2449,25 +2449,20 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
|
|
| 2449 |
int n = word.size();
|
| 2450 |
while (i < n) {
|
| 2451 |
int j = n;
|
|
|
|
| 2452 |
while (j > i) {
|
| 2453 |
-
auto
|
|
|
|
| 2454 |
if (it != vocab.token_to_id.end()) {
|
| 2455 |
tokens.push_back(it->second);
|
| 2456 |
i = j;
|
|
|
|
| 2457 |
break;
|
| 2458 |
}
|
| 2459 |
--j;
|
| 2460 |
}
|
| 2461 |
-
if (
|
| 2462 |
-
|
| 2463 |
-
}
|
| 2464 |
-
if (j == i) {
|
| 2465 |
-
auto sub = word.substr(i, 1);
|
| 2466 |
-
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
|
| 2467 |
-
tokens.push_back(vocab.token_to_id.at(sub));
|
| 2468 |
-
} else {
|
| 2469 |
-
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
|
| 2470 |
-
}
|
| 2471 |
++i;
|
| 2472 |
}
|
| 2473 |
}
|
|
|
|
| 2449 |
int n = word.size();
|
| 2450 |
while (i < n) {
|
| 2451 |
int j = n;
|
| 2452 |
+
bool found = false;
|
| 2453 |
while (j > i) {
|
| 2454 |
+
auto sub = word.substr(i, j-i);
|
| 2455 |
+
auto it = vocab.token_to_id.find(sub);
|
| 2456 |
if (it != vocab.token_to_id.end()) {
|
| 2457 |
tokens.push_back(it->second);
|
| 2458 |
i = j;
|
| 2459 |
+
found = true;
|
| 2460 |
break;
|
| 2461 |
}
|
| 2462 |
--j;
|
| 2463 |
}
|
| 2464 |
+
if (!found) {
|
| 2465 |
+
fprintf(stderr, "unknown token \n");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2466 |
++i;
|
| 2467 |
}
|
| 2468 |
}
|