AfryMask AfryMask commited on
Commit
0393a04
·
unverified ·
1 Parent(s): 9a0a719

whisper : fix the bug related to word splitting errors in the "tokenize" function. (#760)

Browse files
Files changed (1) hide show
  1. whisper.cpp +6 -11
whisper.cpp CHANGED
@@ -2449,25 +2449,20 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
2449
  int n = word.size();
2450
  while (i < n) {
2451
  int j = n;
 
2452
  while (j > i) {
2453
- auto it = vocab.token_to_id.find(word.substr(i, j-i));
 
2454
  if (it != vocab.token_to_id.end()) {
2455
  tokens.push_back(it->second);
2456
  i = j;
 
2457
  break;
2458
  }
2459
  --j;
2460
  }
2461
- if (i == n) {
2462
- break;
2463
- }
2464
- if (j == i) {
2465
- auto sub = word.substr(i, 1);
2466
- if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
2467
- tokens.push_back(vocab.token_to_id.at(sub));
2468
- } else {
2469
- fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
2470
- }
2471
  ++i;
2472
  }
2473
  }
 
2449
  int n = word.size();
2450
  while (i < n) {
2451
  int j = n;
2452
+ bool found = false;
2453
  while (j > i) {
2454
+ auto sub = word.substr(i, j-i);
2455
+ auto it = vocab.token_to_id.find(sub);
2456
  if (it != vocab.token_to_id.end()) {
2457
  tokens.push_back(it->second);
2458
  i = j;
2459
+ found = true;
2460
  break;
2461
  }
2462
  --j;
2463
  }
2464
+ if (!found) {
2465
+ fprintf(stderr, "unknown token \n");
 
 
 
 
 
 
 
 
2466
  ++i;
2467
  }
2468
  }