Eripsa commited on
Commit
b4c5078
·
1 Parent(s): 390071f

clean repo

Browse files
Files changed (2) hide show
  1. .ipynb_checkpoints/app-checkpoint.py +822 -0
  2. phd_model +0 -1
.ipynb_checkpoints/app-checkpoint.py ADDED
@@ -0,0 +1,822 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import gradio as gr
3
+
4
+ from datasets import load_dataset, load_metric, Audio, concatenate_datasets, Dataset
5
+ from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC, TrainingArguments, Trainer
6
+ import json
7
+ import torch
8
+ from dataclasses import dataclass, field
9
+ from typing import Any, Dict, List, Optional, Union
10
+ import random
11
+ import argparse
12
+ import pandas as pd
13
+ import os
14
+ import multiprocess
15
+
16
+ import json
17
+ from typing import List, Optional
18
+ from transformers.tokenization_utils import PreTrainedTokenizer
19
+ from transformers.tokenization_utils_base import AddedToken
20
+
21
+ class Wav2Vec2CTCTokenizer(Wav2Vec2CTCTokenizer):
22
+
23
+ def _decode(
24
+ self,
25
+ token_ids: list[int],
26
+ skip_special_tokens: bool = False,
27
+ clean_up_tokenization_spaces: Optional[bool] = None,
28
+ group_tokens: bool = True,
29
+ spaces_between_special_tokens: bool = False,
30
+ output_word_offsets: Optional[bool] = False,
31
+ output_char_offsets: Optional[bool] = False,
32
+ ) -> str:
33
+ """
34
+ special _decode function is needed for Wav2Vec2Tokenizer because added tokens should be treated exactly the
35
+ same as tokens of the base vocabulary and therefore the function `convert_tokens_to_string` has to be called on
36
+ the whole token list and not individually on added tokens
37
+ """
38
+ filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
39
+
40
+ result = []
41
+ for token in filtered_tokens:
42
+ if skip_special_tokens and (
43
+ token in self.all_special_ids or (token != self.pad_token and token in self.all_special_tokens)
44
+ ):
45
+ continue
46
+ result.append(token)
47
+
48
+ string_output = self.convert_tokens_to_string(
49
+ result,
50
+ group_tokens=group_tokens,
51
+ spaces_between_special_tokens=spaces_between_special_tokens,
52
+ output_word_offsets=output_word_offsets,
53
+ output_char_offsets=output_char_offsets,
54
+ )
55
+
56
+ text = string_output["text"]
57
+
58
+ clean_up_tokenization_spaces = (
59
+ clean_up_tokenization_spaces
60
+ if clean_up_tokenization_spaces is not None
61
+ else self.clean_up_tokenization_spaces
62
+ )
63
+ if clean_up_tokenization_spaces:
64
+ text = self.clean_up_tokenization(text)
65
+
66
+ if output_word_offsets or output_char_offsets:
67
+ return Wav2Vec2CTCTokenizerOutput(
68
+ text=text,
69
+ char_offsets=string_output["char_offsets"],
70
+ word_offsets=string_output["word_offsets"],
71
+ )
72
+ else:
73
+ return text
74
+
75
+
76
+ import torch
77
+ import warnings
78
+ from torch import nn # needed only if you add extra layers
79
+ from transformers import (
80
+ Wav2Vec2ForCTC, # base model we extend
81
+ Wav2Vec2Config, # type hinting & standalone instantiation
82
+ Wav2Vec2Model,
83
+ logging as hf_logging # optional: nicer error messages
84
+ )
85
+
86
+ from transformers.utils import (
87
+ auto_docstring,
88
+ )
89
+
90
+ from transformers.modeling_outputs import (
91
+ CausalLMOutput,
92
+ )
93
+
94
+ class Wav2Vec2ForCTC24Heads(Wav2Vec2ForCTC):
95
+ """
96
+ Same encoder as Wav2Vec2ForCTC but with 24 parallel lm-heads and
97
+ an aggregated CTC loss.
98
+
99
+ Expected `labels` shape : (batch, 24, target_len)
100
+ Returned `logits` shape : (batch, 24, time, vocab_size)
101
+ """
102
+
103
+ def __init__(self, config, num_heads: int = 24, target_lang: Optional[str] = None):
104
+ super().__init__(config)
105
+
106
+ self.wav2vec2 = Wav2Vec2Model(config)
107
+ self.dropout = nn.Dropout(config.final_dropout)
108
+
109
+ self.target_lang = target_lang
110
+
111
+ if config.vocab_size is None:
112
+ raise ValueError(
113
+ f"You are trying to instantiate {self.__class__} with a configuration that "
114
+ "does not define the vocabulary size of the language model head. Please "
115
+ "instantiate the model as follows: `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
116
+ "or define `vocab_size` of your model's configuration."
117
+ )
118
+
119
+ output_hidden_size = (
120
+ config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
121
+ )
122
+
123
+
124
+ self.num_heads = num_heads
125
+
126
+ # Replace the single head with a ModuleList of heads
127
+ self.lm_head = nn.ModuleList(
128
+ [nn.Linear(output_hidden_size, config.vocab_size) for _ in range(num_heads)]
129
+ )
130
+
131
+ def freeze_feature_extractor(self):
132
+ """
133
+ Calling this function will disable the gradient computation for the feature encoder so that its parameters will
134
+ not be updated during training.
135
+ """
136
+ warnings.warn(
137
+ "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
138
+ "Please use the equivalent `freeze_feature_encoder` method instead.",
139
+ FutureWarning,
140
+ )
141
+ self.freeze_feature_encoder()
142
+
143
+ @auto_docstring
144
+ def forward(
145
+ self,
146
+ input_values: Optional[torch.Tensor],
147
+ attention_mask: Optional[torch.Tensor] = None,
148
+ output_attentions: Optional[bool] = None,
149
+ output_hidden_states: Optional[bool] = None,
150
+ return_dict: Optional[bool] = None,
151
+ labels: Optional[torch.Tensor] = None,
152
+ ) -> Union[tuple, CausalLMOutput]:
153
+ r"""
154
+ labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
155
+ Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
156
+ the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
157
+ All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
158
+ config.vocab_size - 1]`.
159
+ """
160
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
161
+
162
+ if labels is not None and labels.max() >= self.config.vocab_size:
163
+ raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
164
+
165
+ outputs = self.wav2vec2(
166
+ input_values,
167
+ attention_mask=attention_mask,
168
+ output_attentions=output_attentions,
169
+ output_hidden_states=output_hidden_states,
170
+ return_dict=return_dict,
171
+ )
172
+
173
+ hidden_states = outputs[0]
174
+ hidden_states = self.dropout(hidden_states)
175
+
176
+ logits = torch.stack(
177
+ [head(hidden_states) for head in self.lm_head], # list[B,T,V]
178
+ dim=1 # -> (B, 24, T, V)
179
+ )
180
+
181
+ loss = None
182
+ if labels is not None:
183
+
184
+ # retrieve loss input_lengths from attention_mask
185
+ attention_mask = (
186
+ attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
187
+ )
188
+ input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
189
+
190
+ loss_list = []
191
+ for h in range(self.num_heads):
192
+ # grab labels for this head: (B, target_len)
193
+
194
+ lab = labels[:, h]
195
+
196
+ # mask – targets for CTC must be 1-D
197
+ # assuming that padded tokens are filled with -100
198
+ # when not being attended to
199
+ lab_mask = lab >= 0
200
+ target_lengths = lab_mask.sum(-1)
201
+ flat_targets = lab.masked_select(lab_mask)
202
+
203
+
204
+ log_probs = nn.functional.log_softmax(logits[:, h], dim=-1).transpose(0, 1) # (T,B,V)
205
+
206
+
207
+ with torch.backends.cudnn.flags(enabled=False):
208
+ head_loss = nn.functional.ctc_loss(
209
+ log_probs,
210
+ flat_targets,
211
+ input_lengths,
212
+ target_lengths,
213
+ blank=self.config.pad_token_id,
214
+ reduction="mean", # per-head loss
215
+ zero_infinity=self.config.ctc_zero_infinity,
216
+ )
217
+
218
+ loss_list.append(head_loss)
219
+
220
+ loss = torch.stack(loss_list).mean() # aggregate
221
+
222
+ batch_preds = [] # will become length B
223
+ for b in range(logits.size(0)):
224
+ head_preds = [] # will become length 24
225
+ for h in range(logits.size(1)):
226
+ ids = logits[b, h].argmax(dim=-1) # (T,)
227
+ head_preds.append(ids) # accumulate each head
228
+ head_preds = torch.stack(head_preds) # (24, T) ← “vector” of heads
229
+ batch_preds.append(head_preds)
230
+
231
+ batch_preds = torch.stack(batch_preds) # (B, 24, T)
232
+
233
+
234
+ if not return_dict:
235
+ output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
236
+ return ((loss,) + output) if loss is not None else output
237
+
238
+
239
+ return CausalLMOutput(
240
+ loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
241
+ )
242
+
243
+ from dataclasses import dataclass
244
+ from typing import Dict, List, Union
245
+ import torch
246
+ from transformers import Wav2Vec2Processor
247
+
248
+ @dataclass
249
+ class DataCollatorCTCWithPadding:
250
+ """
251
+ Data collator that will dynamically pad the inputs received.
252
+ Args:
253
+ processor (:class:`~transformers.Wav2Vec2Processor`)
254
+ The processor used for proccessing the data.
255
+ padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
256
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
257
+ among:
258
+ * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
259
+ sequence if provided).
260
+ * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
261
+ maximum acceptable input length for the model if that argument is not provided.
262
+ * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
263
+ different lengths).
264
+ max_length (:obj:`int`, `optional`):
265
+ Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
266
+ max_length_labels (:obj:`int`, `optional`):
267
+ Maximum length of the ``labels`` returned list and optionally padding length (see above).
268
+ pad_to_multiple_of (:obj:`int`, `optional`):
269
+ If set will pad the sequence to a multiple of the provided value.
270
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
271
+ 7.5 (Volta).
272
+ """
273
+ processor: Wav2Vec2Processor
274
+ padding: Union[bool, str] = True
275
+ max_length: Optional[int] = None
276
+ max_length_labels: Optional[int] = None
277
+ pad_to_multiple_of: Optional[int] = None
278
+ pad_to_multiple_of_labels: Optional[int] = None
279
+
280
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
281
+ # Split inputs and labels since they have to be of different lengths
282
+ # and need different padding methods
283
+ input_features = [{"input_values": feature["input_values"]} for feature in features]
284
+ label_features = [{"input_ids": feature["labels"]} for feature in features]
285
+
286
+ batch = self.processor.pad(
287
+ input_features,
288
+ padding=self.padding,
289
+ max_length=self.max_length,
290
+ pad_to_multiple_of=self.pad_to_multiple_of,
291
+ return_tensors="pt",
292
+ )
293
+ with self.processor.as_target_processor():
294
+ labels_batch = self.processor.pad(
295
+ label_features,
296
+ padding=self.padding,
297
+ max_length=self.max_length_labels,
298
+ pad_to_multiple_of=self.pad_to_multiple_of_labels,
299
+ return_tensors="pt",
300
+ )
301
+
302
+ # Replace padding with -100 to ignore loss correctly
303
+ labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
304
+
305
+ batch["labels"] = labels
306
+
307
+ return batch
308
+
309
+ @dataclass
310
+ class DataCollator24CTC(DataCollatorCTCWithPadding):
311
+ processor: Wav2Vec2Processor
312
+ padding: Union[bool, str] = True
313
+ max_length: Optional[int] = None
314
+ max_length_labels: Optional[int] = None
315
+ pad_to_multiple_of: Optional[int] = None
316
+ pad_to_multiple_of_labels: Optional[int] = None
317
+ num_heads: int = 24
318
+
319
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
320
+ # Split inputs and labels since they have to be of different lengths
321
+ # and need different padding methods
322
+ input_features = [{"input_values": feature["input_values"]} for feature in features]
323
+
324
+ batch = self.processor.pad(
325
+ input_features,
326
+ padding=self.padding,
327
+ max_length=self.max_length,
328
+ pad_to_multiple_of=self.pad_to_multiple_of,
329
+ return_tensors="pt",
330
+ )
331
+
332
+ all_labels = []
333
+ for h in range(self.num_heads):
334
+ label_features_h = [{"input_ids": feature["labels"][h]} for feature in features]
335
+ with self.processor.as_target_processor():
336
+ labels_batch = self.processor.pad(
337
+ label_features_h,
338
+ padding=self.padding,
339
+ max_length=self.max_length_labels,
340
+ pad_to_multiple_of=self.pad_to_multiple_of_labels,
341
+ return_tensors="pt",
342
+ )
343
+ padded_ids = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
344
+ all_labels.append(padded_ids)
345
+
346
+ # Stack to (num_heads, batch, seq_len) -> then permute to (batch, num_heads, seq_len)
347
+ labels = torch.stack(all_labels).permute(1, 0, 2)
348
+
349
+ batch['labels'] = labels
350
+
351
+ return batch
352
+
353
+ import os
354
+ import json
355
+ import random
356
+ from pathlib import Path
357
+ from typing import List
358
+
359
+ import numpy as np
360
+ import torchaudio, torchaudio.transforms as T
361
+
362
+ from datasets import Dataset, Features, Sequence, Value, load_from_disk, concatenate_datasets
363
+
364
+
365
+
366
+ # ------------------------------------------------------------------
367
+ # 1) Audio helpers
368
+ # ------------------------------------------------------------------
369
+ def load_and_standardise(path: str | Path, target_sr: int = 16_000) -> list[float]:
370
+ """
371
+ • Loads `path` with torchaudio
372
+ • Resamples to `target_sr` if necessary
373
+ • Converts to mono (mean over channels)
374
+ • Standardises to zero-mean / unit-var
375
+ • Returns a *Python list* of floats so it is JSON-serialisable
376
+ """
377
+
378
+ try:
379
+ torchaudio.set_audio_backend("sox_io")
380
+ except RuntimeError:
381
+ raise ImportError("To support decoding 'mp3' audio files, please install 'sox'.")
382
+
383
+ array, sampling_rate = torchaudio.load(path)
384
+
385
+ if sampling_rate != 16000:
386
+ array = T.Resample(sampling_rate, 16000)(array)
387
+ array = array.numpy()
388
+ array = array.mean(axis=0)
389
+
390
+ return array.tolist()
391
+
392
+ # --------------------------------------------------------------
393
+ # 2) Streaming readers (JSON array or NDJSON)
394
+ # --------------------------------------------------------------
395
+ def iter_entries(json_path: str | Path):
396
+ """
397
+ Yield entries from either a single JSON array file or an NDJSON file.
398
+ Streaming line-by-line for NDJSON so we never hold the whole file in RAM.
399
+ """
400
+ p = Path(json_path)
401
+ txt = p.read_text(encoding="utf-8")
402
+ try:
403
+ data = json.loads(txt)
404
+ if isinstance(data, list):
405
+ for obj in data:
406
+ yield obj
407
+ else:
408
+ yield data
409
+ except json.JSONDecodeError:
410
+ for ln in txt.splitlines():
411
+ ln = ln.strip()
412
+ if ln:
413
+ yield json.loads(ln)
414
+
415
+
416
+ # --------------------------------------------------------------
417
+ # 3) Stage-1: process one source once and cache to disk (Arrow)
418
+ # --------------------------------------------------------------
419
+ def preprocess_source_to_cache(
420
+ json_path: str | Path,
421
+ processor: Wav2Vec2Processor,
422
+ cache_root: str | Path,
423
+ source_tag: str, # any stable name (e.g. 'en', 'jp', 'doreco-an')
424
+ ) -> Path:
425
+ """
426
+ Stream over entries in json_path, fully decode audio and convert labels to IDs.
427
+ Save as a HuggingFace dataset to disk (memory-mapped Arrow).
428
+ Returns the folder path created by `save_to_disk()`.
429
+ """
430
+ cache_root = Path(cache_root)
431
+ cache_root.mkdir(parents=True, exist_ok=True)
432
+ save_path = cache_root / f"cache_{source_tag}"
433
+
434
+ save_path.mkdir(parents=True, exist_ok=True)
435
+
436
+ # If cache already exists, skip reprocessing to save time.
437
+ if (save_path / "dataset_info.json").exists():
438
+ print(f"[cache] Using existing cache: {save_path}")
439
+ return save_path
440
+ else:
441
+ if save_path.exists():
442
+ import shutil; shutil.rmtree(save_path)
443
+ save_path.mkdir(parents=True, exist_ok=True)
444
+
445
+ def row_generator():
446
+ for obj in iter_entries(json_path):
447
+ # Expect {"path": "...", "ipa": <matrix or whatever your build used>}
448
+ ipa_matrix = obj.get("ipa", [])
449
+ if not ipa_matrix:
450
+ continue
451
+
452
+ # your original: matrix was [segments x 22]; you transposed and stringified
453
+ transpose = [list(row) for row in zip(*ipa_matrix)]
454
+ transpose_str = [[str(tok) for tok in head] for head in transpose]
455
+
456
+ # Decode audio once (as requested)
457
+ audio = load_and_standardise(obj["path"])
458
+ # Cast to float32 for Arrow efficiency
459
+ audio = np.asarray(audio, dtype=np.float32)
460
+
461
+ # Convert labels to IDs once (keep nested per-head if your collator expects it)
462
+ label_ids: List[List[int]] = []
463
+ for head in transpose_str:
464
+ with processor.as_target_processor():
465
+ ids = processor(head).input_ids
466
+ # ids might be [[id]]; unwrap if needed:
467
+ ids = [tok[0] if isinstance(tok, list) else tok for tok in ids]
468
+ label_ids.append(ids)
469
+
470
+ yield {
471
+ "input_values": audio, # variable length float32
472
+ "labels": label_ids, # list[list[int]]
473
+ "source": source_tag, # keep origin
474
+ }
475
+
476
+ # Features: variable-length floats + nested variable-length ints
477
+ features = Features({
478
+ "input_values": Sequence(Value("float32")),
479
+ "labels": Sequence(Sequence(Value("int32"))),
480
+ "source": Value("string"),
481
+ })
482
+
483
+ rows, chunks = [], []
484
+ for row in row_generator(): # <- your existing generator
485
+ rows.append(row)
486
+ if len(rows) >= 5_000: # tune shard size to your RAM
487
+ chunks.append(Dataset.from_list(rows))
488
+ rows = [] # free current chunk
489
+
490
+ if rows: # tail of the stream
491
+ chunks.append(Dataset.from_list(rows))
492
+
493
+ ds = concatenate_datasets(chunks) # single Dataset object
494
+ ds.save_to_disk(save_path.as_posix()) # writes Arrow to local FS
495
+ print(f"[cache] Wrote {len(ds)} rows → {save_path}")
496
+ return save_path
497
+
498
+
499
+ # --------------------------------------------------------------
500
+ # 4) Stage-2: build a weighted dataset from cached sources
501
+ # (no re-decoding, no in-RAM duplication)
502
+ # --------------------------------------------------------------
503
+ def build_weighted_dataset_from_cache(
504
+ cache_paths: list[str | Path],
505
+ percentages: list[float],
506
+ *,
507
+ seed: int = 42
508
+ ) -> Dataset:
509
+ """
510
+ For each cached source dataset:
511
+ pct >= 100 → full copies n_full times + fractional random subset
512
+ pct < 100 → fractional random subset only
513
+ All operations are Arrow-backed (memory-mapped), so no RAM blow-ups.
514
+ """
515
+ assert len(cache_paths) == len(percentages)
516
+ rng = random.Random(seed)
517
+
518
+ per_source_weighted = []
519
+
520
+ for cache_path, pct in zip(cache_paths, percentages):
521
+ ds = load_from_disk(str(cache_path))
522
+ N = len(ds)
523
+ if N == 0 or pct <= 0:
524
+ continue
525
+
526
+ n_full = int(pct // 100)
527
+ frac = (pct % 100) / 100.0
528
+ n_frac = round(N * frac)
529
+
530
+ parts = []
531
+
532
+ # Full copies: concatenate the same dataset handle N times (no decode)
533
+ if n_full > 0:
534
+ parts.extend([ds] * n_full)
535
+
536
+ # Fractional random subset (no decode)
537
+ if n_frac > 0:
538
+ idxs = rng.sample(range(N), n_frac)
539
+ parts.append(ds.select(idxs))
540
+
541
+ if not parts:
542
+ continue
543
+
544
+ ds_weighted = parts[0] if len(parts) == 1 else concatenate_datasets(parts)
545
+ per_source_weighted.append(ds_weighted)
546
+ print(f"[weight] {cache_path} → {len(ds_weighted)} rows "
547
+ f"(full×{n_full} + frac {n_frac}/{N})")
548
+
549
+ # Final training set = concat of all weighted sources
550
+ if not per_source_weighted:
551
+ raise RuntimeError("No data after weighting.")
552
+ train_ds = per_source_weighted[0] if len(per_source_weighted) == 1 \
553
+ else concatenate_datasets(per_source_weighted)
554
+
555
+ # Optional: shuffle once for training
556
+ train_ds = train_ds.shuffle(seed=seed)
557
+ print(f"[train] Total rows: {len(train_ds)}")
558
+ return train_ds
559
+
560
+
561
+ vocab_file = "dummy_vocab.json"
562
+
563
+ feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
564
+ sampling_rate=16_000,
565
+ padding_value=0.0,
566
+ do_normalize=True,
567
+ return_attention_mask=True)
568
+
569
+ tokenizer_ipa = Wav2Vec2CTCTokenizer("./{}".format(vocab_file),
570
+ unk_token="[UNK]",
571
+ pad_token="[PAD]",
572
+ word_delimiter_token="|")
573
+
574
+ processor_ipa = Wav2Vec2Processor(feature_extractor=feature_extractor,
575
+ tokenizer=tokenizer_ipa)
576
+
577
+ import numpy as np
578
+ from phd_model.phonetics.ipa import symbol_to_descriptor, to_symbol
579
+ from phd_model.model.wav2vec2 import Wav2Vec2
580
+ from transformers import Wav2Vec2Processor
581
+ import torchaudio, torchaudio.transforms as T
582
+ from torchinfo import summary
583
+ import torch
584
+ import re
585
+
586
+ ckpt_dir = "anim400k_train_v2"
587
+
588
+
589
+ # Get device
590
+ device = "cuda" if torch.cuda.is_available() else "cpu"
591
+
592
+ # Load model from Huggingface hub
593
+ wav2vec2 = Wav2Vec2ForCTC24Heads.from_pretrained(ckpt_dir)
594
+ processor = Wav2Vec2Processor.from_pretrained(ckpt_dir)
595
+ wav2vec2.to(device)
596
+ wav2vec2.eval()
597
+
598
+ # Print model summary for batch_size 1 and a single second of audio samples
599
+ summary(wav2vec2, input_size=(1, 16_000), depth=8, device=device)
600
+
601
+ # Create new random audio (you can load your own audio here to get actual predictions)
602
+ #rand_audio = np.random.rand(1, 16_000)
603
+
604
+ def generate_tensor(audio_path: str):
605
+
606
+ #audio_path = "/workspace/F5-TTS/data/marrazki_custom/wavs/segment_3153.wav"
607
+
608
+ #rand_audio = load_and_standardise(audio_path)
609
+ #rand_audio, sr = torchaudio.load(audio_path)
610
+
611
+ try:
612
+ torchaudio.set_audio_backend("sox_io")
613
+ except RuntimeError:
614
+ raise ImportError("To support decoding 'mp3' audio files, please install 'sox'.")
615
+
616
+ array, sampling_rate = torchaudio.load(audio_path)
617
+
618
+ if sampling_rate != 16000:
619
+ array = T.Resample(sampling_rate, 16000)(array)
620
+ array = array.numpy()
621
+ array = array.mean(axis=0, keepdims=True)
622
+
623
+ # Create torch tensor, move to device and feed the model
624
+ array = torch.tensor(
625
+ array,
626
+ dtype=torch.float,
627
+ device=device,
628
+ )
629
+
630
+ print(array)
631
+ with torch.no_grad():
632
+ out = wav2vec2(array)
633
+ logits = out.logits
634
+
635
+ # regular–expression that finds either the 2‑char token "-1"
636
+ # OR any single char in 0,1,|
637
+ token_re = re.compile(r"-1|[01\|]")
638
+
639
+ batch_tokens = [] # final matrix (B × 24)
640
+
641
+ for b in range(logits.size(0)):
642
+ head_tokens = [] # 24 rows for this utterance
643
+
644
+ for h in range(logits.size(1)):
645
+ # ---------- 1) arg‑max & CTC collapse → string ----------
646
+ ids = logits[b, h].argmax(dim=-1).cpu().tolist()
647
+
648
+ #text = processor._decode(
649
+ # ids,
650
+ #)
651
+ text = tokenizer_ipa._decode(token_ids = ids)
652
+
653
+ # ---------- 2) split the string into symbols ----------
654
+ symbols = token_re.findall(text) # e.g. ['-1', '1', '-1', '-1', …]
655
+
656
+ head_tokens.append(symbols)
657
+
658
+ batch_tokens.append(head_tokens)
659
+
660
+ batch_data = [[[int(val) for val in row] for row in matrix] for matrix in batch_tokens]
661
+
662
+ print(f"batch_data : {batch_data}")
663
+
664
+ # Convert to a PyTorch tensor
665
+ batch_tensor = torch.tensor(batch_data)
666
+
667
+ return batch_tensor
668
+
669
+
670
+ """
671
+ vector2ipa.py
672
+ =============
673
+
674
+ Map articulatory feature vectors (shape ≡ [*, 22]) to IPA symbols.
675
+
676
+ * If a row is an **exact** match for a symbol’s feature vector,
677
+ return that symbol.
678
+
679
+ * Otherwise compute the Levenshtein distance between the input
680
+ vector and every known IPA vector and choose the symbol with
681
+ the minimum distance.
682
+
683
+ Requires: panphon (pip install panphon)
684
+ numpy (only for dtype / convenience, but any tensor works)
685
+
686
+ Author: <you>
687
+ """
688
+
689
+ import numpy as np
690
+ from typing import Iterable, List, Sequence, Tuple
691
+
692
+ import panphon # -- main feature database
693
+ from panphon.segment import Segment # convenient Segment wrapper
694
+
695
+
696
+ # --------------------------------------------------------------------
697
+ # helpers
698
+ # --------------------------------------------------------------------
699
+ def _levenshtein(a: Sequence[int], b: Sequence[int]) -> int:
700
+ """Classic O(m·n) Levenshtein distance for two sequences of ints."""
701
+ m, n = len(a), len(b)
702
+ prev = list(range(n + 1))
703
+ curr = [0] * (n + 1)
704
+
705
+ for i in range(1, m + 1):
706
+ curr[0] = i
707
+ for j in range(1, n + 1):
708
+ cost = 0 if a[i - 1] == b[j - 1] else 1
709
+ curr[j] = min(
710
+ curr[j - 1] + 1, # insertion
711
+ prev[j] + 1, # deletion
712
+ prev[j - 1] + cost # substitution
713
+ )
714
+ prev, curr = curr, prev # reuse buffers
715
+ return prev[n]
716
+
717
+ def _as_int_vector(raw):
718
+ """Convert a PanPhon vector (numeric or ±0 string form) to a tuple of ints."""
719
+ if isinstance(raw[0], int):
720
+ return tuple(int(x) for x in raw)
721
+ map_sym = {'+': 1, '-': -1, '0': 0}
722
+ return tuple(map_sym[x] for x in raw)
723
+
724
+
725
+ def _build_inventory(ft):
726
+ ipa_syms, ipa_vecs = [], []
727
+
728
+ # ❶ Whatever version we’re on, get *something* iterable
729
+ seg_iter = getattr(ft, "segments", None) or getattr(ft, "_segments", None)
730
+ if seg_iter is None:
731
+ raise RuntimeError("Can't locate segment inventory on this PanPhon version.")
732
+
733
+ for item in seg_iter:
734
+ # ❷ Newer PanPhon: item = (symbol:str, Segment)
735
+ # Older PanPhon: item = symbol:str
736
+ symbol = item[0] if isinstance(item, tuple) else item
737
+
738
+ # ❸ Grab the canonical 22-feature vector
739
+ try:
740
+ raw = ft.segment_to_vector(symbol) # post-0.22
741
+ except TypeError:
742
+ raw = ft.segment_to_vector(symbol, True) # ≤0.21 fallback
743
+
744
+ if raw is None: # skip tones, length marks…
745
+ continue
746
+ ipa_syms.append(symbol)
747
+ ipa_vecs.append(_as_int_vector(raw)) # → tuple[int, …]
748
+
749
+ return ipa_syms, ipa_vecs
750
+
751
+
752
+ # --------------------------------------------------------------------
753
+ # public API
754
+ # --------------------------------------------------------------------
755
+ def vectors_to_ipa(
756
+ tensor: Iterable[Sequence[int]],
757
+ ft: panphon.FeatureTable | None = None,
758
+ ) -> List[str]:
759
+ """
760
+ Parameters
761
+ ----------
762
+ tensor
763
+ Any iterable yielding rows of 22 ints (values −1/0/+1).
764
+
765
+ Works with:
766
+ * list[list[int]]
767
+ * numpy.ndarray (shape [N,22] or [22])
768
+ * torch.Tensor (dtype=torch.int8 / int16 / int32)
769
+ * etc.
770
+
771
+ ft
772
+ Optionally pass in a pre-constructed FeatureTable so you
773
+ don’t pay the I/O cost repeatedly.
774
+
775
+ Returns
776
+ -------
777
+ List[str]
778
+ The IPA symbol that best matches each input row.
779
+ """
780
+ # 🗄️ Load feature database exactly once
781
+ ft = ft or panphon.FeatureTable()
782
+ ipa_syms, ipa_vecs = _build_inventory(ft)
783
+
784
+ # ⚡ Small dict for constant-time exact look-ups
785
+ exact_lookup = {v: s for s, v in zip(ipa_syms, ipa_vecs)}
786
+
787
+ results: List[str] = []
788
+ for row in tensor:
789
+ vec = tuple(int(x) for x in row) # normalise dtype
790
+
791
+ # 1️⃣ Exact hit?
792
+ if vec in exact_lookup:
793
+ results.append(exact_lookup[vec])
794
+ continue
795
+
796
+ # 2️⃣ Nearest neighbour by Levenshtein distance
797
+ best_sym, best_dist = None, float("inf")
798
+ for ref_vec, sym in zip(ipa_vecs, ipa_syms):
799
+ d = _levenshtein(vec, ref_vec)
800
+ if d < best_dist:
801
+ best_dist, best_sym = d, sym
802
+ if d == 0: # early exit
803
+ break
804
+ results.append(f"{best_sym}")
805
+
806
+ # Print results (per brief) and return in case caller needs them
807
+ symbols_str = " ".join(results)
808
+ #print(symbols_str)
809
+ return symbols_str
810
+
811
+
812
+ def transcribe_to_ipa(audio_path):
813
+ batch_tensor = generate_tensor(audio_path)
814
+
815
+ batch_tensor = batch_tensor.squeeze(0)
816
+
817
+ symbols = vectors_to_ipa(batch_tensor.t())
818
+
819
+ return symbols
820
+
821
+ demo = gr.Interface(fn=transcribe_to_ipa, inputs=gr.Audio(type="filepath"), outputs="text")
822
+ demo.launch(share=True)
phd_model DELETED
@@ -1 +0,0 @@
1
- Subproject commit dfff4848baf1a6698c245e83f8768a577c353558