Faham
commited on
Commit
Β·
5c87ec6
1
Parent(s):
db77419
UPDATE: added all notebooks used during research and training
Browse files- README.md +0 -8
- notebooks/audio_attention_layer_training.ipynb +0 -0
- notebooks/audio_pytorch_ensemble_training.ipynb +0 -0
- notebooks/{audio_sentiment_analysis.ipynb β audio_wav2vec2_training.ipynb} +105 -8
- notebooks/{vision_sentiment_analysis.ipynb β vision_resnet50_training.ipynb} +0 -0
- notebooks/vision_swin_s_training.ipynb +0 -0
README.md
CHANGED
|
@@ -85,14 +85,6 @@ sentiment-fused/
|
|
| 85 |
βββ ui/ # User interface components
|
| 86 |
```
|
| 87 |
|
| 88 |
-
### Directory Explanation
|
| 89 |
-
|
| 90 |
-
- **`model_weights/`**: Contains the actual trained model files (`.pth` files) downloaded from Google Drive at inference time.
|
| 91 |
-
- **`src/models/`**: Contains the Python code for model loading, inference, and prediction logic
|
| 92 |
-
- **`src/utils/`**: Contains preprocessing utilities for audio, vision, and text data
|
| 93 |
-
- **`src/config/`**: Contains centralized configuration settings for the entire application
|
| 94 |
-
- **`src/ui/`**: Contains Streamlit UI components and styling
|
| 95 |
-
|
| 96 |
## Key Features
|
| 97 |
|
| 98 |
- **Real-time Analysis**: Instant sentiment predictions with confidence scores
|
|
|
|
| 85 |
βββ ui/ # User interface components
|
| 86 |
```
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
## Key Features
|
| 89 |
|
| 90 |
- **Real-time Analysis**: Instant sentiment predictions with confidence scores
|
notebooks/audio_attention_layer_training.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/audio_pytorch_ensemble_training.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/{audio_sentiment_analysis.ipynb β audio_wav2vec2_training.ipynb}
RENAMED
|
@@ -368,8 +368,28 @@
|
|
| 368 |
"from sklearn.utils.class_weight import compute_class_weight\n",
|
| 369 |
"\n",
|
| 370 |
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
| 371 |
-
"print(f\"\\nUsing device: {device}\")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
"# --- Download & Process Datasets ---\n",
|
| 374 |
"RAVDESS_PATH = kagglehub.dataset_download(\"uwrfkaggler/ravdess-emotional-speech-audio\")\n",
|
| 375 |
"CREMA_D_PATH = kagglehub.dataset_download(\"ejlok1/cremad\")\n",
|
|
@@ -393,8 +413,28 @@
|
|
| 393 |
" if sentiment: crema_data.append({\"filepath\": os.path.join(crema_audio_path, filename), \"sentiment\": sentiment})\n",
|
| 394 |
"crema_df = pd.DataFrame(crema_data)\n",
|
| 395 |
"\n",
|
| 396 |
-
"combined_df = pd.concat([ravdess_df, crema_df], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
"# =============================================================================\n",
|
| 399 |
"# STEP 1: WAV2VEC2 PREPARATION\n",
|
| 400 |
"# =============================================================================\n",
|
|
@@ -427,8 +467,28 @@
|
|
| 427 |
" return torch.utils.data.dataloader.default_collate(batch)\n",
|
| 428 |
"train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)\n",
|
| 429 |
"val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)\n",
|
| 430 |
-
"test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
"# =============================================================================\n",
|
| 433 |
"# STEP 2: MODEL & TRAINING SETUP\n",
|
| 434 |
"# =============================================================================\n",
|
|
@@ -436,8 +496,28 @@
|
|
| 436 |
"model = AutoModelForAudioClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=NUM_CLASSES).to(device)\n",
|
| 437 |
"class_weights_np = compute_class_weight('balanced', classes=np.unique(X_train_df['label']), y=X_train_df['label'])\n",
|
| 438 |
"class_weights = torch.tensor(class_weights_np, dtype=torch.float32).to(device)\n",
|
| 439 |
-
"criterion = nn.CrossEntropyLoss(weight=class_weights)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
"# =============================================================================\n",
|
| 442 |
"# STEP 3: TWO-STAGE FINE-TUNING\n",
|
| 443 |
"# =============================================================================\n",
|
|
@@ -508,8 +588,28 @@
|
|
| 508 |
"\n",
|
| 509 |
" if avg_val_loss < best_val_loss:\n",
|
| 510 |
" best_val_loss = avg_val_loss\n",
|
| 511 |
-
" torch.save(model.state_dict(), 'best_wav2vec2_model_2stage.pth')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 512 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
"# =============================================================================\n",
|
| 514 |
"# STEP 4: EVALUATE THE FINAL MODEL\n",
|
| 515 |
"# =============================================================================\n",
|
|
@@ -540,9 +640,6 @@
|
|
| 540 |
"metadata": {
|
| 541 |
"accelerator": "GPU",
|
| 542 |
"colab": {
|
| 543 |
-
"collapsed_sections": [
|
| 544 |
-
"bubcKFNzLDh_"
|
| 545 |
-
],
|
| 546 |
"gpuType": "T4",
|
| 547 |
"provenance": []
|
| 548 |
},
|
|
|
|
| 368 |
"from sklearn.utils.class_weight import compute_class_weight\n",
|
| 369 |
"\n",
|
| 370 |
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
| 371 |
+
"print(f\"\\nUsing device: {device}\")"
|
| 372 |
+
]
|
| 373 |
+
},
|
| 374 |
+
{
|
| 375 |
+
"cell_type": "markdown",
|
| 376 |
+
"metadata": {
|
| 377 |
+
"id": "1854c1a5"
|
| 378 |
+
},
|
| 379 |
+
"source": [
|
| 380 |
+
"## Data Loading and Preparation\n",
|
| 381 |
"\n",
|
| 382 |
+
"This section downloads the RAVDESS and CREMA-D datasets using `kagglehub`. It then processes the metadata to create a DataFrame with file paths and sentiment labels, using a sentiment mapping to group related emotions."
|
| 383 |
+
]
|
| 384 |
+
},
|
| 385 |
+
{
|
| 386 |
+
"cell_type": "code",
|
| 387 |
+
"execution_count": null,
|
| 388 |
+
"metadata": {
|
| 389 |
+
"id": "4ce1824e"
|
| 390 |
+
},
|
| 391 |
+
"outputs": [],
|
| 392 |
+
"source": [
|
| 393 |
"# --- Download & Process Datasets ---\n",
|
| 394 |
"RAVDESS_PATH = kagglehub.dataset_download(\"uwrfkaggler/ravdess-emotional-speech-audio\")\n",
|
| 395 |
"CREMA_D_PATH = kagglehub.dataset_download(\"ejlok1/cremad\")\n",
|
|
|
|
| 413 |
" if sentiment: crema_data.append({\"filepath\": os.path.join(crema_audio_path, filename), \"sentiment\": sentiment})\n",
|
| 414 |
"crema_df = pd.DataFrame(crema_data)\n",
|
| 415 |
"\n",
|
| 416 |
+
"combined_df = pd.concat([ravdess_df, crema_df], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)"
|
| 417 |
+
]
|
| 418 |
+
},
|
| 419 |
+
{
|
| 420 |
+
"cell_type": "markdown",
|
| 421 |
+
"metadata": {
|
| 422 |
+
"id": "86de5576"
|
| 423 |
+
},
|
| 424 |
+
"source": [
|
| 425 |
+
"## Wav2Vec2 Preparation and Dataset Class\n",
|
| 426 |
"\n",
|
| 427 |
+
"This section loads the pre-trained Wav2Vec 2.0 feature extractor and defines the `AudioDataset` class. This class handles loading the audio files, resampling them to the target sampling rate, and processing them using the Wav2Vec 2.0 feature extractor. It also includes a `collate_fn` to handle potential errors during audio processing."
|
| 428 |
+
]
|
| 429 |
+
},
|
| 430 |
+
{
|
| 431 |
+
"cell_type": "code",
|
| 432 |
+
"execution_count": null,
|
| 433 |
+
"metadata": {
|
| 434 |
+
"id": "7222e5b9"
|
| 435 |
+
},
|
| 436 |
+
"outputs": [],
|
| 437 |
+
"source": [
|
| 438 |
"# =============================================================================\n",
|
| 439 |
"# STEP 1: WAV2VEC2 PREPARATION\n",
|
| 440 |
"# =============================================================================\n",
|
|
|
|
| 467 |
" return torch.utils.data.dataloader.default_collate(batch)\n",
|
| 468 |
"train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)\n",
|
| 469 |
"val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)\n",
|
| 470 |
+
"test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)"
|
| 471 |
+
]
|
| 472 |
+
},
|
| 473 |
+
{
|
| 474 |
+
"cell_type": "markdown",
|
| 475 |
+
"metadata": {
|
| 476 |
+
"id": "31cb70d3"
|
| 477 |
+
},
|
| 478 |
+
"source": [
|
| 479 |
+
"## Model and Training Setup\n",
|
| 480 |
"\n",
|
| 481 |
+
"This section loads the pre-trained Wav2Vec 2.0 model for audio classification and sets up the loss function and optimizer. It also calculates class weights to handle the imbalanced dataset."
|
| 482 |
+
]
|
| 483 |
+
},
|
| 484 |
+
{
|
| 485 |
+
"cell_type": "code",
|
| 486 |
+
"execution_count": null,
|
| 487 |
+
"metadata": {
|
| 488 |
+
"id": "326b7435"
|
| 489 |
+
},
|
| 490 |
+
"outputs": [],
|
| 491 |
+
"source": [
|
| 492 |
"# =============================================================================\n",
|
| 493 |
"# STEP 2: MODEL & TRAINING SETUP\n",
|
| 494 |
"# =============================================================================\n",
|
|
|
|
| 496 |
"model = AutoModelForAudioClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=NUM_CLASSES).to(device)\n",
|
| 497 |
"class_weights_np = compute_class_weight('balanced', classes=np.unique(X_train_df['label']), y=X_train_df['label'])\n",
|
| 498 |
"class_weights = torch.tensor(class_weights_np, dtype=torch.float32).to(device)\n",
|
| 499 |
+
"criterion = nn.CrossEntropyLoss(weight=class_weights)"
|
| 500 |
+
]
|
| 501 |
+
},
|
| 502 |
+
{
|
| 503 |
+
"cell_type": "markdown",
|
| 504 |
+
"metadata": {
|
| 505 |
+
"id": "8aaf7e60"
|
| 506 |
+
},
|
| 507 |
+
"source": [
|
| 508 |
+
"## Two-Stage Fine-Tuning\n",
|
| 509 |
"\n",
|
| 510 |
+
"This section implements a two-stage fine-tuning process for the Wav2Vec 2.0 model. In Stage 1, only the classification head is trained with the base model frozen. In Stage 2, all layers are unfrozen and the entire model is fine-tuned with a lower learning rate and a learning rate scheduler."
|
| 511 |
+
]
|
| 512 |
+
},
|
| 513 |
+
{
|
| 514 |
+
"cell_type": "code",
|
| 515 |
+
"execution_count": null,
|
| 516 |
+
"metadata": {
|
| 517 |
+
"id": "6bbccd3c"
|
| 518 |
+
},
|
| 519 |
+
"outputs": [],
|
| 520 |
+
"source": [
|
| 521 |
"# =============================================================================\n",
|
| 522 |
"# STEP 3: TWO-STAGE FINE-TUNING\n",
|
| 523 |
"# =============================================================================\n",
|
|
|
|
| 588 |
"\n",
|
| 589 |
" if avg_val_loss < best_val_loss:\n",
|
| 590 |
" best_val_loss = avg_val_loss\n",
|
| 591 |
+
" torch.save(model.state_dict(), 'best_wav2vec2_model_2stage.pth')"
|
| 592 |
+
]
|
| 593 |
+
},
|
| 594 |
+
{
|
| 595 |
+
"cell_type": "markdown",
|
| 596 |
+
"metadata": {
|
| 597 |
+
"id": "74a029a7"
|
| 598 |
+
},
|
| 599 |
+
"source": [
|
| 600 |
+
"## Model Evaluation\n",
|
| 601 |
"\n",
|
| 602 |
+
"This section evaluates the final fine-tuned Wav2Vec 2.0 model on the hold-out test set. It loads the best model state dictionary, performs inference, and then displays the classification report and confusion matrix to assess the model's performance."
|
| 603 |
+
]
|
| 604 |
+
},
|
| 605 |
+
{
|
| 606 |
+
"cell_type": "code",
|
| 607 |
+
"execution_count": null,
|
| 608 |
+
"metadata": {
|
| 609 |
+
"id": "2a156c15"
|
| 610 |
+
},
|
| 611 |
+
"outputs": [],
|
| 612 |
+
"source": [
|
| 613 |
"# =============================================================================\n",
|
| 614 |
"# STEP 4: EVALUATE THE FINAL MODEL\n",
|
| 615 |
"# =============================================================================\n",
|
|
|
|
| 640 |
"metadata": {
|
| 641 |
"accelerator": "GPU",
|
| 642 |
"colab": {
|
|
|
|
|
|
|
|
|
|
| 643 |
"gpuType": "T4",
|
| 644 |
"provenance": []
|
| 645 |
},
|
notebooks/{vision_sentiment_analysis.ipynb β vision_resnet50_training.ipynb}
RENAMED
|
File without changes
|
notebooks/vision_swin_s_training.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|