| import logging
|
| import warnings
|
| import pandas as pd
|
| import numpy as np
|
| import json
|
| import time
|
| from tqdm import tqdm
|
| import os
|
| from datetime import datetime as _dt, timezone as _tz
|
|
|
|
|
| from sklearn.exceptions import ConvergenceWarning
|
| from sklearn.mixture import GaussianMixture
|
| from sklearn.preprocessing import StandardScaler
|
| from sklearn.linear_model import LassoCV
|
| from sklearn.svm import SVC
|
| from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, VotingClassifier
|
| from sklearn.pipeline import Pipeline
|
| from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
|
| from sklearn.metrics import (
|
| accuracy_score, precision_score, recall_score,
|
| f1_score, balanced_accuracy_score, matthews_corrcoef
|
| )
|
| from joblib import Memory, dump
|
|
|
|
|
|
|
|
|
| logging.basicConfig(
|
| filename='nested_lodo_groups.log',
|
| level=logging.INFO,
|
| format='%(asctime)s - %(levelname)s - %(message)s'
|
| )
|
| warnings.filterwarnings('ignore', category=UserWarning)
|
| warnings.filterwarnings('ignore', category=ConvergenceWarning)
|
|
|
|
|
| os.makedirs('models_GBM/scenario_1', exist_ok=True)
|
| os.makedirs('models_GBM/scenario_2', exist_ok=True)
|
| os.makedirs('models_GBM/scenario_3', exist_ok=True)
|
| os.makedirs('models_LM22/scenario_1', exist_ok=True)
|
| os.makedirs('models_LM22/scenario_2', exist_ok=True)
|
| os.makedirs('models_LM22/scenario_3', exist_ok=True)
|
|
|
|
|
|
|
|
|
| memory = Memory(location='cache_dir', verbose=0)
|
|
|
|
|
| import numpy as _np
|
|
|
| def _convert_obj(o):
|
| """Recursively convert numpy types/arrays to native Python objects for JSON dumping."""
|
|
|
| if hasattr(o, 'tolist') and not isinstance(o, (dict, list, str, bytes)):
|
| try:
|
| return o.tolist()
|
| except Exception:
|
| return str(o)
|
|
|
| if isinstance(o, dict):
|
| return {k: _convert_obj(v) for k, v in o.items()}
|
|
|
| if isinstance(o, (list, tuple)):
|
| return [_convert_obj(v) for v in o]
|
|
|
| if isinstance(o, (_np.integer, _np.floating, _np.bool_)):
|
| return o.item()
|
|
|
| return o
|
|
|
| def _cv_results_to_serializable(cv_dict):
|
| """Convert sklearn cv_results_ dict values (numpy arrays) into lists where needed."""
|
| out = {}
|
| for k, v in cv_dict.items():
|
| if hasattr(v, 'tolist'):
|
| try:
|
| out[k] = v.tolist()
|
| except Exception:
|
| out[k] = str(v)
|
| else:
|
| out[k] = _convert_obj(v)
|
| return out
|
|
|
|
|
|
|
|
|
| def select_features(X, y, alphas=(0.1, 0.01), cv=5, max_iter=10000, n_jobs=-1, random_state=42):
|
| for alpha in alphas:
|
| lasso = LassoCV(
|
| alphas=[alpha], cv=cv,
|
| max_iter=max_iter, n_jobs=n_jobs,
|
| random_state=random_state
|
| )
|
|
|
| lasso.fit(X, y)
|
|
|
| support = np.flatnonzero(lasso.coef_ != 0)
|
| if support.size > 0:
|
| return support
|
| raise ValueError(f"No features selected at alphas {alphas}")
|
|
|
|
|
|
|
|
|
| scenarios_LM22 = {
|
| 1: {
|
| 'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_TC.csv",
|
| 'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_Ivy/Cbx_LOOCV_heldout_Ivy_Lm22/CIBERSORTx_Job49_Results.csv",
|
| 'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_Ivy.csv",
|
| 'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/IvyGAP/Test_Ivy_LM22/CIBERSORTx_Job55_Results.csv"
|
| },
|
| 2: {
|
| 'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_ivy.csv",
|
| 'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_TCGA/Cbx_heldoutTCGA_Lm22/CIBERSORTx_Job47_Results.csv",
|
| 'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_TCGA.csv",
|
| 'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/TCGA/Cbx_TCGA_Test_LM22/CIBERSORTx_Job53_Results.csv"
|
| },
|
| 3: {
|
| 'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_TC_ivy.csv",
|
| 'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_CPTAC/CBx_LOOCV_heldout_CPTAC_LM22/CIBERSORTx_Job51_Results.csv",
|
| 'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_CPTAC.csv",
|
| 'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/CPTAC/Test_CPTAC_LM22/CIBERSORTx_Job57_Results.csv"
|
| }
|
| }
|
|
|
| scenarios_GBM = {
|
| 1: {
|
| 'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_TC.csv",
|
| 'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_Ivy/Cbx_LOOCV_heldout_Ivy_GBM/CIBERSORTx_Job50_Results.csv",
|
| 'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_Ivy.csv",
|
| 'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/IvyGAP/Test_Ivy_GBM/CIBERSORTx_Job56_Results.csv"
|
| },
|
| 2: {
|
| 'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_ivy.csv",
|
| 'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_TCGA/Cbx_LOOCV_TCGA_heldout_GBM/CIBERSORTx_Job48_Results.csv",
|
| 'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_TCGA.csv",
|
| 'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/TCGA/TCGA_test_GBM/CIBERSORTx_Job54_Results.csv"
|
| },
|
| 3: {
|
| 'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_TC_ivy.csv",
|
| 'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_CPTAC/Cbx_LOOCV_heldout_CPTAC_GBM/CIBERSORTx_Job52_Results.csv",
|
| 'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_CPTAC.csv",
|
| 'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/CPTAC/Test_CPTAC_GBM/CIBERSORTx_Job58_Results.csv"
|
| }
|
| }
|
|
|
| signature_groups = {
|
| 'LM22': scenarios_LM22,
|
| 'GBM': scenarios_GBM
|
| }
|
|
|
|
|
|
|
|
|
| param_dist_svm = {
|
| 'clf__C': [1, 10],
|
| 'clf__gamma': [0.01, 0.1],
|
| 'clf__kernel': ['rbf']
|
| }
|
| param_dist_ensemble = {
|
| 'ensemble__svm__classifier__C': [1],
|
| 'ensemble__svm__classifier__kernel': ['rbf'],
|
| 'ensemble__rf__n_estimators': [100, 200],
|
| 'ensemble__rf__max_depth': [None],
|
| 'ensemble__gb__max_iter': [100],
|
| 'ensemble__gb__learning_rate': [0.1]
|
| }
|
|
|
|
|
|
|
|
|
| for sig_name, scenarios in signature_groups.items():
|
| all_results = {}
|
| all_features = {}
|
| all_cv = {}
|
|
|
| for scen_id, paths in scenarios.items():
|
| logging.info(f"[{sig_name}] Starting {scen_id}")
|
| t0 = time.time()
|
|
|
|
|
| rad_tr = pd.read_csv(paths['train_radiomics'], index_col=0)
|
| imm_tr = pd.read_csv(paths['train_immune'], index_col=0)
|
| df_tr = pd.merge(rad_tr, imm_tr, left_index=True, right_index=True, how='inner')
|
|
|
|
|
| rad_ho = pd.read_csv(paths['heldout_radiomics'], index_col=0)
|
| imm_ho = pd.read_csv(paths['heldout_immune'], index_col=0)
|
| df_ho = pd.merge(rad_ho, imm_ho, left_index=True, right_index=True, how='inner')
|
|
|
| scen_results = {}
|
| scen_features = {}
|
| scen_cv = {}
|
| inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
|
|
| immune_cols = imm_tr.columns.intersection(imm_ho.columns)
|
| if immune_cols.empty:
|
| raise ValueError(f"{sig_name}:{scen_id} - no matching immune features between train and held-out")
|
| logging.info(f"{sig_name}:{scen_id} - {len(immune_cols)} immune features: {immune_cols.tolist()}")
|
|
|
| for col in tqdm(immune_cols, desc=f"{sig_name}:{scen_id}"):
|
| try:
|
|
|
| gmm = GaussianMixture(n_components=2, random_state=42)
|
| y_tr = gmm.fit_predict(df_tr[[col]].values)
|
| if len(np.unique(y_tr)) < 2:
|
| continue
|
| y_ho = gmm.predict(df_ho[[col]].values)
|
|
|
| m0, m1 = gmm.means_.flatten()
|
| if m0 < m1:
|
| y_tr = 1 - y_tr; y_ho = 1 - y_ho
|
|
|
| gmm_model_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_gmm_model.joblib'
|
| dump(gmm, gmm_model_path)
|
| logging.info(f"Saved GMM model to {gmm_model_path}")
|
| logging.info(f"GMM means for {sig_name}:{scen_id}, col {col}: {gmm.means_.flatten().tolist()}")
|
|
|
|
|
| X_tr = df_tr.drop(columns=[col]).values
|
| X_ho = df_ho.drop(columns=[col]).values
|
| sel = select_features(X_tr, y_tr)
|
| X_tr_sel, X_ho_sel = X_tr[:, sel], X_ho[:, sel]
|
| feat_names = df_tr.drop(columns=[col]).columns.tolist()
|
| sel_names = [feat_names[i] for i in sel]
|
|
|
|
|
| sel_feat_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_selected_features.json'
|
| os.makedirs(os.path.dirname(sel_feat_path), exist_ok=True)
|
| ts = _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S')
|
| meta = {'saved_at': _dt.now(_tz.utc).isoformat(), 'version': ts, 'selected_features': sel_names}
|
| with open(sel_feat_path, 'w') as _f:
|
| json.dump(meta, _f, indent=2)
|
|
|
|
|
| pipe_svm = Pipeline([
|
| ('scaler', StandardScaler()),
|
| ('clf', SVC(class_weight='balanced', probability=True, random_state=42))
|
| ], memory=memory)
|
| search_svm = RandomizedSearchCV(
|
| pipe_svm, param_dist_svm, n_iter=5,
|
| cv=inner_cv, scoring='balanced_accuracy',
|
| n_jobs=-1, refit=True, error_score='raise'
|
| )
|
| search_svm.fit(X_tr_sel, y_tr)
|
| y_pred_svm = search_svm.predict(X_ho_sel)
|
| cv_svm = {k: (v.tolist() if hasattr(v, 'tolist') else v)
|
| for k, v in search_svm.cv_results_.items()}
|
|
|
| svm_model_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_svm_model.joblib'
|
| dump(search_svm.best_estimator_, svm_model_path)
|
| logging.info(f"Saved SVM model to {svm_model_path}")
|
| logging.info(f"SVM best params for {sig_name}:{scen_id}, col {col}: {search_svm.best_params_}")
|
|
|
|
|
| svm_params_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_svm_params.json'
|
| svm_cv_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_svm_cv.json'
|
| os.makedirs(os.path.dirname(svm_params_path), exist_ok=True)
|
| svm_meta = {
|
| 'saved_at': _dt.now(_tz.utc).isoformat(),
|
| 'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
|
| 'best_params': _convert_obj(search_svm.best_params_)
|
| }
|
| with open(svm_params_path, 'w') as _f:
|
| json.dump(svm_meta, _f, indent=2)
|
| svm_cv_meta = {
|
| 'saved_at': _dt.now(_tz.utc).isoformat(),
|
| 'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
|
| 'cv_results': _cv_results_to_serializable(search_svm.cv_results_)
|
| }
|
| with open(svm_cv_path, 'w') as _f:
|
| json.dump(svm_cv_meta, _f, indent=2)
|
|
|
|
|
| base_pipe = Pipeline([
|
| ('scaler', StandardScaler()),
|
| ('classifier', SVC(class_weight='balanced', probability=True, random_state=42))
|
| ], memory=memory)
|
| ensemble = VotingClassifier([
|
| ('svm', base_pipe),
|
| ('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
|
| ('gb', HistGradientBoostingClassifier(random_state=42))
|
| ], voting='soft', weights=[1,1,1], n_jobs=-1)
|
| pipe_ens = Pipeline([
|
| ('scaler', StandardScaler()),
|
| ('ensemble', ensemble)
|
| ], memory=memory)
|
| search_ens = RandomizedSearchCV(
|
| pipe_ens, param_dist_ensemble, n_iter=3,
|
| cv=inner_cv, scoring='balanced_accuracy',
|
| n_jobs=-1, refit=True, error_score='raise'
|
| )
|
| search_ens.fit(X_tr_sel, y_tr)
|
| y_pred_ens = search_ens.predict(X_ho_sel)
|
| cv_ens = {k: (v.tolist() if hasattr(v, 'tolist') else v)
|
| for k, v in search_ens.cv_results_.items()}
|
|
|
| ens_model_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_ens_model.joblib'
|
| dump(search_ens.best_estimator_, ens_model_path)
|
| logging.info(f"Saved Ensemble model to {ens_model_path}")
|
| logging.info(f"Ensemble best params for {sig_name}:{scen_id}, col {col}: {search_ens.best_params_}")
|
|
|
|
|
| ens_params_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_ens_params.json'
|
| ens_cv_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_ens_cv.json'
|
| os.makedirs(os.path.dirname(ens_params_path), exist_ok=True)
|
| ens_meta = {
|
| 'saved_at': _dt.now(_tz.utc).isoformat(),
|
| 'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
|
| 'best_params': _convert_obj(search_ens.best_params_)
|
| }
|
| with open(ens_params_path, 'w') as _f:
|
| json.dump(ens_meta, _f, indent=2)
|
| ens_cv_meta = {
|
| 'saved_at': _dt.now(_tz.utc).isoformat(),
|
| 'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
|
| 'cv_results': _cv_results_to_serializable(search_ens.cv_results_)
|
| }
|
| with open(ens_cv_path, 'w') as _f:
|
| json.dump(ens_cv_meta, _f, indent=2)
|
|
|
|
|
| def metrics(y_true, y_pred):
|
| return {
|
| 'Accuracy': accuracy_score(y_true, y_pred),
|
| 'Precision': precision_score(y_true, y_pred, zero_division=1),
|
| 'Recall': recall_score(y_true, y_pred, zero_division=1),
|
| 'F1 Score': f1_score(y_true, y_pred, zero_division=1),
|
| 'Balanced Accuracy': balanced_accuracy_score(y_true, y_pred),
|
| 'MCC': matthews_corrcoef(y_true, y_pred)
|
| }
|
| scen_results[col] = {'SVM': metrics(y_ho, y_pred_svm), 'Ensemble': metrics(y_ho, y_pred_ens)}
|
| scen_features[col] = sel_names
|
| scen_cv[col] = {'svm_cv': cv_svm, 'ensemble_cv': cv_ens}
|
|
|
| except Exception as e:
|
| logging.error(f"{sig_name}:{scen_id}, col {col}: {e}")
|
| print(f"[ERROR] {sig_name}:{scen_id}, column {col}: {e}")
|
|
|
|
|
| all_results[scen_id] = scen_results
|
| all_features[scen_id] = scen_features
|
| all_cv[scen_id] = scen_cv
|
| logging.info(f"[{sig_name}] {scen_id} done in {time.time()-t0:.1f}s")
|
|
|
|
|
| with open(f'nested_results111_{sig_name}.json', 'w') as f:
|
| json.dump(all_results, f, indent=2)
|
| with open(f'nested_features111_{sig_name}.json', 'w') as f:
|
| json.dump(all_features, f, indent=2)
|
| with open(f'nested_cv111_{sig_name}.json', 'w') as f:
|
| json.dump(all_cv, f, indent=2)
|
| print(f"✅ {sig_name} group complete: scenarios={list(all_results.keys())}")
|
|
|
| print("All signature groups processed.")
|
|
|