PRECISE_GBM / Scenario_heldout_final_PRECISE.py

Upload 8 files

e386fee verified 5 months ago

18.4 kB

	import logging
	import warnings
	import pandas as pd
	import numpy as np
	import json
	import time
	from tqdm import tqdm
	import os
	from datetime import datetime as _dt, timezone as _tz


	from sklearn.exceptions import ConvergenceWarning
	from sklearn.mixture import GaussianMixture
	from sklearn.preprocessing import StandardScaler
	from sklearn.linear_model import LassoCV
	from sklearn.svm import SVC
	from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, VotingClassifier
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
	from sklearn.metrics import (
	accuracy_score, precision_score, recall_score,
	f1_score, balanced_accuracy_score, matthews_corrcoef
	)
	from joblib import Memory, dump

	# -------------------------
	# Logging & warnings
	# -------------------------
	logging.basicConfig(
	filename='nested_lodo_groups.log',
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)
	warnings.filterwarnings('ignore', category=UserWarning)
	warnings.filterwarnings('ignore', category=ConvergenceWarning)

	# Create directories for saving models if they don't exist
	os.makedirs('models_GBM/scenario_1', exist_ok=True)
	os.makedirs('models_GBM/scenario_2', exist_ok=True)
	os.makedirs('models_GBM/scenario_3', exist_ok=True)
	os.makedirs('models_LM22/scenario_1', exist_ok=True)
	os.makedirs('models_LM22/scenario_2', exist_ok=True)
	os.makedirs('models_LM22/scenario_3', exist_ok=True)

	# -------------------------
	# Caching for pipelines
	# -------------------------
	memory = Memory(location='cache_dir', verbose=0)

	# Helper: convert numpy scalars/arrays and dicts into JSON-serializable Python types
	import numpy as _np

	def _convert_obj(o):
	"""Recursively convert numpy types/arrays to native Python objects for JSON dumping."""
	# numpy arrays -> lists
	if hasattr(o, 'tolist') and not isinstance(o, (dict, list, str, bytes)):
	try:
	return o.tolist()
	except Exception:
	return str(o)
	# dict -> convert values
	if isinstance(o, dict):
	return {k: _convert_obj(v) for k, v in o.items()}
	# list/tuple -> convert items
	if isinstance(o, (list, tuple)):
	return [_convert_obj(v) for v in o]
	# numpy scalar -> python native
	if isinstance(o, (_np.integer, _np.floating, _np.bool_)):
	return o.item()
	# otherwise return as-is
	return o

	def _cv_results_to_serializable(cv_dict):
	"""Convert sklearn cv_results_ dict values (numpy arrays) into lists where needed."""
	out = {}
	for k, v in cv_dict.items():
	if hasattr(v, 'tolist'):
	try:
	out[k] = v.tolist()
	except Exception:
	out[k] = str(v)
	else:
	out[k] = _convert_obj(v)
	return out

	# -------------------------
	# Utility: two-step Lasso selection
	# -------------------------
	def select_features(X, y, alphas=(0.1, 0.01), cv=5, max_iter=10000, n_jobs=-1, random_state=42):
	for alpha in alphas:
	lasso = LassoCV(
	alphas=[alpha], cv=cv,
	max_iter=max_iter, n_jobs=n_jobs,
	random_state=random_state
	)
	# fit separately so static analyzers can see the correct type
	lasso.fit(X, y)
	# use flatnonzero to get selected indices as a 1-D array
	support = np.flatnonzero(lasso.coef_ != 0)
	if support.size > 0:
	return support
	raise ValueError(f"No features selected at alphas {alphas}")

	# -------------------------
	# Define two groups of scenarios with actual paths
	# Scenario definitions_LM22
	scenarios_LM22 = {
	1: {
	'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_TC.csv",
	'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_Ivy/Cbx_LOOCV_heldout_Ivy_Lm22/CIBERSORTx_Job49_Results.csv",
	'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_Ivy.csv",
	'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/IvyGAP/Test_Ivy_LM22/CIBERSORTx_Job55_Results.csv"
	},
	2: {
	'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_ivy.csv",
	'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_TCGA/Cbx_heldoutTCGA_Lm22/CIBERSORTx_Job47_Results.csv",
	'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_TCGA.csv",
	'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/TCGA/Cbx_TCGA_Test_LM22/CIBERSORTx_Job53_Results.csv"
	},
	3: {
	'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_TC_ivy.csv",
	'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_CPTAC/CBx_LOOCV_heldout_CPTAC_LM22/CIBERSORTx_Job51_Results.csv",
	'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_CPTAC.csv",
	'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/CPTAC/Test_CPTAC_LM22/CIBERSORTx_Job57_Results.csv"
	}
	}
	# Scenario definitions_GBM
	scenarios_GBM = {
	1: {
	'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_TC.csv",
	'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_Ivy/Cbx_LOOCV_heldout_Ivy_GBM/CIBERSORTx_Job50_Results.csv",
	'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_Ivy.csv",
	'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/IvyGAP/Test_Ivy_GBM/CIBERSORTx_Job56_Results.csv"
	},
	2: {
	'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_CP_ivy.csv",
	'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_TCGA/Cbx_LOOCV_TCGA_heldout_GBM/CIBERSORTx_Job48_Results.csv",
	'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_TCGA.csv",
	'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/TCGA/TCGA_test_GBM/CIBERSORTx_Job54_Results.csv"
	},
	3: {
	'train_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/neuro_combat_radiomic_CGGA_Rem_TC_ivy.csv",
	'train_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Heldout/heldout_CPTAC/Cbx_LOOCV_heldout_CPTAC_GBM/CIBERSORTx_Job52_Results.csv",
	'heldout_radiomics': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Radiomics/Radiomics_LOOCV_test_CPTAC.csv",
	'heldout_immune': r"C:/Users/pg22/Downloads/PRECISE-GBM/LOOCV_withoutHarm/Genome/Testing/CPTAC/Test_CPTAC_GBM/CIBERSORTx_Job58_Results.csv"
	}
	}

	signature_groups = {
	'LM22': scenarios_LM22,
	'GBM': scenarios_GBM
	}

	# -------------------------
	# Hyperparameter grids
	# -------------------------
	param_dist_svm = {
	'clf__C': [1, 10],
	'clf__gamma': [0.01, 0.1],
	'clf__kernel': ['rbf']
	}
	param_dist_ensemble = {
	'ensemble__svm__classifier__C': [1],
	'ensemble__svm__classifier__kernel': ['rbf'],
	'ensemble__rf__n_estimators': [100, 200],
	'ensemble__rf__max_depth': [None],
	'ensemble__gb__max_iter': [100],
	'ensemble__gb__learning_rate': [0.1]
	}

	# -------------------------
	# Process each signature group
	# -------------------------
	for sig_name, scenarios in signature_groups.items():
	all_results = {}
	all_features = {}
	all_cv = {}

	for scen_id, paths in scenarios.items():
	logging.info(f"[{sig_name}] Starting {scen_id}")
	t0 = time.time()

	# Load & align training data
	rad_tr = pd.read_csv(paths['train_radiomics'], index_col=0)
	imm_tr = pd.read_csv(paths['train_immune'], index_col=0)
	df_tr = pd.merge(rad_tr, imm_tr, left_index=True, right_index=True, how='inner')

	# Load & align held-out data
	rad_ho = pd.read_csv(paths['heldout_radiomics'], index_col=0)
	imm_ho = pd.read_csv(paths['heldout_immune'], index_col=0)
	df_ho = pd.merge(rad_ho, imm_ho, left_index=True, right_index=True, how='inner')

	scen_results = {}
	scen_features = {}
	scen_cv = {}
	inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

	# Determine immune feature columns (may differ by signature)
	immune_cols = imm_tr.columns.intersection(imm_ho.columns)
	if immune_cols.empty:
	raise ValueError(f"{sig_name}:{scen_id} - no matching immune features between train and held-out")
	logging.info(f"{sig_name}:{scen_id} - {len(immune_cols)} immune features: {immune_cols.tolist()}")

	for col in tqdm(immune_cols, desc=f"{sig_name}:{scen_id}"):
	try:
	# GMM labeling on train
	gmm = GaussianMixture(n_components=2, random_state=42)
	y_tr = gmm.fit_predict(df_tr[[col]].values)
	if len(np.unique(y_tr)) < 2:
	continue
	y_ho = gmm.predict(df_ho[[col]].values)
	# ensure label 1 = higher mean
	m0, m1 = gmm.means_.flatten()
	if m0 < m1:
	y_tr = 1 - y_tr; y_ho = 1 - y_ho
	# save gmm model
	gmm_model_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_gmm_model.joblib'
	dump(gmm, gmm_model_path)
	logging.info(f"Saved GMM model to {gmm_model_path}")
	logging.info(f"GMM means for {sig_name}:{scen_id}, col {col}: {gmm.means_.flatten().tolist()}")

	# Feature selection
	X_tr = df_tr.drop(columns=[col]).values
	X_ho = df_ho.drop(columns=[col]).values
	sel = select_features(X_tr, y_tr)
	X_tr_sel, X_ho_sel = X_tr[:, sel], X_ho[:, sel]
	feat_names = df_tr.drop(columns=[col]).columns.tolist()
	sel_names = [feat_names[i] for i in sel]

	# Save selected feature names for this model so retraining can reuse them
	sel_feat_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_selected_features.json'
	os.makedirs(os.path.dirname(sel_feat_path), exist_ok=True)
	ts = _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S')
	meta = {'saved_at': _dt.now(_tz.utc).isoformat(), 'version': ts, 'selected_features': sel_names}
	with open(sel_feat_path, 'w') as _f:
	json.dump(meta, _f, indent=2)

	# SVM nested CV
	pipe_svm = Pipeline([
	('scaler', StandardScaler()),
	('clf', SVC(class_weight='balanced', probability=True, random_state=42))
	], memory=memory)
	search_svm = RandomizedSearchCV(
	pipe_svm, param_dist_svm, n_iter=5,
	cv=inner_cv, scoring='balanced_accuracy',
	n_jobs=-1, refit=True, error_score='raise'
	)
	search_svm.fit(X_tr_sel, y_tr)
	y_pred_svm = search_svm.predict(X_ho_sel)
	cv_svm = {k: (v.tolist() if hasattr(v, 'tolist') else v)
	for k, v in search_svm.cv_results_.items()}
	# save SVM model
	svm_model_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_svm_model.joblib'
	dump(search_svm.best_estimator_, svm_model_path)
	logging.info(f"Saved SVM model to {svm_model_path}")
	logging.info(f"SVM best params for {sig_name}:{scen_id}, col {col}: {search_svm.best_params_}")

	# Save SVM best params and cv results for reproducibility / retraining (with metadata)
	svm_params_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_svm_params.json'
	svm_cv_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_svm_cv.json'
	os.makedirs(os.path.dirname(svm_params_path), exist_ok=True)
	svm_meta = {
	'saved_at': _dt.now(_tz.utc).isoformat(),
	'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
	'best_params': _convert_obj(search_svm.best_params_)
	}
	with open(svm_params_path, 'w') as _f:
	json.dump(svm_meta, _f, indent=2)
	svm_cv_meta = {
	'saved_at': _dt.now(_tz.utc).isoformat(),
	'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
	'cv_results': _cv_results_to_serializable(search_svm.cv_results_)
	}
	with open(svm_cv_path, 'w') as _f:
	json.dump(svm_cv_meta, _f, indent=2)

	# Ensemble nested CV
	base_pipe = Pipeline([
	('scaler', StandardScaler()),
	('classifier', SVC(class_weight='balanced', probability=True, random_state=42))
	], memory=memory)
	ensemble = VotingClassifier([
	('svm', base_pipe),
	('rf', RandomForestClassifier(class_weight='balanced', random_state=42)),
	('gb', HistGradientBoostingClassifier(random_state=42))
	], voting='soft', weights=[1,1,1], n_jobs=-1)
	pipe_ens = Pipeline([
	('scaler', StandardScaler()),
	('ensemble', ensemble)
	], memory=memory)
	search_ens = RandomizedSearchCV(
	pipe_ens, param_dist_ensemble, n_iter=3,
	cv=inner_cv, scoring='balanced_accuracy',
	n_jobs=-1, refit=True, error_score='raise'
	)
	search_ens.fit(X_tr_sel, y_tr)
	y_pred_ens = search_ens.predict(X_ho_sel)
	cv_ens = {k: (v.tolist() if hasattr(v, 'tolist') else v)
	for k, v in search_ens.cv_results_.items()}
	# save Ensemble model
	ens_model_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_ens_model.joblib'
	dump(search_ens.best_estimator_, ens_model_path)
	logging.info(f"Saved Ensemble model to {ens_model_path}")
	logging.info(f"Ensemble best params for {sig_name}:{scen_id}, col {col}: {search_ens.best_params_}")

	# Save Ensemble best params and cv results for reproducibility / retraining (with metadata)
	ens_params_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_ens_params.json'
	ens_cv_path = f'models_{sig_name}/scenario_{scen_id}/{sig_name}_scen{scen_id}_{col}_ens_cv.json'
	os.makedirs(os.path.dirname(ens_params_path), exist_ok=True)
	ens_meta = {
	'saved_at': _dt.now(_tz.utc).isoformat(),
	'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
	'best_params': _convert_obj(search_ens.best_params_)
	}
	with open(ens_params_path, 'w') as _f:
	json.dump(ens_meta, _f, indent=2)
	ens_cv_meta = {
	'saved_at': _dt.now(_tz.utc).isoformat(),
	'version': _dt.now(_tz.utc).strftime('%Y%m%d_%H%M%S'),
	'cv_results': _cv_results_to_serializable(search_ens.cv_results_)
	}
	with open(ens_cv_path, 'w') as _f:
	json.dump(ens_cv_meta, _f, indent=2)

	# Metrics
	def metrics(y_true, y_pred):
	return {
	'Accuracy': accuracy_score(y_true, y_pred),
	'Precision': precision_score(y_true, y_pred, zero_division=1),
	'Recall': recall_score(y_true, y_pred, zero_division=1),
	'F1 Score': f1_score(y_true, y_pred, zero_division=1),
	'Balanced Accuracy': balanced_accuracy_score(y_true, y_pred),
	'MCC': matthews_corrcoef(y_true, y_pred)
	}
	scen_results[col] = {'SVM': metrics(y_ho, y_pred_svm), 'Ensemble': metrics(y_ho, y_pred_ens)}
	scen_features[col] = sel_names
	scen_cv[col] = {'svm_cv': cv_svm, 'ensemble_cv': cv_ens}

	except Exception as e:
	logging.error(f"{sig_name}:{scen_id}, col {col}: {e}")
	print(f"[ERROR] {sig_name}:{scen_id}, column {col}: {e}")

	# Save for this scenario
	all_results[scen_id] = scen_results
	all_features[scen_id] = scen_features
	all_cv[scen_id] = scen_cv
	logging.info(f"[{sig_name}] {scen_id} done in {time.time()-t0:.1f}s")

	# Write group-level JSONs
	with open(f'nested_results111_{sig_name}.json', 'w') as f:
	json.dump(all_results, f, indent=2)
	with open(f'nested_features111_{sig_name}.json', 'w') as f:
	json.dump(all_features, f, indent=2)
	with open(f'nested_cv111_{sig_name}.json', 'w') as f:
	json.dump(all_cv, f, indent=2)
	print(f"✅ {sig_name} group complete: scenarios={list(all_results.keys())}")

	print("All signature groups processed.")