import os import pandas as pd import numpy as np from sklearn.model_selection import KFold from sklearn.metrics import mean_squared_error, r2_score from scipy.stats import pearsonr, ttest_ind from catboost import CatBoostRegressor # Load dataset, this should be specified for which model will be trained(eg., embedding only or including physical terms) data = pd.read_csv("embeddings/ESM2_interaction.csv") # Fill missing feature strings (Features are chosen based on what kind of mdoel will be trained. # Ligand and Receptor Features are ESM2 embeddings and Physical Features are PyRosetta Features for col in ["Ligand Features", "Receptor Features", "Physical Features"]: data[col] = data[col].fillna("") # Parse comma-separated floats for col in ["Ligand Features", "Receptor Features", "Physical Features"]: data[col] = data[col].apply( lambda s: [float(x) for x in str(s).split(",") if x.strip()] ) # Build feature arrays X_ligand = np.vstack(data["Ligand Features"].values) X_receptor = np.vstack(data["Receptor Features"].values) # optional: X_physical = np.vstack(data["Physical Features"].values) # Convert KD(M) into log10 scale raw_y = data["KD(M)"].values y = np.log10(raw_y) # assumes all KD values are positive records = [] # Repeat 5×5-fold CV, with and without physical features for repeat in range(1, 6): kf = KFold(n_splits=5, shuffle=True, random_state=repeat) for include_phys in (False, True): X_base = np.hstack([X_ligand, X_receptor]) X_full = np.hstack([X_base, X_physical]) X_data = X_full if include_phys else X_base for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X_data), start=1): X_train, X_test = X_data[train_idx], X_data[test_idx] y_train, y_test = y[train_idx], y[test_idx] # Initialize with your chosen hyperparameters and GPU support model = CatBoostRegressor( iterations=2000, learning_rate=0.08, depth=4, verbose=500, task_type="GPU", devices="0" ) # Train and time this fold model.fit(X_train, y_train) preds = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, preds)) r2 = r2_score(y_test, preds) pcc = pearsonr(y_test, preds)[0] records.append({ "repeat": repeat, "fold": fold_idx, "with_physical": include_phys, "pearson_r": pcc, "r2": r2, "rmse": rmse }) # Aggregate metrics metrics_df = pd.DataFrame(records) # Save to CSV out_dir = "metrics" os.makedirs(out_dir, exist_ok=True) csv_path = os.path.join(out_dir, "InteractionMetrics.csv") metrics_df.to_csv(csv_path, index=False) print(f"All metrics saved to {csv_path}") # Conduct independent t tests for each metric results = {} for metric in ["pearson_r", "r2", "rmse"]: grp_with = metrics_df.loc[metrics_df.with_physical, metric] grp_without = metrics_df.loc[~metrics_df.with_physical, metric] t_stat, p_val = ttest_ind(grp_with, grp_without, equal_var=False) results[metric] = (t_stat, p_val) print("\nT test results comparing with vs without physical features:") for m, (t_stat, p_val) in results.items(): print(f"{m} → t = {t_stat:.3f}, p = {p_val:.3f}")