BindPred / train.py
hbp5181's picture
Update train.py
4f89f53 verified
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr, ttest_ind
from catboost import CatBoostRegressor
# Load dataset, this should be specified for which model will be trained(eg., embedding only or including physical terms)
data = pd.read_csv("embeddings/ESM2_interaction.csv")
# Fill missing feature strings (Features are chosen based on what kind of mdoel will be trained.
# Ligand and Receptor Features are ESM2 embeddings and Physical Features are PyRosetta Features
for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
data[col] = data[col].fillna("")
# Parse comma-separated floats
for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
data[col] = data[col].apply(
lambda s: [float(x) for x in str(s).split(",") if x.strip()]
)
# Build feature arrays
X_ligand = np.vstack(data["Ligand Features"].values)
X_receptor = np.vstack(data["Receptor Features"].values)
# optional: X_physical = np.vstack(data["Physical Features"].values)
# Convert KD(M) into log10 scale
raw_y = data["KD(M)"].values
y = np.log10(raw_y) # assumes all KD values are positive
records = []
# Repeat 5×5-fold CV, with and without physical features
for repeat in range(1, 6):
kf = KFold(n_splits=5, shuffle=True, random_state=repeat)
for include_phys in (False, True):
X_base = np.hstack([X_ligand, X_receptor])
X_full = np.hstack([X_base, X_physical])
X_data = X_full if include_phys else X_base
for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X_data), start=1):
X_train, X_test = X_data[train_idx], X_data[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
# Initialize with your chosen hyperparameters and GPU support
model = CatBoostRegressor(
iterations=2000,
learning_rate=0.08,
depth=4,
verbose=500,
task_type="GPU",
devices="0"
)
# Train and time this fold
model.fit(X_train, y_train)
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)
pcc = pearsonr(y_test, preds)[0]
records.append({
"repeat": repeat,
"fold": fold_idx,
"with_physical": include_phys,
"pearson_r": pcc,
"r2": r2,
"rmse": rmse
})
# Aggregate metrics
metrics_df = pd.DataFrame(records)
# Save to CSV
out_dir = "metrics"
os.makedirs(out_dir, exist_ok=True)
csv_path = os.path.join(out_dir, "InteractionMetrics.csv")
metrics_df.to_csv(csv_path, index=False)
print(f"All metrics saved to {csv_path}")
# Conduct independent t tests for each metric
results = {}
for metric in ["pearson_r", "r2", "rmse"]:
grp_with = metrics_df.loc[metrics_df.with_physical, metric]
grp_without = metrics_df.loc[~metrics_df.with_physical, metric]
t_stat, p_val = ttest_ind(grp_with, grp_without, equal_var=False)
results[metric] = (t_stat, p_val)
print("\nT test results comparing with vs without physical features:")
for m, (t_stat, p_val) in results.items():
print(f"{m} → t = {t_stat:.3f}, p = {p_val:.3f}")