hbp5181
/

BindPred

Tabular Regression

Model card Files Files and versions

BindPred / train.py

hbp5181's picture

Update train.py

4f89f53 verified 4 months ago

history blame contribute delete

3.49 kB

	import os
	import pandas as pd
	import numpy as np
	from sklearn.model_selection import KFold
	from sklearn.metrics import mean_squared_error, r2_score
	from scipy.stats import pearsonr, ttest_ind
	from catboost import CatBoostRegressor

	# Load dataset, this should be specified for which model will be trained(eg., embedding only or including physical terms)
	data = pd.read_csv("embeddings/ESM2_interaction.csv")

	# Fill missing feature strings (Features are chosen based on what kind of mdoel will be trained.
	# Ligand and Receptor Features are ESM2 embeddings and Physical Features are PyRosetta Features
	for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
	data[col] = data[col].fillna("")

	# Parse comma-separated floats
	for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
	data[col] = data[col].apply(
	lambda s: [float(x) for x in str(s).split(",") if x.strip()]
	)

	# Build feature arrays
	X_ligand = np.vstack(data["Ligand Features"].values)
	X_receptor = np.vstack(data["Receptor Features"].values)
	# optional: X_physical = np.vstack(data["Physical Features"].values)

	# Convert KD(M) into log10 scale
	raw_y = data["KD(M)"].values
	y = np.log10(raw_y) # assumes all KD values are positive

	records = []

	# Repeat 5×5-fold CV, with and without physical features
	for repeat in range(1, 6):
	kf = KFold(n_splits=5, shuffle=True, random_state=repeat)

	for include_phys in (False, True):
	X_base = np.hstack([X_ligand, X_receptor])
	X_full = np.hstack([X_base, X_physical])
	X_data = X_full if include_phys else X_base

	for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X_data), start=1):
	X_train, X_test = X_data[train_idx], X_data[test_idx]
	y_train, y_test = y[train_idx], y[test_idx]

	# Initialize with your chosen hyperparameters and GPU support
	model = CatBoostRegressor(
	iterations=2000,
	learning_rate=0.08,
	depth=4,
	verbose=500,
	task_type="GPU",
	devices="0"
	)

	# Train and time this fold
	model.fit(X_train, y_train)

	preds = model.predict(X_test)
	rmse = np.sqrt(mean_squared_error(y_test, preds))
	r2 = r2_score(y_test, preds)
	pcc = pearsonr(y_test, preds)[0]

	records.append({
	"repeat": repeat,
	"fold": fold_idx,
	"with_physical": include_phys,
	"pearson_r": pcc,
	"r2": r2,
	"rmse": rmse
	})

	# Aggregate metrics
	metrics_df = pd.DataFrame(records)

	# Save to CSV
	out_dir = "metrics"
	os.makedirs(out_dir, exist_ok=True)
	csv_path = os.path.join(out_dir, "InteractionMetrics.csv")
	metrics_df.to_csv(csv_path, index=False)
	print(f"All metrics saved to {csv_path}")

	# Conduct independent t tests for each metric
	results = {}
	for metric in ["pearson_r", "r2", "rmse"]:
	grp_with = metrics_df.loc[metrics_df.with_physical, metric]
	grp_without = metrics_df.loc[~metrics_df.with_physical, metric]
	t_stat, p_val = ttest_ind(grp_with, grp_without, equal_var=False)
	results[metric] = (t_stat, p_val)

	print("\nT test results comparing with vs without physical features:")
	for m, (t_stat, p_val) in results.items():
	print(f"{m} → t = {t_stat:.3f}, p = {p_val:.3f}")