|
|
import os |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from sklearn.model_selection import KFold |
|
|
from sklearn.metrics import mean_squared_error, r2_score |
|
|
from scipy.stats import pearsonr, ttest_ind |
|
|
from catboost import CatBoostRegressor |
|
|
|
|
|
|
|
|
data = pd.read_csv("embeddings/ESM2_interaction.csv") |
|
|
|
|
|
|
|
|
|
|
|
for col in ["Ligand Features", "Receptor Features", "Physical Features"]: |
|
|
data[col] = data[col].fillna("") |
|
|
|
|
|
|
|
|
for col in ["Ligand Features", "Receptor Features", "Physical Features"]: |
|
|
data[col] = data[col].apply( |
|
|
lambda s: [float(x) for x in str(s).split(",") if x.strip()] |
|
|
) |
|
|
|
|
|
|
|
|
X_ligand = np.vstack(data["Ligand Features"].values) |
|
|
X_receptor = np.vstack(data["Receptor Features"].values) |
|
|
|
|
|
|
|
|
|
|
|
raw_y = data["KD(M)"].values |
|
|
y = np.log10(raw_y) |
|
|
|
|
|
records = [] |
|
|
|
|
|
|
|
|
for repeat in range(1, 6): |
|
|
kf = KFold(n_splits=5, shuffle=True, random_state=repeat) |
|
|
|
|
|
for include_phys in (False, True): |
|
|
X_base = np.hstack([X_ligand, X_receptor]) |
|
|
X_full = np.hstack([X_base, X_physical]) |
|
|
X_data = X_full if include_phys else X_base |
|
|
|
|
|
for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X_data), start=1): |
|
|
X_train, X_test = X_data[train_idx], X_data[test_idx] |
|
|
y_train, y_test = y[train_idx], y[test_idx] |
|
|
|
|
|
|
|
|
model = CatBoostRegressor( |
|
|
iterations=2000, |
|
|
learning_rate=0.08, |
|
|
depth=4, |
|
|
verbose=500, |
|
|
task_type="GPU", |
|
|
devices="0" |
|
|
) |
|
|
|
|
|
|
|
|
model.fit(X_train, y_train) |
|
|
|
|
|
preds = model.predict(X_test) |
|
|
rmse = np.sqrt(mean_squared_error(y_test, preds)) |
|
|
r2 = r2_score(y_test, preds) |
|
|
pcc = pearsonr(y_test, preds)[0] |
|
|
|
|
|
records.append({ |
|
|
"repeat": repeat, |
|
|
"fold": fold_idx, |
|
|
"with_physical": include_phys, |
|
|
"pearson_r": pcc, |
|
|
"r2": r2, |
|
|
"rmse": rmse |
|
|
}) |
|
|
|
|
|
|
|
|
metrics_df = pd.DataFrame(records) |
|
|
|
|
|
|
|
|
out_dir = "metrics" |
|
|
os.makedirs(out_dir, exist_ok=True) |
|
|
csv_path = os.path.join(out_dir, "InteractionMetrics.csv") |
|
|
metrics_df.to_csv(csv_path, index=False) |
|
|
print(f"All metrics saved to {csv_path}") |
|
|
|
|
|
|
|
|
results = {} |
|
|
for metric in ["pearson_r", "r2", "rmse"]: |
|
|
grp_with = metrics_df.loc[metrics_df.with_physical, metric] |
|
|
grp_without = metrics_df.loc[~metrics_df.with_physical, metric] |
|
|
t_stat, p_val = ttest_ind(grp_with, grp_without, equal_var=False) |
|
|
results[metric] = (t_stat, p_val) |
|
|
|
|
|
print("\nT test results comparing with vs without physical features:") |
|
|
for m, (t_stat, p_val) in results.items(): |
|
|
print(f"{m} → t = {t_stat:.3f}, p = {p_val:.3f}") |
|
|
|