cpr / scripts /get_probs.py
LoocasGoose's picture
new clean branch
3a8e9de
import numpy as np
import pandas as pd
import argparse
from protein_conformal.util import *
def main(args):
df = pd.read_csv(args.input)
df_probs = pd.read_csv(args.precomputed_path)
if args.precomputed:
# NOTE: if probabilities are precomputed, we can just load them and save them. This will involve the following steps:
# 1. Load the precomputed probabilities
# 2. For each hit, see which similarity score bin it corresponds to
# 3. Assign the probability to the hit based on the similarity score bin
# df = pd.read_csv(args.precomputed_path)
# if args.partial:
# df["prob_partial"] = df["prob_exact"]
# else:
# df["prob_exact"] = df["prob_exact"]
# df.to_csv(args.output, index=False)
# return
prob_exact_lst, prob_partial_lst = [], []
for d in df["D_score"]:
# Check if there are any rows where similarity <= d
if len(df_probs[df_probs["similarity"] <= d]) > 0:
lower_bin = df_probs[df_probs["similarity"] <= d].iloc[-1]
else:
# If d is smaller than all similarities, use the smallest bin
lower_bin = df_probs.iloc[0]
# Check if there are any rows where similarity >= d
if len(df_probs[df_probs["similarity"] >= d]) > 0:
upper_bin = df_probs[df_probs["similarity"] >= d].iloc[0]
else:
# If d is larger than all similarities, use the largest bin
upper_bin = df_probs.iloc[-1]
# Get probabilities for lower bin, upper bin (columns "prob_exact_p0", "prob_exact_p1")
p_0_lower = lower_bin["prob_exact_p0"]
p_1_lower = lower_bin["prob_exact_p1"]
p_0_upper = upper_bin["prob_exact_p0"]
p_1_upper = upper_bin["prob_exact_p1"]
# Interpolate probabilities
prob_exact = np.mean([
min(p_0_lower, p_1_lower, p_0_upper, p_1_upper),
max(p_0_lower, p_1_lower, p_0_upper, p_1_upper)
])
prob_exact_lst.append(prob_exact)
if args.partial:
p_0_lower = lower_bin["prob_partial_p0"]
p_1_lower = lower_bin["prob_partial_p1"]
p_0_upper = upper_bin["prob_partial_p0"]
p_1_upper = upper_bin["prob_partial_p1"]
prob_partial = np.mean([
min(p_0_lower, p_1_lower, p_0_upper, p_1_upper),
max(p_0_lower, p_1_lower, p_0_upper, p_1_upper)
])
prob_partial_lst.append(prob_partial)
df["prob_exact"] = prob_exact_lst
if args.partial:
df["prob_partial"] = prob_partial_lst
else:
# Get a probability for each hit based on the distance using Venn-Abers / isotonic regression
# Load calibration data
data = np.load(
args.cal_data,
allow_pickle=True,
)
print("loading calibration data")
n_calib = args.n_calib
np.random.shuffle(data)
cal_data = data[:n_calib]
X_cal, y_cal = get_sims_labels(cal_data, partial=False)
X_cal = X_cal.flatten()
y_cal = y_cal.flatten()
print("getting exact probabilities")
p_s = []
for d in df["D_score"]:
p_0, p_1 = simplifed_venn_abers_prediction(X_cal, y_cal, d)
p_s.append([p_0, p_1])
p_s = np.array(p_s)
abs_p = [np.abs(p[0] - p[1]) for p in p_s]
df["prob_exact"] = np.mean(p_s, axis=1)
if args.partial:
# TODO: this stage may not be necessary, but we noticed sometimes that shuffling the data would mess up the original file
data = np.load(
args.cal_data,
allow_pickle=True,
)
print("loading calibration data")
np.random.shuffle(data)
cal_data = data[:n_calib]
X_cal, y_cal = get_sims_labels(cal_data, partial=True)
X_cal = X_cal.flatten()
y_cal = y_cal.flatten()
print("getting partial probabilities")
p_s = []
for d in df["D_score"]:
p_0, p_1 = simplifed_venn_abers_prediction(X_cal, y_cal, d)
p_s.append([p_0, p_1])
p_s = np.array(p_s)
abs_p = [np.abs(p[0] - p[1]) for p in p_s]
df["prob_partial"] = np.mean(p_s, axis=1)
print("saving df new probabilities")
df.to_csv(
args.output,
index=False,
)
def parse_args():
parser = argparse.ArgumentParser("Get probabilities for similarity scores")
parser.add_argument(
"--precomputed",
action='store_true',
default=False,
help="Use precomputed probabilities on similarity scores",
)
parser.add_argument(
"--precomputed_path",
type=str,
default="",
help="Path to precomputed probabilities. This will have probabilities for both partial and exact hits.",
)
parser.add_argument(
"--input",
type=str,
default="/groups/doudna/projects/ronb/conformal_backup/results_no_probs.csv",
help="Input tabular data with similarity scores and metadata.",
)
parser.add_argument(
"--output",
type=str,
default="/groups/doudna/projects/ronb/conformal_backup/results_with_probs.csv",
help="Output file for the results",
)
parser.add_argument(
"--partial",
action='store_true',
default=False,
help="Return probability of partial hits given similarity scores",
)
# parser.add_argument(
# "--alpha", type=float, default=0.1, help="Alpha value for the algorithm"
# )
# parser.add_argument(
# "--num_trials", type=int, default=100, help="Number of trials to run"
# )
parser.add_argument(
"--n_calib", type=int, default=100, help="Number of calibration data points"
)
parser.add_argument(
"--cal_data", type=str, default="/groups/doudna/projects/ronb/conformal_backup/protein-conformal/data/pfam_new_proteins.npy", help="Path to calibration data"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
main(args)