import numpy as np import pandas as pd import argparse from protein_conformal.util import * def main(args): df = pd.read_csv(args.input) df_probs = pd.read_csv(args.precomputed_path) if args.precomputed: # NOTE: if probabilities are precomputed, we can just load them and save them. This will involve the following steps: # 1. Load the precomputed probabilities # 2. For each hit, see which similarity score bin it corresponds to # 3. Assign the probability to the hit based on the similarity score bin # df = pd.read_csv(args.precomputed_path) # if args.partial: # df["prob_partial"] = df["prob_exact"] # else: # df["prob_exact"] = df["prob_exact"] # df.to_csv(args.output, index=False) # return prob_exact_lst, prob_partial_lst = [], [] for d in df["D_score"]: # Check if there are any rows where similarity <= d if len(df_probs[df_probs["similarity"] <= d]) > 0: lower_bin = df_probs[df_probs["similarity"] <= d].iloc[-1] else: # If d is smaller than all similarities, use the smallest bin lower_bin = df_probs.iloc[0] # Check if there are any rows where similarity >= d if len(df_probs[df_probs["similarity"] >= d]) > 0: upper_bin = df_probs[df_probs["similarity"] >= d].iloc[0] else: # If d is larger than all similarities, use the largest bin upper_bin = df_probs.iloc[-1] # Get probabilities for lower bin, upper bin (columns "prob_exact_p0", "prob_exact_p1") p_0_lower = lower_bin["prob_exact_p0"] p_1_lower = lower_bin["prob_exact_p1"] p_0_upper = upper_bin["prob_exact_p0"] p_1_upper = upper_bin["prob_exact_p1"] # Interpolate probabilities prob_exact = np.mean([ min(p_0_lower, p_1_lower, p_0_upper, p_1_upper), max(p_0_lower, p_1_lower, p_0_upper, p_1_upper) ]) prob_exact_lst.append(prob_exact) if args.partial: p_0_lower = lower_bin["prob_partial_p0"] p_1_lower = lower_bin["prob_partial_p1"] p_0_upper = upper_bin["prob_partial_p0"] p_1_upper = upper_bin["prob_partial_p1"] prob_partial = np.mean([ min(p_0_lower, p_1_lower, p_0_upper, p_1_upper), max(p_0_lower, p_1_lower, p_0_upper, p_1_upper) ]) prob_partial_lst.append(prob_partial) df["prob_exact"] = prob_exact_lst if args.partial: df["prob_partial"] = prob_partial_lst else: # Get a probability for each hit based on the distance using Venn-Abers / isotonic regression # Load calibration data data = np.load( args.cal_data, allow_pickle=True, ) print("loading calibration data") n_calib = args.n_calib np.random.shuffle(data) cal_data = data[:n_calib] X_cal, y_cal = get_sims_labels(cal_data, partial=False) X_cal = X_cal.flatten() y_cal = y_cal.flatten() print("getting exact probabilities") p_s = [] for d in df["D_score"]: p_0, p_1 = simplifed_venn_abers_prediction(X_cal, y_cal, d) p_s.append([p_0, p_1]) p_s = np.array(p_s) abs_p = [np.abs(p[0] - p[1]) for p in p_s] df["prob_exact"] = np.mean(p_s, axis=1) if args.partial: # TODO: this stage may not be necessary, but we noticed sometimes that shuffling the data would mess up the original file data = np.load( args.cal_data, allow_pickle=True, ) print("loading calibration data") np.random.shuffle(data) cal_data = data[:n_calib] X_cal, y_cal = get_sims_labels(cal_data, partial=True) X_cal = X_cal.flatten() y_cal = y_cal.flatten() print("getting partial probabilities") p_s = [] for d in df["D_score"]: p_0, p_1 = simplifed_venn_abers_prediction(X_cal, y_cal, d) p_s.append([p_0, p_1]) p_s = np.array(p_s) abs_p = [np.abs(p[0] - p[1]) for p in p_s] df["prob_partial"] = np.mean(p_s, axis=1) print("saving df new probabilities") df.to_csv( args.output, index=False, ) def parse_args(): parser = argparse.ArgumentParser("Get probabilities for similarity scores") parser.add_argument( "--precomputed", action='store_true', default=False, help="Use precomputed probabilities on similarity scores", ) parser.add_argument( "--precomputed_path", type=str, default="", help="Path to precomputed probabilities. This will have probabilities for both partial and exact hits.", ) parser.add_argument( "--input", type=str, default="/groups/doudna/projects/ronb/conformal_backup/results_no_probs.csv", help="Input tabular data with similarity scores and metadata.", ) parser.add_argument( "--output", type=str, default="/groups/doudna/projects/ronb/conformal_backup/results_with_probs.csv", help="Output file for the results", ) parser.add_argument( "--partial", action='store_true', default=False, help="Return probability of partial hits given similarity scores", ) # parser.add_argument( # "--alpha", type=float, default=0.1, help="Alpha value for the algorithm" # ) # parser.add_argument( # "--num_trials", type=int, default=100, help="Number of trials to run" # ) parser.add_argument( "--n_calib", type=int, default=100, help="Number of calibration data points" ) parser.add_argument( "--cal_data", type=str, default="/groups/doudna/projects/ronb/conformal_backup/protein-conformal/data/pfam_new_proteins.npy", help="Path to calibration data" ) return parser.parse_args() if __name__ == "__main__": args = parse_args() main(args)