-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodels_HIV_2007.py
205 lines (192 loc) · 9.65 KB
/
models_HIV_2007.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import pandas
import numpy
from three_way_epistasis import get_next_ordering, ordering_to_fitness, epistasis_positive, epistasis_negative
__author__ = '@gavruskin'
# Gives the probabilities output[i][j] = p_{i, j}, where {i, j} \subset {1, ..., 8} given trial lists using
# p_{i, j} = P(W_i < W_j) = \sum_x P(W_j = x) * P(W_i < x):
def ranking_probabilities(fit_data_list):
output = [[0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0]]
for j in range(8):
for i in range(8):
if i != j:
probability_j = 1 / float(len(fit_data_list[j])) # That's P(W_j = x), which doesn't depend on x.
# Now find probability_i = \sum_x P(W_i < x):
count = 0
for x in range(len(fit_data_list[j])): # This is \sum_x
for r in range(len(fit_data_list[i])):
if fit_data_list[i][r] < fit_data_list[j][x]:
count += 1
probability_i = count / float(len(fit_data_list[i]))
# Multiply the running probability by p_{i, j}:
output[i][j] = probability_j * probability_i
return output
# The comparison (competition experiment) model.
# Returns the probability of epistasis given the trial data.
def epistasis_probability_from_comparisons(fit_data_list, threshold_prob):
# Compute probabilities P(W_i < W_j) = p_{i,j}:
p_ij = ranking_probabilities(fit_data_list)
# Loop through all rankings:
ordering = [1, 1, 1, 1, 1, 1, 1, 1]
ranking = [1, 2, 3, 4, 5, 6, 7, 8]
positives = {1, 5, 6, 7}
negatives = {4, 3, 2, 8}
repetitions = [1, 1, 1, 1, 1, 1, 1, 1]
positive_epi_prob = 0
negative_epi_prob = 0
total_prob_mass = 0
positive_rankings = []
negative_rankings = []
non_informative_rankings = []
# Compute the probability of the ranking as \Pi_{i, j} p_{i, j}.
rank_prob = 1
for j in range(len(ranking)):
for i in range(j):
rank_prob *= p_ij[ranking[i] - 1][ranking[j] - 1]
total_prob_mass += rank_prob
if epistasis_positive(ranking, positives, negatives, repetitions):
positive_epi_prob += rank_prob
if rank_prob > threshold_prob:
positive_rankings.append([ranking, rank_prob])
elif epistasis_negative(ranking, positives, negatives, repetitions):
negative_epi_prob += rank_prob
if rank_prob > threshold_prob:
negative_rankings.append([ranking, rank_prob])
elif rank_prob > threshold_prob:
non_informative_rankings.append([ranking, rank_prob])
while ordering != [8, 7, 6, 5, 4, 3, 2, 1]:
ordering = get_next_ordering(ordering)
ranking = ordering_to_fitness(ordering)
rank_prob = 1
for j in range(len(ranking)):
for i in range(j):
rank_prob *= p_ij[ranking[i] - 1][ranking[j] - 1]
total_prob_mass += rank_prob
if epistasis_positive(ranking, positives, negatives, repetitions):
positive_epi_prob += rank_prob
if rank_prob > threshold_prob:
positive_rankings.append([ranking, rank_prob])
elif epistasis_negative(ranking, positives, negatives, repetitions):
negative_epi_prob += rank_prob
if rank_prob > threshold_prob:
negative_rankings.append([ranking, rank_prob])
elif rank_prob > threshold_prob:
non_informative_rankings.append([ranking, rank_prob])
positive_epi_prob /= total_prob_mass
negative_epi_prob /= total_prob_mass
# print "Probability of positive epistasis: " + str(positive_epi_prob)
# print "Probability of negative epistasis: " + str(negative_epi_prob)
top_pos_epi_prob = 0
for i in range(len(positive_rankings)):
top_pos_epi_prob += positive_rankings[i][1]
top_neg_epi_prob = 0
for i in range(len(negative_rankings)):
top_neg_epi_prob += negative_rankings[i][1]
top_non_info_prob = 0
for i in range(len(non_informative_rankings)):
top_non_info_prob += non_informative_rankings[i][1]
print("Threshold probability: " + str(threshold_prob))
print("Top rankings with positive epistasis: " + str(positive_rankings))
print(str(len(positive_rankings)) + " in total")
print("With total probability: " + str(top_pos_epi_prob))
print("Top rankings with negative epistasis: " + str(negative_rankings))
print(str(len(negative_rankings)) + " in total")
print("With total probability: " + str(top_neg_epi_prob))
print("Top non-informative rankings: " + str(non_informative_rankings))
print(str(len(non_informative_rankings)) + " in total")
print("With total probability: " + str(top_non_info_prob) + "\n")
return [[positive_epi_prob, negative_epi_prob],
[positive_rankings, negative_rankings, non_informative_rankings]]
# Returns the probability of positive and negative epistasis from the fitness measurements.
def epistasis_from_values(fit_data_list):
epi_pos = epi_neg = total_count = 0
for w0 in range(len(fit_data_list[0])):
for w1 in range(len(fit_data_list[1])):
for w2 in range(len(fit_data_list[2])):
for w3 in range(len(fit_data_list[3])):
for w4 in range(len(fit_data_list[4])):
for w5 in range(len(fit_data_list[5])):
for w6 in range(len(fit_data_list[6])):
for w7 in range(len(fit_data_list[7])):
total_count += 1
e = (fit_data_list[0][w0] + fit_data_list[4][w4] + fit_data_list[5][w5] +
fit_data_list[6][w6]) - \
(fit_data_list[1][w1] + fit_data_list[2][w2] + fit_data_list[3][w3] +
fit_data_list[7][w7])
if e > 0:
epi_pos += 1
elif e < 0:
epi_neg += 1
epi_pos_prob = epi_pos / float(total_count)
epi_neg_prob = epi_neg / float(total_count)
return [epi_pos_prob, epi_neg_prob, epi_pos, epi_neg, total_count]
# Returns k closest entries to the mean for each component of fit_data_list:
def closest_to_mean(fit_data_list, k, mean_type="mean"):
if mean_type == "mean":
means = [numpy.mean(fit_data_list[j]) for j in range(len(fit_data_list))]
elif mean_type == "median":
means = [numpy.median(fit_data_list[j]) for j in range(len(fit_data_list))]
fit_data_list_copy = []
for r in range(len(fit_data_list)):
copy = [fit_data_list[r][l] for l in range(len(fit_data_list[r]))]
fit_data_list_copy.append(copy)
output = []
for r in range(k):
output_r = []
for l in range(len(means)):
close_to_mean_index = (numpy.abs(fit_data_list_copy[l] - means[l])).argmin()
output_r.append(fit_data_list_copy[l][close_to_mean_index])
del fit_data_list_copy[l][close_to_mean_index]
output.append(output_r)
return output
# Returns the list of lists of fitness values for the eight genotypes corresponding to the hard-coded mutations:
def datafile_hiv_process():
data_file = "2007_HIV_data.csv"
mutations = [["L", "M"], # mutations: L to M, M to V, t to Y
["M", "V"],
["t", "Y"]]
sites = [88, 244, 275] # sites: PRO L90M, RT M184V, RT T215Y
sites = [0] + [1] + sites # This is specific to the data file. Column 0 contains fitness, column 1 names.
values = pandas.read_csv(data_file, usecols=sites)
values.iloc[:, 0] = numpy.log10(values.iloc[:, 0]) # log10 scale of fitness values.
size = len(values.iloc[:, 1])
f000 = []
f001 = []
f010 = []
f100 = []
f011 = []
f101 = []
f110 = []
f111 = []
for m in range(size):
if (values.iloc[m, 2] == mutations[0][0]) & (values.iloc[m, 3] == mutations[1][0]) & \
(values.iloc[m, 4] == mutations[2][0]):
f000.append(values.iloc[m, 0])
elif (values.iloc[m, 2] == mutations[0][0]) & (values.iloc[m, 3] == mutations[1][0]) & \
(values.iloc[m, 4] == mutations[2][1]):
f001.append(values.iloc[m, 0])
elif (values.iloc[m, 2] == mutations[0][0]) & (values.iloc[m, 3] == mutations[1][1]) & \
(values.iloc[m, 4] == mutations[2][0]):
f010.append(values.iloc[m, 0])
elif (values.iloc[m, 2] == mutations[0][1]) & (values.iloc[m, 3] == mutations[1][0]) & \
(values.iloc[m, 4] == mutations[2][0]):
f100.append(values.iloc[m, 0])
elif (values.iloc[m, 2] == mutations[0][0]) & (values.iloc[m, 3] == mutations[1][1]) & \
(values.iloc[m, 4] == mutations[2][1]):
f011.append(values.iloc[m, 0])
elif (values.iloc[m, 2] == mutations[0][1]) & (values.iloc[m, 3] == mutations[1][0]) & \
(values.iloc[m, 4] == mutations[2][1]):
f101.append(values.iloc[m, 0])
elif (values.iloc[m, 2] == mutations[0][1]) & (values.iloc[m, 3] == mutations[1][1]) & \
(values.iloc[m, 4] == mutations[2][0]):
f110.append(values.iloc[m, 0])
elif (values.iloc[m, 2] == mutations[0][1]) & (values.iloc[m, 3] == mutations[1][1]) & \
(values.iloc[m, 4] == mutations[2][1]):
f111.append(values.iloc[m, 0])
return [f000, f001, f010, f100, f011, f101, f110, f111]