-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandom_walk.py
366 lines (314 loc) · 11.6 KB
/
random_walk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
import networkx as nx
import argparse
from pathlib import Path
def parse_arguments():
"""
Process command line arguments.
@return arguments
"""
parser = argparse.ArgumentParser(
description="Random-Walk-with-Restart Path Reconstruction"
)
# add arguments
parser.add_argument(
"--edges_file", type=Path, required=True, help="Path to the edges file"
)
parser.add_argument(
"--prizes_file", type=Path, required=True, help="Path to the prizes file"
)
parser.add_argument(
"--single_source",
type=str,
required=False,
default="1",
help="1 for single-sourced RWR and 0 for source-target RWR (default: 1)",
)
parser.add_argument(
"--damping_factor",
type=float,
required=False,
default=0.85,
help="Select a damping factor between 0 and 1 for the random walk with restart (default: 0.85)",
)
parser.add_argument(
"--selection_function",
type=str,
required=False,
default="min",
help="Select a function to use (min for minimum/sum for sum/avg for average/max for maximum)",
)
parser.add_argument(
"--w",
type=float,
required=False,
default=0.01,
help="Select a lower bound between 0 and 1 for the edge weight (default: 0.01)",
)
parser.add_argument(
"--threshold",
type=float,
required=False,
default=0.0001,
help="Select a threshold value between 0 and 1 for the construction reference (default: 0.001)",
)
parser.add_argument(
"--output_file", type=Path, required=True, help="Path to the output files"
)
return parser.parse_args()
# Utility functions
def generate_nodes_and_edges(edges_file: Path, w: float) -> tuple:
"""
This function is for extracting the nodes and edges from the path to the edges file
"""
nodes = set()
edges = []
total_edges_weight = 0
number_of_edges = 0
with edges_file.open() as edges_f:
for i, line in enumerate(edges_f):
# if the first line is the title, skip it
if i == 0:
continue
line = line.strip()
endpoints = line.split("\t")
if len(endpoints) != 3:
raise ValueError(
f"Edge {line} does not contain 2 nodes separated by a tab and a weight"
)
nodes.add(endpoints[0])
nodes.add(endpoints[1])
total_edges_weight += float(endpoints[2])
number_of_edges += 1
# add the edge to the list of edges
# (node1, node2, weight)
if float(endpoints[2]) > w:
edges.append((endpoints[0], endpoints[1], endpoints[2]))
avg_weight = total_edges_weight / number_of_edges
print(f"Average weight of edges in the network: {avg_weight}")
return nodes, edges
def generate_nodes(nodes_file: Path, node_type: str) -> list:
"""
This function is for generating the nodes from the path to the source/target file
"""
nodes = []
with nodes_file.open() as nodes_f:
for i, line in enumerate(nodes_f):
# if the first line is the title, skip it
if i == 0:
continue
line = line.strip()
endpoints = line.split("\t")
if len(endpoints) != 3:
raise ValueError(f"Node {line} does not contain 1 node, a prize and a type")
if endpoints[2] != node_type:
continue
nodes.append((endpoints[0], endpoints[1]))
return nodes
def generate_graph(nodes: set, edges: list) -> nx.DiGraph:
"""
This function is for generating the graph from the input files (edges/sources/targets)
"""
G = nx.DiGraph()
G.add_nodes_from(nodes)
G.add_weighted_edges_from(edges)
return G
def generate_personalization_vector(nodes: list) -> dict:
"""
This function is for generating the personalization vector from the source/target file
"""
personalization_vector = {}
# assigning value 1 to the source and target nodes to pass it to the random walk function
for i in nodes:
personalization_vector[i[0]] = float(i[1])
return personalization_vector
def pathway_construction(
G: nx.DiGraph, final_pr: dict, alpha: float, source_node: list, target_node: list
):
"""
Return a subnetwork of G where for each edge of it, both ends are in the subset of interest.
"""
linker_nodes = set()
filtered_nodes = set()
for key in final_pr:
if final_pr[key] > alpha:
linker_nodes.add(key)
source_set = set([i[0] for i in source_node])
target_set = set([i[0] for i in target_node])
filtered_nodes = set(source_set).union(set(target_set)).union(set(linker_nodes))
edgelist = set()
for u in G.edges().keys():
if u[0] in filtered_nodes and u[1] in filtered_nodes:
edgelist.add(u)
return edgelist, filtered_nodes
def generate_output(
G: nx.DiGraph,
final_pr: dict,
threshold: float,
source_node: list,
output_file: Path,
pr: dict,
target_node: list = [],
r_pr: dict = {},
):
"""
This function is for calculating the edge flux and writing the results to the output file.
"""
edgelist, nodelist = pathway_construction(G, final_pr, threshold, source_node, target_node)
edge_sum = {}
for node in G.nodes():
temp = 0
for i in G.out_edges(node):
temp += float(G[node][i[1]]["weight"])
edge_sum[node] = temp
edge_flux = {}
# calculate the edge flux
for edge in G.edges():
edge_flux[edge] = (
[pr[edge[0]] * float(G[edge[0]][edge[1]]["weight"]) / edge_sum[edge[0]], str(edge in edgelist)]
)
sorted_edge_flux = sorted(edge_flux.items(), key=lambda x: x[1], reverse=True)
sorted_final_pr = sorted(final_pr.items(), key=lambda x: x[1], reverse=True)
set_of_nodes = set()
with output_file.open("w") as output_file_f:
output_file_f.write("Node1\tNode2\tEdge Flux\tWeight\tInNetwork\tType\n")
for i in sorted_edge_flux:
output_file_f.write(f"{i[0][0]}\t{i[0][1]}\t{i[1][0]}\t{G[i[0][0]][i[0][1]]['weight']}\t{i[1][1]}\t1\n")
for i in sorted_final_pr:
if r_pr != {}:
output_file_f.write(f"{i[0]}\t{pr[i[0]]}\t{r_pr[i[0]]}\t{i[1]}\t{str(i[0] in nodelist)}\t2\n")
else:
output_file_f.write(f"{i[0]}\t{pr[i[0]]}\tNan\t{i[1]}\t{str(i[0] in nodelist)}\t2\n")
for edge in edgelist:
set_of_nodes.add(edge[0])
set_of_nodes.add(edge[1])
output_file_f.write(
f"{edge[0]}\t{edge[1]}\tNan\tNan\tNan\t3\n"
)
print(f"The number of nodes in the original network is {len(G.nodes())}")
print(f"The number of edges in the original network is {len(G.edges())}")
print(f"The number of nodes in the extracted pathway is {len(set_of_nodes)}")
print(f"The number of edges in the extracted pathway is {len(edgelist)}")
print("Output file generated")
# main algorithm
def random_walk(
edges_file: Path,
prizes_file: Path,
output_file: Path,
single_source: str,
damping_factor: float,
w: float,
selection_function: str,
threshold: float,
):
"""
This function is the main algorithm for random-walk-with-restart path reconstruction.
"""
if not edges_file.exists():
raise OSError(f"Edges file {str(edges_file)} does not exist")
if not prizes_file.exists():
raise OSError(f"Sources file {str(prizes_file)} does not exist")
if single_source != "1" and single_source != "0":
raise ValueError(f"Single source should be either 1 or 0")
if output_file.exists():
print(f"Output files {str(output_file)} (nodes) will be overwritten")
# Create the parent directories for the output file if needed
output_file.parent.mkdir(parents=True, exist_ok=True)
if single_source == "1":
# print a warning here.
print(
"Single source mode (The targets file and the selection function will be ignored)"
)
else:
print("Source-Target mode")
# check if the damping factor is between 0 and 1
if damping_factor < 0 or damping_factor > 1:
raise ValueError(f"Damping factor should be between 0 and 1")
else:
print(f"Damping factor is {damping_factor}")
# check if the selection function is either min, max, sum or avg
if (
selection_function != "min"
and selection_function != "sum"
and selection_function != "max"
and selection_function != "avg"
):
raise ValueError(f"Selection function should be either min, max, sum or avg")
else:
print(f"Selection function is {selection_function}")
if w < 0 or w > 1:
raise ValueError(f"Weight should be between 0 and 1")
else:
print(f"Lower bound for edge weight is {w}")
# check if the threshold is between 0 and 1
if threshold < 0 or threshold > 1:
raise ValueError(f"Threshold should be between 0 and 1")
else:
print(f"Threshold is {threshold}")
# Read the list of sources
nodes, edges = generate_nodes_and_edges(edges_file, w)
G = generate_graph(nodes, edges)
source_node = generate_nodes(prizes_file, 'source')
pr = nx.pagerank(
G,
alpha=damping_factor,
personalization=generate_personalization_vector(source_node),
)
target_node = []
r_pr = {}
if single_source == "0":
# Create the reverse graph
R = G.reverse()
target_node = generate_nodes(prizes_file, 'target')
# Running pagerank on the reverse graph with T as the personalization vector
r_pr = nx.pagerank(
R,
alpha=damping_factor,
personalization=generate_personalization_vector(target_node),
)
final_pr = {}
if single_source == "0":
# Combine the two pageranks with the selection function
if selection_function == "min":
for i in pr:
final_pr[i] = min(pr[i], r_pr[i])
elif selection_function == "sum":
for i in pr:
final_pr[i] = pr[i] + r_pr[i]
elif selection_function == "avg":
for i in pr:
final_pr[i] = (pr[i] + r_pr[i]) / 2
elif selection_function == "max":
for i in pr:
final_pr[i] = max(pr[i], r_pr[i])
else:
final_pr = pr
if single_source == "0":
# Output the results
generate_output(
G, final_pr, threshold, source_node, output_file, pr, target_node, r_pr
)
else:
generate_output(G, final_pr, threshold, source_node, output_file, pr)
def main():
"""
Parse arguments and run pathway reconstruction
"""
args = parse_arguments()
random_walk(
args.edges_file,
args.prizes_file,
args.output_file,
args.single_source,
args.damping_factor,
args.w,
args.selection_function,
args.threshold,
)
if __name__ == "__main__":
main()
"""
test:
python random_walk.py --edges_file input/edges1.txt --prizes_file input/prizes1.txt --output_file output/output1.txt
python random_walk.py --edges_file input/edges1.txt --prizes_file input/prizes1.txt --single_source 0 --damping_factor 0.85 --selection_function min --w 0.000 --threshold 0.01 --output_file output/output1.txt
python random_walk.py --edges_file input/edges1.txt --prizes_file input/prizes1.txt --single_source 1 --damping_factor 0.85 --selection_function min --w 0.000 --threshold 0.01 --output_file output/output1.txt
"""