forked from jdaries/de_id
-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathgetBinSizes.py
80 lines (68 loc) · 2.24 KB
/
getBinSizes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os, pickle, sys, csv
def print_bin_size(bin_map):
bin_dict = {}
bin_list = []
for y in bin_map:
span = bin_map[y][0]
if span not in bin_dict:
bin_dict[span] = bin_map[y][2]
s_span = sorted(bin_dict.keys())
for s in s_span:
print (s, bin_dict[s])
bin_list.append([s, bin_dict[s]])
return bin_list
def build_bin_size_list(bin_map):
bin_list = []
bin_set = set()
bin_k = bin_map.keys()
bin_k.sort()
if bin_k[-1] == '':
bin_k.insert(0, bin_k.pop())
for k in bin_k:
span = bin_map[k][0]
if span not in bin_set:
bin_set.add(span)
bin_list.append([span, bin_map[k][2]])
return bin_list
def shorten_name(fname):
'''
A particularly blecherous mechanism to strip most of the filename out of the name associated with the kind
of quasi-identifier. This assumes that the name has a form that starts with f_, followed by the name we really
want, followed by lots of stuff we don't. Unless it is the yob quasi-identifier, in which case there is no
f_ preface. Sigh...
:param fname: the file name from which to extract the quasi-identifier name
:return: the name of the quasi-identifier
'''
if fname[:3] == 'yob':
return 'yob'
else:
stop_i = fname[2:].find('_')
stop_i += 2
return fname[2:stop_i]
def store_bin_size(bin_name, store_file, bin_list):
store_file.writerow([bin_name])
for s in bin_list:
store_file.writerow(s)
return
if __name__ == '__main__':
if len(sys.argv) < 2:
print ('Usage: python getBinSizes.py outFile.csv {binfiles}')
sys.exit(1)
out_file_name = sys.argv[1]
if len(sys.argv) < 3:
f_list = os.listdir('.')
else:
f_list = []
for n in range(2, len(sys.argv)):
f_list.append(sys.argv[n])
out_f = open(out_file_name, 'w')
out_c = csv.writer(out_f)
for f in f_list:
if '.pkl' in f:
f_in = open(f, 'rb')
bin_map = pickle.load(f_in)
s_list = build_bin_size_list(bin_map)
#store_bin_size(shorten_name(f), out_c, s_list)
store_bin_size(f,out_c, s_list)
f_in.close()
out_f.close()