-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsae_interp.py
197 lines (184 loc) · 9.8 KB
/
sae_interp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
from ioi_utils import *
from sae_variants import VanillaAutoEncoder, GatedAutoEncoder, AttributionAutoEncoder
from circuit_utils import FEATURE_SIZES
################################################################################
### some utils for faster interp computations
################################################################################
def get_feature_mask(feature: Tuple[str,...],
prompt_feature_idxs: Dict[str, List[Tuple[int,...]]],
) -> Tensor:
if feature == ('name_anywhere',):
assert ('io',) in prompt_feature_idxs
assert ('s',) in prompt_feature_idxs
num_examples = prompt_feature_idxs[('io',)].shape[0]
mask = torch.zeros(num_examples, FEATURE_SIZES[feature[0]]).cuda()
mask[range(num_examples), prompt_feature_idxs[('io',)][:, 0]] = 1
mask[range(num_examples), prompt_feature_idxs[('s',)][:, 0]] = 1
sess.d()
else:
feature_idxs = prompt_feature_idxs[feature]
feature_shape = tuple([FEATURE_SIZES[f] for f in feature])
num_examples = len(feature_idxs)
mask = torch.zeros((num_examples, *feature_shape)).cuda()
# create an indexing object into the mask that uses range(num_examples) for the
# first dimension and the feature_idxs for the rest of the dimensions
if len(feature) == 1:
mask[range(num_examples), feature_idxs[:, 0]] = 1
elif len(feature) == 2:
mask[range(num_examples), feature_idxs[:, 0], feature_idxs[:, 1]] = 1
else:
raise NotImplementedError
return mask
def get_feature_scores(activation_pattern: Tensor, feature_mask: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
intersections = einsum("batch hidden, batch ... -> hidden ...", activation_pattern, feature_mask)
feature_supports = feature_mask.sum(dim=0)
activation_supports = activation_pattern.sum(dim=0)
num_feature_dimensions = len(feature_mask.shape) - 1
# insert extra dimensions in `activation_supports`
for i in range(num_feature_dimensions):
activation_supports = activation_supports.unsqueeze(-1)
precision = intersections / activation_supports
recall = intersections / feature_supports
f1 = 2 * precision * recall / (precision + recall)
return precision, recall, f1
def multidim_argmax(t: Tensor, ) -> Tuple[Tuple[Tensor, ...], Tensor]:
# compute argmax over all dimensions but the first
t = t.clone()
# set the nan values to -inf
t[torch.isnan(t)] = float('-inf')
t_reshaped = t.view(t.shape[0], -1)
argmax_indices = t_reshaped.argmax(dim=1)
argmax_values = t_reshaped[range(t_reshaped.shape[0]), argmax_indices]
# go back to original shape
if len(t.shape) == 2:
return (argmax_indices,), argmax_values
elif len(t.shape) == 3:
return (argmax_indices // t.shape[2], argmax_indices % t.shape[2]), argmax_values
else:
raise NotImplementedError
def multidim_topk(t: Tensor, k: int) -> Tuple[Tensor, ...]:
# compute topk over all dimensions but the first
t = t.clone()
# set the nan values to -inf
t[torch.isnan(t)] = float('-inf')
t_reshaped = t.view(t.shape[0], -1)
topk_indices = t_reshaped.topk(k, dim=1).indices
# go back to original shape
if len(t.shape) == 2:
return topk_indices,
elif len(t.shape) == 3:
return topk_indices // t.shape[2], topk_indices % t.shape[2]
else:
raise NotImplementedError
################################################################################
### main ops
################################################################################
@op(__allow_side_effects__=True) # again, because of non-deterministic nn.Module hashes
def get_high_f1_features(
encoder: Union[VanillaAutoEncoder, GatedAutoEncoder, AttributionAutoEncoder],
attributes: List[Tuple[str,...]],
prompt_feature_idxs: Any,
A_normalized: Tensor, # must be normalized for the encoder
topk: int,
) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
"""
Given some attributes, find the top features wrt these attributes based on
the F1 score; return the features together with their scores.
Returns:
- {attr: tensor of shape (*attr_shape, topk) of the top features (ordered)}
- {attr: tensor of shape (*attr_shape, topk) of the top F1 scores (same order)}
"""
activation_pattern = encoder.get_activation_pattern(A=A_normalized).float()
masks = {attr: get_feature_mask(attr, prompt_feature_idxs=prompt_feature_idxs) for attr in attributes} # f -> (num_examples, *feature_shape)
# attr -> (_, _, f1 scores) where f1 scores has shape (num_features, num_attr_values)
f1_scores = {attr: get_feature_scores(activation_pattern, masks[attr])[2] for attr in attributes}
for attr in f1_scores.keys():
t = f1_scores[attr].clone()
t[torch.isnan(t)] = float('-inf')
f1_scores[attr] = t
features_and_scores = {attr: torch.topk(v, k=topk, dim=0) for attr, v in f1_scores.items()}
features = {attr: v.indices.T for attr, v in features_and_scores.items()}
f1_scores = {attr: v.values.T for attr, v in features_and_scores.items()}
return features, f1_scores
@op(__allow_side_effects__=True) # again, because of non-deterministic nn.Module hashes
def autointerp_fast(
encoder: Union[VanillaAutoEncoder, GatedAutoEncoder, AttributionAutoEncoder],
features: List[Tuple[str,...]],
prompt_feature_idxs: Any,
A_normalized: Tensor,
features_to_group: List[Tuple[str,...]],
feature_batch_size: Optional[int] = None,
max_group_size: int = 10,
) -> Tuple[Any, Any, Any, Any]:
"""
Score features according to the F1 score, with greedy search over subsets of
up to `max_group_size` attribute values. Return:
- the best F1 score and corresponding index for each feature
- the best F1 score and corresponding indices for each group of features
"""
activation_pattern = encoder.get_activation_pattern(A=A_normalized).float()
n_examples = A_normalized.shape[0]
n_features = activation_pattern.shape[1]
masks = {f: get_feature_mask(f, prompt_feature_idxs=prompt_feature_idxs) for f in features} # f -> (num_examples, *feature_shape)
if feature_batch_size is None:
activation_pattern_batches = [activation_pattern]
else:
activation_pattern_batches = [activation_pattern[:, i:i+feature_batch_size] for i in range(0, n_features, feature_batch_size)]
top_features_list = []
top_scores_list = []
group_features_list = []
group_scores_list = []
for activation_pattern in activation_pattern_batches:
scores = {f: get_feature_scores(activation_pattern, masks[f]) for f in features}
# collect the best features according to F1 score
top_features_and_scores = {f: multidim_argmax(scores[f][2]) for f in features}
top_features = {f: v[0] for f, v in top_features_and_scores.items()}
top_scores = {f: v[1] for f, v in top_features_and_scores.items()}
# now, for the features where we look for groups, compute the scores for the
# top 1, 2, ..., `max_group_size` elements
group_scores = {}
group_features = {}
for f in features_to_group:
f1 = scores[f][2]
topk_indices = multidim_topk(f1, max_group_size)
group_features[f] = topk_indices
# compute the masks for each group
if len(topk_indices) == 1:
subgroup_mask = torch.cumsum(masks[f][:, topk_indices[0]], dim=2)
elif len(topk_indices) == 2:
subgroup_mask = torch.cumsum(masks[f][:, topk_indices[0], topk_indices[1]], dim=2)
else:
raise NotImplementedError
# compute scores wrt grouped mask
group_intersections = einsum("batch hidden, batch hidden group_size -> hidden group_size", activation_pattern, subgroup_mask)
recall = group_intersections / subgroup_mask.sum(dim=0)
precision = group_intersections / activation_pattern.sum(dim=0).unsqueeze(-1)
group_f1 = 2 * precision * recall / (precision + recall + 1e-8)
group_scores[f] = group_f1
top_features_list.append(top_features)
top_scores_list.append(top_scores)
group_features_list.append(group_features)
group_scores_list.append(group_scores)
if feature_batch_size is not None:
for f in features:
num_elts_in_tuple = len(top_features_list[0][f])
if num_elts_in_tuple == 1:
top_features[f] = (torch.cat([d[f][0] for d in top_features_list], dim=0),)
top_scores[f] = (torch.cat([d[f] for d in top_scores_list], dim=0),)
elif num_elts_in_tuple == 2:
top_features[f] = (torch.cat([d[f][0] for d in top_features_list], dim=0), torch.cat([d[f][1] for d in top_features_list], dim=0))
top_scores[f] = (torch.cat([d[f] for d in top_scores_list], dim=0), torch.cat([d[f] for d in top_scores_list], dim=0))
for f in features_to_group:
num_elts_in_tuple = len(group_features_list[0][f])
if num_elts_in_tuple == 1:
group_features[f] = (torch.cat([d[f][0] for d in group_features_list], dim=0),)
group_scores[f] = (torch.cat([d[f] for d in group_scores_list], dim=0),)
elif num_elts_in_tuple == 2:
group_features[f] = (torch.cat([d[f][0] for d in group_features_list], dim=0), torch.cat([d[f][1] for d in group_features_list], dim=0))
group_scores[f] = (torch.cat([d[f] for d in group_scores_list], dim=0), torch.cat([d[f] for d in group_scores_list], dim=0))
else:
top_features = top_features_list[0]
top_scores = top_scores_list[0]
group_features = group_features_list[0]
group_scores = group_scores_list[0]
return top_features, top_scores, group_features, group_scores