From 123af2c9de35832890fdf046dd7c6bacb5aa8d8a Mon Sep 17 00:00:00 2001 From: Farhan Ahmed Date: Tue, 14 Nov 2023 12:04:50 -0800 Subject: [PATCH 1/3] flatten activations for poisoning defenses Signed-off-by: Farhan Ahmed --- art/defences/detector/poison/activation_defence.py | 4 +++- art/defences/detector/poison/spectral_signature_defense.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/art/defences/detector/poison/activation_defence.py b/art/defences/detector/poison/activation_defence.py index 9b53235bf4..45b09d0e4d 100644 --- a/art/defences/detector/poison/activation_defence.py +++ b/art/defences/detector/poison/activation_defence.py @@ -695,7 +695,9 @@ def _get_activations(self, x_train: Optional[np.ndarray] = None) -> np.ndarray: # wrong way to get activations activations = self.classifier.predict(self.x_train) if isinstance(activations, np.ndarray): - nodes_last_layer = np.shape(activations)[1] + # flatten activations across batch + activations = np.reshape(activations, (activations.shape[0], -1)) + nodes_last_layer = activations.shape[1] else: raise ValueError("activations is None or tensor.") diff --git a/art/defences/detector/poison/spectral_signature_defense.py b/art/defences/detector/poison/spectral_signature_defense.py index 69109f2d61..8fd44a3200 100644 --- a/art/defences/detector/poison/spectral_signature_defense.py +++ b/art/defences/detector/poison/spectral_signature_defense.py @@ -121,6 +121,8 @@ def detect_poison(self, **kwargs) -> Tuple[dict, List[int]]: raise ValueError("Wrong type detected.") if features_x_poisoned is not None: + # flatten activations across batch + features_x_poisoned = np.reshape(features_x_poisoned, (features_x_poisoned.shape[0], -1)) features_split = segment_by_class(features_x_poisoned, self.y_train, self.classifier.nb_classes) else: raise ValueError("Activation are `None`.") From 4db76267ccac573c81086e8b53accd55379606db Mon Sep 17 00:00:00 2001 From: Farhan Ahmed Date: Tue, 14 Nov 2023 12:05:12 -0800 Subject: [PATCH 2/3] remove huggingface estimator activation hack Signed-off-by: Farhan Ahmed --- art/estimators/classification/hugging_face.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/art/estimators/classification/hugging_face.py b/art/estimators/classification/hugging_face.py index 33a9ce18e0..d029df1b35 100644 --- a/art/estimators/classification/hugging_face.py +++ b/art/estimators/classification/hugging_face.py @@ -318,11 +318,7 @@ def get_activations( # type: ignore def get_feature(name): # the hook signature def hook(model, input, output): # pylint: disable=W0622,W0613 - # TODO: this is using the input, rather than the output, to circumvent the fact - # TODO: that flatten is not a layer in pytorch, and the activation defence expects - # TODO: a flattened input. A better option is to refactor the activation defence - # TODO: to not crash if non 2D inputs are provided. - self._features[name] = input + self._features[name] = output return hook From d345786e297347ecc161ca176bc969aee9a2bc03 Mon Sep 17 00:00:00 2001 From: Farhan Ahmed Date: Thu, 7 Dec 2023 16:21:38 -0800 Subject: [PATCH 3/3] Revert "remove huggingface estimator activation hack" This reverts commit 4db76267ccac573c81086e8b53accd55379606db. Signed-off-by: Farhan Ahmed --- art/estimators/classification/hugging_face.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/art/estimators/classification/hugging_face.py b/art/estimators/classification/hugging_face.py index d029df1b35..33a9ce18e0 100644 --- a/art/estimators/classification/hugging_face.py +++ b/art/estimators/classification/hugging_face.py @@ -318,7 +318,11 @@ def get_activations( # type: ignore def get_feature(name): # the hook signature def hook(model, input, output): # pylint: disable=W0622,W0613 - self._features[name] = output + # TODO: this is using the input, rather than the output, to circumvent the fact + # TODO: that flatten is not a layer in pytorch, and the activation defence expects + # TODO: a flattened input. A better option is to refactor the activation defence + # TODO: to not crash if non 2D inputs are provided. + self._features[name] = input return hook