Skip to content

Commit

Permalink
test(e2e): add in chaos E2E test suite (#61)
Browse files Browse the repository at this point in the history
* add in chaos E2E test suite

* update imports

* change to corev1beta1

---------

Co-authored-by: Charlie McBride <[email protected]>
Co-authored-by: Rachel Gregory <[email protected]>
  • Loading branch information
3 people authored Mar 25, 2024
1 parent 2ba7dc3 commit 64e5128
Show file tree
Hide file tree
Showing 2 changed files with 229 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .github/workflows/e2e-matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
strategy:
fail-fast: false
matrix:
suite: [Nonbehavioral, Utilization, GPU, Drift, Integration, NodeClaim]
suite: [Nonbehavioral, Utilization, GPU, Drift, Integration, NodeClaim, Chaos]
permissions:
contents: read
id-token: write
Expand Down
228 changes: 228 additions & 0 deletions test/suites/chaos/suite_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
/*
Portions Copyright (c) Microsoft Corporation.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package chaos

import (
"context"
"fmt"
"sync/atomic"
"testing"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/samber/lo"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/informers"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/cache"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/Azure/karpenter-provider-azure/test/pkg/debug"
"github.com/Azure/karpenter-provider-azure/test/pkg/environment/azure"
corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"
"sigs.k8s.io/karpenter/pkg/test"
nodeutils "sigs.k8s.io/karpenter/pkg/utils/node"
)

var env *azure.Environment

func TestChaos(t *testing.T) {
RegisterFailHandler(Fail)
BeforeSuite(func() {
env = azure.NewEnvironment(t)
})
RunSpecs(t, "Chaos")
}

var _ = BeforeEach(func() { env.BeforeEach() })
var _ = AfterEach(func() { env.Cleanup() })
var _ = AfterEach(func() { env.AfterEach() })

var _ = Describe("Chaos", func() {
Describe("Runaway Scale-Up", func() {
It("should not produce a runaway scale-up when consolidation is enabled", Label(debug.NoWatch), Label(debug.NoEvents), func() {
ctx, cancel := context.WithCancel(env.Context)
defer cancel()

nodeClass := env.DefaultAKSNodeClass()
nodePool := env.DefaultNodePool(nodeClass)
test.ReplaceRequirements(nodePool, v1.NodeSelectorRequirement{
Key: corev1beta1.CapacityTypeLabelKey,
Operator: v1.NodeSelectorOpIn,
Values: []string{corev1beta1.CapacityTypeSpot},
})
nodePool.Spec.Disruption.ConsolidationPolicy = corev1beta1.ConsolidationPolicyWhenUnderutilized
nodePool.Spec.Disruption.ConsolidateAfter = nil

numPods := 1
dep := test.Deployment(test.DeploymentOptions{
Replicas: int32(numPods),
PodOptions: test.PodOptions{
Image: "mcr.microsoft.com/oss/kubernetes/pause:3.6",
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{"app": "my-app"},
},
TerminationGracePeriodSeconds: lo.ToPtr[int64](0),
},
})
// Start a controller that adds taints to nodes after creation
Expect(startTaintAdder(ctx, env.Config)).To(Succeed())
startNodeCountMonitor(ctx, env.Client)

// Create a deployment with a single pod
env.ExpectCreated(nodeClass, nodePool, dep)

// Expect that we never get over a high number of nodes
Consistently(func(g Gomega) {
list := &v1.NodeList{}
g.Expect(env.Client.List(env.Context, list, client.HasLabels{test.DiscoveryLabel})).To(Succeed())
g.Expect(len(list.Items)).To(BeNumerically("<", 35))
}, time.Minute*5).Should(Succeed())
})
It("should not produce a runaway scale-up when ttlSecondsAfterEmpty is enabled", Label(debug.NoWatch), Label(debug.NoEvents), func() {
ctx, cancel := context.WithCancel(env.Context)
defer cancel()

nodeClass := env.DefaultAKSNodeClass()
nodePool := env.DefaultNodePool(nodeClass)
test.ReplaceRequirements(nodePool, v1.NodeSelectorRequirement{
Key: corev1beta1.CapacityTypeLabelKey,
Operator: v1.NodeSelectorOpIn,
Values: []string{corev1beta1.CapacityTypeSpot},
})
nodePool.Spec.Disruption.ConsolidationPolicy = corev1beta1.ConsolidationPolicyWhenEmpty
nodePool.Spec.Disruption.ConsolidateAfter = &corev1beta1.NillableDuration{Duration: lo.ToPtr(30 * time.Second)}

numPods := 1
dep := test.Deployment(test.DeploymentOptions{
Replicas: int32(numPods),
PodOptions: test.PodOptions{
Image: "mcr.microsoft.com/oss/kubernetes/pause:3.6",
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{"app": "my-app"},
},
TerminationGracePeriodSeconds: lo.ToPtr[int64](0),
},
})
// Start a controller that adds taints to nodes after creation
Expect(startTaintAdder(ctx, env.Config)).To(Succeed())
startNodeCountMonitor(ctx, env.Client)

// Create a deployment with a single pod
env.ExpectCreated(nodeClass, nodePool, dep)

// Expect that we never get over a high number of nodes
Consistently(func(g Gomega) {
list := &v1.NodeList{}
g.Expect(env.Client.List(env.Context, list, client.HasLabels{test.DiscoveryLabel})).To(Succeed())
g.Expect(len(list.Items)).To(BeNumerically("<", 35))
}, time.Minute*5).Should(Succeed())
})
})
})

type taintAdder struct {
kubeClient client.Client
}

func (t *taintAdder) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
node := &v1.Node{}
if err := t.kubeClient.Get(ctx, req.NamespacedName, node); err != nil {
return reconcile.Result{}, client.IgnoreNotFound(err)
}
mergeFrom := client.MergeFrom(node.DeepCopy())
taint := v1.Taint{
Key: "test",
Value: "true",
Effect: v1.TaintEffectNoExecute,
}
if !lo.Contains(node.Spec.Taints, taint) {
node.Spec.Taints = append(node.Spec.Taints, taint)
if err := t.kubeClient.Patch(ctx, node, mergeFrom); err != nil {
return reconcile.Result{}, err
}
}
return reconcile.Result{}, nil
}

func (t *taintAdder) Builder(mgr manager.Manager) *controllerruntime.Builder {
return controllerruntime.NewControllerManagedBy(mgr).
For(&v1.Node{}).
WithEventFilter(predicate.NewPredicateFuncs(func(obj client.Object) bool {
node := obj.(*v1.Node)
if _, ok := node.Labels[test.DiscoveryLabel]; !ok {
return false
}
return true
}))
}

func startTaintAdder(ctx context.Context, config *rest.Config) error {
mgr, err := controllerruntime.NewManager(config, controllerruntime.Options{})
if err != nil {
return err
}
adder := &taintAdder{kubeClient: mgr.GetClient()}
if err = adder.Builder(mgr).Complete(adder); err != nil {
return err
}
go func() {
Expect(mgr.Start(ctx)).To(Succeed())
}()
return nil
}

func startNodeCountMonitor(ctx context.Context, kubeClient client.Client) {
createdNodes := atomic.Int64{}
deletedNodes := atomic.Int64{}

factory := informers.NewSharedInformerFactoryWithOptions(env.KubeClient, time.Second*30,
informers.WithTweakListOptions(func(l *metav1.ListOptions) { l.LabelSelector = corev1beta1.NodePoolLabelKey }))
nodeInformer := factory.Core().V1().Nodes().Informer()
_, err := nodeInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(_ interface{}) {
createdNodes.Add(1)
},
DeleteFunc: func(_ interface{}) {
deletedNodes.Add(1)
},
})
Expect(err).ToNot(HaveOccurred())
factory.Start(ctx.Done())
go func() {
for {
list := &v1.NodeList{}
if err := kubeClient.List(ctx, list, client.HasLabels{test.DiscoveryLabel}); err == nil {
readyCount := lo.CountBy(list.Items, func(n v1.Node) bool {
return nodeutils.GetCondition(&n, v1.NodeReady).Status == v1.ConditionTrue
})
fmt.Printf("[NODE COUNT] CURRENT: %d | READY: %d | CREATED: %d | DELETED: %d\n", len(list.Items), readyCount, createdNodes.Load(), deletedNodes.Load())
}
select {
case <-ctx.Done():
return
case <-time.After(time.Second * 5):
}
}
}()
}

0 comments on commit 64e5128

Please sign in to comment.