diff --git a/.github/workflows/approval-comment.yaml b/.github/workflows/approval-comment.yaml index 20c680785..f78d62b56 100644 --- a/.github/workflows/approval-comment.yaml +++ b/.github/workflows/approval-comment.yaml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: disable-telemetry: true disable-sudo: true @@ -30,7 +30,7 @@ jobs: mkdir -p /tmp/artifacts { echo ${{ github.event.pull_request.number }}; echo ${{ github.event.review.commit_id }}; } >> /tmp/artifacts/metadata.txt cat /tmp/artifacts/metadata.txt - - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 + - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: artifacts path: /tmp/artifacts diff --git a/.github/workflows/build-publish-mcr.yml b/.github/workflows/build-publish-mcr.yml index 61d29a087..1aaf5f7ff 100644 --- a/.github/workflows/build-publish-mcr.yml +++ b/.github/workflows/build-publish-mcr.yml @@ -23,7 +23,7 @@ jobs: labels: [self-hosted, "1ES.Pool=${{ vars.RELEASE_1ES_POOL }}"] steps: - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: egress-policy: audit diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml index b63ae1b5f..b9b4336da 100644 --- a/.github/workflows/ci-test.yml +++ b/.github/workflows/ci-test.yml @@ -19,7 +19,7 @@ jobs: K8S_VERSION: ${{ matrix.k8sVersion }} steps: - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: disable-telemetry: true egress-policy: block diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 63326b666..755af2b50 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: disable-telemetry: true egress-policy: block diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 98dc425a3..991223df3 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -26,7 +26,7 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: disable-telemetry: true egress-policy: block @@ -46,8 +46,8 @@ jobs: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: ./.github/actions/install-deps - run: make vulncheck - - uses: github/codeql-action/init@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0 + - uses: github/codeql-action/init@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 with: languages: ${{ matrix.language }} - - uses: github/codeql-action/autobuild@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0 - - uses: github/codeql-action/analyze@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0 + - uses: github/codeql-action/autobuild@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 + - uses: github/codeql-action/analyze@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 diff --git a/.github/workflows/deflake.yml b/.github/workflows/deflake.yml index f97928dba..fc2e09084 100644 --- a/.github/workflows/deflake.yml +++ b/.github/workflows/deflake.yml @@ -14,7 +14,7 @@ jobs: statuses: write steps: - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: disable-telemetry: true egress-policy: block diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index e97e8c207..6f49fc692 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: disable-telemetry: true disable-sudo: true diff --git a/.github/workflows/e2e-matrix.yaml b/.github/workflows/e2e-matrix.yaml index a854583e9..fd43bb889 100644 --- a/.github/workflows/e2e-matrix.yaml +++ b/.github/workflows/e2e-matrix.yaml @@ -29,7 +29,7 @@ jobs: E2E_HASH: ${{ steps.generate-e2e-run-hash.outputs.E2E_HASH }} steps: - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: disable-telemetry: true disable-sudo: true diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 0cfdac413..bbff09ad8 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -45,7 +45,7 @@ jobs: AZURE_SUBSCRIPTION_ID: ${{ secrets.E2E_SUBSCRIPTION_ID }} steps: - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: disable-telemetry: true egress-policy: block diff --git a/.github/workflows/release-trigger.yaml b/.github/workflows/release-trigger.yaml index 6b227b4f5..1ee240e36 100644 --- a/.github/workflows/release-trigger.yaml +++ b/.github/workflows/release-trigger.yaml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: disable-telemetry: true disable-sudo: true diff --git a/.github/workflows/resolve-args.yaml b/.github/workflows/resolve-args.yaml index 8588f8e32..d992176d6 100644 --- a/.github/workflows/resolve-args.yaml +++ b/.github/workflows/resolve-args.yaml @@ -16,7 +16,7 @@ jobs: steps: # Download the artifact and resolve the GIT_REF - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: disable-sudo: true disable-telemetry: true diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 9439f8681..0171bc9af 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -31,7 +31,7 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: disable-sudo: true disable-telemetry: true @@ -82,7 +82,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: name: SARIF file path: results.sarif @@ -90,6 +90,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0 + uses: github/codeql-action/upload-sarif@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1 with: sarif_file: results.sarif diff --git a/charts/karpenter-crd/templates/karpenter.azure.com_aksnodeclasses.yaml b/charts/karpenter-crd/templates/karpenter.azure.com_aksnodeclasses.yaml index db6f5f00a..6311bd8bf 100644 --- a/charts/karpenter-crd/templates/karpenter.azure.com_aksnodeclasses.yaml +++ b/charts/karpenter-crd/templates/karpenter.azure.com_aksnodeclasses.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.17.0 + controller-gen.kubebuilder.io/version: v0.17.1 name: aksnodeclasses.karpenter.azure.com spec: group: karpenter.azure.com diff --git a/karpenter-values-template.yaml b/karpenter-values-template.yaml index dcbbeddfc..c5124877c 100644 --- a/karpenter-values-template.yaml +++ b/karpenter-values-template.yaml @@ -2,8 +2,8 @@ replicas: 1 # for better debugging experience controller: env: - - name: LEADER_ELECT # disable leader election for better debugging / troubleshooting experience - value: "false" + - name: DISABLE_LEADER_ELECTION # disable leader election for better debugging / troubleshooting experience + value: "true" # disable HTTP/2 to reduce ARM throttling on large-scale tests; # with this in place write (and read) QPS can be increased too #- name: GODEBUG @@ -37,12 +37,12 @@ controller: value: "" - name: AZURE_NODE_RESOURCE_GROUP value: ${AZURE_RESOURCE_GROUP_MC} - - # managed karpenter settings + + # managed karpenter settings - name: USE_SIG value: "false" - name: SIG_SUBSCRIPTION_ID - value: "" + value: "" serviceAccount: name: ${KARPENTER_SERVICE_ACCOUNT_NAME} annotations: diff --git a/pkg/apis/crds/karpenter.azure.com_aksnodeclasses.yaml b/pkg/apis/crds/karpenter.azure.com_aksnodeclasses.yaml index db6f5f00a..6311bd8bf 100644 --- a/pkg/apis/crds/karpenter.azure.com_aksnodeclasses.yaml +++ b/pkg/apis/crds/karpenter.azure.com_aksnodeclasses.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.17.0 + controller-gen.kubebuilder.io/version: v0.17.1 name: aksnodeclasses.karpenter.azure.com spec: group: karpenter.azure.com diff --git a/pkg/cloudprovider/cloudprovider.go b/pkg/cloudprovider/cloudprovider.go index b9c9764b3..6ffaee0fb 100644 --- a/pkg/cloudprovider/cloudprovider.go +++ b/pkg/cloudprovider/cloudprovider.go @@ -328,11 +328,9 @@ func (c *CloudProvider) instanceToNodeClaim(ctx context.Context, vm *armcompute. nodeClaim.Status.Allocatable = lo.PickBy(instanceType.Allocatable(), func(_ v1.ResourceName, v resource.Quantity) bool { return !resources.IsZero(v) }) } - // TODO: review logic for determining zone (AWS uses Zone from subnet resolved and aviailable from NodeClass conditions ...) - if zoneID, err := instance.GetZoneID(vm); err != nil { + if zone, err := utils.GetZone(vm); err != nil { logging.FromContext(ctx).Warnf("Failed to get zone for VM %s, %v", *vm.Name, err) } else { - zone := makeZone(*vm.Location, zoneID) // aks-node-validating-webhook protects v1.LabelTopologyZone, will be set elsewhere, so we use a different label labels[v1alpha2.AlternativeLabelTopologyZone] = zone } @@ -369,14 +367,6 @@ func GenerateNodeClaimName(vmName string) string { return strings.TrimLeft("aks-", vmName) } -// makeZone returns the zone value in format of -. -func makeZone(location string, zoneID string) string { - if zoneID == "" { - return "" - } - return fmt.Sprintf("%s-%s", strings.ToLower(location), zoneID) -} - // newTerminatingNodeClassError returns a NotFound error for handling by func newTerminatingNodeClassError(name string) *errors.StatusError { qualifiedResource := schema.GroupResource{Group: apis.Group, Resource: "aksnodeclasses"} diff --git a/pkg/fake/types.go b/pkg/fake/types.go index a76b22cf2..b2f427ce5 100644 --- a/pkg/fake/types.go +++ b/pkg/fake/types.go @@ -45,13 +45,12 @@ func (m *MockedFunction[I, O]) Reset() { } func (m *MockedFunction[I, O]) Invoke(input *I, defaultTransformer func(*I) (O, error)) (O, error) { + m.CalledWithInput.Add(input) err := m.Error.Get() if err != nil { m.failedCalls.Add(1) return *new(O), err } - m.CalledWithInput.Add(input) - if !m.Output.IsNil() { m.successfulCalls.Add(1) return *m.Output.Clone(), nil @@ -94,6 +93,8 @@ func (m *MockedLRO[I, O]) Reset() { } func (m *MockedLRO[I, O]) Invoke(input *I, defaultTransformer func(*I) (*O, error)) (*runtime.Poller[O], error) { + m.CalledWithInput.Add(input) + if err := m.BeginError.Get(); err != nil { m.failedCalls.Add(1) return nil, err @@ -103,8 +104,6 @@ func (m *MockedLRO[I, O]) Invoke(input *I, defaultTransformer func(*I) (*O, erro return newMockPoller[O](nil, err) } - m.CalledWithInput.Add(input) - if !m.Output.IsNil() { m.successfulCalls.Add(1) return newMockPoller(m.Output.Clone(), nil) diff --git a/pkg/providers/instance/armutils.go b/pkg/providers/instance/armutils.go index 11bd6adac..dcd899e5c 100644 --- a/pkg/providers/instance/armutils.go +++ b/pkg/providers/instance/armutils.go @@ -118,7 +118,8 @@ func deleteNicIfExists(ctx context.Context, client NetworkInterfacesAPI, rg, nic func deleteVirtualMachineIfExists(ctx context.Context, client VirtualMachinesAPI, rg, vmName string) error { _, err := client.Get(ctx, rg, vmName, nil) if err != nil { - if sdkerrors.IsNotFoundErr(err) { + azErr := sdkerrors.IsResponseError(err) + if azErr != nil && (azErr.ErrorCode == "NotFound" || azErr.ErrorCode == "ResourceNotFound") { return nil } return err diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index 0149c875e..975eb2a92 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -37,6 +37,7 @@ import ( "github.com/Azure/karpenter-provider-azure/pkg/providers/instancetype" "github.com/Azure/karpenter-provider-azure/pkg/providers/launchtemplate" "github.com/Azure/karpenter-provider-azure/pkg/providers/loadbalancer" + "github.com/Azure/karpenter-provider-azure/pkg/utils" corecloudprovider "sigs.k8s.io/karpenter/pkg/cloudprovider" "sigs.k8s.io/karpenter/pkg/scheduling" @@ -140,7 +141,7 @@ func (p *DefaultProvider) Create(ctx context.Context, nodeClass *v1alpha2.AKSNod } return nil, err } - zone, err := GetZoneID(vm) + zone, err := utils.GetZone(vm) if err != nil { logging.FromContext(ctx).Error(err) } @@ -163,7 +164,8 @@ func (p *DefaultProvider) Get(ctx context.Context, vmName string) (*armcompute.V var err error if vm, err = p.azClient.virtualMachinesClient.Get(ctx, p.resourceGroup, vmName, nil); err != nil { - if sdkerrors.IsNotFoundErr(err) { + azErr := sdkerrors.IsResponseError(err) + if azErr != nil && (azErr.ErrorCode == "NotFound" || azErr.ErrorCode == "ResourceNotFound") { return nil, corecloudprovider.NewNodeClaimNotFoundError(err) } return nil, fmt.Errorf("failed to get VM instance, %w", err) @@ -374,7 +376,7 @@ func newVMObject( CapacityTypeToPriority[capacityType]), ), }, - Zones: lo.Ternary(len(zone) > 0, []*string{&zone}, []*string{}), + Zones: utils.MakeVMZone(zone), Tags: launchTemplate.Tags, } setVMPropertiesOSDiskType(vm.Properties, launchTemplate.StorageProfile) @@ -627,11 +629,6 @@ func (p *DefaultProvider) pickSkuSizePriorityAndZone(ctx context.Context, nodeCl }) zonesWithPriority := lo.Map(priorityOfferings, func(o corecloudprovider.Offering, _ int) string { return getOfferingZone(o) }) if zone, ok := sets.New(zonesWithPriority...).PopAny(); ok { - if len(zone) > 0 { - // Zones in zonal Offerings have - format; the zone returned from here will be used for VM instantiation, - // which expects just the zone number, without region - zone = string(zone[len(zone)-1]) - } return instanceType, priority, zone } return nil, "", "" @@ -751,26 +748,6 @@ func (p *DefaultProvider) getCSExtension(cse string, isWindows bool) *armcompute } } -// GetZoneID returns the zone ID for the given virtual machine, or an empty string if there is no zone specified -func GetZoneID(vm *armcompute.VirtualMachine) (string, error) { - if vm == nil { - return "", fmt.Errorf("cannot pass in a nil virtual machine") - } - if vm.Name == nil { - return "", fmt.Errorf("virtual machine is missing name") - } - if vm.Zones == nil { - return "", nil - } - if len(vm.Zones) == 1 { - return *(vm.Zones)[0], nil - } - if len(vm.Zones) > 1 { - return "", fmt.Errorf("virtual machine %v has multiple zones", *vm.Name) - } - return "", nil -} - func GetListQueryBuilder(rg string) *kql.Builder { return kql.New(`Resources`). AddLiteral(` | where type == "microsoft.compute/virtualmachines"`). diff --git a/pkg/providers/instance/instance_test.go b/pkg/providers/instance/instance_test.go index 67ee915f3..06710aa79 100644 --- a/pkg/providers/instance/instance_test.go +++ b/pkg/providers/instance/instance_test.go @@ -20,7 +20,6 @@ import ( "context" "testing" - "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute" "github.com/Azure/karpenter-provider-azure/pkg/cache" "github.com/stretchr/testify/assert" @@ -79,7 +78,7 @@ func TestGetPriorityCapacityAndInstanceType(t *testing.T) { }, nodeClaim: &karpv1.NodeClaim{}, expectedInstanceType: "Standard_D2s_v3", - expectedZone: "2", + expectedZone: "westus-2", expectedPriority: karpv1.CapacityTypeOnDemand, }, } @@ -101,60 +100,6 @@ func TestGetPriorityCapacityAndInstanceType(t *testing.T) { } } -func TestGetZone(t *testing.T) { - testVMName := "silly-armcompute" - tc := []struct { - testName string - input *armcompute.VirtualMachine - expectedZone string - expectedError string - }{ - { - testName: "missing name", - input: &armcompute.VirtualMachine{ - Name: nil, - }, - expectedError: "virtual machine is missing name", - }, - { - testName: "invalid virtual machine struct", - input: nil, - expectedError: "cannot pass in a nil virtual machine", - }, - { - testName: "invalid zones field in virtual machine struct", - input: &armcompute.VirtualMachine{ - Name: &testVMName, - }, - expectedError: "virtual machine silly-armcompute zones are nil", - }, - { - testName: "happy case", - input: &armcompute.VirtualMachine{ - Name: &testVMName, - Zones: []*string{to.Ptr("poland-central")}, - }, - expectedZone: "poland-central", - }, - { - testName: "emptyZones", - input: &armcompute.VirtualMachine{ - Name: &testVMName, - Zones: []*string{}, - }, - expectedError: "virtual machine silly-armcompute does not have any zones specified", - }, - } - - for _, c := range tc { - zone, err := GetZoneID(c.input) - assert.Equal(t, c.expectedZone, zone, c.testName) - if err != nil { - assert.Equal(t, c.expectedError, err.Error(), c.testName) - } - } -} - // Currently tested: ID, Name, Tags, Zones // TODO: Add the below attributes for Properties if needed: // Priority, InstanceView.HyperVGeneration, TimeCreated diff --git a/pkg/providers/instancetype/instancetypes.go b/pkg/providers/instancetype/instancetypes.go index 946db59e8..4ed007112 100644 --- a/pkg/providers/instancetype/instancetypes.go +++ b/pkg/providers/instancetype/instancetypes.go @@ -172,7 +172,7 @@ func instanceTypeZones(sku *skewer.SKU, region string) sets.Set[string] { skuZones := lo.Keys(sku.AvailabilityZones(region)) if hasZonalSupport(region) && len(skuZones) > 0 { return sets.New(lo.Map(skuZones, func(zone string, _ int) string { - return fmt.Sprintf("%s-%s", region, zone) + return utils.MakeZone(region, zone) })...) } return sets.New("") // empty string means non-zonal offering diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index ae1f0d515..972be555f 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -76,6 +76,8 @@ var coreProvisioner, coreProvisionerNonZonal *provisioning.Provisioner var cluster, clusterNonZonal *state.Cluster var cloudProvider, cloudProviderNonZonal *cloudprovider.CloudProvider +var fakeZone1 = utils.MakeZone(fake.Region, "1") + func TestAzure(t *testing.T) { ctx = TestContextWithLogger(t) RegisterFailHandler(Fail) @@ -589,8 +591,8 @@ var _ = Describe("InstanceType Provider", func() { Context("Unavailable Offerings", func() { It("should not allocate a vm in a zone marked as unavailable", func() { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", "Standard_D2_v2", fmt.Sprintf("%s-1", fake.Region), karpv1.CapacityTypeSpot) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", "Standard_D2_v2", fmt.Sprintf("%s-1", fake.Region), karpv1.CapacityTypeOnDemand) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", "Standard_D2_v2", fakeZone1, karpv1.CapacityTypeSpot) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", "Standard_D2_v2", fakeZone1, karpv1.CapacityTypeOnDemand) coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ NodeSelectorRequirement: v1.NodeSelectorRequirement{ Key: v1.LabelInstanceTypeStable, @@ -599,19 +601,38 @@ var _ = Describe("InstanceType Provider", func() { }}) ExpectApplied(ctx, env.Client, nodePool, nodeClass) - // Try this 100 times to make sure we don't get a node in eastus-1, - // we pick from 3 zones so the likelihood of this test passing by chance is 1/3^100 - for i := 0; i < 100; i++ { - pod := coretest.UnschedulablePod() - ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) - ExpectScheduled(ctx, env.Client, pod) - nodes := &v1.NodeList{} - Expect(env.Client.List(ctx, nodes)).To(Succeed()) - for _, node := range nodes.Items { - Expect(node.Labels["karpenter.kubernetes.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region))) - Expect(node.Labels["node.kubernetes.io/instance-type"]).To(Equal("Standard_D2_v2")) - } - } + pod := coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[v1alpha2.AlternativeLabelTopologyZone]).ToNot(Equal(fakeZone1)) + Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_D2_v2")) + }) + It("should handle ZonalAllocationFailed on creating the VM", func() { + azureEnv.VirtualMachinesAPI.VirtualMachinesBehavior.VirtualMachineCreateOrUpdateBehavior.Error.Set( + &azcore.ResponseError{ErrorCode: sdkerrors.ZoneAllocationFailed}, + ) + coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_D2_v2"}, + }}) + + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + pod := coretest.UnschedulablePod() + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + ExpectNotScheduled(ctx, env.Client, pod) + + By("marking whatever zone was picked as unavailable - for both spot and on-demand") + zone, err := utils.GetZone(&azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM) + Expect(err).ToNot(HaveOccurred()) + Expect(azureEnv.UnavailableOfferingsCache.IsUnavailable("Standard_D2_v2", zone, karpv1.CapacityTypeSpot)).To(BeTrue()) + Expect(azureEnv.UnavailableOfferingsCache.IsUnavailable("Standard_D2_v2", zone, karpv1.CapacityTypeOnDemand)).To(BeTrue()) + + By("successfully scheduling in a different zone on retry") + ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) + node := ExpectScheduled(ctx, env.Client, pod) + Expect(node.Labels[v1alpha2.AlternativeLabelTopologyZone]).ToNot(Equal(zone)) }) DescribeTable("Should not return unavailable offerings", func(azEnv *test.Environment) { @@ -641,8 +662,8 @@ var _ = Describe("InstanceType Provider", func() { ) It("should launch instances in a different zone than preferred", func() { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", "Standard_D2_v2", fmt.Sprintf("%s-1", fake.Region), karpv1.CapacityTypeOnDemand) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", "Standard_D2_v2", fmt.Sprintf("%s-1", fake.Region), karpv1.CapacityTypeSpot) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", "Standard_D2_v2", fakeZone1, karpv1.CapacityTypeOnDemand) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "ZonalAllocationFailure", "Standard_D2_v2", fakeZone1, karpv1.CapacityTypeSpot) ExpectApplied(ctx, env.Client, nodeClass, nodePool) pod := coretest.UnschedulablePod(coretest.PodOptions{ @@ -651,18 +672,18 @@ var _ = Describe("InstanceType Provider", func() { pod.Spec.Affinity = &v1.Affinity{NodeAffinity: &v1.NodeAffinity{PreferredDuringSchedulingIgnoredDuringExecution: []v1.PreferredSchedulingTerm{ { Weight: 1, Preference: v1.NodeSelectorTerm{MatchExpressions: []v1.NodeSelectorRequirement{ - {Key: v1.LabelTopologyZone, Operator: v1.NodeSelectorOpIn, Values: []string{fmt.Sprintf("%s-1", fake.Region)}}, + {Key: v1.LabelTopologyZone, Operator: v1.NodeSelectorOpIn, Values: []string{fakeZone1}}, }}, }, }}} ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) node := ExpectScheduled(ctx, env.Client, pod) - Expect(node.Labels["karpenter.kubernetes.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region))) - Expect(node.Labels["node.kubernetes.io/instance-type"]).To(Equal("Standard_D2_v2")) + Expect(node.Labels[v1alpha2.AlternativeLabelTopologyZone]).ToNot(Equal(fakeZone1)) + Expect(node.Labels[v1.LabelInstanceTypeStable]).To(Equal("Standard_D2_v2")) }) It("should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error", func() { - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", "Standard_F16s_v2", fmt.Sprintf("%s-1", fake.Region), karpv1.CapacityTypeOnDemand) - azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", "Standard_F16s_v2", fmt.Sprintf("%s-1", fake.Region), karpv1.CapacityTypeSpot) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", "Standard_F16s_v2", fakeZone1, karpv1.CapacityTypeOnDemand) + azureEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "SubscriptionQuotaReached", "Standard_F16s_v2", fakeZone1, karpv1.CapacityTypeSpot) coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ NodeSelectorRequirement: v1.NodeSelectorRequirement{ Key: v1.LabelInstanceTypeStable, @@ -676,7 +697,7 @@ var _ = Describe("InstanceType Provider", func() { Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1")}, }, NodeSelector: map[string]string{ - v1.LabelTopologyZone: fmt.Sprintf("%s-1", fake.Region), + v1.LabelTopologyZone: fakeZone1, }, })) } @@ -731,8 +752,8 @@ var _ = Describe("InstanceType Provider", func() { pod := coretest.UnschedulablePod() ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) ExpectNotScheduled(ctx, env.Client, pod) - for _, zone := range []string{"1", "2", "3"} { - ExpectUnavailable(azureEnv, sku, zone, capacityType) + for _, zoneID := range []string{"1", "2", "3"} { + ExpectUnavailable(azureEnv, sku, utils.MakeZone(fake.Region, zoneID), capacityType) } } @@ -793,7 +814,7 @@ var _ = Describe("InstanceType Provider", func() { // Well known v1.LabelTopologyRegion: fake.Region, karpv1.NodePoolLabelKey: nodePool.Name, - v1.LabelTopologyZone: fmt.Sprintf("%s-1", fake.Region), + v1.LabelTopologyZone: fakeZone1, v1.LabelInstanceTypeStable: "Standard_NC24ads_A100_v4", v1.LabelOSStable: "linux", v1.LabelArchStable: "amd64", @@ -814,11 +835,11 @@ var _ = Describe("InstanceType Provider", func() { v1alpha2.LabelSKUAccelerator: "A100", // Deprecated Labels v1.LabelFailureDomainBetaRegion: fake.Region, - v1.LabelFailureDomainBetaZone: fmt.Sprintf("%s-1", fake.Region), + v1.LabelFailureDomainBetaZone: fakeZone1, "beta.kubernetes.io/arch": "amd64", "beta.kubernetes.io/os": "linux", v1.LabelInstanceType: "Standard_NC24ads_A100_v4", - "topology.disk.csi.azure.com/zone": fmt.Sprintf("%s-1", fake.Region), + "topology.disk.csi.azure.com/zone": fakeZone1, v1.LabelWindowsBuild: "window", // Cluster Label v1alpha2.AKSLabelCluster: "test-cluster", diff --git a/pkg/test/expectations/expectations.go b/pkg/test/expectations/expectations.go index f16b7fcfe..5184d2d27 100644 --- a/pkg/test/expectations/expectations.go +++ b/pkg/test/expectations/expectations.go @@ -24,13 +24,12 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/Azure/karpenter-provider-azure/pkg/fake" "github.com/Azure/karpenter-provider-azure/pkg/test" ) func ExpectUnavailable(env *test.Environment, instanceType string, zone string, capacityType string) { GinkgoHelper() - Expect(env.UnavailableOfferingsCache.IsUnavailable(instanceType, fmt.Sprintf("%s-%s", fake.Region, zone), capacityType)).To(BeTrue()) + Expect(env.UnavailableOfferingsCache.IsUnavailable(instanceType, zone, capacityType)).To(BeTrue()) } func ExpectKubeletFlags(env *test.Environment, customData string, expectedFlags map[string]string) { diff --git a/pkg/utils/zone.go b/pkg/utils/zone.go new file mode 100644 index 000000000..a1efe304e --- /dev/null +++ b/pkg/utils/zone.go @@ -0,0 +1,61 @@ +/* +Portions Copyright (c) Microsoft Corporation. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "fmt" + "strings" + + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute" +) + +// MakeZone returns the zone value in format of -. +func MakeZone(location string, zoneID string) string { + if zoneID == "" { + return "" + } + return fmt.Sprintf("%s-%s", strings.ToLower(location), zoneID) +} + +// VM Zones field expects just the zone number, without region +func MakeVMZone(zone string) []*string { + if zone == "" { + return []*string{} + } + zoneNum := zone[len(zone)-1:] + return []*string{&zoneNum} +} + +// GetZone returns the zone for the given virtual machine, or an empty string if there is no zone specified +func GetZone(vm *armcompute.VirtualMachine) (string, error) { + if vm == nil { + return "", fmt.Errorf("cannot pass in a nil virtual machine") + } + if vm.Zones == nil { + return "", nil + } + if len(vm.Zones) == 1 { + if vm.Location == nil { + return "", fmt.Errorf("virtual machine is missing location") + } + return MakeZone(*vm.Location, *(vm.Zones)[0]), nil + } + if len(vm.Zones) > 1 { + return "", fmt.Errorf("virtual machine has multiple zones") + } + return "", nil +} diff --git a/pkg/utils/zone_test.go b/pkg/utils/zone_test.go new file mode 100644 index 000000000..ff9d9c353 --- /dev/null +++ b/pkg/utils/zone_test.go @@ -0,0 +1,86 @@ +/* +Portions Copyright (c) Microsoft Corporation. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils_test + +import ( + "testing" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute" + "github.com/Azure/karpenter-provider-azure/pkg/utils" + "github.com/stretchr/testify/assert" +) + +func TestGetZone(t *testing.T) { + tc := []struct { + testName string + input *armcompute.VirtualMachine + expectedZone string + expectedError string + }{ + { + testName: "invalid virtual machine struct", + input: nil, + expectedError: "cannot pass in a nil virtual machine", + }, + { + testName: "happy case", + input: &armcompute.VirtualMachine{ + Location: to.Ptr("region"), + Zones: []*string{to.Ptr("1")}, + }, + expectedZone: "region-1", + }, + { + testName: "missing Location", + input: &armcompute.VirtualMachine{ + Zones: []*string{to.Ptr("1")}, + }, + expectedError: "virtual machine is missing location", + }, + { + testName: "multiple zones", + input: &armcompute.VirtualMachine{ + Zones: []*string{to.Ptr("1"), to.Ptr("2")}, + }, + expectedError: "virtual machine has multiple zones", + }, + { + testName: "empty Zones", + input: &armcompute.VirtualMachine{ + Zones: []*string{}, + }, + expectedZone: "", + }, + { + testName: "nil Zones", + input: &armcompute.VirtualMachine{}, + expectedZone: "", + }, + } + + for _, c := range tc { + zone, err := utils.GetZone(c.input) + assert.Equal(t, c.expectedZone, zone, c.testName) + if err == nil && c.expectedError != "" { + assert.Fail(t, "expected error but got nil", c.testName) + } + if err != nil { + assert.Equal(t, c.expectedError, err.Error(), c.testName) + } + } +} diff --git a/test/pkg/environment/common/expectations.go b/test/pkg/environment/common/expectations.go index af3876eee..75274e0e8 100644 --- a/test/pkg/environment/common/expectations.go +++ b/test/pkg/environment/common/expectations.go @@ -366,7 +366,12 @@ func (env *Environment) EventuallyExpectKarpenterRestarted() { GinkgoHelper() By("rolling out the new karpenter deployment") env.EventuallyExpectRollout("karpenter", "kube-system") - env.ExpectKarpenterLeaseOwnerChanged() + + if !lo.ContainsBy(env.ExpectSettings(), func(v corev1.EnvVar) bool { + return v.Name == "DISABLE_LEADER_ELECTION" && v.Value == "true" + }) { + env.ExpectKarpenterLeaseOwnerChanged() + } } func (env *Environment) ExpectKarpenterLeaseOwnerChanged() { diff --git a/test/suites/drift/suite_test.go b/test/suites/drift/suite_test.go index b2d06e2c4..4d7fce780 100644 --- a/test/suites/drift/suite_test.go +++ b/test/suites/drift/suite_test.go @@ -61,8 +61,6 @@ var _ = Describe("Drift", func() { var pod *corev1.Pod BeforeEach(func() { - env.ExpectSettingsOverridden(corev1.EnvVar{Name: "FEATURE_GATES", Value: "Drift=true"}) - coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ NodeSelectorRequirement: corev1.NodeSelectorRequirement{ Key: corev1.LabelInstanceTypeStable,