From 61d82d318ebac0193d2eda6d1131389ee8137274 Mon Sep 17 00:00:00 2001 From: Alex Leites <18728999+tallaxes@users.noreply.github.com> Date: Tue, 14 Nov 2023 09:59:32 -0800 Subject: [PATCH] fix: remove confidential VM SKUs (DC,EC) from consideration (#23) --- hack/codegen.sh | 2 +- pkg/fake/zz_generated.sku.go | 45 ++++++++++++- pkg/providers/instancetype/instancetypes.go | 70 +++++++++++++-------- pkg/providers/instancetype/suite_test.go | 34 ++++++---- 4 files changed, 110 insertions(+), 41 deletions(-) diff --git a/hack/codegen.sh b/hack/codegen.sh index c0444eb9f..177c5ed89 100755 --- a/hack/codegen.sh +++ b/hack/codegen.sh @@ -61,7 +61,7 @@ skugen() { SUBJECT="SKUGEN" NO_UPDATE=$' pkg/fake/zz_generated.sku.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)' - go run hack/code/instancetype_testdata_gen.go -- "${GENERATED_FILE}" "eastus" "Standard_B1s,Standard_A0,Standard_D2_v2,Standard_D2_v3,Standard_DS2_v2,Standard_D2s_v3,Standard_D2_v5,Standard_F16s_v2,Standard_NC24ads_A100_v4,Standard_M8-2ms,Standard_D4s_v3,Standard_D64s_v3" + go run hack/code/instancetype_testdata_gen.go -- "${GENERATED_FILE}" "eastus" "Standard_B1s,Standard_A0,Standard_D2_v2,Standard_D2_v3,Standard_DS2_v2,Standard_D2s_v3,Standard_D2_v5,Standard_F16s_v2,Standard_NC24ads_A100_v4,Standard_M8-2ms,Standard_D4s_v3,Standard_D64s_v3,Standard_DC8s_v3" GIT_DIFF=$(git diff --stat "${GENERATED_FILE}") checkForUpdates "${GIT_DIFF}" "${NO_UPDATE}" "${SUBJECT} beside timestamps since last update" "${GENERATED_FILE}" diff --git a/pkg/fake/zz_generated.sku.go b/pkg/fake/zz_generated.sku.go index 62a8ef09b..122c798c3 100644 --- a/pkg/fake/zz_generated.sku.go +++ b/pkg/fake/zz_generated.sku.go @@ -24,7 +24,7 @@ import ( "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute" ) -// generated at 2023-11-04T06:56:34Z +// generated at 2023-11-10T01:03:33Z // ResourceSkus is a list of all available skus from azure var ResourceSkus = []compute.ResourceSku{ @@ -386,6 +386,49 @@ var ResourceSkus = []compute.ResourceSku{ }, }, }, + { + Name: lo.ToPtr("Standard_DC8s_v3"), + Tier: lo.ToPtr("Standard"), + Kind: lo.ToPtr(""), + Size: lo.ToPtr("DC8s_v3"), + Family: lo.ToPtr("standardDCSv3Family"), + ResourceType: lo.ToPtr("virtualMachines"), + APIVersions: &[]string{}, + Costs: &[]compute.ResourceSkuCosts{}, + Restrictions: &[]compute.ResourceSkuRestrictions{}, + Capabilities: &[]compute.ResourceSkuCapabilities{ + {Name: lo.ToPtr("MaxResourceVolumeMB"), Value: lo.ToPtr("0")}, + {Name: lo.ToPtr("OSVhdSizeMB"), Value: lo.ToPtr("1047552")}, + {Name: lo.ToPtr("vCPUs"), Value: lo.ToPtr("8")}, + {Name: lo.ToPtr("MemoryPreservingMaintenanceSupported"), Value: lo.ToPtr("True")}, + {Name: lo.ToPtr("HyperVGenerations"), Value: lo.ToPtr("V2")}, + {Name: lo.ToPtr("MemoryGB"), Value: lo.ToPtr("64")}, + {Name: lo.ToPtr("MaxDataDiskCount"), Value: lo.ToPtr("32")}, + {Name: lo.ToPtr("CpuArchitectureType"), Value: lo.ToPtr("x64")}, + {Name: lo.ToPtr("LowPriorityCapable"), Value: lo.ToPtr("True")}, + {Name: lo.ToPtr("PremiumIO"), Value: lo.ToPtr("True")}, + {Name: lo.ToPtr("VMDeploymentTypes"), Value: lo.ToPtr("IaaS")}, + {Name: lo.ToPtr("vCPUsAvailable"), Value: lo.ToPtr("8")}, + {Name: lo.ToPtr("vCPUsPerCore"), Value: lo.ToPtr("1")}, + {Name: lo.ToPtr("CombinedTempDiskAndCachedIOPS"), Value: lo.ToPtr("77000")}, + {Name: lo.ToPtr("CombinedTempDiskAndCachedReadBytesPerSecond"), Value: lo.ToPtr("77000000000")}, + {Name: lo.ToPtr("CombinedTempDiskAndCachedWriteBytesPerSecond"), Value: lo.ToPtr("77000000000")}, + {Name: lo.ToPtr("UncachedDiskIOPS"), Value: lo.ToPtr("25600")}, + {Name: lo.ToPtr("UncachedDiskBytesPerSecond"), Value: lo.ToPtr("384000000")}, + {Name: lo.ToPtr("EphemeralOSDiskSupported"), Value: lo.ToPtr("False")}, + {Name: lo.ToPtr("EncryptionAtHostSupported"), Value: lo.ToPtr("True")}, + {Name: lo.ToPtr("CapacityReservationSupported"), Value: lo.ToPtr("False")}, + {Name: lo.ToPtr("AcceleratedNetworkingEnabled"), Value: lo.ToPtr("True")}, + {Name: lo.ToPtr("RdmaEnabled"), Value: lo.ToPtr("False")}, + {Name: lo.ToPtr("MaxNetworkInterfaces"), Value: lo.ToPtr("8")}, + }, + Locations: &[]string{"eastus"}, + LocationInfo: &[]compute.ResourceSkuLocationInfo{{Location: lo.ToPtr("eastus"), Zones: &[]string{ + "1", + }, + }, + }, + }, { Name: lo.ToPtr("Standard_DS2_v2"), Tier: lo.ToPtr("Standard"), diff --git a/pkg/providers/instancetype/instancetypes.go b/pkg/providers/instancetype/instancetypes.go index 6ae0b9644..3453058c5 100644 --- a/pkg/providers/instancetype/instancetypes.go +++ b/pkg/providers/instancetype/instancetypes.go @@ -21,6 +21,7 @@ import ( "fmt" "math" "net/http" + "strings" "sync" "time" @@ -88,9 +89,6 @@ func (p *Provider) List( logging.FromContext(ctx).Errorf("parsing VM size %s, %v", *sku.Size, err) continue } - if !p.isSupportedSize(vmsize) { - continue - } architecture, err := sku.GetCPUArchitectureType() if err != nil { logging.FromContext(ctx).Errorf("parsing SKU architecture %s, %v", *sku.Size, err) @@ -152,7 +150,13 @@ func (p *Provider) getInstanceTypes(ctx context.Context) (map[string]*skewer.SKU skus := cache.List(ctx, skewer.ResourceTypeFilter(skewer.VirtualMachines)) logging.FromContext(ctx).Debugf("Discovered %d SKUs", len(skus)) for i := range skus { - if p.isSupported(&skus[i]) { + vmsize, err := skus[i].GetVMSize() + if err != nil { + logging.FromContext(ctx).Errorf("parsing VM size %s, %v", *skus[i].Size, err) + continue + } + + if p.isSupported(&skus[i], vmsize) { instanceTypes[skus[i].GetName()] = &skus[i] } } @@ -163,36 +167,48 @@ func (p *Provider) getInstanceTypes(ctx context.Context) (map[string]*skewer.SKU } // isSupported indicates SKU is supported by AKS, based on SKU properties -func (p *Provider) isSupported(sku *skewer.SKU) bool { - name := lo.FromPtr(sku.Name) +func (p *Provider) isSupported(sku *skewer.SKU, vmsize *skewer.VMSizeType) bool { + return p.hasMinimumCPU(sku) && + p.hasMinimumMemory(sku) && + !p.isUnsupportedByAKS(sku) && + !p.isUnsupportedGPU(sku) && + !p.hasConstrainedCPUs(vmsize) && + !p.isConfidential(sku) +} - // less than 2 cpus - if cpu, err := sku.VCPU(); err == nil && cpu < 2 { - return false - } +// at least 2 cpus +func (p *Provider) hasMinimumCPU(sku *skewer.SKU) bool { + cpu, err := sku.VCPU() + return err == nil && cpu >= 2 +} - // less then 3.5 GiB of memory - if memGiB, err := sku.Memory(); err == nil && memGiB < 3.5 { - return false - } +// at least 3.5 GiB of memory +func (p *Provider) hasMinimumMemory(sku *skewer.SKU) bool { + memGiB, err := sku.Memory() + return err == nil && memGiB >= 3.5 +} - // filter out instances AKS does not support - if RestrictedVMSizes.Has(sku.GetName()) { - return false - } +// instances AKS does not support +func (p *Provider) isUnsupportedByAKS(sku *skewer.SKU) bool { + return RestrictedVMSizes.Has(sku.GetName()) +} - // filter out GPU SKUs AKS does not support - if gpu, err := sku.GPU(); err == nil && gpu > 0 && !(utils.IsNvidiaEnabledSKU(name) || utils.IsMarinerEnabledGPUSKU(name)) { - return false - } +// GPU SKUs AKS does not support +func (p *Provider) isUnsupportedGPU(sku *skewer.SKU) bool { + name := lo.FromPtr(sku.Name) + gpu, err := sku.GPU() + return err == nil && gpu > 0 && !(utils.IsNvidiaEnabledSKU(name) || utils.IsMarinerEnabledGPUSKU(name)) +} - return true +// SKU with constrained CPUs +func (p *Provider) hasConstrainedCPUs(vmsize *skewer.VMSizeType) bool { + return vmsize.CpusConstrained != nil } -// isSupportedSize indicates VM size is supported by AKS, based on additional SKU properties -func (p *Provider) isSupportedSize(size *skewer.VMSizeType) bool { - // any SKU with constrained CPUs - return size.CpusConstrained == nil +// confidential VMs (DC, EC) are not yet supported by this Karpenter provider +func (p *Provider) isConfidential(sku *skewer.SKU) bool { + size := sku.GetSize() + return strings.HasPrefix(size, "DC") || strings.HasPrefix(size, "EC") } // MaxEphemeralOSDiskSizeGB returns the maximum ephemeral OS disk size for a given SKU. diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index e43bc1464..2c40fab53 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -131,14 +131,30 @@ var _ = Describe("InstanceType Provider", func() { }) Context("Filtering in InstanceType Provider List", func() { - It("should filter out skus that are explicitly marked as restricted", func() { - instanceTypes, err := azureEnv.InstanceTypesProvider.List(ctx, &corev1beta1.KubeletConfiguration{}, nodeClass) - Expect(err).NotTo(HaveOccurred()) - for _, instanceType := range instanceTypes { - // We should not see any instance types in the restricted list - Expect(instancetype.RestrictedVMSizes.Has(instanceType.Name)).To(BeFalse()) + var instanceTypes corecloudprovider.InstanceTypes + var err error + getName := func(instanceType *corecloudprovider.InstanceType) string { return instanceType.Name } + + BeforeEach(func() { + instanceTypes, err = azureEnv.InstanceTypesProvider.List(ctx, &corev1beta1.KubeletConfiguration{}, nodeClass) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should not include SKUs marked as restricted", func() { + isRestricted := func(instanceType *corecloudprovider.InstanceType) bool { + return instancetype.RestrictedVMSizes.Has(instanceType.Name) } + Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(isRestricted, Equal(true)))) + Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(isRestricted, Equal(true)))) }) + It("should not include SKUs with constrained CPUs, but include unconstrained ones", func() { + Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_M8-2ms")))) + Expect(instanceTypes).Should(ContainElement(WithTransform(getName, Equal("Standard_D2_v2")))) + }) + It("should not include confidential SKUs", func() { + Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_DC8s_v3")))) + }) + }) Context("Ephemeral Disk", func() { @@ -564,12 +580,6 @@ var _ = Describe("InstanceType Provider", func() { Expect(ok).To(BeTrue(), "Expected nvidia.com/gpu to be present in capacity, and be zero") Expect(gpuQuanityNonGPU.Value()).To(Equal(int64(0))) }) - - It("should not include SKUs with constrained CPUs, but include unconstrained ones", func() { - getName := func(instanceType *corecloudprovider.InstanceType) string { return instanceType.Name } - Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_M8-2ms")))) - Expect(instanceTypes).Should(ContainElement(WithTransform(getName, Equal("Standard_D2_v2")))) - }) }) Context("Instance Types", func() {