From 6f1c1c669bd1bdea72c6a0dbcc7dba764a1a7066 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Fri, 18 Mar 2022 02:35:21 -0700 Subject: [PATCH] Setup workload identity via terraform The GKE config connector was helpful in letting us deploy Google Cloud Service Accounts with permissions for cloud storage directly just from helm. However, it has been difficult to debug, and in https://github.com/2i2c-org/infrastructure/issues/669 we decided to move away from it and towards creating these cloud resources via Terraform. This commit adds: - Terraform code that will create a Google Service Account, bind it to a given Kubernetes Service Account, for a list of hub namespaces passed in. This means that some hub initial deployments now *can not be done just with CD*, but need manual work with terraform. I think this would be any hub that wants to use requestor pays or scratch buckets. This would need to be documented. - Move meom-ige to use this new scheme. metadata concealment (https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata#concealment) which is what we were using earlier as alternative to config-connector + workload identity, is no longer supported by the terraform google provider. In b7b42ced82a1b3ad9d6936fa78c5a099ce5004f3, we changed the default from 'SECURE' to 'UNSPECIFIED', but it looks like 'UNSPECIFIED' really means 'use workload identity' haha. When https://github.com/2i2c-org/infrastructure/pull/1124 was deployed to meom-ige yesterday, it seems to have enabled workload identity, causing cloud access to stop working, leading to https://2i2c.freshdesk.com/a/tickets/107. Further investigation on what happened here is needed, but I've currently fixed it by just deploying this change for meom-ige. - All hubs are given access to all buckets we create. This is inadequete, and needs to be more fine grained. Ref https://github.com/2i2c-org/infrastructure/issues/669 Ref https://github.com/2i2c-org/infrastructure/issues/1046 --- terraform/gcp/buckets.tf | 19 ++++++-- terraform/gcp/cluster.tf | 40 ++++++++++------ terraform/gcp/main.tf | 42 +++++------------ terraform/gcp/projects/meom-ige.tfvars | 4 +- terraform/gcp/variables.tf | 31 +++++-------- terraform/gcp/workload-identity.tf | 63 ++++++++++++++++++++++++++ 6 files changed, 129 insertions(+), 70 deletions(-) create mode 100644 terraform/gcp/workload-identity.tf diff --git a/terraform/gcp/buckets.tf b/terraform/gcp/buckets.tf index 5cf759d857..1066718584 100644 --- a/terraform/gcp/buckets.tf +++ b/terraform/gcp/buckets.tf @@ -12,10 +12,21 @@ resource "google_storage_bucket" "user_buckets" { labels = {} } -resource "google_storage_bucket_iam_member" "member" { +locals { + # Nested for loop, thanks to https://www.daveperrett.com/articles/2021/08/19/nested-for-each-with-terraform/ + bucket_permissions = distinct(flatten([ + for hub_name in var.workload_identity_enabled_hubs : [ + for bucket_name in var.user_buckets : { + hub_name = hub_name + bucket_name = bucket_name + } + ] + ])) +} - for_each = var.user_buckets - bucket = google_storage_bucket.user_buckets[each.key].name +resource "google_storage_bucket_iam_member" "member" { + for_each = { for bp in local.bucket_permissions : "${bp.hub_name}.${bp.bucket_name}" => bp } + bucket = google_storage_bucket.user_buckets[each.value.bucket_name].name role = "roles/storage.admin" - member = "serviceAccount:${google_service_account.cluster_sa.email}" + member = "serviceAccount:${google_service_account.workload_sa[each.value.hub_name].email}" } diff --git a/terraform/gcp/cluster.tf b/terraform/gcp/cluster.tf index 8d21e22098..ac13251f2f 100644 --- a/terraform/gcp/cluster.tf +++ b/terraform/gcp/cluster.tf @@ -1,5 +1,27 @@ +resource "google_service_account" "cluster_sa" { + account_id = "${var.prefix}-cluster-sa" + display_name = "Service account used by nodes of cluster ${var.prefix}" + project = var.project_id +} + +resource "google_project_iam_member" "cluster_sa_roles" { + # https://cloud.google.com/kubernetes-engine/docs/how-to/hardening-your-cluster + # has information on why the cluster SA needs these rights + for_each = toset([ + "roles/logging.logWriter", + "roles/monitoring.metricWriter", + "roles/monitoring.viewer", + "roles/stackdriver.resourceMetadata.writer", + "roles/artifactregistry.reader" + ]) + + project = var.project_id + role = each.value + member = "serviceAccount:${google_service_account.cluster_sa.email}" +} + resource "google_container_cluster" "cluster" { - # config_connector_config is in beta + # Setting cluster autoscaling profile is in google-beta provider = google-beta name = "${var.prefix}-cluster" @@ -61,18 +83,10 @@ resource "google_container_cluster" "cluster" { // This isn't used anywhere, so let's turn this off disabled = true } - config_connector_config { - enabled = var.config_connector_enabled - } } - dynamic "workload_identity_config" { - # Setup workload identity only if we're using config connector, otherwise - # just metadata concealment is used - for_each = var.config_connector_enabled == "" ? [] : [1] - content { - workload_pool = "${var.project_id}.svc.id.goog" - } + workload_identity_config { + workload_pool = "${var.project_id}.svc.id.goog" } release_channel { @@ -208,7 +222,7 @@ resource "google_container_node_pool" "notebook" { # to expose the node CA to users safely. # FIXME: This should be a bit more fine-grained - it should be possible to disable # config connector and completely hide all node metadata from user pods - mode = var.config_connector_enabled ? "GKE_METADATA" : "MODE_UNSPECIFIED" + mode = "GKE_METADATA" } labels = merge({ # Notebook pods and dask schedulers can exist here @@ -278,7 +292,7 @@ resource "google_container_node_pool" "dask_worker" { # to expose the node CA to users safely. # FIXME: This should be a bit more fine-grained - it should be possible to disable # config connector and completely hide all node metadata from user pods - mode = var.config_connector_enabled ? "GKE_METADATA" : "MODE_UNSPECIFIED" + mode = "GKE_METADATA" } labels = merge({ "k8s.dask.org/node-purpose" = "worker", diff --git a/terraform/gcp/main.tf b/terraform/gcp/main.tf index c3584892c0..3a4a1733f6 100644 --- a/terraform/gcp/main.tf +++ b/terraform/gcp/main.tf @@ -9,40 +9,20 @@ terraform { source = "google-beta" version = "4.11.0" } + kubernetes = { + version = "2.8.0" + } } } -// Service account used by all the nodes and pods in our cluster -resource "google_service_account" "cluster_sa" { - account_id = "${var.prefix}-cluster-sa" - display_name = "Cluster SA for ${var.prefix}" - project = var.project_id -} - -// To access GCS buckets with requestor pays, the calling code needs -// to have serviceusage.services.use permission. We create a role -// granting just this to provide the cluster SA, so user pods can -// use it. See https://cloud.google.com/storage/docs/requester-pays -// for more info -resource "google_project_iam_custom_role" "identify_project_role" { - // Role names can't contain -, so we swap them out. BOO - role_id = replace("${var.prefix}_user_sa_role", "-", "_") - project = var.project_id - title = "Identify as project role for users in ${var.prefix}" - description = "Minimal role for hub users on ${var.prefix} to identify as current project" - permissions = ["serviceusage.services.use"] -} +data "google_client_config" "default" {} -resource "google_project_iam_member" "identify_project_binding" { - project = var.project_id - role = google_project_iam_custom_role.identify_project_role.name - member = "serviceAccount:${google_service_account.cluster_sa.email}" +provider "kubernetes" { + # From https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/guides/getting-started#provider-setup + host = "https://${google_container_cluster.cluster.endpoint}" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode( + google_container_cluster.cluster.master_auth.0.cluster_ca_certificate + ) } -resource "google_project_iam_member" "cluster_sa_roles" { - for_each = var.cluster_sa_roles - - project = var.project_id - role = each.value - member = "serviceAccount:${google_service_account.cluster_sa.email}" -} diff --git a/terraform/gcp/projects/meom-ige.tfvars b/terraform/gcp/projects/meom-ige.tfvars index 4d3c914e46..9fbf903d31 100644 --- a/terraform/gcp/projects/meom-ige.tfvars +++ b/terraform/gcp/projects/meom-ige.tfvars @@ -15,9 +15,7 @@ core_node_machine_type = "g1-small" # Single-tenant cluster, network policy not needed enable_network_policy = false -# Single tenant cluster, so bucket access is provided via -# metadata concealment + node SA. Config Connector not needed. -config_connector_enabled = false +workload_identity_enabled_hubs = ["staging", "prod"] notebook_nodes = { "small" : { diff --git a/terraform/gcp/variables.tf b/terraform/gcp/variables.tf index a3d9008d6a..42181bc275 100644 --- a/terraform/gcp/variables.tf +++ b/terraform/gcp/variables.tf @@ -51,25 +51,6 @@ variable "config_connector_enabled" { EOT } -variable "cluster_sa_roles" { - type = set(string) - default = [ - "roles/logging.logWriter", - "roles/monitoring.metricWriter", - "roles/monitoring.viewer", - "roles/stackdriver.resourceMetadata.writer", - "roles/artifactregistry.reader" - ] - description = <<-EOT - List of roles granted to the SA assumed by cluster nodes. - - The defaults grant just enough access for the components on the node - to write metrics & logs to stackdriver, and pull images from artifact registry. - - https://cloud.google.com/kubernetes-engine/docs/how-to/hardening-your-cluster - has more information. - EOT -} variable "cd_sa_roles" { type = set(string) @@ -274,3 +255,15 @@ variable "max_cpu" { Default = 1000 EOT } + +variable "workload_identity_enabled_hubs" { + type = set(string) + default = [] + description = <<-EOT + List of hubs that will get workload identity enabled. + + This should match individual namespaces that exist in the cluster. An + appropriate Google Cloud Service Account will be created for *each* of these, + and a Kubernetes Service Account will also be created. + EOT +} diff --git a/terraform/gcp/workload-identity.tf b/terraform/gcp/workload-identity.tf new file mode 100644 index 0000000000..eca65f825d --- /dev/null +++ b/terraform/gcp/workload-identity.tf @@ -0,0 +1,63 @@ +# User pods need to authenticate to cloud APIs - particularly around storage. +# On GKE, Workload Identity (https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) +# is the canonical way to do this securely. A Google Cloud Service Account (GSA) +# is created and given appropriate rights, and then bound to a Kubernetes Service Account (KSA) +# via workload identity. All pods that then mount this kubernetes service account (named user-sa) +# get the cloud permissions assigned to the Google Cloud Service Account. +# +# Since each cluster can contain multiple hubs, we need to tell terraform which hubs we want +# to equip with the KSA that has cloud credentials. Terraform will create this Kubernetes +# Service Account (and the namespace, if it does not exist). + +resource "google_service_account" "workload_sa" { + for_each = var.workload_identity_enabled_hubs + account_id = "${var.prefix}-${each.value}-workload-sa" + display_name = "Service account for user pods in hub ${each.value} in ${var.prefix}" + project = var.project_id +} + +# To access GCS buckets with requestor pays, the calling code needs +# to have serviceusage.services.use permission. We create a role +# granting just this to provide the workload SA, so user pods can +# use it. See https://cloud.google.com/storage/docs/requester-pays +# for more info +resource "google_project_iam_custom_role" "workload_role" { + // Role names can't contain -, so we swap them out. BOO + role_id = replace("${var.prefix}_workload_sa_role", "-", "_") + project = var.project_id + title = "Identify as project role for users in ${var.prefix}" + description = "Minimal role for hub users on ${var.prefix} to identify as current project" + permissions = ["serviceusage.services.use"] +} + +resource "google_project_iam_member" "workload_binding" { + for_each = var.workload_identity_enabled_hubs + project = var.project_id + role = google_project_iam_custom_role.workload_role.name + member = "serviceAccount:${google_service_account.workload_sa[each.value].email}" +} + +# Bind the Kubernetes Service Accounts to their appropriate Google Cloud Service Accounts +resource "google_service_account_iam_binding" "workload_identity_binding" { + for_each = var.workload_identity_enabled_hubs + service_account_id = google_service_account.workload_sa[each.value].id + role = "roles/iam.workloadIdentityUser" + members = [ + "serviceAccount:${var.project_id}.svc.id.goog[${each.value}/user-sa]" + ] +} + +# Create the Service Account in the Kubernetes Namespace +# FIXME: We might need to create the k8s namespace here some of the time, but then who is +# responsible for that - terraform or helm (via our deployer?) +resource "kubernetes_service_account" "workload_kubernetes_sa" { + for_each = var.workload_identity_enabled_hubs + + metadata { + name = "user-sa" + namespace = each.value + annotations = { + "iam.gke.io/gcp-service-account" = google_service_account.workload_sa[each.value].email + } + } +}