Skip to content

Commit

Permalink
Setup workload identity via terraform
Browse files Browse the repository at this point in the history
The GKE config connector was helpful in letting us deploy
Google Cloud Service Accounts with permissions for cloud storage
directly just from helm. However, it has been difficult to debug,
and in 2i2c-org#669
we decided to move away from it and towards creating these
cloud resources via Terraform.

This commit adds:
- Terraform code that will create a Google Service Account,
  bind it to a given Kubernetes Service Account, for a list of
  hub namespaces passed in. This means that some hub initial deployments
  now *can not be done just with CD*, but need manual work with
  terraform. I think this would be any hub that wants to use
  requestor pays or scratch buckets. This would need to be
  documented.
- Move meom-ige to use this new scheme. metadata concealment
  (https://cloud.google.com/kubernetes-engine/docs/how-to/protecting-cluster-metadata#concealment)
  which is what we were using earlier as alternative to config-connector
  + workload identity, is no longer supported by the terraform
  google provider. In b7b42ce,
  we changed the default from 'SECURE' to 'UNSPECIFIED', but
  it looks like 'UNSPECIFIED' really means 'use workload identity'
  haha. When 2i2c-org#1124 was
  deployed to meom-ige yesterday, it seems to have enabled workload
  identity, causing cloud access to stop working, leading to
  https://2i2c.freshdesk.com/a/tickets/107. Further investigation on
  what happened here is needed, but I've currently fixed it by
  just deploying this change for meom-ige.
- All hubs are given access to all buckets we create. This is
  inadequete, and needs to be more fine grained.

Ref 2i2c-org#669
Ref 2i2c-org#1046
  • Loading branch information
yuvipanda committed Mar 18, 2022
1 parent 194d9d7 commit 322227c
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 70 deletions.
19 changes: 15 additions & 4 deletions terraform/gcp/buckets.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,21 @@ resource "google_storage_bucket" "user_buckets" {
labels = {}
}

resource "google_storage_bucket_iam_member" "member" {
locals {
# Nested for loop, thanks to https://www.daveperrett.com/articles/2021/08/19/nested-for-each-with-terraform/
bucket_permissions = distinct(flatten([
for hub_name in var.workload_identity_enabled_hubs : [
for bucket_name in var.user_buckets : {
hub_name = hub_name
bucket_name = bucket_name
}
]
]))
}

for_each = var.user_buckets
bucket = google_storage_bucket.user_buckets[each.key].name
resource "google_storage_bucket_iam_member" "member" {
for_each = { for bp in local.bucket_permissions : "${bp.hub_name}.${bp.bucket_name}" => bp }
bucket = google_storage_bucket.user_buckets[each.value.bucket_name].name
role = "roles/storage.admin"
member = "serviceAccount:${google_service_account.cluster_sa.email}"
member = "serviceAccount:${google_service_account.workload_sa[each.value.hub_name].email}"
}
40 changes: 27 additions & 13 deletions terraform/gcp/cluster.tf
Original file line number Diff line number Diff line change
@@ -1,5 +1,27 @@
resource "google_service_account" "cluster_sa" {
account_id = "${var.prefix}-cluster-sa"
display_name = "Service account used by nodes of cluster ${var.prefix}"
project = var.project_id
}

resource "google_project_iam_member" "cluster_sa_roles" {
# https://cloud.google.com/kubernetes-engine/docs/how-to/hardening-your-cluster
# has information on why the cluster SA needs these rights
for_each = toset([
"roles/logging.logWriter",
"roles/monitoring.metricWriter",
"roles/monitoring.viewer",
"roles/stackdriver.resourceMetadata.writer",
"roles/artifactregistry.reader"
])

project = var.project_id
role = each.value
member = "serviceAccount:${google_service_account.cluster_sa.email}"
}

resource "google_container_cluster" "cluster" {
# config_connector_config is in beta
# Setting cluster autoscaling profile is in google-beta
provider = google-beta

name = "${var.prefix}-cluster"
Expand Down Expand Up @@ -61,18 +83,10 @@ resource "google_container_cluster" "cluster" {
// This isn't used anywhere, so let's turn this off
disabled = true
}
config_connector_config {
enabled = var.config_connector_enabled
}
}

dynamic "workload_identity_config" {
# Setup workload identity only if we're using config connector, otherwise
# just metadata concealment is used
for_each = var.config_connector_enabled == "" ? [] : [1]
content {
workload_pool = "${var.project_id}.svc.id.goog"
}
workload_identity_config {
workload_pool = "${var.project_id}.svc.id.goog"
}

release_channel {
Expand Down Expand Up @@ -208,7 +222,7 @@ resource "google_container_node_pool" "notebook" {
# to expose the node CA to users safely.
# FIXME: This should be a bit more fine-grained - it should be possible to disable
# config connector and completely hide all node metadata from user pods
mode = var.config_connector_enabled ? "GKE_METADATA" : "MODE_UNSPECIFIED"
mode = "GKE_METADATA"
}
labels = merge({
# Notebook pods and dask schedulers can exist here
Expand Down Expand Up @@ -278,7 +292,7 @@ resource "google_container_node_pool" "dask_worker" {
# to expose the node CA to users safely.
# FIXME: This should be a bit more fine-grained - it should be possible to disable
# config connector and completely hide all node metadata from user pods
mode = var.config_connector_enabled ? "GKE_METADATA" : "MODE_UNSPECIFIED"
mode = "GKE_METADATA"
}
labels = merge({
"k8s.dask.org/node-purpose" = "worker",
Expand Down
42 changes: 11 additions & 31 deletions terraform/gcp/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -9,40 +9,20 @@ terraform {
source = "google-beta"
version = "4.11.0"
}
kubernetes = {
version = "2.8.0"
}
}
}

// Service account used by all the nodes and pods in our cluster
resource "google_service_account" "cluster_sa" {
account_id = "${var.prefix}-cluster-sa"
display_name = "Cluster SA for ${var.prefix}"
project = var.project_id
}

// To access GCS buckets with requestor pays, the calling code needs
// to have serviceusage.services.use permission. We create a role
// granting just this to provide the cluster SA, so user pods can
// use it. See https://cloud.google.com/storage/docs/requester-pays
// for more info
resource "google_project_iam_custom_role" "identify_project_role" {
// Role names can't contain -, so we swap them out. BOO
role_id = replace("${var.prefix}_user_sa_role", "-", "_")
project = var.project_id
title = "Identify as project role for users in ${var.prefix}"
description = "Minimal role for hub users on ${var.prefix} to identify as current project"
permissions = ["serviceusage.services.use"]
}
data "google_client_config" "default" {}

resource "google_project_iam_member" "identify_project_binding" {
project = var.project_id
role = google_project_iam_custom_role.identify_project_role.name
member = "serviceAccount:${google_service_account.cluster_sa.email}"
provider "kubernetes" {
# From https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/guides/getting-started#provider-setup
host = "https://${google_container_cluster.cluster.endpoint}"
token = data.google_client_config.default.access_token
cluster_ca_certificate = base64decode(
google_container_cluster.cluster.master_auth.0.cluster_ca_certificate
)
}

resource "google_project_iam_member" "cluster_sa_roles" {
for_each = var.cluster_sa_roles

project = var.project_id
role = each.value
member = "serviceAccount:${google_service_account.cluster_sa.email}"
}
4 changes: 1 addition & 3 deletions terraform/gcp/projects/meom-ige.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@ core_node_machine_type = "g1-small"
# Single-tenant cluster, network policy not needed
enable_network_policy = false

# Single tenant cluster, so bucket access is provided via
# metadata concealment + node SA. Config Connector not needed.
config_connector_enabled = false
workload_identity_enabled_hubs = ["staging", "prod"]

notebook_nodes = {
"small" : {
Expand Down
31 changes: 12 additions & 19 deletions terraform/gcp/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -51,25 +51,6 @@ variable "config_connector_enabled" {
EOT
}

variable "cluster_sa_roles" {
type = set(string)
default = [
"roles/logging.logWriter",
"roles/monitoring.metricWriter",
"roles/monitoring.viewer",
"roles/stackdriver.resourceMetadata.writer",
"roles/artifactregistry.reader"
]
description = <<-EOT
List of roles granted to the SA assumed by cluster nodes.
The defaults grant just enough access for the components on the node
to write metrics & logs to stackdriver, and pull images from artifact registry.
https://cloud.google.com/kubernetes-engine/docs/how-to/hardening-your-cluster
has more information.
EOT
}

variable "cd_sa_roles" {
type = set(string)
Expand Down Expand Up @@ -274,3 +255,15 @@ variable "max_cpu" {
Default = 1000
EOT
}

variable "workload_identity_enabled_hubs" {
type = set(string)
default = []
description = <<-EOT
List of hubs that will get workload identity enabled.
This should match individual namespaces that exist in the cluster. An
appropriate Google Cloud Service Account will be created for *each* of these,
and a Kubernetes Service Account will also be created.
EOT
}
63 changes: 63 additions & 0 deletions terraform/gcp/workload-identity.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# User pods need to authenticate to cloud APIs - particularly around storage.
# On GKE, Workload Identity (https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity)
# is the canonical way to do this securely. A Google Cloud Service Account (GSA)
# is created and given appropriate rights, and then bound to a Kubernetes Service Account (KSA)
# via workload identity. All pods that then mount this kubernetes service account (named user-sa)
# get the cloud permissions assigned to the Google Cloud Service Account.
#
# Since each cluster can contain multiple hubs, we need to tell terraform which hubs we want
# to equip with the KSA that has cloud credentials. Terraform will create this Kubernetes
# Service Account (and the namespace, if it does not exist).

resource "google_service_account" "workload_sa" {
for_each = var.workload_identity_enabled_hubs
account_id = "${var.prefix}-${each.value}-workload-sa"
display_name = "Service account for user pods in hub ${each.value} in ${var.prefix}"
project = var.project_id
}

# To access GCS buckets with requestor pays, the calling code needs
# to have serviceusage.services.use permission. We create a role
# granting just this to provide the workload SA, so user pods can
# use it. See https://cloud.google.com/storage/docs/requester-pays
# for more info
resource "google_project_iam_custom_role" "workload_role" {
// Role names can't contain -, so we swap them out. BOO
role_id = replace("${var.prefix}_workload_sa_role", "-", "_")
project = var.project_id
title = "Identify as project role for users in ${var.prefix}"
description = "Minimal role for hub users on ${var.prefix} to identify as current project"
permissions = ["serviceusage.services.use"]
}

resource "google_project_iam_member" "workload_binding" {
for_each = var.workload_identity_enabled_hubs
project = var.project_id
role = google_project_iam_custom_role.workload_role.name
member = "serviceAccount:${google_service_account.workload_sa[each.value].email}"
}

# Bind the Kubernetes Service Accounts to their appropriate Google Cloud Service Accounts
resource "google_service_account_iam_binding" "workload_identity_binding" {
for_each = var.workload_identity_enabled_hubs
service_account_id = google_service_account.workload_sa[each.value].id
role = "roles/iam.workloadIdentityUser"
members = [
"serviceAccount:${var.project_id}.svc.id.goog[${each.value}/user-sa]"
]
}

# Create the Service Account in the Kubernetes Namespace
# FIXME: We might need to create the k8s namespace here some of the time, but then who is
# responsible for that - terraform or helm (via our deployer?)
resource "kubernetes_service_account" "workload_kubernetes_sa" {
for_each = var.workload_identity_enabled_hubs

metadata {
name = "user-sa"
namespace = each.value
annotations = {
"iam.gke.io/gcp-service-account" = google_service_account.workload_sa[each.value].email
}
}
}

0 comments on commit 322227c

Please sign in to comment.