diff --git a/manifests/poc-data-cluster/prod/helium/mobile-rewards-share-delta-lake-sink.yaml b/manifests/poc-data-cluster/prod/helium/mobile-rewards-share-delta-lake-sink.yaml new file mode 100755 index 00000000..0e2843f3 --- /dev/null +++ b/manifests/poc-data-cluster/prod/helium/mobile-rewards-share-delta-lake-sink.yaml @@ -0,0 +1,66 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: mobile-rewards-share-delta-lake-sink + namespace: helium +spec: + concurrencyPolicy: Forbid + schedule: "10 */4 * * *" + jobTemplate: + spec: + backoffLimit: 10 + template: + spec: + serviceAccountName: s3-data-lake-bucket-access + tolerations: # Schedule executor pods on spot instance group + - key: dedicated + operator: Equal + value: spark + effect: NoSchedule + nodeSelector: + nodegroup-type: spot + containers: + - name: mobile-rewards-delta-lake-sink + image: public.ecr.aws/k0m1p4t7/protobuf-delta-lake-sink:0.0.10 + imagePullPolicy: IfNotPresent + resources: + requests: + cpu: 1000m + memory: 6900Mi + limits: + memory: 6900Mi + env: + - name: AWS_S3_ALLOW_UNSAFE_RENAME + value: "true" + args: + - --source-bucket + - foundation-poc-data-requester-pays + - --source-region + - us-west-2 + - --file-prefix + - foundation-iot-verified-rewards/mobile_reward_share + - --source-proto-name + - "mobile_reward_share" + - --source-proto-base-url + - https://raw.githubusercontent.com/helium/proto/master/src + - --source-protos + - data_rate.proto + - --source-protos + - service/packet_verifier.proto + - --source-protos + - service/poc_mobile.proto + - --source-protos + - region.proto + - --target-bucket + - foundation-data-lake-requester-pays + - --target-table + - bronze/mobile_reward_share + - --target-region + - us-west-2 + - --partition-timestamp-column + - start_period + - --partition-timestamp-date-divisor + - "86400" + - --batch-size + - "500000000" # Targetting 500mb parquet files, per databricks recs on large tables + restartPolicy: OnFailure \ No newline at end of file diff --git a/manifests/poc-data-cluster/prod/spark/iot-data-reward-totals-silver.yaml b/manifests/poc-data-cluster/prod/spark/iot-data-reward-totals-silver.yaml new file mode 100644 index 00000000..97ef20db --- /dev/null +++ b/manifests/poc-data-cluster/prod/spark/iot-data-reward-totals-silver.yaml @@ -0,0 +1,83 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: iot-data-reward-totals-silver-query + namespace: spark +data: + query.sql: | + SELECT + date, sum(dc_transfer_amount_iot) AS data_iot_total + FROM iot_reward_share +--- +apiVersion: "sparkoperator.k8s.io/v1beta2" +kind: SparkApplication +metadata: + name: iot-data-reward-totals-silver + namespace: spark +spec: + type: Scala + mode: cluster + image: "public.ecr.aws/k0m1p4t7/spark:v3.4.0-aws" + imagePullPolicy: Always + mainClass: Main + mainApplicationFile: "s3a://foundation-data-lake-requester-pays/jars/spark-streaming-sql-assembly-1.0.jar" + sparkVersion: "3.4.0" + restartPolicy: + type: OnFailure + onFailureRetries: 3 + onFailureRetryInterval: 10 + onSubmissionFailureRetries: 3 + onSubmissionFailureRetryInterval: 10 + sparkConf: + spark.databricks.delta.autoCompact.enabled: "true" + hadoopConf: + fs.s3a.aws.credentials.provider: com.amazonaws.auth.WebIdentityTokenCredentialsProvider + volumes: + - name: "tmp" + hostPath: + path: "/tmp" + type: Directory + - name: config-vol + configMap: + name: iot-data-reward-totals-silver-query + items: + - key: query.sql + path: query.sql + driver: + serviceAccount: spark-data-lake-access + cores: 1 + coreLimit: "1200m" + memory: "512m" + nodeSelector: + node.kubernetes.io/instance-type: m5.large + envVars: + TABLE_IOT_REWARD_SHARE: s3a://foundation-data-lake-requester-pays/silver/iot-reward-share + PARTITION_BY: "date" + CHECKPOINT: s3a://foundation-data-lake-requester-pays/checkpoints/iot-data-reward-totals + OUTPUT: s3a://foundation-data-lake-requester-pays/silver/iot-data-reward-totals + QUERY_PATH: /app/query.sql + labels: + version: 3.4.0 + volumeMounts: + - name: "test-volume" + mountPath: "/tmp" + - name: config-vol + mountPath: /app + executor: + serviceAccount: spark-data-lake-access + cores: 1 + coreLimit: "1200m" + instances: 3 + memory: "10G" + tolerations: # Schedule executor pods on spot instance group + - key: dedicated + operator: Equal + value: spark + effect: NoSchedule + nodeSelector: + nodegroup-type: spot + labels: + version: 3.4.0 + volumeMounts: + - name: "tmp" + mountPath: "/tmp" \ No newline at end of file diff --git a/manifests/poc-data-cluster/prod/spark/iot-netid-counts-silver.yaml b/manifests/poc-data-cluster/prod/spark/iot-netid-counts-silver.yaml new file mode 100644 index 00000000..2849bd62 --- /dev/null +++ b/manifests/poc-data-cluster/prod/spark/iot-netid-counts-silver.yaml @@ -0,0 +1,85 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: iot-oui-counts-silver-query + namespace: spark +data: + query.sql: | + SELECT + date, net_id, count(net_id) AS count + FROM iot_packets + GROUP BY net_id + ORDER BY net_id DESC +--- +apiVersion: "sparkoperator.k8s.io/v1beta2" +kind: SparkApplication +metadata: + name: iot-netid-counts-silver + namespace: spark +spec: + type: Scala + mode: cluster + image: "public.ecr.aws/k0m1p4t7/spark:v3.4.0-aws" + imagePullPolicy: Always + mainClass: Main + mainApplicationFile: "s3a://foundation-data-lake-requester-pays/jars/spark-streaming-sql-assembly-1.0.jar" + sparkVersion: "3.4.0" + restartPolicy: + type: OnFailure + onFailureRetries: 3 + onFailureRetryInterval: 10 + onSubmissionFailureRetries: 3 + onSubmissionFailureRetryInterval: 10 + sparkConf: + spark.databricks.delta.autoCompact.enabled: "true" + hadoopConf: + fs.s3a.aws.credentials.provider: com.amazonaws.auth.WebIdentityTokenCredentialsProvider + volumes: + - name: "tmp" + hostPath: + path: "/tmp" + type: Directory + - name: config-vol + configMap: + name: iot-netid-counts-silver-query + items: + - key: query.sql + path: query.sql + driver: + serviceAccount: spark-data-lake-access + cores: 1 + coreLimit: "1200m" + memory: "512m" + nodeSelector: + node.kubernetes.io/instance-type: m5.large + envVars: + TABLE_IOT_PACKETS: s3a://foundation-data-lake-requester-pays/silver/iot-packets + PARTITION_BY: "date" + CHECKPOINT: s3a://foundation-data-lake-requester-pays/checkpoints/iot-netid-counts + OUTPUT: s3a://foundation-data-lake-requester-pays/silver/iot-netid-counts + QUERY_PATH: /app/query.sql + labels: + version: 3.4.0 + volumeMounts: + - name: "test-volume" + mountPath: "/tmp" + - name: config-vol + mountPath: /app + executor: + serviceAccount: spark-data-lake-access + cores: 1 + coreLimit: "1200m" + instances: 3 + memory: "10G" + tolerations: # Schedule executor pods on spot instance group + - key: dedicated + operator: Equal + value: spark + effect: NoSchedule + nodeSelector: + nodegroup-type: spot + labels: + version: 3.4.0 + volumeMounts: + - name: "tmp" + mountPath: "/tmp" \ No newline at end of file diff --git a/manifests/poc-data-cluster/prod/spark/iot-oui-counts-silver.yaml b/manifests/poc-data-cluster/prod/spark/iot-oui-counts-silver.yaml new file mode 100644 index 00000000..ddc81d53 --- /dev/null +++ b/manifests/poc-data-cluster/prod/spark/iot-oui-counts-silver.yaml @@ -0,0 +1,91 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: iot-oui-counts-silver-query + namespace: spark +data: + query.sql: | + SELECT + date, oui, count(oui) AS count + FROM iot_packets + GROUP BY oui + ORDER BY oui DESC + WHERE date_add(current_date(), -1); +--- +apiVersion: "sparkoperator.k8s.io/v1beta2" +kind: ScheduledSparkApplication +metadata: + name: iot-oui-counts-silver + namespace: spark +spec: + schedule: "@daily" + concurrencyPolicy: Allow + successfulRunHistoryLimit: 1 + failedRunHistoryLimit: 3 + template: + type: Scala + mode: cluster + image: "public.ecr.aws/k0m1p4t7/spark:v3.4.0-aws" + imagePullPolicy: Always + mainClass: Main + mainApplicationFile: "s3a://foundation-data-lake-requester-pays/jars/spark-streaming-sql-assembly-1.0.jar" + sparkVersion: "3.4.0" + restartPolicy: + type: OnFailure + onFailureRetries: 3 + onFailureRetryInterval: 10 + onSubmissionFailureRetries: 3 + onSubmissionFailureRetryInterval: 10 + sparkConf: + spark.databricks.delta.autoCompact.enabled: "true" + hadoopConf: + fs.s3a.aws.credentials.provider: com.amazonaws.auth.WebIdentityTokenCredentialsProvider + volumes: + - name: "tmp" + hostPath: + path: "/tmp" + type: Directory + - name: config-vol + configMap: + name: iot-oui-counts-silver-query + items: + - key: query.sql + path: query.sql + driver: + serviceAccount: spark-data-lake-access + cores: 1 + coreLimit: "1200m" + memory: "512m" + nodeSelector: + node.kubernetes.io/instance-type: m5.large + envVars: + TABLE_IOT_PACKETS: s3a://foundation-data-lake-requester-pays/silver/iot-packets + PARTITION_BY: "date" + CHECKPOINT: s3a://foundation-data-lake-requester-pays/checkpoints/iot-oui-counts + OUTPUT: s3a://foundation-data-lake-requester-pays/silver/iot-oui-counts + QUERY_PATH: /app/query.sql + labels: + version: 3.4.0 + volumeMounts: + - name: "test-volume" + mountPath: "/tmp" + - name: config-vol + mountPath: /app + executor: + serviceAccount: spark-data-lake-access + cores: 1 + coreLimit: "1200m" + instances: 3 + memory: "10G" + tolerations: # Schedule executor pods on spot instance group + - key: dedicated + operator: Equal + value: spark + effect: NoSchedule + nodeSelector: + nodegroup-type: spot + labels: + version: 3.4.0 + volumeMounts: + - name: "tmp" + mountPath: "/tmp" \ No newline at end of file diff --git a/manifests/poc-data-cluster/prod/spark/iot-packet-counts-silver.yaml b/manifests/poc-data-cluster/prod/spark/iot-packet-counts-silver.yaml new file mode 100644 index 00000000..6ebe2d20 --- /dev/null +++ b/manifests/poc-data-cluster/prod/spark/iot-packet-counts-silver.yaml @@ -0,0 +1,83 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: iot-packet-counts-silver-query + namespace: spark +data: + query.sql: | + SELECT + date, count(payload_hash) AS count + FROM iot_packets +--- +apiVersion: "sparkoperator.k8s.io/v1beta2" +kind: SparkApplication +metadata: + name: iot-packet-counts-silver + namespace: spark +spec: + type: Scala + mode: cluster + image: "public.ecr.aws/k0m1p4t7/spark:v3.4.0-aws" + imagePullPolicy: Always + mainClass: Main + mainApplicationFile: "s3a://foundation-data-lake-requester-pays/jars/spark-streaming-sql-assembly-1.0.jar" + sparkVersion: "3.4.0" + restartPolicy: + type: OnFailure + onFailureRetries: 3 + onFailureRetryInterval: 10 + onSubmissionFailureRetries: 3 + onSubmissionFailureRetryInterval: 10 + sparkConf: + spark.databricks.delta.autoCompact.enabled: "true" + hadoopConf: + fs.s3a.aws.credentials.provider: com.amazonaws.auth.WebIdentityTokenCredentialsProvider + volumes: + - name: "tmp" + hostPath: + path: "/tmp" + type: Directory + - name: config-vol + configMap: + name: iot-packet-counts-silver-query + items: + - key: query.sql + path: query.sql + driver: + serviceAccount: spark-data-lake-access + cores: 1 + coreLimit: "1200m" + memory: "512m" + nodeSelector: + node.kubernetes.io/instance-type: m5.large + envVars: + TABLE_IOT_PACKETS: s3a://foundation-data-lake-requester-pays/silver/iot-packets + PARTITION_BY: "date" + CHECKPOINT: s3a://foundation-data-lake-requester-pays/checkpoints/iot-packet-counts + OUTPUT: s3a://foundation-data-lake-requester-pays/silver/iot-packet-counts + QUERY_PATH: /app/query.sql + labels: + version: 3.4.0 + volumeMounts: + - name: "test-volume" + mountPath: "/tmp" + - name: config-vol + mountPath: /app + executor: + serviceAccount: spark-data-lake-access + cores: 1 + coreLimit: "1200m" + instances: 3 + memory: "10G" + tolerations: # Schedule executor pods on spot instance group + - key: dedicated + operator: Equal + value: spark + effect: NoSchedule + nodeSelector: + nodegroup-type: spot + labels: + version: 3.4.0 + volumeMounts: + - name: "tmp" + mountPath: "/tmp" \ No newline at end of file diff --git a/manifests/poc-data-cluster/prod/spark/iot-payload-totals-silver.yaml b/manifests/poc-data-cluster/prod/spark/iot-payload-totals-silver.yaml new file mode 100644 index 00000000..5119d67e --- /dev/null +++ b/manifests/poc-data-cluster/prod/spark/iot-payload-totals-silver.yaml @@ -0,0 +1,83 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: iot-payload-totals-silver-query + namespace: spark +data: + query.sql: | + SELECT + date, sum(payload_size) AS payload_total + FROM iot_packets +--- +apiVersion: "sparkoperator.k8s.io/v1beta2" +kind: SparkApplication +metadata: + name: iot-payload-totals-silver + namespace: spark +spec: + type: Scala + mode: cluster + image: "public.ecr.aws/k0m1p4t7/spark:v3.4.0-aws" + imagePullPolicy: Always + mainClass: Main + mainApplicationFile: "s3a://foundation-data-lake-requester-pays/jars/spark-streaming-sql-assembly-1.0.jar" + sparkVersion: "3.4.0" + restartPolicy: + type: OnFailure + onFailureRetries: 3 + onFailureRetryInterval: 10 + onSubmissionFailureRetries: 3 + onSubmissionFailureRetryInterval: 10 + sparkConf: + spark.databricks.delta.autoCompact.enabled: "true" + hadoopConf: + fs.s3a.aws.credentials.provider: com.amazonaws.auth.WebIdentityTokenCredentialsProvider + volumes: + - name: "tmp" + hostPath: + path: "/tmp" + type: Directory + - name: config-vol + configMap: + name: iot-payload-totals-silver-query + items: + - key: query.sql + path: query.sql + driver: + serviceAccount: spark-data-lake-access + cores: 1 + coreLimit: "1200m" + memory: "512m" + nodeSelector: + node.kubernetes.io/instance-type: m5.large + envVars: + TABLE_IOT_PACKETS: s3a://foundation-data-lake-requester-pays/silver/iot-packets + PARTITION_BY: "date" + CHECKPOINT: s3a://foundation-data-lake-requester-pays/checkpoints/iot-payload-totals + OUTPUT: s3a://foundation-data-lake-requester-pays/silver/iot-payload-totals + QUERY_PATH: /app/query.sql + labels: + version: 3.4.0 + volumeMounts: + - name: "test-volume" + mountPath: "/tmp" + - name: config-vol + mountPath: /app + executor: + serviceAccount: spark-data-lake-access + cores: 1 + coreLimit: "1200m" + instances: 3 + memory: "10G" + tolerations: # Schedule executor pods on spot instance group + - key: dedicated + operator: Equal + value: spark + effect: NoSchedule + nodeSelector: + nodegroup-type: spot + labels: + version: 3.4.0 + volumeMounts: + - name: "tmp" + mountPath: "/tmp" \ No newline at end of file diff --git a/manifests/poc-data-cluster/prod/spark/iot-region-counts-silver.yaml b/manifests/poc-data-cluster/prod/spark/iot-region-counts-silver.yaml new file mode 100644 index 00000000..efac80a3 --- /dev/null +++ b/manifests/poc-data-cluster/prod/spark/iot-region-counts-silver.yaml @@ -0,0 +1,85 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: iot-region-counts-silver-query + namespace: spark +data: + query.sql: | + SELECT + date, region, count(region) AS count + FROM iot_packets + GROUP BY region + ORDER BY region DESC +--- +apiVersion: "sparkoperator.k8s.io/v1beta2" +kind: SparkApplication +metadata: + name: iot-region-counts-silver + namespace: spark +spec: + type: Scala + mode: cluster + image: "public.ecr.aws/k0m1p4t7/spark:v3.4.0-aws" + imagePullPolicy: Always + mainClass: Main + mainApplicationFile: "s3a://foundation-data-lake-requester-pays/jars/spark-streaming-sql-assembly-1.0.jar" + sparkVersion: "3.4.0" + restartPolicy: + type: OnFailure + onFailureRetries: 3 + onFailureRetryInterval: 10 + onSubmissionFailureRetries: 3 + onSubmissionFailureRetryInterval: 10 + sparkConf: + spark.databricks.delta.autoCompact.enabled: "true" + hadoopConf: + fs.s3a.aws.credentials.provider: com.amazonaws.auth.WebIdentityTokenCredentialsProvider + volumes: + - name: "tmp" + hostPath: + path: "/tmp" + type: Directory + - name: config-vol + configMap: + name: iot-region-counts-silver-query + items: + - key: query.sql + path: query.sql + driver: + serviceAccount: spark-data-lake-access + cores: 1 + coreLimit: "1200m" + memory: "512m" + nodeSelector: + node.kubernetes.io/instance-type: m5.large + envVars: + TABLE_IOT_PACKETS: s3a://foundation-data-lake-requester-pays/silver/iot-packets + PARTITION_BY: "date" + CHECKPOINT: s3a://foundation-data-lake-requester-pays/checkpoints/iot-region-counts + OUTPUT: s3a://foundation-data-lake-requester-pays/silver/iot-region-counts + QUERY_PATH: /app/query.sql + labels: + version: 3.4.0 + volumeMounts: + - name: "test-volume" + mountPath: "/tmp" + - name: config-vol + mountPath: /app + executor: + serviceAccount: spark-data-lake-access + cores: 1 + coreLimit: "1200m" + instances: 3 + memory: "10G" + tolerations: # Schedule executor pods on spot instance group + - key: dedicated + operator: Equal + value: spark + effect: NoSchedule + nodeSelector: + nodegroup-type: spot + labels: + version: 3.4.0 + volumeMounts: + - name: "tmp" + mountPath: "/tmp" \ No newline at end of file diff --git a/manifests/poc-data-cluster/prod/spark/mobile-rewards-silver.yaml b/manifests/poc-data-cluster/prod/spark/mobile-rewards-silver.yaml new file mode 100644 index 00000000..c114851d --- /dev/null +++ b/manifests/poc-data-cluster/prod/spark/mobile-rewards-silver.yaml @@ -0,0 +1,87 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: mobile-rewards-silver-query + namespace: spark +data: + query.sql: | + SELECT + date, + start_period, + end_period, + b58encodeChecked(gateway_reward.hotspot_key) as gateway, + gateway_reward.dc_transfer_reward as dc_transfer_amount_mobile + FROM mobile_reward_share +--- +apiVersion: "sparkoperator.k8s.io/v1beta2" +kind: SparkApplication +metadata: + name: mobile-rewards-silver + namespace: spark +spec: + type: Scala + mode: cluster + image: "public.ecr.aws/k0m1p4t7/spark:v3.4.0-aws" + imagePullPolicy: Always + mainClass: Main + mainApplicationFile: "s3a://foundation-data-lake-requester-pays/jars/spark-streaming-sql-assembly-1.0.jar" + sparkVersion: "3.4.0" + restartPolicy: + type: OnFailure + onFailureRetries: 3 + onFailureRetryInterval: 10 + onSubmissionFailureRetries: 3 + onSubmissionFailureRetryInterval: 10 + sparkConf: + spark.databricks.delta.autoCompact.enabled: "true" + hadoopConf: + fs.s3a.aws.credentials.provider: com.amazonaws.auth.WebIdentityTokenCredentialsProvider + volumes: + - name: "tmp" + hostPath: + path: "/tmp" + type: Directory + - name: config-vol + configMap: + name: mobile-rewards-silver-query + items: + - key: query.sql + path: query.sql + driver: + serviceAccount: spark-data-lake-access + cores: 1 + coreLimit: "1200m" + memory: "512m" + nodeSelector: + node.kubernetes.io/instance-type: m5.large + envVars: + TABLE_MOBILE_REWARD_SHARE: s3a://foundation-data-lake-requester-pays/bronze/mobile_reward_share + PARTITION_BY: "date" + CHECKPOINT: s3a://foundation-data-lake-requester-pays/checkpoints/mobile-reward-share + OUTPUT: s3a://foundation-data-lake-requester-pays/silver/mobile-reward-share + QUERY_PATH: /app/query.sql + labels: + version: 3.4.0 + volumeMounts: + - name: "test-volume" + mountPath: "/tmp" + - name: config-vol + mountPath: /app + executor: + serviceAccount: spark-data-lake-access + cores: 1 + coreLimit: "1200m" + instances: 2 + memory: "10G" + tolerations: # Schedule executor pods on spot instance group + - key: dedicated + operator: Equal + value: spark + effect: NoSchedule + nodeSelector: + nodegroup-type: spot + labels: + version: 3.4.0 + volumeMounts: + - name: "tmp" + mountPath: "/tmp" \ No newline at end of file