Minor docs and deployment script fixes (#20)

Signed-off-by: Apekshit Sharma <[email protected]>
blockchain-etl · Jun 2, 2020 · dbf4d3a · dbf4d3a
1 parent fe9a10b
commit dbf4d3a
Show file tree

Hide file tree

Showing 6 changed files with 17 additions and 11 deletions.
diff --git a/docs/deployment.md b/docs/deployment.md
@@ -1,7 +1,7 @@
 # Deployment
 
 ## Requirements:
-1. BigQuery tables : transactions, errors, dedupe_state
+1. BigQuery tables : transactions, errors, dedupe_state, transaction_types
 1. PubSub topic for transactions
 1. GCS bucket : Used for dataflow templates, staging and as temp location
 1. ETL Pipeline from PubSub to BigQuery:
@@ -27,11 +27,11 @@ Resource creation can be automated using [setup-gcp-resources.sh](../scripts/set
 
 Use [deploy-etl-pipeline.sh](../scripts/deploy-etl-pipeline.sh) script to deploy the etl pipeline to GCP Dataflow.
 
-1. Deploy Deduplication task
+2. Deploy Deduplication task
 
 TODO
 
-1. Deploy Hedera Mirror Node Importer to publish transactions to the pubsub topic. See
+3. Deploy Hedera Mirror Node Importer to publish transactions to the pubsub topic. See
 Mirror Nodes [installation](https://github.com/hashgraph/hedera-mirror-node/blob/master/docs/installation.md) and
 [configuration](https://github.com/hashgraph/hedera-mirror-node/blob/master/docs/configuration.md#importer) for more
 details.

diff --git a/hedera-deduplication-bigquery/src/main/java/com/hedera/dedupe/DedupeProperties.java b/hedera-deduplication-bigquery/src/main/java/com/hedera/dedupe/DedupeProperties.java
@@ -41,7 +41,7 @@ public class DedupeProperties {
     private String transactionsTableName = "transactions";
 
     @NotBlank
-    private String stateTableName = "state";
+    private String stateTableName = "dedupe_state";
 
     private boolean metricsEnabled = false;
 

diff --git a/scripts/common.sh b/scripts/common.sh
@@ -19,8 +19,8 @@ NAME=${DEPLOYMENT_NAME}
 
 : ${KEYS_DIR:=`pwd`/${NAME}-keys}
 
-: ${BUCKET_PIPELINES:=gs://${NAME}-hedera-etl-pipelines}
-: ${BUCKET_ETL_GCS:=gs://${NAME}-transactions}
+: ${BUCKET_PIPELINES:=gs://${PROJECT_ID}-${NAME}-pipelines} # Should be globally unique
+: ${BUCKET_ETL_GCS:=gs://${PROJECT_ID}-${NAME}-transactions}
 
 : ${PUBSUB_TOPIC_NAME:=${NAME}-transactions-topic}
 : ${PUBSUB_SUBSCRIPTION_ETL_BIGQUERY:=${NAME}-etl-bigquery}
@@ -35,3 +35,7 @@ NAME=${DEPLOYMENT_NAME}
 : ${SA_ETL_GCS:=${NAME}-etl-gcs}
 : ${SA_DEDUPLICATION:=${NAME}-deduplication-bigquery}
 : ${SA_IMPORTER:=${NAME}-importer}
+
+NOW=`date +"%Y%m%d-%H%M%S%z"`
+: ${JOB_NAME_ETL_BIGQUERY:=${NAME}-etl-bigquery-${NOW}}
+: ${JOB_NAME_ETL_GCS:=${NAME}-etl-gcs-${NOW}}
diff --git a/scripts/create-tables.sh b/scripts/create-tables.sh
@@ -35,7 +35,7 @@ bq mk \
   ${BQ_TRANSACTION_TYPES_TABLE} \
   ${SCRIPT_DIR}/../hedera-etl-bigquery/src/main/resources/transaction-types-schema.json
 
-echo "INSERT INTO ${PROJECT_ID}.${DATASET}.transaction_types (id, name) VALUES \
+echo "INSERT INTO ${PROJECT_ID}.${DATASET_NAME}.transaction_types (id, name) VALUES \
 (7, 'CONTRACTCALL'), \
 (8, 'CONTRACTCREATEINSTANCE'), \
 (9, 'CONTRACTUPDATEINSTANCE'), \

diff --git a/scripts/deploy-etl-pipeline.sh b/scripts/deploy-etl-pipeline.sh
@@ -41,9 +41,10 @@ mvn clean compile exec:java \
 echo "Staring ETL BigQuery on Dataflow"
 
 SUBSCRIPTION="projects/${PROJECT_ID}/subscriptions/${PUBSUB_SUBSCRIPTION_ETL_BIGQUERY}"
-gcloud dataflow jobs run etl-bigquery-`date +"%Y%m%d-%H%M%S%z"` \
+gcloud dataflow jobs run ${JOB_NAME_ETL_BIGQUERY} \
   --gcs-location=${PIPELINE_FOLDER}/template \
   --service-account-email=${SA_ETL_BIGQUERY}@${PROJECT_ID}.iam.gserviceaccount.com \
+  --project=${PROJECT_ID} \
   --parameters \
 inputSubscription=${SUBSCRIPTION},\
 outputTransactionsTable=${BQ_TRANSACTIONS_TABLE},\
@@ -52,12 +53,13 @@ outputErrorsTable=${BQ_ERRORS_TABLE}
 if [[ "${ETL_TO_GCS}" == "true" ]]; then
   TEMPLATE_LOCATION=gs://dataflow-templates/2020-03-31-01_RC00/Cloud_PubSub_to_GCS_Text
   TOPIC="projects/${PROJECT_ID}/topics/${PUBSUB_TOPIC_NAME}"
-  gcloud dataflow jobs run etl-gcs-`date +"%Y%m%d-%H%M%S%z"` \
+  gcloud dataflow jobs run ${JOB_NAME_ETL_GCS} \
     --gcs-location=${TEMPLATE_LOCATION} \
     --service-account-email=${SA_ETL_GCS}@${PROJECT_ID}.iam.gserviceaccount.com \
+    --project=${PROJECT_ID} \
     --parameters \
 inputTopic=${TOPIC},\
 outputDirectory=${BUCKET_ETL_GCS}/,\
 outputFilenamePrefix=transactions-,\
 outputFilenameSuffix=.txt
-fi
+fi
diff --git a/scripts/setup-gcp-resources.sh b/scripts/setup-gcp-resources.sh
@@ -98,4 +98,4 @@ if [[ "${ETL_TO_GCS}" == "true" ]]; then
     ${SA_ETL_GCS} \
     "roles/dataflow.worker roles/pubsub.editor roles/storage.admin" \
     "For pubsub --> GCS dataflow controller"
-fi
+fi