diff --git a/.azure-pipelines/workflows/periodic_reporting.yml b/.azure-pipelines/workflows/periodic_reporting.yml
deleted file mode 100644
index 1bcf54fa..00000000
--- a/.azure-pipelines/workflows/periodic_reporting.yml
+++ /dev/null
@@ -1,65 +0,0 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-trigger: none
-
-variables:
- MAVEN_CACHE_FOLDER: $(Pipeline.Workspace)/.m2/repository
- MAVEN_OPTS: '-ntp -B -Dmaven.repo.local=$(MAVEN_CACHE_FOLDER)'
-
-stages:
-# Build LST-Bench and create artifact to deploy to target VM
-- stage: build
- jobs:
- - job: Build
- pool:
- vmImage: 'ubuntu-latest'
- steps:
- - task: Cache@2
- displayName: Cache Maven local repo
- inputs:
- key: 'maven | "$(Agent.OS)" | **/pom.xml'
- restoreKeys: |
- maven | "$(Agent.OS)"
- maven
- path: $(MAVEN_CACHE_FOLDER)
- - task: Maven@4
- inputs:
- mavenPomFile: 'pom.xml'
- options: $(MAVEN_OPTS)
- javaHomeOption: 'JDKVersion'
- jdkVersionOption: '1.11'
- publishJUnitResults: true
- testResultsFiles: '**/surefire-reports/TEST-*.xml'
- goals: 'package -DskipTests -Pspark-jdbc'
- - task: CopyFiles@2
- displayName: 'Copy Artifacts to: $(TargetFolder)'
- inputs:
- SourceFolder: '$(Build.SourcesDirectory)'
- Contents: |
- launcher.sh
- target/**/*
- TargetFolder: '$(System.DefaultWorkingDirectory)/pipeline-artifacts/'
- - upload: '$(System.DefaultWorkingDirectory)/pipeline-artifacts/'
- artifact: drop
- - deployment: VMDeploy
- displayName: 'Deploying LST-Bench'
- dependsOn: Build
- environment:
- name: 'lst-bench-periodic-reporting'
- resourceType: VirtualMachine
- tags: 'client'
- strategy:
- runOnce:
- deploy:
- steps:
- - script: echo my first deployment
diff --git a/run/README.md b/run/README.md
new file mode 100644
index 00000000..14e74f6c
--- /dev/null
+++ b/run/README.md
@@ -0,0 +1,46 @@
+
+
+# LST-Bench: Configurations and Results
+This folder contains configurations for running LST-Bench on various systems as depicted in the [LST-Bench dashboard](/metrics/app), along with details about the setups used to generate those results.
+
+## Systems Included
+- [x] Apache Spark 3.3.1
+ - [x] Delta Lake 2.2.0
+ - [x] Apache Hudi 0.12.2
+ - [x] Apache Iceberg 1.1.0
+- [ ] Trino 420
+ - [ ] Delta Lake
+ - [ ] Apache Iceberg
+
+## Folder Structure
+While the folder for each engine may have a slightly different structure, they generally contain the following:
+
+- `scripts/`:
+ This directory contains SQL files used to execute LST-Bench workloads on the respective engine.
+ Typically, these SQL files may vary slightly across engines and LSTs based on the supported SQL dialect.
+- `config/`:
+ This directory houses LST-Bench configuration files required to execute the workload.
+ It includes LST-Bench phase/session/task libraries that reference the aforementioned SQL scripts.
+- Additional infrastructure and configuration automation folders, e.g., `azure-pipelines/`:
+ These folders contain scripts or files facilitating automation for running the benchmark on a specific infrastructure/engine.
+ For instance, Azure Pipelines scripts to deploy an engine with different LSTs and executing LST-Bench.
+ Generally, these folders should include an additional README.md file offering further details.
+- `results/`:
+ This folder stores the results of the LST-Bench runs as captured by LST-Bench telemetry using DuckDB.
+ These results are processed and visualized in the [LST-Bench dashboard](/metrics/app).
diff --git a/run/spark-3.3.1/azure-pipelines/README.md b/run/spark-3.3.1/azure-pipelines/README.md
new file mode 100644
index 00000000..4488a12e
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/README.md
@@ -0,0 +1,50 @@
+
+
+# Azure Pipelines Deployment for LST-Bench on Apache Spark 3.3.1
+This directory comprises the necessary tooling for executing LST-Bench on Apache Spark 3.3.1 with different LSTs using Azure Pipelines. The included tooling consists of:
+- `run-lst-bench.yml`:
+ An Azure Pipelines script designed to deploy Apache Spark with various LSTs and execute LST-Bench.
+- `sh/`:
+ A directory containing shell scripts and engine configuration files supporting the deployment of Spark with different LSTs and the execution of experiments.
+- `config/`:
+ A directory with LST-Bench configuration files necessary for executing the experiments that are part of the results.
+
+## Prerequisites
+- Automation for deploying the infrastructure in Azure to run LST-Bench is not implemented. As a result, the Azure Pipeline script expects the following setup:
+ - A VM named 'lst-bench-client' connected to the pipeline environment to run the LST-Bench client.
+ - A VM named 'lst-bench-head' to run the head node of the Spark cluster, also connected to the pipeline environment.
+ - A VMSS cluster, that will serve as the Spark worker nodes, within the same VNet as the head node.
+ - An Azure Storage Account accessible by both the VMSS and head node.
+ - An Azure SQL Database (or SQL Server flavored RDBMS) that will be running Hive Metastore.
+ The Hive Metastore schema for version 2.3.0 should already be installed in the instance.
+- Prior to running the pipeline, several variables need definition in your Azure Pipeline:
+ - `data_storage_account`: Name of the Azure Blob Storage account where the source data for the experiment is stored.
+ - `data_storage_account_shared_key` (secret): Shared key for the Azure Blob Storage account where the source data for the experiment is stored.
+ - `hms_jdbc_driver`: JDBC driver for the Hive Metastore.
+ - `hms_jdbc_url`: JDBC URL for the Hive Metastore.
+ - `hms_jdbc_user`: Username for the Hive Metastore.
+ - `hms_jdbc_password` (secret): Password for the Hive Metastore.
+ - `hms_storage_account`: Name of the Azure Blob Storage account where the Hive Metastore will store data associated with the catalog (can be the same as the data_storage_account).
+ - `hms_storage_account_shared_key` (secret): Shared key for the Azure Blob Storage account where the Hive Metastore will store data associated with the catalog.
+ - `hms_storage_account_container`: Name of the container in the Azure Blob Storage account where the Hive Metastore will store data associated with the catalog.
+- The versions and configurations of LSTs to run can be modified via input parameters for the pipelines in the Azure Pipelines YAML file or from the Web UI.
+ Default values are assigned to these parameters.
+ Parameters also include experiment scale factor, machine type, and cluster size.
+ Note that these parameters are not used to deploy the data or the infrastructure, as this process is not automated in the pipeline.
+ Instead, they are recorded in the experiment telemetry for proper categorization and visualization of results later on.
diff --git a/run/spark-3.3.1/azure-pipelines/config/connections_config.yaml b/run/spark-3.3.1/azure-pipelines/config/connections_config.yaml
new file mode 100644
index 00000000..63128856
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/config/connections_config.yaml
@@ -0,0 +1,7 @@
+# Description: Connections Configuration
+---
+version: 1
+connections:
+- id: spark_0
+ driver: org.apache.hive.jdbc.HiveDriver
+ url: jdbc:hive2://${SPARK_MASTER_HOST}:10000
diff --git a/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-delta-2.2.0.yaml b/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-delta-2.2.0.yaml
new file mode 100644
index 00000000..3fd39e23
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-delta-2.2.0.yaml
@@ -0,0 +1,29 @@
+# Description: Experiment Configuration
+---
+version: 1
+id: "${EXP_NAME}"
+repetitions: 1
+# Metadata accepts any key-value that we want to register together with the experiment run.
+metadata:
+ system: spark
+ system_version: 3.3.1
+ table_format: delta
+ table_format_version: 2.2.0
+ scale_factor: "${EXP_SCALE_FACTOR}"
+ mode: cow
+ machine: "${EXP_MACHINE}"
+ cluster_size: "${EXP_CLUSTER_SIZE}"
+# The following parameter values will be used to replace the variables in the workload statements.
+parameter_values:
+ external_catalog: spark_catalog
+ external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}"
+ external_table_format: csv
+ external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/"
+ external_options_suffix: ',header="true"'
+ external_tblproperties_suffix: ''
+ catalog: spark_catalog
+ database: "${EXP_NAME}"
+ table_format: delta
+ data_path: 'abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/run/delta/sf_${EXP_SCALE_FACTOR}/'
+ options_suffix: ''
+ tblproperties_suffix: ''
diff --git a/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-hudi-0.12.2.yaml b/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-hudi-0.12.2.yaml
new file mode 100644
index 00000000..3c8df376
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-hudi-0.12.2.yaml
@@ -0,0 +1,29 @@
+# Description: Experiment Configuration
+---
+version: 1
+id: "${EXP_NAME}"
+repetitions: 1
+# Metadata accepts any key-value that we want to register together with the experiment run.
+metadata:
+ system: spark
+ system_version: 3.3.1
+ table_format: hudi
+ table_format_version: 0.12.2
+ scale_factor: "${EXP_SCALE_FACTOR}"
+ mode: cow
+ machine: "${EXP_MACHINE}"
+ cluster_size: "${EXP_CLUSTER_SIZE}"
+# The following parameter values will be used to replace the variables in the workload statements.
+parameter_values:
+ external_catalog: spark_catalog
+ external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}"
+ external_table_format: csv
+ external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/"
+ external_options_suffix: ',header="true"'
+ external_tblproperties_suffix: ''
+ catalog: spark_catalog
+ database: "${EXP_NAME}"
+ table_format: hudi
+ data_path: 'abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/run/hudi/sf_${EXP_SCALE_FACTOR}/'
+ options_suffix: ''
+ tblproperties_suffix: ', "type"="cow"'
diff --git a/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-iceberg-1.1.0.yaml b/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-iceberg-1.1.0.yaml
new file mode 100644
index 00000000..506f40c7
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-iceberg-1.1.0.yaml
@@ -0,0 +1,29 @@
+# Description: Experiment Configuration
+---
+version: 1
+id: "${EXP_NAME}"
+repetitions: 1
+# Metadata accepts any key-value that we want to register together with the experiment run.
+metadata:
+ system: spark
+ system_version: 3.3.1
+ table_format: iceberg
+ table_format_version: 1.1.0
+ scale_factor: "${EXP_SCALE_FACTOR}"
+ mode: cow
+ machine: "${EXP_MACHINE}"
+ cluster_size: "${EXP_CLUSTER_SIZE}"
+# The following parameter values will be used to replace the variables in the workload statements.
+parameter_values:
+ external_catalog: spark_catalog
+ external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}"
+ external_table_format: csv
+ external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/"
+ external_options_suffix: ',header="true"'
+ external_tblproperties_suffix: ''
+ catalog: spark_catalog
+ database: "${EXP_NAME}"
+ table_format: iceberg
+ data_path: 'abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/run/iceberg/sf_${EXP_SCALE_FACTOR}/'
+ options_suffix: ''
+ tblproperties_suffix: ', "format-version"="2", "write.delete.mode"="copy-on-write", "write.update.mode"="copy-on-write", "write.merge.mode"="copy-on-write"'
diff --git a/run/spark-3.3.1/azure-pipelines/config/experiment_config-mor-hudi-0.12.2.yaml b/run/spark-3.3.1/azure-pipelines/config/experiment_config-mor-hudi-0.12.2.yaml
new file mode 100644
index 00000000..cbc82720
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/config/experiment_config-mor-hudi-0.12.2.yaml
@@ -0,0 +1,29 @@
+# Description: Experiment Configuration
+---
+version: 1
+id: "${EXP_NAME}"
+repetitions: 1
+# Metadata accepts any key-value that we want to register together with the experiment run.
+metadata:
+ system: spark
+ system_version: 3.3.1
+ table_format: hudi
+ table_format_version: 0.12.2
+ scale_factor: "${EXP_SCALE_FACTOR}"
+ mode: mor
+ machine: "${EXP_MACHINE}"
+ cluster_size: "${EXP_CLUSTER_SIZE}"
+# The following parameter values will be used to replace the variables in the workload statements.
+parameter_values:
+ external_catalog: spark_catalog
+ external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}"
+ external_table_format: csv
+ external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/"
+ external_options_suffix: ',header="true"'
+ external_tblproperties_suffix: ''
+ catalog: spark_catalog
+ database: "${EXP_NAME}"
+ table_format: hudi
+ data_path: 'abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/run/hudi/sf_${EXP_SCALE_FACTOR}/'
+ options_suffix: ''
+ tblproperties_suffix: ', "type"="mor"'
diff --git a/run/spark-3.3.1/azure-pipelines/config/experiment_config-mor-iceberg-1.1.0.yaml b/run/spark-3.3.1/azure-pipelines/config/experiment_config-mor-iceberg-1.1.0.yaml
new file mode 100644
index 00000000..2b916227
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/config/experiment_config-mor-iceberg-1.1.0.yaml
@@ -0,0 +1,29 @@
+# Description: Experiment Configuration
+---
+version: 1
+id: "${EXP_NAME}"
+repetitions: 1
+# Metadata accepts any key-value that we want to register together with the experiment run.
+metadata:
+ system: spark
+ system_version: 3.3.1
+ table_format: iceberg
+ table_format_version: 1.1.0
+ scale_factor: "${EXP_SCALE_FACTOR}"
+ mode: mor
+ machine: "${EXP_MACHINE}"
+ cluster_size: "${EXP_CLUSTER_SIZE}"
+# The following parameter values will be used to replace the variables in the workload statements.
+parameter_values:
+ external_catalog: spark_catalog
+ external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}"
+ external_table_format: csv
+ external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/"
+ external_options_suffix: ',header="true"'
+ external_tblproperties_suffix: ''
+ catalog: spark_catalog
+ database: "${EXP_NAME}"
+ table_format: iceberg
+ data_path: 'abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/run/iceberg/sf_${EXP_SCALE_FACTOR}/'
+ options_suffix: ''
+ tblproperties_suffix: ', "format-version"="2", "write.delete.mode"="merge-on-read", "write.update.mode"="merge-on-read", "write.merge.mode"="merge-on-read"'
diff --git a/run/spark-3.3.1/azure-pipelines/config/setup_experiment_config.yaml b/run/spark-3.3.1/azure-pipelines/config/setup_experiment_config.yaml
new file mode 100644
index 00000000..a4907102
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/config/setup_experiment_config.yaml
@@ -0,0 +1,20 @@
+# Description: Experiment Configuration
+---
+version: 1
+id: setup_experiment
+repetitions: 1
+# Metadata accepts any key-value that we want to register together with the experiment run.
+metadata:
+ system: spark
+ system_version: 3.3.1
+ scale_factor: "${EXP_SCALE_FACTOR}"
+ machine: "${EXP_MACHINE}"
+ cluster_size: "${EXP_CLUSTER_SIZE}"
+# The following parameter values will be used to replace the variables in the workload statements.
+parameter_values:
+ external_catalog: spark_catalog
+ external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}"
+ external_table_format: csv
+ external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/"
+ external_options_suffix: ',header="true"'
+ external_tblproperties_suffix: ''
diff --git a/run/spark-3.3.1/azure-pipelines/config/telemetry_config.yaml b/run/spark-3.3.1/azure-pipelines/config/telemetry_config.yaml
new file mode 100644
index 00000000..6e5f3400
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/config/telemetry_config.yaml
@@ -0,0 +1,13 @@
+# Description: Telemetry Configuration
+---
+version: 1
+connection:
+ id: duckdb_0
+ driver: org.duckdb.DuckDBDriver
+ url: jdbc:duckdb:./telemetry-spark-3.3.1
+execute_ddl: true
+ddl_file: 'src/main/resources/scripts/logging/duckdb/ddl.sql'
+insert_file: 'src/main/resources/scripts/logging/duckdb/insert.sql'
+# The following parameter values will be used to replace the variables in the logging statements.
+parameter_values:
+ data_path: ''
\ No newline at end of file
diff --git a/run/spark-3.3.1/azure-pipelines/run-lst-bench.yml b/run/spark-3.3.1/azure-pipelines/run-lst-bench.yml
new file mode 100644
index 00000000..1d63227e
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/run-lst-bench.yml
@@ -0,0 +1,297 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+trigger: none
+
+parameters:
+- name: lsts
+ type: object
+ default:
+ - table_format: "delta"
+ version: "2.2.0"
+ mode: "cow"
+ - table_format: "iceberg"
+ version: "1.1.0"
+ mode: "cow"
+ - table_format: "iceberg"
+ version: "1.1.0"
+ mode: "mor"
+ - table_format: "hudi"
+ version: "0.12.2"
+ mode: "cow"
+ - table_format: "hudi"
+ version: "0.12.2"
+ mode: "mor"
+- name: workloads
+ type: object
+ default:
+ - "wp1_longevity"
+ - "wp2_resilience"
+ - "wp3_rw_concurrency"
+ - "wp4_time_travel"
+- name: exp_scale_factor
+ type: number
+ default: 100
+- name: exp_machine
+ type: string
+ default: "Standard_E8s_v5"
+- name: exp_cluster_size
+ type: number
+ default: 8
+
+variables:
+ MAVEN_CACHE_FOLDER: $(Pipeline.Workspace)/.m2/repository
+ MAVEN_OPTS: '-ntp -B -Dmaven.repo.local=$(MAVEN_CACHE_FOLDER)'
+ EXP_SCALE_FACTOR: ${{ parameters.exp_scale_factor }}
+ EXP_MACHINE: ${{ parameters.exp_machine }}
+ EXP_CLUSTER_SIZE: ${{ parameters.exp_cluster_size }}
+
+stages:
+# Build LST-Bench and create artifact to deploy to target VM
+- stage: build
+ jobs:
+ - job: Build
+ pool:
+ vmImage: 'ubuntu-latest'
+ steps:
+ - task: Cache@2
+ displayName: Cache Maven local repo
+ inputs:
+ key: 'maven | "$(Agent.OS)" | **/pom.xml'
+ restoreKeys: |
+ maven | "$(Agent.OS)"
+ maven
+ path: $(MAVEN_CACHE_FOLDER)
+ - task: Maven@4
+ inputs:
+ mavenPomFile: 'pom.xml'
+ options: $(MAVEN_OPTS)
+ javaHomeOption: 'JDKVersion'
+ jdkVersionOption: '1.11'
+ publishJUnitResults: false
+ goals: 'package -DskipTests -Pspark-jdbc'
+ - task: CopyFiles@2
+ displayName: 'Copy Artifacts to: $(TargetFolder)'
+ inputs:
+ SourceFolder: '$(Build.SourcesDirectory)'
+ TargetFolder: '$(System.DefaultWorkingDirectory)/pipeline-artifacts/'
+ - task: PublishPipelineArtifact@1
+ inputs:
+ targetPath: '$(System.DefaultWorkingDirectory)/pipeline-artifacts/'
+ artifact: lst-bench-0.1-SNAPSHOT
+
+# Set up engine and deploy LST-Bench
+- stage: deploy
+ jobs:
+ - deployment: EngineDeploy
+ displayName: 'Deploying engine'
+ workspace:
+ clean: all
+ environment:
+ name: 'lst-bench-github'
+ resourceType: VirtualMachine
+ resourceName: 'lst-bench-head'
+ strategy:
+ runOnce:
+ deploy:
+ steps:
+ - bash: |
+ echo 'Deploy engine'
+ mkdir -p ~/spark-3.3.1
+ cp $(Pipeline.Workspace)/lst-bench-0.1-SNAPSHOT/run/spark-3.3.1/azure-pipelines/sh/* ~/spark-3.3.1/
+ cd ~/spark-3.3.1
+ chmod +x ./*
+ spark_head_node=$(ip addr show eth0 | sed -n 's/ *inet [^0-9]*\([0-9\.]\+\).*/\1/p')
+ ./init.sh "${spark_head_node}" "$(data_storage_account)" "$(data_storage_account_shared_key)"
+ ./hms.sh "$(hms_jdbc_driver)" "$(hms_jdbc_url)" "$(hms_jdbc_user)" "$(hms_jdbc_password)" "$(hms_storage_account)" "$(hms_storage_account_shared_key)" "$(hms_storage_account_container)"
+ ./dist-setup.sh
+ ./dist-exec.sh spark-3.3.1 init.sh "${spark_head_node}" "$(data_storage_account)" "$(data_storage_account_shared_key)"
+ - deployment: ClientDeploy
+ displayName: 'Deploying LST-Bench client'
+ workspace:
+ clean: all
+ environment:
+ name: 'lst-bench-github'
+ resourceType: VirtualMachine
+ resourceName: 'lst-bench-client'
+ strategy:
+ runOnce:
+ deploy:
+ steps:
+ - bash: |
+ echo 'Deploy LST-Bench client'
+ sudo apt install -y openjdk-11-jdk
+ mkdir -p ~/lst-bench-0.1-SNAPSHOT
+ cp -rf $(Pipeline.Workspace)/lst-bench-0.1-SNAPSHOT/* ~/lst-bench-0.1-SNAPSHOT/
+ chmod +x ~/lst-bench-0.1-SNAPSHOT/launcher.sh
+
+# Run LST-Bench (setup external tables)
+- stage: setup_experiment
+ jobs:
+ - deployment: StartEngine
+ displayName: "Starting Engine"
+ environment:
+ name: 'lst-bench-github'
+ resourceType: VirtualMachine
+ resourceName: 'lst-bench-head'
+ variables:
+ process.clean: false
+ strategy:
+ runOnce:
+ deploy:
+ steps:
+ - download: none
+ - bash: |
+ cd ~/spark-3.3.1
+ ./stop-cluster.sh && ./start-cluster.sh
+ sleep 10
+ spark_head_node=$(ip addr show eth0 | sed -n 's/ *inet [^0-9]*\([0-9\.]\+\).*/\1/p')
+ echo "##vso[task.setvariable variable=spark_head_node;isOutput=true]${spark_head_node}"
+ name: engine_start_step
+ - deployment: RunSetupExperiment
+ dependsOn: StartEngine
+ displayName: "Setup Experiment"
+ environment:
+ name: 'lst-bench-github'
+ resourceType: VirtualMachine
+ resourceName: 'lst-bench-client'
+ variables:
+ spark_master_host: $[ dependencies.StartEngine.outputs['deploy_lst-bench-head.engine_start_step.spark_head_node'] ]
+ timeoutInMinutes: 0
+ strategy:
+ runOnce:
+ deploy:
+ steps:
+ - download: none
+ - bash: |
+ cd ~/lst-bench-0.1-SNAPSHOT
+ ./launcher.sh -c run/spark-3.3.1/azure-pipelines/config/connections_config.yaml \
+ -e run/spark-3.3.1/azure-pipelines/config/setup_experiment_config.yaml \
+ -t run/spark-3.3.1/azure-pipelines/config/telemetry_config.yaml \
+ -l run/spark-3.3.1/config/tpcds/library.yaml \
+ -w run/spark-3.3.1/config/tpcds/setup_experiment.yaml
+ - deployment: StopEngine
+ dependsOn: RunSetupExperiment
+ displayName: "Stopping Engine"
+ environment:
+ name: 'lst-bench-github'
+ resourceType: VirtualMachine
+ resourceName: 'lst-bench-head'
+ strategy:
+ runOnce:
+ deploy:
+ steps:
+ - download: none
+ - bash: |
+ cd ~/spark-3.3.1
+ ./stop-cluster.sh
+
+# Run LST-Bench
+# TODO: Enable time travel for Hudi (see HUDI-7274)
+- ${{ each lst in parameters.lsts }}:
+ - stage: setup_${{ lst.mode }}_${{ lst.table_format }}
+ jobs:
+ - deployment: SetupEngine
+ displayName: "Setup Engine (${{ lst.mode }}, ${{ lst.table_format }}-${{ lst.version }})"
+ environment:
+ name: 'lst-bench-github'
+ resourceType: VirtualMachine
+ resourceName: 'lst-bench-head'
+ strategy:
+ runOnce:
+ deploy:
+ steps:
+ - download: none
+ - bash: |
+ cd ~/spark-3.3.1
+ ./${{ lst.table_format }}-${{ lst.version }}.sh
+ ./dist-exec.sh spark-3.3.1 ${{ lst.table_format }}-${{ lst.version }}.sh
+ - ${{ each workload in parameters.workloads }}:
+ - ${{ if or(ne(lst.table_format, 'hudi'),ne(workload, 'wp4_time_travel')) }}:
+ - stage: test_${{ lst.mode }}_${{ lst.table_format }}_${{ workload }}
+ jobs:
+ - deployment: StartEngine
+ displayName: "Starting Engine (${{ lst.mode }}, ${{ lst.table_format }}-${{ lst.version }}, ${{ workload }})"
+ environment:
+ name: 'lst-bench-github'
+ resourceType: VirtualMachine
+ resourceName: 'lst-bench-head'
+ variables:
+ process.clean: false
+ strategy:
+ runOnce:
+ deploy:
+ steps:
+ - download: none
+ - bash: |
+ cd ~/spark-3.3.1
+ ./stop-cluster.sh && ./start-cluster.sh ${{ lst.table_format }}
+ sleep 10
+ spark_head_node=$(ip addr show eth0 | sed -n 's/ *inet [^0-9]*\([0-9\.]\+\).*/\1/p')
+ echo "##vso[task.setvariable variable=spark_head_node;isOutput=true]${spark_head_node}"
+ name: engine_start_step
+ - deployment: RunExperiment
+ dependsOn: StartEngine
+ displayName: "Running Experiment (${{ lst.mode }}, ${{ lst.table_format }}-${{ lst.version }}, ${{ workload }})"
+ environment:
+ name: 'lst-bench-github'
+ resourceType: VirtualMachine
+ resourceName: 'lst-bench-client'
+ variables:
+ spark_master_host: $[ dependencies.StartEngine.outputs['deploy_lst-bench-head.engine_start_step.spark_head_node'] ]
+ timeoutInMinutes: 0
+ strategy:
+ runOnce:
+ deploy:
+ steps:
+ - download: none
+ - bash: |
+ cd ~/lst-bench-0.1-SNAPSHOT
+ echo "${{ workload }}"
+ export EXP_NAME="${{ workload }}"
+ ./launcher.sh -c run/spark-3.3.1/azure-pipelines/config/connections_config.yaml \
+ -e run/spark-3.3.1/azure-pipelines/config/experiment_config-${{ lst.mode }}-${{ lst.table_format }}-${{ lst.version }}.yaml \
+ -t run/spark-3.3.1/azure-pipelines/config/telemetry_config.yaml \
+ -l run/spark-3.3.1/config/tpcds/library.yaml \
+ -w run/spark-3.3.1/config/tpcds/${{ workload }}-${{ lst.table_format }}-${{ lst.version }}.yaml
+ - deployment: StopEngine
+ dependsOn: RunExperiment
+ displayName: "Stopping Engine (${{ lst.mode }}, ${{ lst.table_format }}-${{ lst.version }}, ${{ workload }})"
+ environment:
+ name: 'lst-bench-github'
+ resourceType: VirtualMachine
+ resourceName: 'lst-bench-head'
+ strategy:
+ runOnce:
+ deploy:
+ steps:
+ - download: none
+ - bash: |
+ cd ~/spark-3.3.1
+ ./stop-cluster.sh
+ - stage: cleanup_${{ lst.mode }}_${{ lst.table_format }}
+ jobs:
+ - deployment: CleanupEngine
+ displayName: "Cleanup Engine (${{ lst.mode }}, ${{ lst.table_format }}-${{ lst.version }})"
+ environment:
+ name: 'lst-bench-github'
+ resourceType: VirtualMachine
+ resourceName: 'lst-bench-head'
+ strategy:
+ runOnce:
+ deploy:
+ steps:
+ - download: none
+ - bash: |
+ cd ~/spark-3.3.1
+ ./cleanup-${{ lst.table_format }}-${{ lst.version }}.sh
+ ./dist-exec.sh spark-3.3.1 cleanup-${{ lst.table_format }}-${{ lst.version }}.sh
diff --git a/run/spark-3.3.1/azure-pipelines/sh/cleanup-delta-2.2.0.sh b/run/spark-3.3.1/azure-pipelines/sh/cleanup-delta-2.2.0.sh
new file mode 100755
index 00000000..c8eacccd
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/sh/cleanup-delta-2.2.0.sh
@@ -0,0 +1,9 @@
+#!/bin/bash -e
+source env.sh
+if [ -z "${SPARK_HOME}" ]; then
+ echo "ERROR: SPARK_HOME is not defined."
+ exit 1
+fi
+
+rm $SPARK_HOME/jars/delta-core.jar
+rm $SPARK_HOME/jars/delta-storage.jar
diff --git a/run/spark-3.3.1/azure-pipelines/sh/cleanup-hudi-0.12.2.sh b/run/spark-3.3.1/azure-pipelines/sh/cleanup-hudi-0.12.2.sh
new file mode 100755
index 00000000..ab6aee49
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/sh/cleanup-hudi-0.12.2.sh
@@ -0,0 +1,8 @@
+#!/bin/bash -e
+source env.sh
+if [ -z "${SPARK_HOME}" ]; then
+ echo "ERROR: SPARK_HOME is not defined."
+ exit 1
+fi
+
+rm $SPARK_HOME/jars/hudi-spark-bundle.jar
diff --git a/run/spark-3.3.1/azure-pipelines/sh/cleanup-iceberg-1.1.0.sh b/run/spark-3.3.1/azure-pipelines/sh/cleanup-iceberg-1.1.0.sh
new file mode 100755
index 00000000..e0a01cd8
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/sh/cleanup-iceberg-1.1.0.sh
@@ -0,0 +1,8 @@
+#!/bin/bash -e
+source env.sh
+if [ -z "${SPARK_HOME}" ]; then
+ echo "ERROR: SPARK_HOME is not defined."
+ exit 1
+fi
+
+rm $SPARK_HOME/jars/iceberg-spark-runtime.jar
diff --git a/run/spark-3.3.1/azure-pipelines/sh/delta-2.2.0.sh b/run/spark-3.3.1/azure-pipelines/sh/delta-2.2.0.sh
new file mode 100755
index 00000000..c9e4f015
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/sh/delta-2.2.0.sh
@@ -0,0 +1,12 @@
+#!/bin/bash -e
+source env.sh
+if [ -z "${SPARK_HOME}" ]; then
+ echo "ERROR: SPARK_HOME is not defined."
+ exit 1
+fi
+
+wget -nv -N https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar
+wget -nv -N https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar
+
+ln -sf $(pwd)/delta-core_2.12-2.2.0.jar $SPARK_HOME/jars/delta-core.jar
+ln -sf $(pwd)/delta-storage-2.2.0.jar $SPARK_HOME/jars/delta-storage.jar
diff --git a/run/spark-3.3.1/azure-pipelines/sh/dist-exec.sh b/run/spark-3.3.1/azure-pipelines/sh/dist-exec.sh
new file mode 100755
index 00000000..bd7c3ca6
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/sh/dist-exec.sh
@@ -0,0 +1,18 @@
+#!/bin/bash -e
+source env.sh
+if [ -z "${HOSTS}" ]; then
+ echo "ERROR: HOSTS is not defined."
+ exit 1
+fi
+
+if [ "$#" -lt 2 ]; then
+ echo "Error: Please provide at least two input parameters."
+ exit 1
+fi
+deploy_dir=$1
+script_file=$2
+
+for node in $HOSTS ; do ssh -t $node "mkdir -p ~/$deploy_dir" ; done
+for node in $HOSTS ; do scp *.template $node:~/$deploy_dir ; done
+for node in $HOSTS ; do scp $script_file $node:~/$deploy_dir ; done
+for node in $HOSTS ; do ssh -t $node "cd ~/$deploy_dir && chmod +x ./$script_file && ./$script_file ${@:3}" ; done
diff --git a/run/spark-3.3.1/azure-pipelines/sh/dist-setup.sh b/run/spark-3.3.1/azure-pipelines/sh/dist-setup.sh
new file mode 100755
index 00000000..fda4f282
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/sh/dist-setup.sh
@@ -0,0 +1,28 @@
+#!/bin/bash -e
+source env.sh
+if [ -z "${HOME}" ]; then
+ echo "ERROR: HOME is not defined."
+ exit 1
+fi
+if [ -z "${SPARK_HOME}" ]; then
+ echo "ERROR: SPARK_HOME is not defined."
+ exit 1
+fi
+
+# Install packages
+sudo apt install -y net-tools nmap
+
+# Configure hosts
+my_ip=$(/sbin/ifconfig eth0 | sed -n 's/ *inet [^0-9]*\([0-9\.]\+\).*/\1/p')
+ip_range=${my_ip%.*}.*
+nmap -sn $ip_range | grep -Eo '([0-9]{1,3}\.){3}[0-9]{1,3}' | grep -v "^$my_ip$" > $HOME/hostiplist
+
+cp $HOME/hostiplist $SPARK_HOME/conf/workers
+
+export HOSTS=$(<$HOME/hostiplist)
+
+for node in $HOSTS ; do scp ~/.ssh/id_rsa* $node:~/.ssh/ ; done
+
+# Push to environment
+echo "export HOSTS=\"${HOSTS}\"" >> env.sh
+echo "source $(pwd)/env.sh" >> ~/.bashrc
diff --git a/run/spark-3.3.1/azure-pipelines/sh/hive-site.xml.template b/run/spark-3.3.1/azure-pipelines/sh/hive-site.xml.template
new file mode 100644
index 00000000..0e79ed7b
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/sh/hive-site.xml.template
@@ -0,0 +1,36 @@
+
+
+ javax.jdo.option.ConnectionURL
+ ${HMS_JDBC_URL}
+
+
+
+ javax.jdo.option.ConnectionDriverName
+ ${HMS_JDBC_DRIVER}
+
+
+
+ javax.jdo.option.ConnectionUserName
+ ${HMS_JDBC_USER}
+
+
+
+ javax.jdo.option.ConnectionPassword
+ ${HMS_JDBC_PASSWORD}
+
+
+
+ hive.metastore.warehouse.dir
+ abfss://${HMS_STORAGE_ACCOUNT_CONTAINER}@${HMS_STORAGE_ACCOUNT}.dfs.core.windows.net/hive/warehouse
+
+
+
+ fs.azure.account.auth.type.${HMS_STORAGE_ACCOUNT}.dfs.core.windows.net
+ SharedKey
+
+
+
+ fs.azure.account.key.${HMS_STORAGE_ACCOUNT}.dfs.core.windows.net
+ ${HMS_STORAGE_ACCOUNT_SHARED_KEY}
+
+
\ No newline at end of file
diff --git a/run/spark-3.3.1/azure-pipelines/sh/hms.sh b/run/spark-3.3.1/azure-pipelines/sh/hms.sh
new file mode 100755
index 00000000..4d78cbff
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/sh/hms.sh
@@ -0,0 +1,45 @@
+#!/bin/bash -e
+if [ "$#" -ne 7 ]; then
+ echo "Usage: $0 HMS_JDBC_DRIVER HMS_JDBC_URL HMS_JDBC_USER HMS_JDBC_PASSWORD HMS_STORAGE_ACCOUNT HMS_STORAGE_ACCOUNT_SHARED_KEY HMS_STORAGE_ACCOUNT_CONTAINER"
+ exit 1
+fi
+
+source env.sh
+if [ -z "${HADOOP_HOME}" ]; then
+ echo "ERROR: HADOOP_HOME is not defined."
+ exit 1
+fi
+if [ -z "${SPARK_HOME}" ]; then
+ echo "ERROR: SPARK_HOME is not defined."
+ exit 1
+fi
+
+export HMS_JDBC_DRIVER=$1
+export HMS_JDBC_URL=$2
+export HMS_JDBC_USER=$3
+export HMS_JDBC_PASSWORD=$4
+export HMS_STORAGE_ACCOUNT=$5
+export HMS_STORAGE_ACCOUNT_SHARED_KEY=$6
+export HMS_STORAGE_ACCOUNT_CONTAINER=$7
+export HIVE_HOME=/home/$USER/hive
+
+# Install Hive (needed for HMS)
+rm -rf apache-hive-2.3.9-bin
+wget -nv -N https://downloads.apache.org/hive/hive-2.3.9/apache-hive-2.3.9-bin.tar.gz
+tar -xzf apache-hive-2.3.9-bin.tar.gz
+ln -sf $(pwd)/apache-hive-2.3.9-bin $HIVE_HOME
+
+# Configure HMS
+envsubst < "hive-site.xml.template" > "$HIVE_HOME/conf/hive-site.xml"
+ln -sf $HIVE_HOME/conf/hive-site.xml $SPARK_HOME/conf/hive-site.xml
+
+# Copy Azure dependencies to Hive classpath
+cp $HADOOP_HOME/share/hadoop/tools/lib/hadoop-azure* $HIVE_HOME/lib/
+
+# Install MSSQL driver
+wget -nv -N https://repo1.maven.org/maven2/com/microsoft/sqlserver/mssql-jdbc/6.2.1.jre8/mssql-jdbc-6.2.1.jre8.jar
+ln -sf $(pwd)/mssql-jdbc-6.2.1.jre8.jar $SPARK_HOME/jars/mssql-jdbc.jar
+
+# Push to environment
+echo "export HIVE_HOME=${HIVE_HOME}" >> env.sh
+echo "source $(pwd)/env.sh" >> ~/.bashrc
diff --git a/run/spark-3.3.1/azure-pipelines/sh/hudi-0.12.2.sh b/run/spark-3.3.1/azure-pipelines/sh/hudi-0.12.2.sh
new file mode 100755
index 00000000..7c9166c5
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/sh/hudi-0.12.2.sh
@@ -0,0 +1,10 @@
+#!/bin/bash -e
+source env.sh
+if [ -z "${SPARK_HOME}" ]; then
+ echo "ERROR: SPARK_HOME is not defined."
+ exit 1
+fi
+
+wget -nv -N https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark3.3-bundle_2.12/0.12.2/hudi-spark3.3-bundle_2.12-0.12.2.jar
+
+ln -sf $(pwd)/hudi-spark3.3-bundle_2.12-0.12.2.jar $SPARK_HOME/jars/hudi-spark-bundle.jar
diff --git a/run/spark-3.3.1/azure-pipelines/sh/iceberg-1.1.0.sh b/run/spark-3.3.1/azure-pipelines/sh/iceberg-1.1.0.sh
new file mode 100755
index 00000000..61d6c4d5
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/sh/iceberg-1.1.0.sh
@@ -0,0 +1,10 @@
+#!/bin/bash -e
+source env.sh
+if [ -z "${SPARK_HOME}" ]; then
+ echo "ERROR: SPARK_HOME is not defined."
+ exit 1
+fi
+
+wget -nv -N https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.3_2.12/1.1.0/iceberg-spark-runtime-3.3_2.12-1.1.0.jar
+
+ln -sf $(pwd)/iceberg-spark-runtime-3.3_2.12-1.1.0.jar $SPARK_HOME/jars/iceberg-spark-runtime.jar
diff --git a/run/spark-3.3.1/azure-pipelines/sh/init.sh b/run/spark-3.3.1/azure-pipelines/sh/init.sh
new file mode 100755
index 00000000..282753a5
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/sh/init.sh
@@ -0,0 +1,60 @@
+#!/bin/bash -e
+if [ "$#" -ne 3 ]; then
+ echo "Usage: $0 SPARK_MASTER_HOST DATA_STORAGE_ACCOUNT DATA_STORAGE_ACCOUNT_SHARED_KEY"
+ exit 1
+fi
+
+if [ -z "${USER}" ]; then
+ echo "ERROR: USER is not defined."
+ exit 1
+fi
+
+export SPARK_MASTER_HOST=$1
+export SPARK_HOME=/home/$USER/spark
+export HADOOP_HOME=/home/$USER/hadoop
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64
+export DATA_STORAGE_ACCOUNT=$2
+export DATA_STORAGE_ACCOUNT_SHARED_KEY=$3
+
+# Update dependencies and install packages
+sudo apt update -y
+sudo apt install -y openjdk-8-jdk wget
+
+# Install Hadoop
+rm -rf hadoop-3.3.1
+wget -nv -N https://archive.apache.org/dist/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz
+tar -xzf hadoop-3.3.1.tar.gz
+ln -sf $(pwd)/hadoop-3.3.1 $HADOOP_HOME
+
+# Install Spark
+rm -rf spark-3.3.1-bin-hadoop3
+wget -nv -N https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
+tar -xf spark-3.3.1-bin-hadoop3.tgz
+ln -sf $(pwd)/spark-3.3.1-bin-hadoop3 $SPARK_HOME
+
+# Configure Spark
+sudo mkdir -p /opt/spark-events
+sudo chown $USER:$USER /opt/spark-events/
+
+cp $SPARK_HOME/conf/spark-env.sh.template $SPARK_HOME/conf/spark-env.sh
+cp $SPARK_HOME/conf/spark-defaults.conf.template $SPARK_HOME/conf/spark-defaults.conf
+
+envsubst < "spark-defaults.conf.template" > "$SPARK_HOME/conf/spark-defaults.conf"
+
+envsubst < "spark-env.sh.template" > "$SPARK_HOME/conf/spark-env.sh"
+
+sudo mkdir -p /mnt/local_resource/
+sudo mkdir -p /mnt/local_resource/data/
+sudo chown $USER:$USER /mnt/local_resource/data
+sudo mkdir -p /mnt/local_resource/tmp/
+sudo chown $USER:$USER /mnt/local_resource/tmp
+
+# Copy Azure dependencies to Spark classpath
+cp $HADOOP_HOME/share/hadoop/tools/lib/hadoop-azure* $SPARK_HOME/jars/
+
+# Push to environment
+echo "export HADOOP_HOME=${HADOOP_HOME}
+export SPARK_HOME=${SPARK_HOME}
+export JAVA_HOME=${JAVA_HOME}
+export PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin" >> env.sh
+echo "source $(pwd)/env.sh" >> ~/.bashrc
diff --git a/run/spark-3.3.1/azure-pipelines/sh/spark-defaults.conf.template b/run/spark-3.3.1/azure-pipelines/sh/spark-defaults.conf.template
new file mode 100644
index 00000000..67909343
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/sh/spark-defaults.conf.template
@@ -0,0 +1,16 @@
+spark.master spark://${SPARK_MASTER_HOST}:7077
+spark.driver.cores 4
+spark.driver.memory 45992m
+spark.executor.cores 7
+spark.executor.memory 11754m
+spark.memory.offHeap.enabled true
+spark.memory.offHeap.size 36974886912
+spark.eventLog.enabled true
+spark.eventLog.dir file:/opt/spark-events
+spark.history.fs.logDirectory file:/opt/spark-events
+spark.serializer org.apache.spark.serializer.KryoSerializer
+spark.kryoserializer.buffer 1024k
+spark.kryoserializer.buffer.max 1024m
+spark.sql.parquet.compression.codec gzip
+spark.hadoop.fs.azure.account.auth.type.${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net SharedKey
+spark.hadoop.fs.azure.account.key.${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net ${DATA_STORAGE_ACCOUNT_SHARED_KEY}
\ No newline at end of file
diff --git a/run/spark-3.3.1/azure-pipelines/sh/spark-env.sh.template b/run/spark-3.3.1/azure-pipelines/sh/spark-env.sh.template
new file mode 100644
index 00000000..18ea7d39
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/sh/spark-env.sh.template
@@ -0,0 +1,2 @@
+SPARK_MASTER_HOST=$SPARK_MASTER_HOST
+JAVA_HOME=$JAVA_HOME
\ No newline at end of file
diff --git a/run/spark-3.3.1/azure-pipelines/sh/start-cluster.sh b/run/spark-3.3.1/azure-pipelines/sh/start-cluster.sh
new file mode 100755
index 00000000..353e0b5f
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/sh/start-cluster.sh
@@ -0,0 +1,32 @@
+#!/bin/bash -e
+source env.sh
+if [ -z "${SPARK_HOME}" ]; then
+ echo "ERROR: SPARK_HOME is not defined."
+ exit 1
+fi
+
+cd $SPARK_HOME
+
+echo "Starting Spark cluster"
+./sbin/start-all.sh
+
+echo "Starting history server"
+./sbin/start-history-server.sh
+
+echo "Starting thrift server"
+if [ "$#" == 0 ]; then
+ echo "No LST provided"
+ ./sbin/start-thriftserver.sh
+elif [ "$1" == "delta" ]; then
+ echo "Using delta catalog"
+ ./sbin/start-thriftserver.sh --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog --conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension
+elif [ "$1" == "iceberg" ]; then
+ echo "Using iceberg catalog"
+ ./sbin/start-thriftserver.sh --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog --conf spark.sql.catalog.spark_catalog.type=hive --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
+elif [ "$1" == "hudi" ]; then
+ echo "Using hudi catalog"
+ ./sbin/start-thriftserver.sh --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension
+else
+ echo "Invalid LST"
+ exit 1
+fi
diff --git a/run/spark-3.3.1/azure-pipelines/sh/stop-cluster.sh b/run/spark-3.3.1/azure-pipelines/sh/stop-cluster.sh
new file mode 100755
index 00000000..68502692
--- /dev/null
+++ b/run/spark-3.3.1/azure-pipelines/sh/stop-cluster.sh
@@ -0,0 +1,17 @@
+#!/bin/bash -e
+source env.sh
+if [ -z "${SPARK_HOME}" ]; then
+ echo "ERROR: SPARK_HOME is not defined."
+ exit 1
+fi
+
+cd $SPARK_HOME
+
+echo "Stopping thrift server"
+./sbin/stop-thriftserver.sh
+
+echo "Stopping history server"
+./sbin/stop-history-server.sh
+
+echo "Stopping spark cluster"
+./sbin/stop-all.sh
\ No newline at end of file
diff --git a/run/spark-3.3.1/config/tpcds/setup_experiment.yaml b/run/spark-3.3.1/config/tpcds/setup_experiment.yaml
new file mode 100644
index 00000000..d122811f
--- /dev/null
+++ b/run/spark-3.3.1/config/tpcds/setup_experiment.yaml
@@ -0,0 +1,32 @@
+# Description: Setup experiment
+---
+version: 1
+id: setup_experiment
+phases:
+- id: setup
+ sessions:
+ - tasks:
+ - template_id: setup
+- id: setup_data_maintenance
+ sessions:
+ - tasks:
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
+ - template_id: setup_data_maintenance
diff --git a/run/spark-3.3.1/config/tpcds/w0_tpcds-delta.yaml b/run/spark-3.3.1/config/tpcds/w0_tpcds-delta-2.2.0.yaml
similarity index 98%
rename from run/spark-3.3.1/config/tpcds/w0_tpcds-delta.yaml
rename to run/spark-3.3.1/config/tpcds/w0_tpcds-delta-2.2.0.yaml
index 8c55b511..dd975408 100644
--- a/run/spark-3.3.1/config/tpcds/w0_tpcds-delta.yaml
+++ b/run/spark-3.3.1/config/tpcds/w0_tpcds-delta-2.2.0.yaml
@@ -1,7 +1,7 @@
# Description: W0: Original TPC-DS sequence
---
version: 1
-id: w0_tpcds_delta
+id: w0_tpcds
phases:
- id: setup
sessions:
diff --git a/run/spark-3.3.1/config/tpcds/w0_tpcds-hudi.yaml b/run/spark-3.3.1/config/tpcds/w0_tpcds-hudi-0.12.2.yaml
similarity index 98%
rename from run/spark-3.3.1/config/tpcds/w0_tpcds-hudi.yaml
rename to run/spark-3.3.1/config/tpcds/w0_tpcds-hudi-0.12.2.yaml
index d4508627..0e81b4fd 100644
--- a/run/spark-3.3.1/config/tpcds/w0_tpcds-hudi.yaml
+++ b/run/spark-3.3.1/config/tpcds/w0_tpcds-hudi-0.12.2.yaml
@@ -1,7 +1,7 @@
# Description: W0: Original TPC-DS sequence
---
version: 1
-id: w0_tpcds_hudi
+id: w0_tpcds
phases:
- id: setup
sessions:
diff --git a/run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg.yaml b/run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg-1.1.0.yaml
similarity index 98%
rename from run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg.yaml
rename to run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg-1.1.0.yaml
index 4fd0b4f0..ab43a8ef 100644
--- a/run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg.yaml
+++ b/run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg-1.1.0.yaml
@@ -1,7 +1,7 @@
# Description: W0: Original TPC-DS sequence
---
version: 1
-id: w0_tpcds_iceberg
+id: w0_tpcds
phases:
- id: setup
sessions:
diff --git a/run/spark-3.3.1/config/tpcds/wp1_longevity.yaml b/run/spark-3.3.1/config/tpcds/wp1_longevity-delta-2.2.0.yaml
similarity index 70%
rename from run/spark-3.3.1/config/tpcds/wp1_longevity.yaml
rename to run/spark-3.3.1/config/tpcds/wp1_longevity-delta-2.2.0.yaml
index f12d1d63..b0498bce 100644
--- a/run/spark-3.3.1/config/tpcds/wp1_longevity.yaml
+++ b/run/spark-3.3.1/config/tpcds/wp1_longevity-delta-2.2.0.yaml
@@ -3,23 +3,6 @@
version: 1
id: wp1_longevity
phases:
-- id: setup
- sessions:
- - tasks:
- - template_id: setup
-- id: setup_data_maintenance
- sessions:
- - tasks:
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- id: init
sessions:
- tasks:
diff --git a/run/spark-3.3.1/config/tpcds/wp1_longevity-hudi-0.12.2.yaml b/run/spark-3.3.1/config/tpcds/wp1_longevity-hudi-0.12.2.yaml
new file mode 100644
index 00000000..88c784c7
--- /dev/null
+++ b/run/spark-3.3.1/config/tpcds/wp1_longevity-hudi-0.12.2.yaml
@@ -0,0 +1,65 @@
+# Description: WP1: Longevity
+---
+version: 1
+id: wp1_longevity
+phases:
+- id: init
+ sessions:
+ - tasks:
+ - template_id: init
+- id: build
+ sessions:
+ - tasks:
+ - template_id: build
+ replace_regex:
+ - pattern: '(?i)varchar\(.*\)|char\(.*\)'
+ replacement: 'string'
+- id: single_user_1
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_1
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+- id: single_user_2
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_2
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+- id: single_user_3
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_3
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+- id: single_user_4
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_4
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+- id: single_user_5
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_5
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+- id: single_user_6
+ sessions:
+ - tasks:
+ - template_id: single_user
diff --git a/run/spark-3.3.1/config/tpcds/wp1_longevity-iceberg-1.1.0.yaml b/run/spark-3.3.1/config/tpcds/wp1_longevity-iceberg-1.1.0.yaml
new file mode 100644
index 00000000..721e3474
--- /dev/null
+++ b/run/spark-3.3.1/config/tpcds/wp1_longevity-iceberg-1.1.0.yaml
@@ -0,0 +1,62 @@
+# Description: WP1: Longevity
+---
+version: 1
+id: wp1_longevity
+phases:
+- id: init
+ sessions:
+ - tasks:
+ - template_id: init
+- id: build
+ sessions:
+ - tasks:
+ - template_id: build
+- id: single_user_1
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_1
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+- id: single_user_2
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_2
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+- id: single_user_3
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_3
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+- id: single_user_4
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_4
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+- id: single_user_5
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_5
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+- id: single_user_6
+ sessions:
+ - tasks:
+ - template_id: single_user
diff --git a/run/spark-3.3.1/config/tpcds/wp1_longevity_trickle_1k_batches.yaml b/run/spark-3.3.1/config/tpcds/wp1_longevity_trickle_1k_batches.yaml
index 2cfb71ba..7d81df86 100644
--- a/run/spark-3.3.1/config/tpcds/wp1_longevity_trickle_1k_batches.yaml
+++ b/run/spark-3.3.1/config/tpcds/wp1_longevity_trickle_1k_batches.yaml
@@ -1,7 +1,7 @@
# Description: WP1: Longevity
---
version: 1
-id: wp1_longevity
+id: wp1_longevity_trickle
phases:
- id: setup
sessions:
diff --git a/run/spark-3.3.1/config/tpcds/wp2_resilience.yaml b/run/spark-3.3.1/config/tpcds/wp2_resilience-delta-2.2.0.yaml
similarity index 71%
rename from run/spark-3.3.1/config/tpcds/wp2_resilience.yaml
rename to run/spark-3.3.1/config/tpcds/wp2_resilience-delta-2.2.0.yaml
index 9ed97a4c..86f38527 100644
--- a/run/spark-3.3.1/config/tpcds/wp2_resilience.yaml
+++ b/run/spark-3.3.1/config/tpcds/wp2_resilience-delta-2.2.0.yaml
@@ -3,25 +3,6 @@
version: 1
id: wp2_resilience
phases:
-- id: setup
- sessions:
- - tasks:
- - template_id: setup
-- id: setup_data_maintenance
- sessions:
- - tasks:
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- id: init
sessions:
- tasks:
diff --git a/run/spark-3.3.1/config/tpcds/wp2_resilience-hudi-0.12.2.yaml b/run/spark-3.3.1/config/tpcds/wp2_resilience-hudi-0.12.2.yaml
new file mode 100644
index 00000000..ff73de34
--- /dev/null
+++ b/run/spark-3.3.1/config/tpcds/wp2_resilience-hudi-0.12.2.yaml
@@ -0,0 +1,77 @@
+# Description: WP2: Resilience
+---
+version: 1
+id: wp2_resilience
+phases:
+- id: init
+ sessions:
+ - tasks:
+ - template_id: init
+- id: build
+ sessions:
+ - tasks:
+ - template_id: build
+ replace_regex:
+ - pattern: '(?i)varchar\(.*\)|char\(.*\)'
+ replacement: 'string'
+- id: single_user_1
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_1
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+- id: single_user_2
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: optimize_1
+ sessions:
+ - tasks:
+ - template_id: optimize_hudi
+- id: single_user_2o
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_2
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+- id: single_user_3
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: optimize_2
+ sessions:
+ - tasks:
+ - template_id: optimize_hudi
+- id: single_user_3o
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_3
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+- id: single_user_4
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: optimize_3
+ sessions:
+ - tasks:
+ - template_id: optimize_hudi
+- id: single_user_4o
+ sessions:
+ - tasks:
+ - template_id: single_user
diff --git a/run/spark-3.3.1/config/tpcds/wp2_resilience-iceberg-1.1.0.yaml b/run/spark-3.3.1/config/tpcds/wp2_resilience-iceberg-1.1.0.yaml
new file mode 100644
index 00000000..974730b5
--- /dev/null
+++ b/run/spark-3.3.1/config/tpcds/wp2_resilience-iceberg-1.1.0.yaml
@@ -0,0 +1,74 @@
+# Description: WP2: Resilience
+---
+version: 1
+id: wp2_resilience
+phases:
+- id: init
+ sessions:
+ - tasks:
+ - template_id: init
+- id: build
+ sessions:
+ - tasks:
+ - template_id: build
+- id: single_user_1
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_1
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+- id: single_user_2
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: optimize_1
+ sessions:
+ - tasks:
+ - template_id: optimize_iceberg
+- id: single_user_2o
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_2
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+- id: single_user_3
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: optimize_2
+ sessions:
+ - tasks:
+ - template_id: optimize_iceberg
+- id: single_user_3o
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: data_maintenance_3
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+- id: single_user_4
+ sessions:
+ - tasks:
+ - template_id: single_user
+- id: optimize_3
+ sessions:
+ - tasks:
+ - template_id: optimize_iceberg
+- id: single_user_4o
+ sessions:
+ - tasks:
+ - template_id: single_user
diff --git a/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency.yaml b/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-delta-2.2.0.yaml
similarity index 69%
rename from run/spark-3.3.1/config/tpcds/wp3_rw_concurrency.yaml
rename to run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-delta-2.2.0.yaml
index 28b93990..f84b48bd 100644
--- a/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency.yaml
+++ b/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-delta-2.2.0.yaml
@@ -3,25 +3,6 @@
version: 1
id: wp3_rw_concurrency
phases:
-- id: setup
- sessions:
- - tasks:
- - template_id: setup
-- id: setup_data_maintenance
- sessions:
- - tasks:
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- id: init
sessions:
- tasks:
diff --git a/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-hudi-0.12.2.yaml b/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-hudi-0.12.2.yaml
new file mode 100644
index 00000000..c5934f51
--- /dev/null
+++ b/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-hudi-0.12.2.yaml
@@ -0,0 +1,61 @@
+# Description: WP3: R/W concurrency
+---
+version: 1
+id: wp3_rw_concurrency
+phases:
+- id: init
+ sessions:
+ - tasks:
+ - template_id: init
+- id: build
+ sessions:
+ - tasks:
+ - template_id: build
+ replace_regex:
+ - pattern: '(?i)varchar\(.*\)|char\(.*\)'
+ replacement: 'string'
+- id: single_user_1_data_maintenance_1
+ sessions:
+ - tasks:
+ - template_id: single_user
+ - tasks:
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+- id: single_user_2_optimize_1
+ sessions:
+ - tasks:
+ - template_id: single_user
+ - tasks:
+ - template_id: optimize_hudi
+- id: single_user_2o_data_maintenance_2
+ sessions:
+ - tasks:
+ - template_id: single_user
+ - tasks:
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+- id: single_user_3_optimize_2
+ sessions:
+ - tasks:
+ - template_id: single_user
+ - tasks:
+ - template_id: optimize_hudi
+- id: single_user_3o_data_maintenance_3
+ sessions:
+ - tasks:
+ - template_id: single_user
+ - tasks:
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+- id: single_user_4_optimize_3
+ sessions:
+ - tasks:
+ - template_id: single_user
+ - tasks:
+ - template_id: optimize_hudi
diff --git a/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-iceberg-1.1.0.yaml b/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-iceberg-1.1.0.yaml
new file mode 100644
index 00000000..c0be11da
--- /dev/null
+++ b/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-iceberg-1.1.0.yaml
@@ -0,0 +1,58 @@
+# Description: WP3: R/W concurrency
+---
+version: 1
+id: wp3_rw_concurrency
+phases:
+- id: init
+ sessions:
+ - tasks:
+ - template_id: init
+- id: build
+ sessions:
+ - tasks:
+ - template_id: build
+- id: single_user_1_data_maintenance_1
+ sessions:
+ - tasks:
+ - template_id: single_user
+ - tasks:
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+- id: single_user_2_optimize_1
+ sessions:
+ - tasks:
+ - template_id: single_user
+ - tasks:
+ - template_id: optimize_iceberg
+- id: single_user_2o_data_maintenance_2
+ sessions:
+ - tasks:
+ - template_id: single_user
+ - tasks:
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+- id: single_user_3_optimize_2
+ sessions:
+ - tasks:
+ - template_id: single_user
+ - tasks:
+ - template_id: optimize_iceberg
+- id: single_user_3o_data_maintenance_3
+ sessions:
+ - tasks:
+ - template_id: single_user
+ - tasks:
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+- id: single_user_4_optimize_3
+ sessions:
+ - tasks:
+ - template_id: single_user
+ - tasks:
+ - template_id: optimize_iceberg
diff --git a/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency_multi.yaml b/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency_multi-delta-2.2.0.yaml
similarity index 100%
rename from run/spark-3.3.1/config/tpcds/wp3_rw_concurrency_multi.yaml
rename to run/spark-3.3.1/config/tpcds/wp3_rw_concurrency_multi-delta-2.2.0.yaml
diff --git a/run/spark-3.3.1/config/tpcds/wp4_time_travel.yaml b/run/spark-3.3.1/config/tpcds/wp4_time_travel-delta-2.2.0.yaml
similarity index 80%
rename from run/spark-3.3.1/config/tpcds/wp4_time_travel.yaml
rename to run/spark-3.3.1/config/tpcds/wp4_time_travel-delta-2.2.0.yaml
index 0b6c186b..64027647 100644
--- a/run/spark-3.3.1/config/tpcds/wp4_time_travel.yaml
+++ b/run/spark-3.3.1/config/tpcds/wp4_time_travel-delta-2.2.0.yaml
@@ -3,21 +3,6 @@
version: 1
id: wp4_time_travel
phases:
-- id: setup
- sessions:
- - tasks:
- - template_id: setup
-- id: setup_data_maintenance
- sessions:
- - tasks:
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- - template_id: setup_data_maintenance
- id: init
sessions:
- tasks:
diff --git a/run/spark-3.3.1/config/tpcds/wp4_time_travel-hudi-0.12.2.yaml b/run/spark-3.3.1/config/tpcds/wp4_time_travel-hudi-0.12.2.yaml
new file mode 100644
index 00000000..b0d7c545
--- /dev/null
+++ b/run/spark-3.3.1/config/tpcds/wp4_time_travel-hudi-0.12.2.yaml
@@ -0,0 +1,86 @@
+# Description: WP4: Time travel
+---
+version: 1
+id: wp4_time_travel
+phases:
+- id: init
+ sessions:
+ - tasks:
+ - template_id: init
+- id: build
+ sessions:
+ - tasks:
+ - template_id: build
+ replace_regex:
+ - pattern: '(?i)varchar\(.*\)|char\(.*\)'
+ replacement: 'string'
+- id: data_maintenance_1
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+- id: single_user_2_0
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: build
+- id: data_maintenance_2
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+- id: single_user_3_1
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: data_maintenance_1
+- id: single_user_3_0
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: build
+- id: data_maintenance_3
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+- id: single_user_4_2
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: data_maintenance_2
+- id: single_user_4_1
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: data_maintenance_1
+- id: single_user_4_0
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: build
+- id: data_maintenance_4
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_hudi
+ - template_id: data_maintenance_hudi
+- id: single_user_5_3
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: data_maintenance_3
+- id: single_user_5_2
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: data_maintenance_2
+- id: single_user_5_1
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: data_maintenance_1
+- id: single_user_5_0
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: build
diff --git a/run/spark-3.3.1/config/tpcds/wp4_time_travel-iceberg-1.1.0.yaml b/run/spark-3.3.1/config/tpcds/wp4_time_travel-iceberg-1.1.0.yaml
new file mode 100644
index 00000000..0e91ad7f
--- /dev/null
+++ b/run/spark-3.3.1/config/tpcds/wp4_time_travel-iceberg-1.1.0.yaml
@@ -0,0 +1,83 @@
+# Description: WP4: Time travel
+---
+version: 1
+id: wp4_time_travel
+phases:
+- id: init
+ sessions:
+ - tasks:
+ - template_id: init
+- id: build
+ sessions:
+ - tasks:
+ - template_id: build
+- id: data_maintenance_1
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+- id: single_user_2_0
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: build
+- id: data_maintenance_2
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+- id: single_user_3_1
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: data_maintenance_1
+- id: single_user_3_0
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: build
+- id: data_maintenance_3
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+- id: single_user_4_2
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: data_maintenance_2
+- id: single_user_4_1
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: data_maintenance_1
+- id: single_user_4_0
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: build
+- id: data_maintenance_4
+ sessions:
+ - tasks:
+ - template_id: data_maintenance_iceberg
+ - template_id: data_maintenance_iceberg
+- id: single_user_5_3
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: data_maintenance_3
+- id: single_user_5_2
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: data_maintenance_2
+- id: single_user_5_1
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: data_maintenance_1
+- id: single_user_5_0
+ sessions:
+ - tasks:
+ - template_id: single_user
+ time_travel_phase_id: build
diff --git a/src/test/java/com/microsoft/lst_bench/input/ParserTest.java b/src/test/java/com/microsoft/lst_bench/input/ParserTest.java
index 9926c33d..bdd818de 100644
--- a/src/test/java/com/microsoft/lst_bench/input/ParserTest.java
+++ b/src/test/java/com/microsoft/lst_bench/input/ParserTest.java
@@ -175,9 +175,9 @@ public void testParseTaskLibrary() throws IOException {
@Test
public void testParseW0Delta() throws IOException {
- Workload workload = FileParser.loadWorkload(TPCDS_PATH + "w0_tpcds-delta.yaml");
+ Workload workload = FileParser.loadWorkload(TPCDS_PATH + "w0_tpcds-delta-2.2.0.yaml");
Assertions.assertEquals(1, workload.getVersion());
- Assertions.assertEquals("w0_tpcds_delta", workload.getId());
+ Assertions.assertEquals("w0_tpcds", workload.getId());
Assertions.assertEquals(9, workload.getPhases().size());
for (Phase phase : workload.getPhases()) {
switch (phase.getId()) {
@@ -235,9 +235,9 @@ public void testParseW0Delta() throws IOException {
@Test
public void testParseW0Hudi() throws IOException {
- Workload workload = FileParser.loadWorkload(TPCDS_PATH + "w0_tpcds-hudi.yaml");
+ Workload workload = FileParser.loadWorkload(TPCDS_PATH + "w0_tpcds-hudi-0.12.2.yaml");
Assertions.assertEquals(1, workload.getVersion());
- Assertions.assertEquals("w0_tpcds_hudi", workload.getId());
+ Assertions.assertEquals("w0_tpcds", workload.getId());
Assertions.assertEquals(9, workload.getPhases().size());
for (Phase phase : workload.getPhases()) {
switch (phase.getId()) {
@@ -310,9 +310,9 @@ public void testParseW0Hudi() throws IOException {
@Test
public void testParseW0Iceberg() throws IOException {
- Workload workload = FileParser.loadWorkload(TPCDS_PATH + "w0_tpcds-iceberg.yaml");
+ Workload workload = FileParser.loadWorkload(TPCDS_PATH + "w0_tpcds-iceberg-1.1.0.yaml");
Assertions.assertEquals(1, workload.getVersion());
- Assertions.assertEquals("w0_tpcds_iceberg", workload.getId());
+ Assertions.assertEquals("w0_tpcds", workload.getId());
Assertions.assertEquals(9, workload.getPhases().size());
for (Phase phase : workload.getPhases()) {
switch (phase.getId()) {
@@ -370,22 +370,20 @@ public void testParseW0Iceberg() throws IOException {
@Test
public void testParseWP1Longevity() throws IOException {
- Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp1_longevity.yaml");
+ Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp1_longevity-delta-2.2.0.yaml");
Assertions.assertEquals(1, workload.getVersion());
Assertions.assertEquals("wp1_longevity", workload.getId());
- Assertions.assertEquals(15, workload.getPhases().size());
+ Assertions.assertEquals(13, workload.getPhases().size());
}
@Test
public void testParseWP2Resilience() throws IOException {
- Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp2_resilience.yaml");
+ Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp2_resilience-delta-2.2.0.yaml");
Assertions.assertEquals(1, workload.getVersion());
Assertions.assertEquals("wp2_resilience", workload.getId());
- Assertions.assertEquals(17, workload.getPhases().size());
+ Assertions.assertEquals(15, workload.getPhases().size());
for (Phase phase : workload.getPhases()) {
switch (phase.getId()) {
- case "setup":
- case "setup_data_maintenance":
case "init":
case "build":
case "single_user_1":
@@ -411,10 +409,10 @@ public void testParseWP2Resilience() throws IOException {
@Test
public void testParseWP3RWConcurrency() throws IOException {
- Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp3_rw_concurrency.yaml");
+ Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp3_rw_concurrency-delta-2.2.0.yaml");
Assertions.assertEquals(1, workload.getVersion());
Assertions.assertEquals("wp3_rw_concurrency", workload.getId());
- Assertions.assertEquals(10, workload.getPhases().size());
+ Assertions.assertEquals(8, workload.getPhases().size());
for (Phase phase : workload.getPhases()) {
switch (phase.getId()) {
case "single_user_1_data_maintenance_1":
@@ -458,8 +456,6 @@ public void testParseWP3RWConcurrency() throws IOException {
Assertions.assertNull(taskO.getTimeTravelPhaseId());
}
break;
- case "setup":
- case "setup_data_maintenance":
case "init":
case "build":
case "single_user_2o_data_maintenance_2":
@@ -476,7 +472,8 @@ public void testParseWP3RWConcurrency() throws IOException {
@Test
public void testParseWP3RWConcurrencyMulti() throws IOException {
- Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp3_rw_concurrency_multi.yaml");
+ Workload workload =
+ FileParser.loadWorkload(TPCDS_PATH + "wp3_rw_concurrency_multi-delta-2.2.0.yaml");
Assertions.assertEquals(1, workload.getVersion());
Assertions.assertEquals("wp3_rw_concurrency_multi", workload.getId());
Assertions.assertEquals(10, workload.getPhases().size());
@@ -518,10 +515,10 @@ public void testParseWP3RWConcurrencyMulti() throws IOException {
@Test
public void testParseWP4TimeTravel() throws IOException {
- Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp4_time_travel.yaml");
+ Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp4_time_travel-delta-2.2.0.yaml");
Assertions.assertEquals(1, workload.getVersion());
Assertions.assertEquals("wp4_time_travel", workload.getId());
- Assertions.assertEquals(18, workload.getPhases().size());
+ Assertions.assertEquals(16, workload.getPhases().size());
for (Phase phase : workload.getPhases()) {
switch (phase.getId()) {
case "single_user_2_0":
@@ -546,15 +543,6 @@ public void testParseWP4TimeTravel() throws IOException {
Assertions.assertNotNull(task.getTimeTravelPhaseId());
}
break;
- case "setup_data_maintenance":
- {
- List sessions = phase.getSessions();
- Assertions.assertEquals(1, sessions.size());
- List tasks = sessions.get(0).getTasks();
- Assertions.assertEquals(8, tasks.size());
- }
- break;
- case "setup":
case "init":
case "build":
case "data_maintenance_1":
diff --git a/src/test/java/com/microsoft/lst_bench/input/ValidationTest.java b/src/test/java/com/microsoft/lst_bench/input/ValidationTest.java
index a4c463c8..d25b9adf 100644
--- a/src/test/java/com/microsoft/lst_bench/input/ValidationTest.java
+++ b/src/test/java/com/microsoft/lst_bench/input/ValidationTest.java
@@ -186,14 +186,14 @@ private void testValidationLibrary(String libraryPath) throws IOException {
@EnabledOnOs({OS.LINUX, OS.MAC})
@ValueSource(
strings = {
- "run/spark-3.3.1/config/tpcds/w0_tpcds-delta.yaml",
- "run/spark-3.3.1/config/tpcds/w0_tpcds-hudi.yaml",
- "run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg.yaml",
- "run/spark-3.3.1/config/tpcds/wp1_longevity.yaml",
- "run/spark-3.3.1/config/tpcds/wp2_resilience.yaml",
- "run/spark-3.3.1/config/tpcds/wp3_rw_concurrency.yaml",
- "run/spark-3.3.1/config/tpcds/wp3_rw_concurrency_multi.yaml",
- "run/spark-3.3.1/config/tpcds/wp4_time_travel.yaml",
+ "run/spark-3.3.1/config/tpcds/w0_tpcds-delta-2.2.0.yaml",
+ "run/spark-3.3.1/config/tpcds/w0_tpcds-hudi-0.12.2.yaml",
+ "run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg-1.1.0.yaml",
+ "run/spark-3.3.1/config/tpcds/wp1_longevity-delta-2.2.0.yaml",
+ "run/spark-3.3.1/config/tpcds/wp2_resilience-delta-2.2.0.yaml",
+ "run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-delta-2.2.0.yaml",
+ "run/spark-3.3.1/config/tpcds/wp3_rw_concurrency_multi-delta-2.2.0.yaml",
+ "run/spark-3.3.1/config/tpcds/wp4_time_travel-delta-2.2.0.yaml",
"run/trino-420/config/tpcds/w0_tpcds.yaml",
"run/trino-420/config/tpcds/wp1_longevity.yaml",
"run/trino-420/config/tpcds/wp2_resilience.yaml",
@@ -210,14 +210,14 @@ public void testValidationWorkloadUnix(String workloadFilePath) throws IOExcepti
@EnabledOnOs({OS.WINDOWS})
@ValueSource(
strings = {
- "run\\spark-3.3.1\\config\\tpcds\\w0_tpcds-delta.yaml",
- "run\\spark-3.3.1\\config\\tpcds\\w0_tpcds-hudi.yaml",
- "run\\spark-3.3.1\\config\\tpcds\\w0_tpcds-iceberg.yaml",
- "run\\spark-3.3.1\\config\\tpcds\\wp1_longevity.yaml",
- "run\\spark-3.3.1\\config\\tpcds\\wp2_resilience.yaml",
- "run\\spark-3.3.1\\config\\tpcds\\wp3_rw_concurrency.yaml",
- "run\\spark-3.3.1\\config\\tpcds\\wp3_rw_concurrency_multi.yaml",
- "run\\spark-3.3.1\\config\\tpcds\\wp4_time_travel.yaml",
+ "run\\spark-3.3.1\\config\\tpcds\\w0_tpcds-delta-2.2.0.yaml",
+ "run\\spark-3.3.1\\config\\tpcds\\w0_tpcds-hudi-0.12.2.yaml",
+ "run\\spark-3.3.1\\config\\tpcds\\w0_tpcds-iceberg-1.1.0.yaml",
+ "run\\spark-3.3.1\\config\\tpcds\\wp1_longevity-delta-2.2.0.yaml",
+ "run\\spark-3.3.1\\config\\tpcds\\wp2_resilience-delta-2.2.0.yaml",
+ "run\\spark-3.3.1\\config\\tpcds\\wp3_rw_concurrency-delta-2.2.0.yaml",
+ "run\\spark-3.3.1\\config\\tpcds\\wp3_rw_concurrency_multi-delta-2.2.0.yaml",
+ "run\\spark-3.3.1\\config\\tpcds\\wp4_time_travel-delta-2.2.0.yaml",
"run\\trino-420\\config\\tpcds\\w0_tpcds.yaml",
"run\\trino-420\\config\\tpcds\\wp1_longevity.yaml",
"run\\trino-420\\config\\tpcds\\wp2_resilience.yaml",