diff --git a/.azure-pipelines/workflows/periodic_reporting.yml b/.azure-pipelines/workflows/periodic_reporting.yml deleted file mode 100644 index 1bcf54fa..00000000 --- a/.azure-pipelines/workflows/periodic_reporting.yml +++ /dev/null @@ -1,65 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -trigger: none - -variables: - MAVEN_CACHE_FOLDER: $(Pipeline.Workspace)/.m2/repository - MAVEN_OPTS: '-ntp -B -Dmaven.repo.local=$(MAVEN_CACHE_FOLDER)' - -stages: -# Build LST-Bench and create artifact to deploy to target VM -- stage: build - jobs: - - job: Build - pool: - vmImage: 'ubuntu-latest' - steps: - - task: Cache@2 - displayName: Cache Maven local repo - inputs: - key: 'maven | "$(Agent.OS)" | **/pom.xml' - restoreKeys: | - maven | "$(Agent.OS)" - maven - path: $(MAVEN_CACHE_FOLDER) - - task: Maven@4 - inputs: - mavenPomFile: 'pom.xml' - options: $(MAVEN_OPTS) - javaHomeOption: 'JDKVersion' - jdkVersionOption: '1.11' - publishJUnitResults: true - testResultsFiles: '**/surefire-reports/TEST-*.xml' - goals: 'package -DskipTests -Pspark-jdbc' - - task: CopyFiles@2 - displayName: 'Copy Artifacts to: $(TargetFolder)' - inputs: - SourceFolder: '$(Build.SourcesDirectory)' - Contents: | - launcher.sh - target/**/* - TargetFolder: '$(System.DefaultWorkingDirectory)/pipeline-artifacts/' - - upload: '$(System.DefaultWorkingDirectory)/pipeline-artifacts/' - artifact: drop - - deployment: VMDeploy - displayName: 'Deploying LST-Bench' - dependsOn: Build - environment: - name: 'lst-bench-periodic-reporting' - resourceType: VirtualMachine - tags: 'client' - strategy: - runOnce: - deploy: - steps: - - script: echo my first deployment diff --git a/run/README.md b/run/README.md new file mode 100644 index 00000000..14e74f6c --- /dev/null +++ b/run/README.md @@ -0,0 +1,46 @@ + + +# LST-Bench: Configurations and Results +This folder contains configurations for running LST-Bench on various systems as depicted in the [LST-Bench dashboard](/metrics/app), along with details about the setups used to generate those results. + +## Systems Included +- [x] Apache Spark 3.3.1 + - [x] Delta Lake 2.2.0 + - [x] Apache Hudi 0.12.2 + - [x] Apache Iceberg 1.1.0 +- [ ] Trino 420 + - [ ] Delta Lake + - [ ] Apache Iceberg + +## Folder Structure +While the folder for each engine may have a slightly different structure, they generally contain the following: + +- `scripts/`: + This directory contains SQL files used to execute LST-Bench workloads on the respective engine. + Typically, these SQL files may vary slightly across engines and LSTs based on the supported SQL dialect. +- `config/`: + This directory houses LST-Bench configuration files required to execute the workload. + It includes LST-Bench phase/session/task libraries that reference the aforementioned SQL scripts. +- Additional infrastructure and configuration automation folders, e.g., `azure-pipelines/`: + These folders contain scripts or files facilitating automation for running the benchmark on a specific infrastructure/engine. + For instance, Azure Pipelines scripts to deploy an engine with different LSTs and executing LST-Bench. + Generally, these folders should include an additional README.md file offering further details. +- `results/`: + This folder stores the results of the LST-Bench runs as captured by LST-Bench telemetry using DuckDB. + These results are processed and visualized in the [LST-Bench dashboard](/metrics/app). diff --git a/run/spark-3.3.1/azure-pipelines/README.md b/run/spark-3.3.1/azure-pipelines/README.md new file mode 100644 index 00000000..4488a12e --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/README.md @@ -0,0 +1,50 @@ + + +# Azure Pipelines Deployment for LST-Bench on Apache Spark 3.3.1 +This directory comprises the necessary tooling for executing LST-Bench on Apache Spark 3.3.1 with different LSTs using Azure Pipelines. The included tooling consists of: +- `run-lst-bench.yml`: + An Azure Pipelines script designed to deploy Apache Spark with various LSTs and execute LST-Bench. +- `sh/`: + A directory containing shell scripts and engine configuration files supporting the deployment of Spark with different LSTs and the execution of experiments. +- `config/`: + A directory with LST-Bench configuration files necessary for executing the experiments that are part of the results. + +## Prerequisites +- Automation for deploying the infrastructure in Azure to run LST-Bench is not implemented. As a result, the Azure Pipeline script expects the following setup: + - A VM named 'lst-bench-client' connected to the pipeline environment to run the LST-Bench client. + - A VM named 'lst-bench-head' to run the head node of the Spark cluster, also connected to the pipeline environment. + - A VMSS cluster, that will serve as the Spark worker nodes, within the same VNet as the head node. + - An Azure Storage Account accessible by both the VMSS and head node. + - An Azure SQL Database (or SQL Server flavored RDBMS) that will be running Hive Metastore. + The Hive Metastore schema for version 2.3.0 should already be installed in the instance. +- Prior to running the pipeline, several variables need definition in your Azure Pipeline: + - `data_storage_account`: Name of the Azure Blob Storage account where the source data for the experiment is stored. + - `data_storage_account_shared_key` (secret): Shared key for the Azure Blob Storage account where the source data for the experiment is stored. + - `hms_jdbc_driver`: JDBC driver for the Hive Metastore. + - `hms_jdbc_url`: JDBC URL for the Hive Metastore. + - `hms_jdbc_user`: Username for the Hive Metastore. + - `hms_jdbc_password` (secret): Password for the Hive Metastore. + - `hms_storage_account`: Name of the Azure Blob Storage account where the Hive Metastore will store data associated with the catalog (can be the same as the data_storage_account). + - `hms_storage_account_shared_key` (secret): Shared key for the Azure Blob Storage account where the Hive Metastore will store data associated with the catalog. + - `hms_storage_account_container`: Name of the container in the Azure Blob Storage account where the Hive Metastore will store data associated with the catalog. +- The versions and configurations of LSTs to run can be modified via input parameters for the pipelines in the Azure Pipelines YAML file or from the Web UI. + Default values are assigned to these parameters. + Parameters also include experiment scale factor, machine type, and cluster size. + Note that these parameters are not used to deploy the data or the infrastructure, as this process is not automated in the pipeline. + Instead, they are recorded in the experiment telemetry for proper categorization and visualization of results later on. diff --git a/run/spark-3.3.1/azure-pipelines/config/connections_config.yaml b/run/spark-3.3.1/azure-pipelines/config/connections_config.yaml new file mode 100644 index 00000000..63128856 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/config/connections_config.yaml @@ -0,0 +1,7 @@ +# Description: Connections Configuration +--- +version: 1 +connections: +- id: spark_0 + driver: org.apache.hive.jdbc.HiveDriver + url: jdbc:hive2://${SPARK_MASTER_HOST}:10000 diff --git a/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-delta-2.2.0.yaml b/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-delta-2.2.0.yaml new file mode 100644 index 00000000..3fd39e23 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-delta-2.2.0.yaml @@ -0,0 +1,29 @@ +# Description: Experiment Configuration +--- +version: 1 +id: "${EXP_NAME}" +repetitions: 1 +# Metadata accepts any key-value that we want to register together with the experiment run. +metadata: + system: spark + system_version: 3.3.1 + table_format: delta + table_format_version: 2.2.0 + scale_factor: "${EXP_SCALE_FACTOR}" + mode: cow + machine: "${EXP_MACHINE}" + cluster_size: "${EXP_CLUSTER_SIZE}" +# The following parameter values will be used to replace the variables in the workload statements. +parameter_values: + external_catalog: spark_catalog + external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}" + external_table_format: csv + external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/" + external_options_suffix: ',header="true"' + external_tblproperties_suffix: '' + catalog: spark_catalog + database: "${EXP_NAME}" + table_format: delta + data_path: 'abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/run/delta/sf_${EXP_SCALE_FACTOR}/' + options_suffix: '' + tblproperties_suffix: '' diff --git a/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-hudi-0.12.2.yaml b/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-hudi-0.12.2.yaml new file mode 100644 index 00000000..3c8df376 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-hudi-0.12.2.yaml @@ -0,0 +1,29 @@ +# Description: Experiment Configuration +--- +version: 1 +id: "${EXP_NAME}" +repetitions: 1 +# Metadata accepts any key-value that we want to register together with the experiment run. +metadata: + system: spark + system_version: 3.3.1 + table_format: hudi + table_format_version: 0.12.2 + scale_factor: "${EXP_SCALE_FACTOR}" + mode: cow + machine: "${EXP_MACHINE}" + cluster_size: "${EXP_CLUSTER_SIZE}" +# The following parameter values will be used to replace the variables in the workload statements. +parameter_values: + external_catalog: spark_catalog + external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}" + external_table_format: csv + external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/" + external_options_suffix: ',header="true"' + external_tblproperties_suffix: '' + catalog: spark_catalog + database: "${EXP_NAME}" + table_format: hudi + data_path: 'abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/run/hudi/sf_${EXP_SCALE_FACTOR}/' + options_suffix: '' + tblproperties_suffix: ', "type"="cow"' diff --git a/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-iceberg-1.1.0.yaml b/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-iceberg-1.1.0.yaml new file mode 100644 index 00000000..506f40c7 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/config/experiment_config-cow-iceberg-1.1.0.yaml @@ -0,0 +1,29 @@ +# Description: Experiment Configuration +--- +version: 1 +id: "${EXP_NAME}" +repetitions: 1 +# Metadata accepts any key-value that we want to register together with the experiment run. +metadata: + system: spark + system_version: 3.3.1 + table_format: iceberg + table_format_version: 1.1.0 + scale_factor: "${EXP_SCALE_FACTOR}" + mode: cow + machine: "${EXP_MACHINE}" + cluster_size: "${EXP_CLUSTER_SIZE}" +# The following parameter values will be used to replace the variables in the workload statements. +parameter_values: + external_catalog: spark_catalog + external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}" + external_table_format: csv + external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/" + external_options_suffix: ',header="true"' + external_tblproperties_suffix: '' + catalog: spark_catalog + database: "${EXP_NAME}" + table_format: iceberg + data_path: 'abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/run/iceberg/sf_${EXP_SCALE_FACTOR}/' + options_suffix: '' + tblproperties_suffix: ', "format-version"="2", "write.delete.mode"="copy-on-write", "write.update.mode"="copy-on-write", "write.merge.mode"="copy-on-write"' diff --git a/run/spark-3.3.1/azure-pipelines/config/experiment_config-mor-hudi-0.12.2.yaml b/run/spark-3.3.1/azure-pipelines/config/experiment_config-mor-hudi-0.12.2.yaml new file mode 100644 index 00000000..cbc82720 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/config/experiment_config-mor-hudi-0.12.2.yaml @@ -0,0 +1,29 @@ +# Description: Experiment Configuration +--- +version: 1 +id: "${EXP_NAME}" +repetitions: 1 +# Metadata accepts any key-value that we want to register together with the experiment run. +metadata: + system: spark + system_version: 3.3.1 + table_format: hudi + table_format_version: 0.12.2 + scale_factor: "${EXP_SCALE_FACTOR}" + mode: mor + machine: "${EXP_MACHINE}" + cluster_size: "${EXP_CLUSTER_SIZE}" +# The following parameter values will be used to replace the variables in the workload statements. +parameter_values: + external_catalog: spark_catalog + external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}" + external_table_format: csv + external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/" + external_options_suffix: ',header="true"' + external_tblproperties_suffix: '' + catalog: spark_catalog + database: "${EXP_NAME}" + table_format: hudi + data_path: 'abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/run/hudi/sf_${EXP_SCALE_FACTOR}/' + options_suffix: '' + tblproperties_suffix: ', "type"="mor"' diff --git a/run/spark-3.3.1/azure-pipelines/config/experiment_config-mor-iceberg-1.1.0.yaml b/run/spark-3.3.1/azure-pipelines/config/experiment_config-mor-iceberg-1.1.0.yaml new file mode 100644 index 00000000..2b916227 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/config/experiment_config-mor-iceberg-1.1.0.yaml @@ -0,0 +1,29 @@ +# Description: Experiment Configuration +--- +version: 1 +id: "${EXP_NAME}" +repetitions: 1 +# Metadata accepts any key-value that we want to register together with the experiment run. +metadata: + system: spark + system_version: 3.3.1 + table_format: iceberg + table_format_version: 1.1.0 + scale_factor: "${EXP_SCALE_FACTOR}" + mode: mor + machine: "${EXP_MACHINE}" + cluster_size: "${EXP_CLUSTER_SIZE}" +# The following parameter values will be used to replace the variables in the workload statements. +parameter_values: + external_catalog: spark_catalog + external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}" + external_table_format: csv + external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/" + external_options_suffix: ',header="true"' + external_tblproperties_suffix: '' + catalog: spark_catalog + database: "${EXP_NAME}" + table_format: iceberg + data_path: 'abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/run/iceberg/sf_${EXP_SCALE_FACTOR}/' + options_suffix: '' + tblproperties_suffix: ', "format-version"="2", "write.delete.mode"="merge-on-read", "write.update.mode"="merge-on-read", "write.merge.mode"="merge-on-read"' diff --git a/run/spark-3.3.1/azure-pipelines/config/setup_experiment_config.yaml b/run/spark-3.3.1/azure-pipelines/config/setup_experiment_config.yaml new file mode 100644 index 00000000..a4907102 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/config/setup_experiment_config.yaml @@ -0,0 +1,20 @@ +# Description: Experiment Configuration +--- +version: 1 +id: setup_experiment +repetitions: 1 +# Metadata accepts any key-value that we want to register together with the experiment run. +metadata: + system: spark + system_version: 3.3.1 + scale_factor: "${EXP_SCALE_FACTOR}" + machine: "${EXP_MACHINE}" + cluster_size: "${EXP_CLUSTER_SIZE}" +# The following parameter values will be used to replace the variables in the workload statements. +parameter_values: + external_catalog: spark_catalog + external_database: "external_tpcds_sf_${EXP_SCALE_FACTOR}" + external_table_format: csv + external_data_path: "abfss://${DATA_STORAGE_ACCOUNT_CONTAINER}@${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net/tpc-ds/csv/sf_${EXP_SCALE_FACTOR}/" + external_options_suffix: ',header="true"' + external_tblproperties_suffix: '' diff --git a/run/spark-3.3.1/azure-pipelines/config/telemetry_config.yaml b/run/spark-3.3.1/azure-pipelines/config/telemetry_config.yaml new file mode 100644 index 00000000..6e5f3400 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/config/telemetry_config.yaml @@ -0,0 +1,13 @@ +# Description: Telemetry Configuration +--- +version: 1 +connection: + id: duckdb_0 + driver: org.duckdb.DuckDBDriver + url: jdbc:duckdb:./telemetry-spark-3.3.1 +execute_ddl: true +ddl_file: 'src/main/resources/scripts/logging/duckdb/ddl.sql' +insert_file: 'src/main/resources/scripts/logging/duckdb/insert.sql' +# The following parameter values will be used to replace the variables in the logging statements. +parameter_values: + data_path: '' \ No newline at end of file diff --git a/run/spark-3.3.1/azure-pipelines/run-lst-bench.yml b/run/spark-3.3.1/azure-pipelines/run-lst-bench.yml new file mode 100644 index 00000000..1d63227e --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/run-lst-bench.yml @@ -0,0 +1,297 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +trigger: none + +parameters: +- name: lsts + type: object + default: + - table_format: "delta" + version: "2.2.0" + mode: "cow" + - table_format: "iceberg" + version: "1.1.0" + mode: "cow" + - table_format: "iceberg" + version: "1.1.0" + mode: "mor" + - table_format: "hudi" + version: "0.12.2" + mode: "cow" + - table_format: "hudi" + version: "0.12.2" + mode: "mor" +- name: workloads + type: object + default: + - "wp1_longevity" + - "wp2_resilience" + - "wp3_rw_concurrency" + - "wp4_time_travel" +- name: exp_scale_factor + type: number + default: 100 +- name: exp_machine + type: string + default: "Standard_E8s_v5" +- name: exp_cluster_size + type: number + default: 8 + +variables: + MAVEN_CACHE_FOLDER: $(Pipeline.Workspace)/.m2/repository + MAVEN_OPTS: '-ntp -B -Dmaven.repo.local=$(MAVEN_CACHE_FOLDER)' + EXP_SCALE_FACTOR: ${{ parameters.exp_scale_factor }} + EXP_MACHINE: ${{ parameters.exp_machine }} + EXP_CLUSTER_SIZE: ${{ parameters.exp_cluster_size }} + +stages: +# Build LST-Bench and create artifact to deploy to target VM +- stage: build + jobs: + - job: Build + pool: + vmImage: 'ubuntu-latest' + steps: + - task: Cache@2 + displayName: Cache Maven local repo + inputs: + key: 'maven | "$(Agent.OS)" | **/pom.xml' + restoreKeys: | + maven | "$(Agent.OS)" + maven + path: $(MAVEN_CACHE_FOLDER) + - task: Maven@4 + inputs: + mavenPomFile: 'pom.xml' + options: $(MAVEN_OPTS) + javaHomeOption: 'JDKVersion' + jdkVersionOption: '1.11' + publishJUnitResults: false + goals: 'package -DskipTests -Pspark-jdbc' + - task: CopyFiles@2 + displayName: 'Copy Artifacts to: $(TargetFolder)' + inputs: + SourceFolder: '$(Build.SourcesDirectory)' + TargetFolder: '$(System.DefaultWorkingDirectory)/pipeline-artifacts/' + - task: PublishPipelineArtifact@1 + inputs: + targetPath: '$(System.DefaultWorkingDirectory)/pipeline-artifacts/' + artifact: lst-bench-0.1-SNAPSHOT + +# Set up engine and deploy LST-Bench +- stage: deploy + jobs: + - deployment: EngineDeploy + displayName: 'Deploying engine' + workspace: + clean: all + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-head' + strategy: + runOnce: + deploy: + steps: + - bash: | + echo 'Deploy engine' + mkdir -p ~/spark-3.3.1 + cp $(Pipeline.Workspace)/lst-bench-0.1-SNAPSHOT/run/spark-3.3.1/azure-pipelines/sh/* ~/spark-3.3.1/ + cd ~/spark-3.3.1 + chmod +x ./* + spark_head_node=$(ip addr show eth0 | sed -n 's/ *inet [^0-9]*\([0-9\.]\+\).*/\1/p') + ./init.sh "${spark_head_node}" "$(data_storage_account)" "$(data_storage_account_shared_key)" + ./hms.sh "$(hms_jdbc_driver)" "$(hms_jdbc_url)" "$(hms_jdbc_user)" "$(hms_jdbc_password)" "$(hms_storage_account)" "$(hms_storage_account_shared_key)" "$(hms_storage_account_container)" + ./dist-setup.sh + ./dist-exec.sh spark-3.3.1 init.sh "${spark_head_node}" "$(data_storage_account)" "$(data_storage_account_shared_key)" + - deployment: ClientDeploy + displayName: 'Deploying LST-Bench client' + workspace: + clean: all + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-client' + strategy: + runOnce: + deploy: + steps: + - bash: | + echo 'Deploy LST-Bench client' + sudo apt install -y openjdk-11-jdk + mkdir -p ~/lst-bench-0.1-SNAPSHOT + cp -rf $(Pipeline.Workspace)/lst-bench-0.1-SNAPSHOT/* ~/lst-bench-0.1-SNAPSHOT/ + chmod +x ~/lst-bench-0.1-SNAPSHOT/launcher.sh + +# Run LST-Bench (setup external tables) +- stage: setup_experiment + jobs: + - deployment: StartEngine + displayName: "Starting Engine" + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-head' + variables: + process.clean: false + strategy: + runOnce: + deploy: + steps: + - download: none + - bash: | + cd ~/spark-3.3.1 + ./stop-cluster.sh && ./start-cluster.sh + sleep 10 + spark_head_node=$(ip addr show eth0 | sed -n 's/ *inet [^0-9]*\([0-9\.]\+\).*/\1/p') + echo "##vso[task.setvariable variable=spark_head_node;isOutput=true]${spark_head_node}" + name: engine_start_step + - deployment: RunSetupExperiment + dependsOn: StartEngine + displayName: "Setup Experiment" + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-client' + variables: + spark_master_host: $[ dependencies.StartEngine.outputs['deploy_lst-bench-head.engine_start_step.spark_head_node'] ] + timeoutInMinutes: 0 + strategy: + runOnce: + deploy: + steps: + - download: none + - bash: | + cd ~/lst-bench-0.1-SNAPSHOT + ./launcher.sh -c run/spark-3.3.1/azure-pipelines/config/connections_config.yaml \ + -e run/spark-3.3.1/azure-pipelines/config/setup_experiment_config.yaml \ + -t run/spark-3.3.1/azure-pipelines/config/telemetry_config.yaml \ + -l run/spark-3.3.1/config/tpcds/library.yaml \ + -w run/spark-3.3.1/config/tpcds/setup_experiment.yaml + - deployment: StopEngine + dependsOn: RunSetupExperiment + displayName: "Stopping Engine" + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-head' + strategy: + runOnce: + deploy: + steps: + - download: none + - bash: | + cd ~/spark-3.3.1 + ./stop-cluster.sh + +# Run LST-Bench +# TODO: Enable time travel for Hudi (see HUDI-7274) +- ${{ each lst in parameters.lsts }}: + - stage: setup_${{ lst.mode }}_${{ lst.table_format }} + jobs: + - deployment: SetupEngine + displayName: "Setup Engine (${{ lst.mode }}, ${{ lst.table_format }}-${{ lst.version }})" + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-head' + strategy: + runOnce: + deploy: + steps: + - download: none + - bash: | + cd ~/spark-3.3.1 + ./${{ lst.table_format }}-${{ lst.version }}.sh + ./dist-exec.sh spark-3.3.1 ${{ lst.table_format }}-${{ lst.version }}.sh + - ${{ each workload in parameters.workloads }}: + - ${{ if or(ne(lst.table_format, 'hudi'),ne(workload, 'wp4_time_travel')) }}: + - stage: test_${{ lst.mode }}_${{ lst.table_format }}_${{ workload }} + jobs: + - deployment: StartEngine + displayName: "Starting Engine (${{ lst.mode }}, ${{ lst.table_format }}-${{ lst.version }}, ${{ workload }})" + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-head' + variables: + process.clean: false + strategy: + runOnce: + deploy: + steps: + - download: none + - bash: | + cd ~/spark-3.3.1 + ./stop-cluster.sh && ./start-cluster.sh ${{ lst.table_format }} + sleep 10 + spark_head_node=$(ip addr show eth0 | sed -n 's/ *inet [^0-9]*\([0-9\.]\+\).*/\1/p') + echo "##vso[task.setvariable variable=spark_head_node;isOutput=true]${spark_head_node}" + name: engine_start_step + - deployment: RunExperiment + dependsOn: StartEngine + displayName: "Running Experiment (${{ lst.mode }}, ${{ lst.table_format }}-${{ lst.version }}, ${{ workload }})" + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-client' + variables: + spark_master_host: $[ dependencies.StartEngine.outputs['deploy_lst-bench-head.engine_start_step.spark_head_node'] ] + timeoutInMinutes: 0 + strategy: + runOnce: + deploy: + steps: + - download: none + - bash: | + cd ~/lst-bench-0.1-SNAPSHOT + echo "${{ workload }}" + export EXP_NAME="${{ workload }}" + ./launcher.sh -c run/spark-3.3.1/azure-pipelines/config/connections_config.yaml \ + -e run/spark-3.3.1/azure-pipelines/config/experiment_config-${{ lst.mode }}-${{ lst.table_format }}-${{ lst.version }}.yaml \ + -t run/spark-3.3.1/azure-pipelines/config/telemetry_config.yaml \ + -l run/spark-3.3.1/config/tpcds/library.yaml \ + -w run/spark-3.3.1/config/tpcds/${{ workload }}-${{ lst.table_format }}-${{ lst.version }}.yaml + - deployment: StopEngine + dependsOn: RunExperiment + displayName: "Stopping Engine (${{ lst.mode }}, ${{ lst.table_format }}-${{ lst.version }}, ${{ workload }})" + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-head' + strategy: + runOnce: + deploy: + steps: + - download: none + - bash: | + cd ~/spark-3.3.1 + ./stop-cluster.sh + - stage: cleanup_${{ lst.mode }}_${{ lst.table_format }} + jobs: + - deployment: CleanupEngine + displayName: "Cleanup Engine (${{ lst.mode }}, ${{ lst.table_format }}-${{ lst.version }})" + environment: + name: 'lst-bench-github' + resourceType: VirtualMachine + resourceName: 'lst-bench-head' + strategy: + runOnce: + deploy: + steps: + - download: none + - bash: | + cd ~/spark-3.3.1 + ./cleanup-${{ lst.table_format }}-${{ lst.version }}.sh + ./dist-exec.sh spark-3.3.1 cleanup-${{ lst.table_format }}-${{ lst.version }}.sh diff --git a/run/spark-3.3.1/azure-pipelines/sh/cleanup-delta-2.2.0.sh b/run/spark-3.3.1/azure-pipelines/sh/cleanup-delta-2.2.0.sh new file mode 100755 index 00000000..c8eacccd --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/sh/cleanup-delta-2.2.0.sh @@ -0,0 +1,9 @@ +#!/bin/bash -e +source env.sh +if [ -z "${SPARK_HOME}" ]; then + echo "ERROR: SPARK_HOME is not defined." + exit 1 +fi + +rm $SPARK_HOME/jars/delta-core.jar +rm $SPARK_HOME/jars/delta-storage.jar diff --git a/run/spark-3.3.1/azure-pipelines/sh/cleanup-hudi-0.12.2.sh b/run/spark-3.3.1/azure-pipelines/sh/cleanup-hudi-0.12.2.sh new file mode 100755 index 00000000..ab6aee49 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/sh/cleanup-hudi-0.12.2.sh @@ -0,0 +1,8 @@ +#!/bin/bash -e +source env.sh +if [ -z "${SPARK_HOME}" ]; then + echo "ERROR: SPARK_HOME is not defined." + exit 1 +fi + +rm $SPARK_HOME/jars/hudi-spark-bundle.jar diff --git a/run/spark-3.3.1/azure-pipelines/sh/cleanup-iceberg-1.1.0.sh b/run/spark-3.3.1/azure-pipelines/sh/cleanup-iceberg-1.1.0.sh new file mode 100755 index 00000000..e0a01cd8 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/sh/cleanup-iceberg-1.1.0.sh @@ -0,0 +1,8 @@ +#!/bin/bash -e +source env.sh +if [ -z "${SPARK_HOME}" ]; then + echo "ERROR: SPARK_HOME is not defined." + exit 1 +fi + +rm $SPARK_HOME/jars/iceberg-spark-runtime.jar diff --git a/run/spark-3.3.1/azure-pipelines/sh/delta-2.2.0.sh b/run/spark-3.3.1/azure-pipelines/sh/delta-2.2.0.sh new file mode 100755 index 00000000..c9e4f015 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/sh/delta-2.2.0.sh @@ -0,0 +1,12 @@ +#!/bin/bash -e +source env.sh +if [ -z "${SPARK_HOME}" ]; then + echo "ERROR: SPARK_HOME is not defined." + exit 1 +fi + +wget -nv -N https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar +wget -nv -N https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar + +ln -sf $(pwd)/delta-core_2.12-2.2.0.jar $SPARK_HOME/jars/delta-core.jar +ln -sf $(pwd)/delta-storage-2.2.0.jar $SPARK_HOME/jars/delta-storage.jar diff --git a/run/spark-3.3.1/azure-pipelines/sh/dist-exec.sh b/run/spark-3.3.1/azure-pipelines/sh/dist-exec.sh new file mode 100755 index 00000000..bd7c3ca6 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/sh/dist-exec.sh @@ -0,0 +1,18 @@ +#!/bin/bash -e +source env.sh +if [ -z "${HOSTS}" ]; then + echo "ERROR: HOSTS is not defined." + exit 1 +fi + +if [ "$#" -lt 2 ]; then + echo "Error: Please provide at least two input parameters." + exit 1 +fi +deploy_dir=$1 +script_file=$2 + +for node in $HOSTS ; do ssh -t $node "mkdir -p ~/$deploy_dir" ; done +for node in $HOSTS ; do scp *.template $node:~/$deploy_dir ; done +for node in $HOSTS ; do scp $script_file $node:~/$deploy_dir ; done +for node in $HOSTS ; do ssh -t $node "cd ~/$deploy_dir && chmod +x ./$script_file && ./$script_file ${@:3}" ; done diff --git a/run/spark-3.3.1/azure-pipelines/sh/dist-setup.sh b/run/spark-3.3.1/azure-pipelines/sh/dist-setup.sh new file mode 100755 index 00000000..fda4f282 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/sh/dist-setup.sh @@ -0,0 +1,28 @@ +#!/bin/bash -e +source env.sh +if [ -z "${HOME}" ]; then + echo "ERROR: HOME is not defined." + exit 1 +fi +if [ -z "${SPARK_HOME}" ]; then + echo "ERROR: SPARK_HOME is not defined." + exit 1 +fi + +# Install packages +sudo apt install -y net-tools nmap + +# Configure hosts +my_ip=$(/sbin/ifconfig eth0 | sed -n 's/ *inet [^0-9]*\([0-9\.]\+\).*/\1/p') +ip_range=${my_ip%.*}.* +nmap -sn $ip_range | grep -Eo '([0-9]{1,3}\.){3}[0-9]{1,3}' | grep -v "^$my_ip$" > $HOME/hostiplist + +cp $HOME/hostiplist $SPARK_HOME/conf/workers + +export HOSTS=$(<$HOME/hostiplist) + +for node in $HOSTS ; do scp ~/.ssh/id_rsa* $node:~/.ssh/ ; done + +# Push to environment +echo "export HOSTS=\"${HOSTS}\"" >> env.sh +echo "source $(pwd)/env.sh" >> ~/.bashrc diff --git a/run/spark-3.3.1/azure-pipelines/sh/hive-site.xml.template b/run/spark-3.3.1/azure-pipelines/sh/hive-site.xml.template new file mode 100644 index 00000000..0e79ed7b --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/sh/hive-site.xml.template @@ -0,0 +1,36 @@ + + + javax.jdo.option.ConnectionURL + ${HMS_JDBC_URL} + + + + javax.jdo.option.ConnectionDriverName + ${HMS_JDBC_DRIVER} + + + + javax.jdo.option.ConnectionUserName + ${HMS_JDBC_USER} + + + + javax.jdo.option.ConnectionPassword + ${HMS_JDBC_PASSWORD} + + + + hive.metastore.warehouse.dir + abfss://${HMS_STORAGE_ACCOUNT_CONTAINER}@${HMS_STORAGE_ACCOUNT}.dfs.core.windows.net/hive/warehouse + + + + fs.azure.account.auth.type.${HMS_STORAGE_ACCOUNT}.dfs.core.windows.net + SharedKey + + + + fs.azure.account.key.${HMS_STORAGE_ACCOUNT}.dfs.core.windows.net + ${HMS_STORAGE_ACCOUNT_SHARED_KEY} + + \ No newline at end of file diff --git a/run/spark-3.3.1/azure-pipelines/sh/hms.sh b/run/spark-3.3.1/azure-pipelines/sh/hms.sh new file mode 100755 index 00000000..4d78cbff --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/sh/hms.sh @@ -0,0 +1,45 @@ +#!/bin/bash -e +if [ "$#" -ne 7 ]; then + echo "Usage: $0 HMS_JDBC_DRIVER HMS_JDBC_URL HMS_JDBC_USER HMS_JDBC_PASSWORD HMS_STORAGE_ACCOUNT HMS_STORAGE_ACCOUNT_SHARED_KEY HMS_STORAGE_ACCOUNT_CONTAINER" + exit 1 +fi + +source env.sh +if [ -z "${HADOOP_HOME}" ]; then + echo "ERROR: HADOOP_HOME is not defined." + exit 1 +fi +if [ -z "${SPARK_HOME}" ]; then + echo "ERROR: SPARK_HOME is not defined." + exit 1 +fi + +export HMS_JDBC_DRIVER=$1 +export HMS_JDBC_URL=$2 +export HMS_JDBC_USER=$3 +export HMS_JDBC_PASSWORD=$4 +export HMS_STORAGE_ACCOUNT=$5 +export HMS_STORAGE_ACCOUNT_SHARED_KEY=$6 +export HMS_STORAGE_ACCOUNT_CONTAINER=$7 +export HIVE_HOME=/home/$USER/hive + +# Install Hive (needed for HMS) +rm -rf apache-hive-2.3.9-bin +wget -nv -N https://downloads.apache.org/hive/hive-2.3.9/apache-hive-2.3.9-bin.tar.gz +tar -xzf apache-hive-2.3.9-bin.tar.gz +ln -sf $(pwd)/apache-hive-2.3.9-bin $HIVE_HOME + +# Configure HMS +envsubst < "hive-site.xml.template" > "$HIVE_HOME/conf/hive-site.xml" +ln -sf $HIVE_HOME/conf/hive-site.xml $SPARK_HOME/conf/hive-site.xml + +# Copy Azure dependencies to Hive classpath +cp $HADOOP_HOME/share/hadoop/tools/lib/hadoop-azure* $HIVE_HOME/lib/ + +# Install MSSQL driver +wget -nv -N https://repo1.maven.org/maven2/com/microsoft/sqlserver/mssql-jdbc/6.2.1.jre8/mssql-jdbc-6.2.1.jre8.jar +ln -sf $(pwd)/mssql-jdbc-6.2.1.jre8.jar $SPARK_HOME/jars/mssql-jdbc.jar + +# Push to environment +echo "export HIVE_HOME=${HIVE_HOME}" >> env.sh +echo "source $(pwd)/env.sh" >> ~/.bashrc diff --git a/run/spark-3.3.1/azure-pipelines/sh/hudi-0.12.2.sh b/run/spark-3.3.1/azure-pipelines/sh/hudi-0.12.2.sh new file mode 100755 index 00000000..7c9166c5 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/sh/hudi-0.12.2.sh @@ -0,0 +1,10 @@ +#!/bin/bash -e +source env.sh +if [ -z "${SPARK_HOME}" ]; then + echo "ERROR: SPARK_HOME is not defined." + exit 1 +fi + +wget -nv -N https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark3.3-bundle_2.12/0.12.2/hudi-spark3.3-bundle_2.12-0.12.2.jar + +ln -sf $(pwd)/hudi-spark3.3-bundle_2.12-0.12.2.jar $SPARK_HOME/jars/hudi-spark-bundle.jar diff --git a/run/spark-3.3.1/azure-pipelines/sh/iceberg-1.1.0.sh b/run/spark-3.3.1/azure-pipelines/sh/iceberg-1.1.0.sh new file mode 100755 index 00000000..61d6c4d5 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/sh/iceberg-1.1.0.sh @@ -0,0 +1,10 @@ +#!/bin/bash -e +source env.sh +if [ -z "${SPARK_HOME}" ]; then + echo "ERROR: SPARK_HOME is not defined." + exit 1 +fi + +wget -nv -N https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.3_2.12/1.1.0/iceberg-spark-runtime-3.3_2.12-1.1.0.jar + +ln -sf $(pwd)/iceberg-spark-runtime-3.3_2.12-1.1.0.jar $SPARK_HOME/jars/iceberg-spark-runtime.jar diff --git a/run/spark-3.3.1/azure-pipelines/sh/init.sh b/run/spark-3.3.1/azure-pipelines/sh/init.sh new file mode 100755 index 00000000..282753a5 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/sh/init.sh @@ -0,0 +1,60 @@ +#!/bin/bash -e +if [ "$#" -ne 3 ]; then + echo "Usage: $0 SPARK_MASTER_HOST DATA_STORAGE_ACCOUNT DATA_STORAGE_ACCOUNT_SHARED_KEY" + exit 1 +fi + +if [ -z "${USER}" ]; then + echo "ERROR: USER is not defined." + exit 1 +fi + +export SPARK_MASTER_HOST=$1 +export SPARK_HOME=/home/$USER/spark +export HADOOP_HOME=/home/$USER/hadoop +export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64 +export DATA_STORAGE_ACCOUNT=$2 +export DATA_STORAGE_ACCOUNT_SHARED_KEY=$3 + +# Update dependencies and install packages +sudo apt update -y +sudo apt install -y openjdk-8-jdk wget + +# Install Hadoop +rm -rf hadoop-3.3.1 +wget -nv -N https://archive.apache.org/dist/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz +tar -xzf hadoop-3.3.1.tar.gz +ln -sf $(pwd)/hadoop-3.3.1 $HADOOP_HOME + +# Install Spark +rm -rf spark-3.3.1-bin-hadoop3 +wget -nv -N https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz +tar -xf spark-3.3.1-bin-hadoop3.tgz +ln -sf $(pwd)/spark-3.3.1-bin-hadoop3 $SPARK_HOME + +# Configure Spark +sudo mkdir -p /opt/spark-events +sudo chown $USER:$USER /opt/spark-events/ + +cp $SPARK_HOME/conf/spark-env.sh.template $SPARK_HOME/conf/spark-env.sh +cp $SPARK_HOME/conf/spark-defaults.conf.template $SPARK_HOME/conf/spark-defaults.conf + +envsubst < "spark-defaults.conf.template" > "$SPARK_HOME/conf/spark-defaults.conf" + +envsubst < "spark-env.sh.template" > "$SPARK_HOME/conf/spark-env.sh" + +sudo mkdir -p /mnt/local_resource/ +sudo mkdir -p /mnt/local_resource/data/ +sudo chown $USER:$USER /mnt/local_resource/data +sudo mkdir -p /mnt/local_resource/tmp/ +sudo chown $USER:$USER /mnt/local_resource/tmp + +# Copy Azure dependencies to Spark classpath +cp $HADOOP_HOME/share/hadoop/tools/lib/hadoop-azure* $SPARK_HOME/jars/ + +# Push to environment +echo "export HADOOP_HOME=${HADOOP_HOME} +export SPARK_HOME=${SPARK_HOME} +export JAVA_HOME=${JAVA_HOME} +export PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin" >> env.sh +echo "source $(pwd)/env.sh" >> ~/.bashrc diff --git a/run/spark-3.3.1/azure-pipelines/sh/spark-defaults.conf.template b/run/spark-3.3.1/azure-pipelines/sh/spark-defaults.conf.template new file mode 100644 index 00000000..67909343 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/sh/spark-defaults.conf.template @@ -0,0 +1,16 @@ +spark.master spark://${SPARK_MASTER_HOST}:7077 +spark.driver.cores 4 +spark.driver.memory 45992m +spark.executor.cores 7 +spark.executor.memory 11754m +spark.memory.offHeap.enabled true +spark.memory.offHeap.size 36974886912 +spark.eventLog.enabled true +spark.eventLog.dir file:/opt/spark-events +spark.history.fs.logDirectory file:/opt/spark-events +spark.serializer org.apache.spark.serializer.KryoSerializer +spark.kryoserializer.buffer 1024k +spark.kryoserializer.buffer.max 1024m +spark.sql.parquet.compression.codec gzip +spark.hadoop.fs.azure.account.auth.type.${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net SharedKey +spark.hadoop.fs.azure.account.key.${DATA_STORAGE_ACCOUNT}.dfs.core.windows.net ${DATA_STORAGE_ACCOUNT_SHARED_KEY} \ No newline at end of file diff --git a/run/spark-3.3.1/azure-pipelines/sh/spark-env.sh.template b/run/spark-3.3.1/azure-pipelines/sh/spark-env.sh.template new file mode 100644 index 00000000..18ea7d39 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/sh/spark-env.sh.template @@ -0,0 +1,2 @@ +SPARK_MASTER_HOST=$SPARK_MASTER_HOST +JAVA_HOME=$JAVA_HOME \ No newline at end of file diff --git a/run/spark-3.3.1/azure-pipelines/sh/start-cluster.sh b/run/spark-3.3.1/azure-pipelines/sh/start-cluster.sh new file mode 100755 index 00000000..353e0b5f --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/sh/start-cluster.sh @@ -0,0 +1,32 @@ +#!/bin/bash -e +source env.sh +if [ -z "${SPARK_HOME}" ]; then + echo "ERROR: SPARK_HOME is not defined." + exit 1 +fi + +cd $SPARK_HOME + +echo "Starting Spark cluster" +./sbin/start-all.sh + +echo "Starting history server" +./sbin/start-history-server.sh + +echo "Starting thrift server" +if [ "$#" == 0 ]; then + echo "No LST provided" + ./sbin/start-thriftserver.sh +elif [ "$1" == "delta" ]; then + echo "Using delta catalog" + ./sbin/start-thriftserver.sh --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog --conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension +elif [ "$1" == "iceberg" ]; then + echo "Using iceberg catalog" + ./sbin/start-thriftserver.sh --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog --conf spark.sql.catalog.spark_catalog.type=hive --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions +elif [ "$1" == "hudi" ]; then + echo "Using hudi catalog" + ./sbin/start-thriftserver.sh --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension +else + echo "Invalid LST" + exit 1 +fi diff --git a/run/spark-3.3.1/azure-pipelines/sh/stop-cluster.sh b/run/spark-3.3.1/azure-pipelines/sh/stop-cluster.sh new file mode 100755 index 00000000..68502692 --- /dev/null +++ b/run/spark-3.3.1/azure-pipelines/sh/stop-cluster.sh @@ -0,0 +1,17 @@ +#!/bin/bash -e +source env.sh +if [ -z "${SPARK_HOME}" ]; then + echo "ERROR: SPARK_HOME is not defined." + exit 1 +fi + +cd $SPARK_HOME + +echo "Stopping thrift server" +./sbin/stop-thriftserver.sh + +echo "Stopping history server" +./sbin/stop-history-server.sh + +echo "Stopping spark cluster" +./sbin/stop-all.sh \ No newline at end of file diff --git a/run/spark-3.3.1/config/tpcds/setup_experiment.yaml b/run/spark-3.3.1/config/tpcds/setup_experiment.yaml new file mode 100644 index 00000000..d122811f --- /dev/null +++ b/run/spark-3.3.1/config/tpcds/setup_experiment.yaml @@ -0,0 +1,32 @@ +# Description: Setup experiment +--- +version: 1 +id: setup_experiment +phases: +- id: setup + sessions: + - tasks: + - template_id: setup +- id: setup_data_maintenance + sessions: + - tasks: + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance + - template_id: setup_data_maintenance diff --git a/run/spark-3.3.1/config/tpcds/w0_tpcds-delta.yaml b/run/spark-3.3.1/config/tpcds/w0_tpcds-delta-2.2.0.yaml similarity index 98% rename from run/spark-3.3.1/config/tpcds/w0_tpcds-delta.yaml rename to run/spark-3.3.1/config/tpcds/w0_tpcds-delta-2.2.0.yaml index 8c55b511..dd975408 100644 --- a/run/spark-3.3.1/config/tpcds/w0_tpcds-delta.yaml +++ b/run/spark-3.3.1/config/tpcds/w0_tpcds-delta-2.2.0.yaml @@ -1,7 +1,7 @@ # Description: W0: Original TPC-DS sequence --- version: 1 -id: w0_tpcds_delta +id: w0_tpcds phases: - id: setup sessions: diff --git a/run/spark-3.3.1/config/tpcds/w0_tpcds-hudi.yaml b/run/spark-3.3.1/config/tpcds/w0_tpcds-hudi-0.12.2.yaml similarity index 98% rename from run/spark-3.3.1/config/tpcds/w0_tpcds-hudi.yaml rename to run/spark-3.3.1/config/tpcds/w0_tpcds-hudi-0.12.2.yaml index d4508627..0e81b4fd 100644 --- a/run/spark-3.3.1/config/tpcds/w0_tpcds-hudi.yaml +++ b/run/spark-3.3.1/config/tpcds/w0_tpcds-hudi-0.12.2.yaml @@ -1,7 +1,7 @@ # Description: W0: Original TPC-DS sequence --- version: 1 -id: w0_tpcds_hudi +id: w0_tpcds phases: - id: setup sessions: diff --git a/run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg.yaml b/run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg-1.1.0.yaml similarity index 98% rename from run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg.yaml rename to run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg-1.1.0.yaml index 4fd0b4f0..ab43a8ef 100644 --- a/run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg.yaml +++ b/run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg-1.1.0.yaml @@ -1,7 +1,7 @@ # Description: W0: Original TPC-DS sequence --- version: 1 -id: w0_tpcds_iceberg +id: w0_tpcds phases: - id: setup sessions: diff --git a/run/spark-3.3.1/config/tpcds/wp1_longevity.yaml b/run/spark-3.3.1/config/tpcds/wp1_longevity-delta-2.2.0.yaml similarity index 70% rename from run/spark-3.3.1/config/tpcds/wp1_longevity.yaml rename to run/spark-3.3.1/config/tpcds/wp1_longevity-delta-2.2.0.yaml index f12d1d63..b0498bce 100644 --- a/run/spark-3.3.1/config/tpcds/wp1_longevity.yaml +++ b/run/spark-3.3.1/config/tpcds/wp1_longevity-delta-2.2.0.yaml @@ -3,23 +3,6 @@ version: 1 id: wp1_longevity phases: -- id: setup - sessions: - - tasks: - - template_id: setup -- id: setup_data_maintenance - sessions: - - tasks: - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - id: init sessions: - tasks: diff --git a/run/spark-3.3.1/config/tpcds/wp1_longevity-hudi-0.12.2.yaml b/run/spark-3.3.1/config/tpcds/wp1_longevity-hudi-0.12.2.yaml new file mode 100644 index 00000000..88c784c7 --- /dev/null +++ b/run/spark-3.3.1/config/tpcds/wp1_longevity-hudi-0.12.2.yaml @@ -0,0 +1,65 @@ +# Description: WP1: Longevity +--- +version: 1 +id: wp1_longevity +phases: +- id: init + sessions: + - tasks: + - template_id: init +- id: build + sessions: + - tasks: + - template_id: build + replace_regex: + - pattern: '(?i)varchar\(.*\)|char\(.*\)' + replacement: 'string' +- id: single_user_1 + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_1 + sessions: + - tasks: + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi +- id: single_user_2 + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_2 + sessions: + - tasks: + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi +- id: single_user_3 + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_3 + sessions: + - tasks: + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi +- id: single_user_4 + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_4 + sessions: + - tasks: + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi +- id: single_user_5 + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_5 + sessions: + - tasks: + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi +- id: single_user_6 + sessions: + - tasks: + - template_id: single_user diff --git a/run/spark-3.3.1/config/tpcds/wp1_longevity-iceberg-1.1.0.yaml b/run/spark-3.3.1/config/tpcds/wp1_longevity-iceberg-1.1.0.yaml new file mode 100644 index 00000000..721e3474 --- /dev/null +++ b/run/spark-3.3.1/config/tpcds/wp1_longevity-iceberg-1.1.0.yaml @@ -0,0 +1,62 @@ +# Description: WP1: Longevity +--- +version: 1 +id: wp1_longevity +phases: +- id: init + sessions: + - tasks: + - template_id: init +- id: build + sessions: + - tasks: + - template_id: build +- id: single_user_1 + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_1 + sessions: + - tasks: + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg +- id: single_user_2 + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_2 + sessions: + - tasks: + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg +- id: single_user_3 + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_3 + sessions: + - tasks: + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg +- id: single_user_4 + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_4 + sessions: + - tasks: + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg +- id: single_user_5 + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_5 + sessions: + - tasks: + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg +- id: single_user_6 + sessions: + - tasks: + - template_id: single_user diff --git a/run/spark-3.3.1/config/tpcds/wp1_longevity_trickle_1k_batches.yaml b/run/spark-3.3.1/config/tpcds/wp1_longevity_trickle_1k_batches.yaml index 2cfb71ba..7d81df86 100644 --- a/run/spark-3.3.1/config/tpcds/wp1_longevity_trickle_1k_batches.yaml +++ b/run/spark-3.3.1/config/tpcds/wp1_longevity_trickle_1k_batches.yaml @@ -1,7 +1,7 @@ # Description: WP1: Longevity --- version: 1 -id: wp1_longevity +id: wp1_longevity_trickle phases: - id: setup sessions: diff --git a/run/spark-3.3.1/config/tpcds/wp2_resilience.yaml b/run/spark-3.3.1/config/tpcds/wp2_resilience-delta-2.2.0.yaml similarity index 71% rename from run/spark-3.3.1/config/tpcds/wp2_resilience.yaml rename to run/spark-3.3.1/config/tpcds/wp2_resilience-delta-2.2.0.yaml index 9ed97a4c..86f38527 100644 --- a/run/spark-3.3.1/config/tpcds/wp2_resilience.yaml +++ b/run/spark-3.3.1/config/tpcds/wp2_resilience-delta-2.2.0.yaml @@ -3,25 +3,6 @@ version: 1 id: wp2_resilience phases: -- id: setup - sessions: - - tasks: - - template_id: setup -- id: setup_data_maintenance - sessions: - - tasks: - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - id: init sessions: - tasks: diff --git a/run/spark-3.3.1/config/tpcds/wp2_resilience-hudi-0.12.2.yaml b/run/spark-3.3.1/config/tpcds/wp2_resilience-hudi-0.12.2.yaml new file mode 100644 index 00000000..ff73de34 --- /dev/null +++ b/run/spark-3.3.1/config/tpcds/wp2_resilience-hudi-0.12.2.yaml @@ -0,0 +1,77 @@ +# Description: WP2: Resilience +--- +version: 1 +id: wp2_resilience +phases: +- id: init + sessions: + - tasks: + - template_id: init +- id: build + sessions: + - tasks: + - template_id: build + replace_regex: + - pattern: '(?i)varchar\(.*\)|char\(.*\)' + replacement: 'string' +- id: single_user_1 + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_1 + sessions: + - tasks: + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi +- id: single_user_2 + sessions: + - tasks: + - template_id: single_user +- id: optimize_1 + sessions: + - tasks: + - template_id: optimize_hudi +- id: single_user_2o + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_2 + sessions: + - tasks: + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi +- id: single_user_3 + sessions: + - tasks: + - template_id: single_user +- id: optimize_2 + sessions: + - tasks: + - template_id: optimize_hudi +- id: single_user_3o + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_3 + sessions: + - tasks: + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi +- id: single_user_4 + sessions: + - tasks: + - template_id: single_user +- id: optimize_3 + sessions: + - tasks: + - template_id: optimize_hudi +- id: single_user_4o + sessions: + - tasks: + - template_id: single_user diff --git a/run/spark-3.3.1/config/tpcds/wp2_resilience-iceberg-1.1.0.yaml b/run/spark-3.3.1/config/tpcds/wp2_resilience-iceberg-1.1.0.yaml new file mode 100644 index 00000000..974730b5 --- /dev/null +++ b/run/spark-3.3.1/config/tpcds/wp2_resilience-iceberg-1.1.0.yaml @@ -0,0 +1,74 @@ +# Description: WP2: Resilience +--- +version: 1 +id: wp2_resilience +phases: +- id: init + sessions: + - tasks: + - template_id: init +- id: build + sessions: + - tasks: + - template_id: build +- id: single_user_1 + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_1 + sessions: + - tasks: + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg +- id: single_user_2 + sessions: + - tasks: + - template_id: single_user +- id: optimize_1 + sessions: + - tasks: + - template_id: optimize_iceberg +- id: single_user_2o + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_2 + sessions: + - tasks: + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg +- id: single_user_3 + sessions: + - tasks: + - template_id: single_user +- id: optimize_2 + sessions: + - tasks: + - template_id: optimize_iceberg +- id: single_user_3o + sessions: + - tasks: + - template_id: single_user +- id: data_maintenance_3 + sessions: + - tasks: + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg +- id: single_user_4 + sessions: + - tasks: + - template_id: single_user +- id: optimize_3 + sessions: + - tasks: + - template_id: optimize_iceberg +- id: single_user_4o + sessions: + - tasks: + - template_id: single_user diff --git a/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency.yaml b/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-delta-2.2.0.yaml similarity index 69% rename from run/spark-3.3.1/config/tpcds/wp3_rw_concurrency.yaml rename to run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-delta-2.2.0.yaml index 28b93990..f84b48bd 100644 --- a/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency.yaml +++ b/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-delta-2.2.0.yaml @@ -3,25 +3,6 @@ version: 1 id: wp3_rw_concurrency phases: -- id: setup - sessions: - - tasks: - - template_id: setup -- id: setup_data_maintenance - sessions: - - tasks: - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - id: init sessions: - tasks: diff --git a/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-hudi-0.12.2.yaml b/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-hudi-0.12.2.yaml new file mode 100644 index 00000000..c5934f51 --- /dev/null +++ b/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-hudi-0.12.2.yaml @@ -0,0 +1,61 @@ +# Description: WP3: R/W concurrency +--- +version: 1 +id: wp3_rw_concurrency +phases: +- id: init + sessions: + - tasks: + - template_id: init +- id: build + sessions: + - tasks: + - template_id: build + replace_regex: + - pattern: '(?i)varchar\(.*\)|char\(.*\)' + replacement: 'string' +- id: single_user_1_data_maintenance_1 + sessions: + - tasks: + - template_id: single_user + - tasks: + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi +- id: single_user_2_optimize_1 + sessions: + - tasks: + - template_id: single_user + - tasks: + - template_id: optimize_hudi +- id: single_user_2o_data_maintenance_2 + sessions: + - tasks: + - template_id: single_user + - tasks: + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi +- id: single_user_3_optimize_2 + sessions: + - tasks: + - template_id: single_user + - tasks: + - template_id: optimize_hudi +- id: single_user_3o_data_maintenance_3 + sessions: + - tasks: + - template_id: single_user + - tasks: + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi +- id: single_user_4_optimize_3 + sessions: + - tasks: + - template_id: single_user + - tasks: + - template_id: optimize_hudi diff --git a/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-iceberg-1.1.0.yaml b/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-iceberg-1.1.0.yaml new file mode 100644 index 00000000..c0be11da --- /dev/null +++ b/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-iceberg-1.1.0.yaml @@ -0,0 +1,58 @@ +# Description: WP3: R/W concurrency +--- +version: 1 +id: wp3_rw_concurrency +phases: +- id: init + sessions: + - tasks: + - template_id: init +- id: build + sessions: + - tasks: + - template_id: build +- id: single_user_1_data_maintenance_1 + sessions: + - tasks: + - template_id: single_user + - tasks: + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg +- id: single_user_2_optimize_1 + sessions: + - tasks: + - template_id: single_user + - tasks: + - template_id: optimize_iceberg +- id: single_user_2o_data_maintenance_2 + sessions: + - tasks: + - template_id: single_user + - tasks: + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg +- id: single_user_3_optimize_2 + sessions: + - tasks: + - template_id: single_user + - tasks: + - template_id: optimize_iceberg +- id: single_user_3o_data_maintenance_3 + sessions: + - tasks: + - template_id: single_user + - tasks: + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg +- id: single_user_4_optimize_3 + sessions: + - tasks: + - template_id: single_user + - tasks: + - template_id: optimize_iceberg diff --git a/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency_multi.yaml b/run/spark-3.3.1/config/tpcds/wp3_rw_concurrency_multi-delta-2.2.0.yaml similarity index 100% rename from run/spark-3.3.1/config/tpcds/wp3_rw_concurrency_multi.yaml rename to run/spark-3.3.1/config/tpcds/wp3_rw_concurrency_multi-delta-2.2.0.yaml diff --git a/run/spark-3.3.1/config/tpcds/wp4_time_travel.yaml b/run/spark-3.3.1/config/tpcds/wp4_time_travel-delta-2.2.0.yaml similarity index 80% rename from run/spark-3.3.1/config/tpcds/wp4_time_travel.yaml rename to run/spark-3.3.1/config/tpcds/wp4_time_travel-delta-2.2.0.yaml index 0b6c186b..64027647 100644 --- a/run/spark-3.3.1/config/tpcds/wp4_time_travel.yaml +++ b/run/spark-3.3.1/config/tpcds/wp4_time_travel-delta-2.2.0.yaml @@ -3,21 +3,6 @@ version: 1 id: wp4_time_travel phases: -- id: setup - sessions: - - tasks: - - template_id: setup -- id: setup_data_maintenance - sessions: - - tasks: - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - - template_id: setup_data_maintenance - id: init sessions: - tasks: diff --git a/run/spark-3.3.1/config/tpcds/wp4_time_travel-hudi-0.12.2.yaml b/run/spark-3.3.1/config/tpcds/wp4_time_travel-hudi-0.12.2.yaml new file mode 100644 index 00000000..b0d7c545 --- /dev/null +++ b/run/spark-3.3.1/config/tpcds/wp4_time_travel-hudi-0.12.2.yaml @@ -0,0 +1,86 @@ +# Description: WP4: Time travel +--- +version: 1 +id: wp4_time_travel +phases: +- id: init + sessions: + - tasks: + - template_id: init +- id: build + sessions: + - tasks: + - template_id: build + replace_regex: + - pattern: '(?i)varchar\(.*\)|char\(.*\)' + replacement: 'string' +- id: data_maintenance_1 + sessions: + - tasks: + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi +- id: single_user_2_0 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: build +- id: data_maintenance_2 + sessions: + - tasks: + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi +- id: single_user_3_1 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: data_maintenance_1 +- id: single_user_3_0 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: build +- id: data_maintenance_3 + sessions: + - tasks: + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi +- id: single_user_4_2 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: data_maintenance_2 +- id: single_user_4_1 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: data_maintenance_1 +- id: single_user_4_0 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: build +- id: data_maintenance_4 + sessions: + - tasks: + - template_id: data_maintenance_hudi + - template_id: data_maintenance_hudi +- id: single_user_5_3 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: data_maintenance_3 +- id: single_user_5_2 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: data_maintenance_2 +- id: single_user_5_1 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: data_maintenance_1 +- id: single_user_5_0 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: build diff --git a/run/spark-3.3.1/config/tpcds/wp4_time_travel-iceberg-1.1.0.yaml b/run/spark-3.3.1/config/tpcds/wp4_time_travel-iceberg-1.1.0.yaml new file mode 100644 index 00000000..0e91ad7f --- /dev/null +++ b/run/spark-3.3.1/config/tpcds/wp4_time_travel-iceberg-1.1.0.yaml @@ -0,0 +1,83 @@ +# Description: WP4: Time travel +--- +version: 1 +id: wp4_time_travel +phases: +- id: init + sessions: + - tasks: + - template_id: init +- id: build + sessions: + - tasks: + - template_id: build +- id: data_maintenance_1 + sessions: + - tasks: + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg +- id: single_user_2_0 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: build +- id: data_maintenance_2 + sessions: + - tasks: + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg +- id: single_user_3_1 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: data_maintenance_1 +- id: single_user_3_0 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: build +- id: data_maintenance_3 + sessions: + - tasks: + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg +- id: single_user_4_2 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: data_maintenance_2 +- id: single_user_4_1 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: data_maintenance_1 +- id: single_user_4_0 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: build +- id: data_maintenance_4 + sessions: + - tasks: + - template_id: data_maintenance_iceberg + - template_id: data_maintenance_iceberg +- id: single_user_5_3 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: data_maintenance_3 +- id: single_user_5_2 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: data_maintenance_2 +- id: single_user_5_1 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: data_maintenance_1 +- id: single_user_5_0 + sessions: + - tasks: + - template_id: single_user + time_travel_phase_id: build diff --git a/src/test/java/com/microsoft/lst_bench/input/ParserTest.java b/src/test/java/com/microsoft/lst_bench/input/ParserTest.java index 9926c33d..bdd818de 100644 --- a/src/test/java/com/microsoft/lst_bench/input/ParserTest.java +++ b/src/test/java/com/microsoft/lst_bench/input/ParserTest.java @@ -175,9 +175,9 @@ public void testParseTaskLibrary() throws IOException { @Test public void testParseW0Delta() throws IOException { - Workload workload = FileParser.loadWorkload(TPCDS_PATH + "w0_tpcds-delta.yaml"); + Workload workload = FileParser.loadWorkload(TPCDS_PATH + "w0_tpcds-delta-2.2.0.yaml"); Assertions.assertEquals(1, workload.getVersion()); - Assertions.assertEquals("w0_tpcds_delta", workload.getId()); + Assertions.assertEquals("w0_tpcds", workload.getId()); Assertions.assertEquals(9, workload.getPhases().size()); for (Phase phase : workload.getPhases()) { switch (phase.getId()) { @@ -235,9 +235,9 @@ public void testParseW0Delta() throws IOException { @Test public void testParseW0Hudi() throws IOException { - Workload workload = FileParser.loadWorkload(TPCDS_PATH + "w0_tpcds-hudi.yaml"); + Workload workload = FileParser.loadWorkload(TPCDS_PATH + "w0_tpcds-hudi-0.12.2.yaml"); Assertions.assertEquals(1, workload.getVersion()); - Assertions.assertEquals("w0_tpcds_hudi", workload.getId()); + Assertions.assertEquals("w0_tpcds", workload.getId()); Assertions.assertEquals(9, workload.getPhases().size()); for (Phase phase : workload.getPhases()) { switch (phase.getId()) { @@ -310,9 +310,9 @@ public void testParseW0Hudi() throws IOException { @Test public void testParseW0Iceberg() throws IOException { - Workload workload = FileParser.loadWorkload(TPCDS_PATH + "w0_tpcds-iceberg.yaml"); + Workload workload = FileParser.loadWorkload(TPCDS_PATH + "w0_tpcds-iceberg-1.1.0.yaml"); Assertions.assertEquals(1, workload.getVersion()); - Assertions.assertEquals("w0_tpcds_iceberg", workload.getId()); + Assertions.assertEquals("w0_tpcds", workload.getId()); Assertions.assertEquals(9, workload.getPhases().size()); for (Phase phase : workload.getPhases()) { switch (phase.getId()) { @@ -370,22 +370,20 @@ public void testParseW0Iceberg() throws IOException { @Test public void testParseWP1Longevity() throws IOException { - Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp1_longevity.yaml"); + Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp1_longevity-delta-2.2.0.yaml"); Assertions.assertEquals(1, workload.getVersion()); Assertions.assertEquals("wp1_longevity", workload.getId()); - Assertions.assertEquals(15, workload.getPhases().size()); + Assertions.assertEquals(13, workload.getPhases().size()); } @Test public void testParseWP2Resilience() throws IOException { - Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp2_resilience.yaml"); + Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp2_resilience-delta-2.2.0.yaml"); Assertions.assertEquals(1, workload.getVersion()); Assertions.assertEquals("wp2_resilience", workload.getId()); - Assertions.assertEquals(17, workload.getPhases().size()); + Assertions.assertEquals(15, workload.getPhases().size()); for (Phase phase : workload.getPhases()) { switch (phase.getId()) { - case "setup": - case "setup_data_maintenance": case "init": case "build": case "single_user_1": @@ -411,10 +409,10 @@ public void testParseWP2Resilience() throws IOException { @Test public void testParseWP3RWConcurrency() throws IOException { - Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp3_rw_concurrency.yaml"); + Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp3_rw_concurrency-delta-2.2.0.yaml"); Assertions.assertEquals(1, workload.getVersion()); Assertions.assertEquals("wp3_rw_concurrency", workload.getId()); - Assertions.assertEquals(10, workload.getPhases().size()); + Assertions.assertEquals(8, workload.getPhases().size()); for (Phase phase : workload.getPhases()) { switch (phase.getId()) { case "single_user_1_data_maintenance_1": @@ -458,8 +456,6 @@ public void testParseWP3RWConcurrency() throws IOException { Assertions.assertNull(taskO.getTimeTravelPhaseId()); } break; - case "setup": - case "setup_data_maintenance": case "init": case "build": case "single_user_2o_data_maintenance_2": @@ -476,7 +472,8 @@ public void testParseWP3RWConcurrency() throws IOException { @Test public void testParseWP3RWConcurrencyMulti() throws IOException { - Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp3_rw_concurrency_multi.yaml"); + Workload workload = + FileParser.loadWorkload(TPCDS_PATH + "wp3_rw_concurrency_multi-delta-2.2.0.yaml"); Assertions.assertEquals(1, workload.getVersion()); Assertions.assertEquals("wp3_rw_concurrency_multi", workload.getId()); Assertions.assertEquals(10, workload.getPhases().size()); @@ -518,10 +515,10 @@ public void testParseWP3RWConcurrencyMulti() throws IOException { @Test public void testParseWP4TimeTravel() throws IOException { - Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp4_time_travel.yaml"); + Workload workload = FileParser.loadWorkload(TPCDS_PATH + "wp4_time_travel-delta-2.2.0.yaml"); Assertions.assertEquals(1, workload.getVersion()); Assertions.assertEquals("wp4_time_travel", workload.getId()); - Assertions.assertEquals(18, workload.getPhases().size()); + Assertions.assertEquals(16, workload.getPhases().size()); for (Phase phase : workload.getPhases()) { switch (phase.getId()) { case "single_user_2_0": @@ -546,15 +543,6 @@ public void testParseWP4TimeTravel() throws IOException { Assertions.assertNotNull(task.getTimeTravelPhaseId()); } break; - case "setup_data_maintenance": - { - List sessions = phase.getSessions(); - Assertions.assertEquals(1, sessions.size()); - List tasks = sessions.get(0).getTasks(); - Assertions.assertEquals(8, tasks.size()); - } - break; - case "setup": case "init": case "build": case "data_maintenance_1": diff --git a/src/test/java/com/microsoft/lst_bench/input/ValidationTest.java b/src/test/java/com/microsoft/lst_bench/input/ValidationTest.java index a4c463c8..d25b9adf 100644 --- a/src/test/java/com/microsoft/lst_bench/input/ValidationTest.java +++ b/src/test/java/com/microsoft/lst_bench/input/ValidationTest.java @@ -186,14 +186,14 @@ private void testValidationLibrary(String libraryPath) throws IOException { @EnabledOnOs({OS.LINUX, OS.MAC}) @ValueSource( strings = { - "run/spark-3.3.1/config/tpcds/w0_tpcds-delta.yaml", - "run/spark-3.3.1/config/tpcds/w0_tpcds-hudi.yaml", - "run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg.yaml", - "run/spark-3.3.1/config/tpcds/wp1_longevity.yaml", - "run/spark-3.3.1/config/tpcds/wp2_resilience.yaml", - "run/spark-3.3.1/config/tpcds/wp3_rw_concurrency.yaml", - "run/spark-3.3.1/config/tpcds/wp3_rw_concurrency_multi.yaml", - "run/spark-3.3.1/config/tpcds/wp4_time_travel.yaml", + "run/spark-3.3.1/config/tpcds/w0_tpcds-delta-2.2.0.yaml", + "run/spark-3.3.1/config/tpcds/w0_tpcds-hudi-0.12.2.yaml", + "run/spark-3.3.1/config/tpcds/w0_tpcds-iceberg-1.1.0.yaml", + "run/spark-3.3.1/config/tpcds/wp1_longevity-delta-2.2.0.yaml", + "run/spark-3.3.1/config/tpcds/wp2_resilience-delta-2.2.0.yaml", + "run/spark-3.3.1/config/tpcds/wp3_rw_concurrency-delta-2.2.0.yaml", + "run/spark-3.3.1/config/tpcds/wp3_rw_concurrency_multi-delta-2.2.0.yaml", + "run/spark-3.3.1/config/tpcds/wp4_time_travel-delta-2.2.0.yaml", "run/trino-420/config/tpcds/w0_tpcds.yaml", "run/trino-420/config/tpcds/wp1_longevity.yaml", "run/trino-420/config/tpcds/wp2_resilience.yaml", @@ -210,14 +210,14 @@ public void testValidationWorkloadUnix(String workloadFilePath) throws IOExcepti @EnabledOnOs({OS.WINDOWS}) @ValueSource( strings = { - "run\\spark-3.3.1\\config\\tpcds\\w0_tpcds-delta.yaml", - "run\\spark-3.3.1\\config\\tpcds\\w0_tpcds-hudi.yaml", - "run\\spark-3.3.1\\config\\tpcds\\w0_tpcds-iceberg.yaml", - "run\\spark-3.3.1\\config\\tpcds\\wp1_longevity.yaml", - "run\\spark-3.3.1\\config\\tpcds\\wp2_resilience.yaml", - "run\\spark-3.3.1\\config\\tpcds\\wp3_rw_concurrency.yaml", - "run\\spark-3.3.1\\config\\tpcds\\wp3_rw_concurrency_multi.yaml", - "run\\spark-3.3.1\\config\\tpcds\\wp4_time_travel.yaml", + "run\\spark-3.3.1\\config\\tpcds\\w0_tpcds-delta-2.2.0.yaml", + "run\\spark-3.3.1\\config\\tpcds\\w0_tpcds-hudi-0.12.2.yaml", + "run\\spark-3.3.1\\config\\tpcds\\w0_tpcds-iceberg-1.1.0.yaml", + "run\\spark-3.3.1\\config\\tpcds\\wp1_longevity-delta-2.2.0.yaml", + "run\\spark-3.3.1\\config\\tpcds\\wp2_resilience-delta-2.2.0.yaml", + "run\\spark-3.3.1\\config\\tpcds\\wp3_rw_concurrency-delta-2.2.0.yaml", + "run\\spark-3.3.1\\config\\tpcds\\wp3_rw_concurrency_multi-delta-2.2.0.yaml", + "run\\spark-3.3.1\\config\\tpcds\\wp4_time_travel-delta-2.2.0.yaml", "run\\trino-420\\config\\tpcds\\w0_tpcds.yaml", "run\\trino-420\\config\\tpcds\\wp1_longevity.yaml", "run\\trino-420\\config\\tpcds\\wp2_resilience.yaml",