From 79ee8417631a878a43d1a1fa5be3134a8b7e58c1 Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Tue, 17 Jan 2023 09:12:08 -0800 Subject: [PATCH] Doc: Taurus (ZIH) A100 (#3611) Document how to use the AMD Rome + A100 GPU nodes on Taurus at ZIH (TU Dresden). --- Docs/source/install/hpc.rst | 1 + Docs/source/install/hpc/taurus.rst | 86 +++++++++++++++++++ Tools/machines/taurus-zih/taurus.sbatch | 27 ++++++ .../taurus-zih/taurus_warpx.profile.example | 39 +++++++++ 4 files changed, 153 insertions(+) create mode 100644 Docs/source/install/hpc/taurus.rst create mode 100644 Tools/machines/taurus-zih/taurus.sbatch create mode 100644 Tools/machines/taurus-zih/taurus_warpx.profile.example diff --git a/Docs/source/install/hpc.rst b/Docs/source/install/hpc.rst index 643ab604c86..8a693e5a825 100644 --- a/Docs/source/install/hpc.rst +++ b/Docs/source/install/hpc.rst @@ -37,6 +37,7 @@ HPC Systems hpc/ookami hpc/lxplus hpc/lumi + hpc/taurus .. tip:: diff --git a/Docs/source/install/hpc/taurus.rst b/Docs/source/install/hpc/taurus.rst new file mode 100644 index 00000000000..49d883efb15 --- /dev/null +++ b/Docs/source/install/hpc/taurus.rst @@ -0,0 +1,86 @@ +.. _building-taurus: + +Taurus (ZIH) +============ + +The `Taurus cluster `_ is located at `ZIH (TU Dresden) `__. + +The cluster has multiple partitions, this section describes how to use the `AMD Rome CPUs + NVIDIA A100ΒΆ `__. + +Introduction +------------ + +If you are new to this system, **please see the following resources**: + +* `ZIH user guide `__ +* Batch system: `Slurm `__ +* Jupyter service: Missing? +* `Production directories `__: + + * ``$PSCRATCH``: per-user production directory, purged every 30 days (TB) + * ``/global/cscratch1/sd/m3239``: shared production directory for users in the project ``m3239``, purged every 30 days (50TB) + * ``/global/cfs/cdirs/m3239/``: community file system for users in the project ``m3239`` (100TB) + + +Installation +------------ + +Use the following commands to download the WarpX source code and switch to the correct branch: + +.. code-block:: bash + + git clone https://github.com/ECP-WarpX/WarpX.git $HOME/src/warpx + +We use the following modules and environments on the system (``$HOME/taurus_warpx.profile``). + +.. literalinclude:: ../../../../Tools/machines/taurus-zih/taurus_warpx.profile.example + :language: bash + :caption: You can copy this file from ``Tools/machines/taurus-zih/taurus_warpx.profile.example``. + +We recommend to store the above lines in a file, such as ``$HOME/taurus_warpx.profile``, and load it into your shell after a login: + +.. code-block:: bash + + source $HOME/taurus_warpx.profile + +Then, ``cd`` into the directory ``$HOME/src/warpx`` and use the following commands to compile: + +.. code-block:: bash + + cd $HOME/src/warpx + rm -rf build + + cmake -S . -B build -DWarpX_DIMS=3 -DWarpX_COMPUTE=CUDA + cmake --build build -j 16 + +The general :ref:`cmake compile-time options ` apply as usual. + + +.. _running-cpp-taurus: + +Running +------- + +.. _running-cpp-taurus-A100-GPUs: + +A100 GPUs (40 GB) +^^^^^^^^^^^^^^^^^ + +The `alpha` partition has 34 nodes with 8 x NVIDIA A100-SXM4 Tensor Core-GPUs and 2 x AMD EPYC CPU 7352 (24 cores) @ 2.3 GHz (multithreading disabled) per node. + +The batch script below can be used to run a WarpX simulation on multiple nodes (change ``-N`` accordingly). +Replace descriptions between chevrons ``<>`` by relevant values, for instance ```` could be ``plasma_mirror_inputs``. +Note that we run one MPI rank per GPU. + + +.. literalinclude:: ../../../../Tools/machines/taurus-zih/taurus.sbatch + :language: bash + :caption: You can copy this file from ``Tools/machines/taurus-zih/taurus.sbatch``. + +To run a simulation, copy the lines above to a file ``taurus.sbatch`` and run + +.. code-block:: bash + + sbatch taurus.sbatch + +to submit the job. diff --git a/Tools/machines/taurus-zih/taurus.sbatch b/Tools/machines/taurus-zih/taurus.sbatch new file mode 100644 index 00000000000..b50bd627361 --- /dev/null +++ b/Tools/machines/taurus-zih/taurus.sbatch @@ -0,0 +1,27 @@ +#!/bin/bash -l + +# Copyright 2023 Axel Huebl, Thomas Miethlinger +# +# This file is part of WarpX. +# +# License: BSD-3-Clause-LBNL + +#SBATCH -t 00:10:00 +#SBATCH -N 1 +#SBATCH -J WarpX +#SBATCH -p alpha +#SBATCH --exclusive +#SBATCH --cpus-per-task=6 +#SBATCH --mem-per-cpu=2048 +#SBATCH --gres=gpu:1 +#SBATCH --gpu-bind=single:1 +#SBATCH -o WarpX.o%j +#SBATCH -e WarpX.e%j + +# executable & inputs file or python interpreter & PICMI script here +EXE=./warpx +INPUTS=inputs_small + +# run +srun ${EXE} ${INPUTS} \ + > output.txt diff --git a/Tools/machines/taurus-zih/taurus_warpx.profile.example b/Tools/machines/taurus-zih/taurus_warpx.profile.example new file mode 100644 index 00000000000..f564f696c4a --- /dev/null +++ b/Tools/machines/taurus-zih/taurus_warpx.profile.example @@ -0,0 +1,39 @@ +# please set your project account +#export proj="" # change me + +# required dependencies +module load modenv/hiera +module load foss/2021b +module load CUDA/11.8.0 +module load CMake/3.22.1 + +# optional: for QED support with detailed tables +#module load Boost # TODO + +# optional: for openPMD and PSATD+RZ support +module load HDF5/1.13.1 + +# optional: for Python bindings or libEnsemble +#module load python # TODO +# +#if [ -d "$HOME/sw/taurus/venvs/warpx" ] +#then +# source $HOME/sw/taurus/venvs/warpx/bin/activate +#fi + +# an alias to request an interactive batch node for one hour +# for parallel execution, start on the batch node: srun +alias getNode="salloc --time=2:00:00 -N1 -n1 --cpus-per-task=6 --mem-per-cpu=2048 --gres=gpu:1 --gpu-bind=single:1 -p alpha-interactive --pty bash" +# an alias to run a command on a batch node for up to 30min +# usage: runNode +alias runNode="srun --time=2:00:00 -N1 -n1 --cpus-per-task=6 --mem-per-cpu=2048 --gres=gpu:1 --gpu-bind=single:1 -p alpha-interactive --pty bash" + +# optimize CUDA compilation for A100 +export AMREX_CUDA_ARCH=8.0 + +# compiler environment hints +#export CC=$(which gcc) +#export CXX=$(which g++) +#export FC=$(which gfortran) +#export CUDACXX=$(which nvcc) +#export CUDAHOSTCXX=${CXX}