diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d39bdea..3944d03 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,4 +1,4 @@ -add_subdirectory(circuit) -add_subdirectory(pennant) -add_subdirectory(snap) +# add_subdirectory(circuit) +# add_subdirectory(pennant) +# add_subdirectory(snap) add_subdirectory(08_multiple_partitions) diff --git a/experiment/frontier/sbatch_circuit.sh b/experiment/frontier/sbatch_circuit.sh index 2d2d5a8..62ef339 100644 --- a/experiment/frontier/sbatch_circuit.sh +++ b/experiment/frontier/sbatch_circuit.sh @@ -1,8 +1,6 @@ #!/bin/bash #SBATCH -A CMB103 #SBATCH --partition=batch -#SBATCH --dependency=singleton -#SBATCH --job-name=circuit_test #SBATCH --time=01:00:00 #SBATCH --mail-type=ALL #SBATCH -C nvme @@ -21,6 +19,11 @@ export GASNET_OFI_DEVICE_3=cxi0 export GASNET_OFI_DEVICE_TYPE=Node export GASNET_OFI_NUM_RECEIVE_BUFFS=32M +export GASNET_OFI_SPAWNER=mpi + +export MPICH_VERSION_DISPLAY=1 +export MPICH_ENV_DISPLAY=1 + ulimit -S -c 0 # disable core dumps slurm_flags= @@ -41,6 +44,8 @@ for n in $SLURM_JOB_NUM_NODES; do slug="${n}x${ranks_per_node}_f${freq}_r${r}" echo "Running $slug" checkpoint_dir="$SCRATCH/$experiment_name/$slug" + export MPICH_OFI_CXI_COUNTER_REPORT=5 + export MPICH_OFI_CXI_COUNTER_REPORT_FILE="counter_${slug}" set -x mkdir -p "$checkpoint_dir" srun -n $(( n * ranks_per_node )) -N $n --ntasks-per-node $ranks_per_node --cpus-per-task $(( 56 / ranks_per_node )) --gpus-per-task $(( 8 / ranks_per_node )) --cpu_bind cores $slurm_flags "$root_dir/circuit.checkpoint" -npp 5000 -wpp 20000 -l 300 -p $(( n * ranks_per_node * 10 )) -pps 10 -prune 30 -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 2 -ll:csize 15000 -ll:fsize 15000 -ll:zsize 15000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 10 -lg:no_tracing -level 3 -logfile log_"$slug"_%.log -checkpoint:prefix "$checkpoint_dir" -checkpoint:auto_steps $freq | tee out_"$slug".out @@ -57,19 +62,19 @@ done popd -if [[ ! -d no_checkpoint ]]; then mkdir no_checkpoint; fi -pushd no_checkpoint +# if [[ ! -d no_checkpoint ]]; then mkdir no_checkpoint; fi +# pushd no_checkpoint -for n in $SLURM_JOB_NUM_NODES; do - for r in 0 1 2 3 4; do - freq=0 - slug="${n}x${ranks_per_node}_f${freq}_r${r}" - echo "Running $slug" - set -x - srun -n $(( n * ranks_per_node )) -N $n --ntasks-per-node $ranks_per_node --cpus-per-task $(( 56 / ranks_per_node )) --gpus-per-task $(( 8 / ranks_per_node )) --cpu_bind cores $slurm_flags "$root_dir/circuit.checkpoint" -npp 5000 -wpp 20000 -l 300 -p $(( n * ranks_per_node * 10 )) -pps 10 -prune 30 -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 2 -ll:csize 15000 -ll:fsize 15000 -ll:zsize 15000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 10 -lg:no_tracing -level 3 -logfile log_"$slug"_%.log -checkpoint:disable | tee out_"$slug".out - # -dm:memoize -lg:parallel_replay 2 - { set +x; } 2>/dev/null - done -done +# for n in $SLURM_JOB_NUM_NODES; do +# for r in 0 1 2 3 4; do +# freq=0 +# slug="${n}x${ranks_per_node}_f${freq}_r${r}" +# echo "Running $slug" +# set -x +# srun -n $(( n * ranks_per_node )) -N $n --ntasks-per-node $ranks_per_node --cpus-per-task $(( 56 / ranks_per_node )) --gpus-per-task $(( 8 / ranks_per_node )) --cpu_bind cores $slurm_flags "$root_dir/circuit.checkpoint" -npp 5000 -wpp 20000 -l 300 -p $(( n * ranks_per_node * 10 )) -pps 10 -prune 30 -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 2 -ll:csize 15000 -ll:fsize 15000 -ll:zsize 15000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 10 -lg:no_tracing -level 3 -logfile log_"$slug"_%.log -checkpoint:disable | tee out_"$slug".out +# # -dm:memoize -lg:parallel_replay 2 +# { set +x; } 2>/dev/null +# done +# done -popd +# popd