Skip to content

Commit

Permalink
Configuration for reproducing freeze.
Browse files Browse the repository at this point in the history
  • Loading branch information
elliottslaughter committed Jul 23, 2024
1 parent 3212294 commit 784625d
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 19 deletions.
6 changes: 3 additions & 3 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
add_subdirectory(circuit)
add_subdirectory(pennant)
add_subdirectory(snap)
# add_subdirectory(circuit)
# add_subdirectory(pennant)
# add_subdirectory(snap)
add_subdirectory(08_multiple_partitions)
37 changes: 21 additions & 16 deletions experiment/frontier/sbatch_circuit.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
#!/bin/bash
#SBATCH -A CMB103
#SBATCH --partition=batch
#SBATCH --dependency=singleton
#SBATCH --job-name=circuit_test
#SBATCH --time=01:00:00
#SBATCH --mail-type=ALL
#SBATCH -C nvme
Expand All @@ -21,6 +19,11 @@ export GASNET_OFI_DEVICE_3=cxi0
export GASNET_OFI_DEVICE_TYPE=Node
export GASNET_OFI_NUM_RECEIVE_BUFFS=32M

export GASNET_OFI_SPAWNER=mpi

export MPICH_VERSION_DISPLAY=1
export MPICH_ENV_DISPLAY=1

ulimit -S -c 0 # disable core dumps

slurm_flags=
Expand All @@ -41,6 +44,8 @@ for n in $SLURM_JOB_NUM_NODES; do
slug="${n}x${ranks_per_node}_f${freq}_r${r}"
echo "Running $slug"
checkpoint_dir="$SCRATCH/$experiment_name/$slug"
export MPICH_OFI_CXI_COUNTER_REPORT=5
export MPICH_OFI_CXI_COUNTER_REPORT_FILE="counter_${slug}"
set -x
mkdir -p "$checkpoint_dir"
srun -n $(( n * ranks_per_node )) -N $n --ntasks-per-node $ranks_per_node --cpus-per-task $(( 56 / ranks_per_node )) --gpus-per-task $(( 8 / ranks_per_node )) --cpu_bind cores $slurm_flags "$root_dir/circuit.checkpoint" -npp 5000 -wpp 20000 -l 300 -p $(( n * ranks_per_node * 10 )) -pps 10 -prune 30 -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 2 -ll:csize 15000 -ll:fsize 15000 -ll:zsize 15000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 10 -lg:no_tracing -level 3 -logfile log_"$slug"_%.log -checkpoint:prefix "$checkpoint_dir" -checkpoint:auto_steps $freq | tee out_"$slug".out
Expand All @@ -57,19 +62,19 @@ done

popd

if [[ ! -d no_checkpoint ]]; then mkdir no_checkpoint; fi
pushd no_checkpoint
# if [[ ! -d no_checkpoint ]]; then mkdir no_checkpoint; fi
# pushd no_checkpoint

for n in $SLURM_JOB_NUM_NODES; do
for r in 0 1 2 3 4; do
freq=0
slug="${n}x${ranks_per_node}_f${freq}_r${r}"
echo "Running $slug"
set -x
srun -n $(( n * ranks_per_node )) -N $n --ntasks-per-node $ranks_per_node --cpus-per-task $(( 56 / ranks_per_node )) --gpus-per-task $(( 8 / ranks_per_node )) --cpu_bind cores $slurm_flags "$root_dir/circuit.checkpoint" -npp 5000 -wpp 20000 -l 300 -p $(( n * ranks_per_node * 10 )) -pps 10 -prune 30 -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 2 -ll:csize 15000 -ll:fsize 15000 -ll:zsize 15000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 10 -lg:no_tracing -level 3 -logfile log_"$slug"_%.log -checkpoint:disable | tee out_"$slug".out
# -dm:memoize -lg:parallel_replay 2
{ set +x; } 2>/dev/null
done
done
# for n in $SLURM_JOB_NUM_NODES; do
# for r in 0 1 2 3 4; do
# freq=0
# slug="${n}x${ranks_per_node}_f${freq}_r${r}"
# echo "Running $slug"
# set -x
# srun -n $(( n * ranks_per_node )) -N $n --ntasks-per-node $ranks_per_node --cpus-per-task $(( 56 / ranks_per_node )) --gpus-per-task $(( 8 / ranks_per_node )) --cpu_bind cores $slurm_flags "$root_dir/circuit.checkpoint" -npp 5000 -wpp 20000 -l 300 -p $(( n * ranks_per_node * 10 )) -pps 10 -prune 30 -hl:sched 1024 -ll:gpu 1 -ll:io 1 -ll:util 2 -ll:bgwork 2 -ll:csize 15000 -ll:fsize 15000 -ll:zsize 15000 -ll:rsize 0 -ll:gsize 0 -lg:eager_alloc_percentage 10 -lg:no_tracing -level 3 -logfile log_"$slug"_%.log -checkpoint:disable | tee out_"$slug".out
# # -dm:memoize -lg:parallel_replay 2
# { set +x; } 2>/dev/null
# done
# done

popd
# popd

0 comments on commit 784625d

Please sign in to comment.