Skip to content

AthenaPK scaling instructions

Philipp Grete edited this page Feb 11, 2021 · 6 revisions

Prerequisites

  • Assumes a Power9 node with 4x V100
  • Recommended environment: SpectrumMPI and GCC host compiler
# get source
git clone https://gitlab.com/theias/hpc/jmstone/athena-parthenon/athenapk.git athenaPK
cd athenaPK

# change to branch for scaling test
git checkout pgrete/pack-in-one

# get submodules (mainly Kokkos and Parthenon)
git submodule init
git submodule update

# Configure and build. Reusing Summit machine file (same architecture)
mkdir build-cuda-mpi && cd build-cuda-mpi
cmake -DMACHINE_CFG=$(pwd)/../external/parthenon/cmake/machinecfg/Summit.cmake ..
make -j8 athenaPK

Building on RZAnsel

# get source
git clone https://gitlab.com/theias/hpc/jmstone/athena-parthenon/athenapk.git athenaPK
cd athenaPK

# change to branch for scaling test
git checkout pgrete/pack-in-one

# get submodules (mainly Kokkos and Parthenon)
git submodule init
git submodule update

cmake -S. -B build -DCMAKE_TOOLCHAIN_FILE=$(pwd)/external/parthenon/cmake/machinecfg/RZAnsel.cmake
cmake --build build

Scaling instructions

Static, uniform mesh scaling (basic instructions - don't use)

  • For static meshes we'll use a workload of 256^3 cells per GPU
  • Adjust launch command as needed (e.g., use -M "-gpu" parameter of jsrun instead of MY_SPECTRUM_OPTIONS environment variable)
# enable Cuda aware MPI
export MY_SPECTRUM_OPTIONS="--gpu"
# make Kokkos pick GPUs round robin
export KOKKOS_NUM_DEVICES=4

cd build-cuda-mpi

# mesh dimensions
export MB=256
export MX=256
export MY=256
export MZ=256


ibrun -n 1 ./src/athenaPK -i ../inputs/advection_3d.in parthenon/meshblock/nx1=$MB parthenon/meshblock/nx2=$MB parthenon/meshblock/nx3=$MB parthenon/time/nlim=10 parthenon/mesh/nx1=$MX parthenon/mesh/nx2=$MY parthenon/mesh/nx3=$MZ parthenon/mesh/refinement=none
# should be about 2.2e8 zone-cycles/wsec_step

export MX=512
ibrun -n 2 ./src/athenaPK -i ../inputs/advection_3d.in parthenon/meshblock/nx1=$MB parthenon/meshblock/nx2=$MB parthenon/meshblock/nx3=$MB parthenon/time/nlim=10 parthenon/mesh/nx1=$MX parthenon/mesh/nx2=$MY parthenon/mesh/nx3=$MZ parthenon/mesh/refinement=none
# should be about 4.4e8 zone-cycles/wsec_step

export MY=512
ibrun -n 4 ./src/athenaPK -i ../inputs/advection_3d.in parthenon/meshblock/nx1=$MB parthenon/meshblock/nx2=$MB parthenon/meshblock/nx3=$MB parthenon/time/nlim=10 parthenon/mesh/nx1=$MX parthenon/mesh/nx2=$MY parthenon/mesh/nx3=$MZ parthenon/mesh/refinement=none
# should be about 8.6e8 zone-cycles/wsec_step

# Test with overdecomposition
export MB=128
ibrun -n 4 ./src/athenaPK -i ../inputs/advection_3d.in parthenon/meshblock/nx1=$MB parthenon/meshblock/nx2=$MB parthenon/meshblock/nx3=$MB parthenon/time/nlim=10 parthenon/mesh/nx1=$MX parthenon/mesh/nx2=$MY parthenon/mesh/nx3=$MZ parthenon/mesh/refinement=none
# should be about 9.5e8 zone-cycles/wsec_step

# And much more overdecomposition
export MB=32
ibrun -n 4 ./src/athenaPK -i ../inputs/advection_3d.in parthenon/meshblock/nx1=$MB parthenon/meshblock/nx2=$MB parthenon/meshblock/nx3=$MB parthenon/time/nlim=10 parthenon/mesh/nx1=$MX parthenon/mesh/nx2=$MY parthenon/mesh/nx3=$MZ parthenon/mesh/refinement=none
# should be about 2.2e8 zone-cycles/wsec_step

# And now with process<->GPU overdecomposition (requires MPS): Using 32 on a single host for 4 GPUs
ibrun -n 32 ./src/athenaPK -i ../inputs/advection_3d.in parthenon/meshblock/nx1=$MB parthenon/meshblock/nx2=$MB parthenon/meshblock/nx3=$MB parthenon/time/nlim=10 parthenon/mesh/nx1=$MX parthenon/mesh/nx2=$MY parthenon/mesh/nx3=$MZ parthenon/mesh/refinement=none
# should be about 3.2e8 zone-cycles/wsec_step

Python script for weak scaling

# default
mb = 256  # MeshBlock size, used for x,y,z
mx = 256  # x Mesh size 
my = 256  # y Mesh size 
mz = 256  # z Mesh size 
nlim = 10  # max number of cycles for sim
refinement = 'none' # static, uniform mesh
nodes = 1 # number of nodes to be used
max_nodes = 2048 # scale up to this number of nodes

cmd_str = " ./src/athenaPK "

input_str = " -i ../inputs/advection_3d.in "

param_str = lambda: f' parthenon/meshblock/nx1={mb:d} parthenon/meshblock/nx2={mb:d} parthenon/meshblock/nx3={mb:d} parthenon/time/nlim={nlim:d} parthenon/mesh/nx1={mx:d} parthenon/mesh/nx2={my:d} parthenon/mesh/nx3={mz:d} parthenon/mesh/refinement={refinement:s} '

machine = 'RZAnsel'

if machine == 'Summit_4_GPUs_per_node':
  print("Configuration for Summit using 4 GPUs per node to mirror Sierra")
  print("############## ENVIRONMENT ###############")
  print("export KOKKOS_NUM_DEVICES=2")
  tasks_per_gpu = 1
  init_mx = 512
  init_my = 512
  init_mz = 256
  gpus_per_node = 4
  launch_str = lambda: f"jsrun --nrs {2*nodes} --tasks_per_rs {2*tasks_per_gpu} --cpu_per_rs 21 --gpu_per_rs 3 --rs_per_host 2 --smpiargs=-gpu"

elif machine == 'RZAnsel':
  print("Configuration for RZAnsel using 4 GPUs per node")
  print("############## ENVIRONMENT ###############")
  print("export KOKKOS_NUM_DEVICES=1")
  init_mx = 512
  init_my = 512
  init_mz = 256
  gpus_per_node = 4
  launch_str = lambda: f"lrun -N {nodes} -T {gpus_per_node*tasks_per_gpu} -g 1 -M \"-gpu\""


for tasks_per_gpu in [1]:
  for mb in [128, 256]:
    mx = init_mx
    my = init_my
    mz = init_mz
    nodes = 1

    # ensure that there's at leat one task per meshblock 
    if tasks_per_gpu*gpus_per_node > (init_mx/mb)*(init_my/mb)*(init_mz/mb):
      continue
    print("##########################################")
    print(f"\n# Weak static uniform grid scaling with one  {mb}^3 meshblock per device. MPS oversub. = {tasks_per_gpu}.")
    log_str = lambda: f' |tee weak_static.out.nodes_{nodes}-mb_{mb}-mps_{tasks_per_gpu}\n' 
    while True:
      print(launch_str() + cmd_str + input_str + param_str() + log_str())
      mx *= 2
      nodes *= 2
      if nodes > max_nodes:
        break
      print(launch_str() + cmd_str + input_str + param_str() + log_str())
      my *= 2
      nodes *= 2
      if nodes > max_nodes:
        break
      print(launch_str() + cmd_str + input_str + param_str() + log_str())
      mz *= 2
      nodes *= 2
      if nodes > max_nodes:
        break

To be continued...

Clone this wiki locally