Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Code Block
#!/bin/bash

#SBATCH --job-name="ddl_mnist"
#SBATCH --output="ddl_mnist.%j.%N.out"
#SBATCH --error="ddl_mnist.%j.%N.err" 
#SBATCH --partition=batchgpu 
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=36
#SBATCH --gres=gpu:v100:4
#SBATCH --export=ALL 
#SBATCH -t 1:00:00 

NODE_LIST=$( scontrol show hostname $SLURM_JOB_NODELIST | sed -z 's/\n/,/g' )
NODE_LIST=${NODE_LIST%?}
echo $NODE_LIST

ddlrun --no_ddloptions -cores 18 -H $NODE_LIST python ~/[example_dir]/ddl-tensorflow/examples/mnist/mnist-init.py --ddl_options="-mode b:4x2" --mpiarg "--mca btl_openib_allow_ib 1 --mca orte_base_help_aggregate 1"

...

Code Block
#!/bin/bash

#SBATCH --job-name="hvd_tutorial"
#SBATCH --output="hvd_tutorial.%j.%N.out"
#SBATCH --error="hvd_tutorial.%j.%N.err" 
#SBATCH --partition=batch gpu
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=36
#SBATCH --gres=gpu:v100:4
#SBATCH --export=ALL 
#SBATCH -t 1:00:00 

NODE_LIST=$( scontrol show hostname $SLURM_JOB_NODELIST | sed -z 's/\n/\:4,/g' )
NODE_LIST=${NODE_LIST%?}
echo $NODE_LIST
mpirun -np $SLURM_NTASKS -H $NODE_LIST -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x NCCL_SOCKET_IFNAME=^lo -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib -mca btl_openib_verbose 1 -mca btl_tcp_if_incle 192.168.0.0/16 -mca oob_tcp_if_include 192.168.0.0/16 python tensorflow_mnist.py

...