#!/usr/bin/env bash
#SBATCH -N 2                # nodes requested
#SBATCH -n 2                # tasks requested
#SBATCH -p gpu_lowpriority	# use the preemptible GPU partition
#SBATCH -c 24               # cores requested
#SBATCH --gres=gpu:v100:2	# select 2 V100 GPUs
#SBATCH -o outfile-%j       # send stdout to outfile
#SBATCH -e errfile-%j       # send stderr to errfile

SUBNET=10.3.19

# Tell the backend to use the 10GbE.
NCCL_SOCKET_IFNAME=$(ifconfig | grep -B1 $SUBNET | \
                         head -n1 | \
                         awk -F':' '{ print $1 }')

# Figure out which node is going to be king.
MASTER_FILE=$SLURM_JOBID.master
if [ $SLURM_PROCID -eq 0 ] ; then
    IP=$(ifconfig | grep $SUBNET | awk '{ print $2 }')
    echo "Master IP address is $IP"
    MASTER="tcp://$IP:12345"
    echo $MASTER > $MASTER_FILE
else
    while [ ! -f $MASTER_FILE ] ; do
        sleep 1
    done
    MASTER=$(cat $MASTER_FILE)
fi

python imagenet_distributed_training.py --arch alexnet \
                                        --batch-size 512 \
                                        --dist-url $MASTER \
                                        --dist-backend nccl \
                                        --multiprocessing-distributed \
                                        /scratch4/imagenet2012/Data/CLS-LOC
