Skip to content
Snippets Groups Projects
Commit 0182be26 authored by sfux's avatar sfux
Browse files

Merge branch 'add_slurm' into 'master'

Add slurm

See merge request !6
parents ac1410ff 733a9651
No related branches found
No related tags found
No related merge requests found
......@@ -6,12 +6,13 @@
# connect it with a local browser to it #
# #
# Main author : Samuel Fux #
# Contributions : Andreas Lugmayr, Mike Boss #
# Contributions : Andreas Lugmayr, Mike Boss, Nadia Marounina #
# Date : October 2021 #
# Location : ETH Zurich #
# Version : 0.1 #
# Version : 0.2 #
# Change history : #
# #
# 24.10.2022 Added Slurm support #
# 19.05.2022 JOBID is now saved to reconnect_info file #
# 28.10.2021 Initial version of the script based on Jupyter script #
# #
......@@ -22,7 +23,7 @@
###############################################################################
# Version
VSC_VERSION="0.1"
VSC_VERSION="0.2"
# Script directory
VSC_SCRIPTDIR=$(pwd)
......@@ -59,6 +60,9 @@ VSC_WAITING_INTERVAL=60
# SSH key location default : no default
VSC_SSH_KEY_PATH=""
# Batch system : Slurm
VSC_BATCH_SYSTEM="SLURM"
###############################################################################
# Usage instructions #
###############################################################################
......@@ -75,6 +79,7 @@ Options:
-n | --numcores NUM_CPU Number of CPU cores to be used on the cluster
-W | --runtime RUN_TIME Run time limit for the code-server in hours and minutes HH:MM
-m | --memory MEM_PER_CORE Memory limit in MB per core
-b | --batchsys BATCH_SYS Batch system to use (LSF or SLURM)
Optional arguments:
......@@ -85,11 +90,12 @@ Optional arguments:
-k | --key SSH_KEY_PATH Path to SSH key with non-standard name
-v | --version Display version of the script and exit
Examples:
./start_vscode.sh -u sfux -n 4 -W 04:00 -m 2048
./start_vscode.sh -u sfux -b SLURM -n 4 -W 04:00 -m 2048
./start_vscode.sh --username sfux --numcores 2 --runtime 01:30 --memory 2048
./start_vscode.sh --username sfux --batchsys SLURM --numcores 2 --runtime 01:30 --memory 2048
./start_vscode.sh -c $HOME/.vsc_config
......@@ -102,6 +108,7 @@ VSC_RUN_TIME="01:00" # Run time limit for the code-server in hours and mi
VSC_MEM_PER_CPU_CORE=1024 # Memory limit in MB per core
VSC_WAITING_INTERVAL=60 # Time interval to check if the job on the cluster already started
VSC_SSH_KEY_PATH="" # Path to SSH key with non-standard name
VSC_BATCH_SYSTEM="SLURM" # Batch system to use (SLURM or LSF)
EOF
exit 1
......@@ -161,6 +168,11 @@ do
shift
shift
;;
-b|--batchsys)
BATCH_SYS=$2
shift
shift
;;
*)
echo -e "Warning: ignoring unknown option $1 \n"
shift
......@@ -268,6 +280,20 @@ else
echo -e "Using SSH key $VSC_SSH_KEY_PATH"
fi
# check if VSC_BATCH_SYSTEM is set to SLURM or LSF
case $VSC_BATCH_SYSTEM in
LSF)
echo -e "Using LSF batch system"
;;
SLURM)
echo -e "Using Slurm batch system"
;;
*)
echo -e "Error: Unknown batch system $VSC_BATCH_SYSTEM. Please either specify LSF or SLURM as batch system"
;;
esac
# put together string for SSH options
VSC_SSH_OPT="$VSC_SKPATH $VSC_USERNAME@$VSC_HOSTNAME"
......@@ -297,9 +323,29 @@ ENDSSH
###############################################################################
# run the code-server job on Euler and save the ip of the compute node in the file vscip in the home directory of the user on Euler
echo -e "Connecting to $VSC_HOSTNAME to start the code-server in a batch job"
echo -e "Connecting to $VSC_HOSTNAME to start the code-server in a $BATCH_SYS batch job"
case $BATCH_SYS in
"LSF")
VSC_BJOB_OUT=$(ssh $VSC_SSH_OPT bsub -n $VSC_NUM_CPU -W $VSC_RUN_TIME -R "rusage[mem=$VSC_MEM_PER_CPU_CORE]" $VSC_SNUM_GPU<<ENDBSUB
module load $VSC_MODULE_COMMAND
export XDG_RUNTIME_DIR="\$HOME/vsc_runtime"
VSC_IP_REMOTE="\$(hostname -i)"
echo "Remote IP:\$VSC_IP_REMOTE" >> /cluster/home/$VSC_USERNAME/vscip
code-server --bind-addr=\${VSC_IP_REMOTE}:8899
ENDBSUB
) ;;
"SLURM")
VSC_RUN_TIME="${VSC_RUN_TIME}":00" "
VSC_BJOB_OUT=$(ssh $VSC_SSH_OPT bsub -n $VSC_NUM_CPU -W $VSC_RUN_TIME -R "rusage[mem=$VSC_MEM_PER_CPU_CORE]" $VSC_SNUM_GPU<<ENDBSUB
if [ "$VSC_NUM_GPU" -gt "0" ]; then
VSC_SNUM_GPU="-G $VSC_NUM_GPU"
fi
VSC_BJOB_OUT=$(ssh $VSC_SSH_OPT sbatch -n $VSC_NUM_CPU "--time=$VSC_RUN_TIME" "--mem-per-cpu=$VSC_MEM_PER_CPU_CORE" -e "error.dat" $VSC_SNUM_GPU<<ENDBSUB
#!/bin/bash
module load $VSC_MODULE_COMMAND
export XDG_RUNTIME_DIR="\$HOME/vsc_runtime"
VSC_IP_REMOTE="\$(hostname -i)"
......@@ -307,8 +353,13 @@ echo "Remote IP:\$VSC_IP_REMOTE" >> /cluster/home/$VSC_USERNAME/vscip
code-server --bind-addr=\${VSC_IP_REMOTE}:8899
ENDBSUB
)
;;
esac
VSC_BJOB_ID=$(echo $VSC_BJOB_OUT | awk '/is submitted/{print substr($2, 2, length($2)-2);}')
# TODO: get jobid for both cases (LSF/Slurm)
# store jobid in a variable
# VSC_BJOB_ID=$(echo $VSC_BJOB_OUT | awk '/is submitted/{print substr($2, 2, length($2)-2);}')
# wait until batch job has started, poll every $VSC_WAITING_INTERVAL seconds to check if /cluster/home/$VSC_USERNAME/vscip exists
# once the file exists and is not empty the batch job has started
......@@ -351,6 +402,10 @@ VSC_LOCAL_PORT=$((3 * 2**14 + RANDOM % 2**14))
echo -e "Using local port: $VSC_LOCAL_PORT"
# write reconnect_info file
#
# FIXME: add jobid
# BJOB ID : $VSC_BJOB_ID
cat <<EOF > $VSC_SCRIPTDIR/reconnect_info
Restart file
Remote IP address : $VSC_REMOTE_IP
......@@ -358,7 +413,6 @@ Remote port : $VSC_REMOTE_PORT
Local port : $VSC_LOCAL_PORT
SSH tunnel : ssh $VSC_SSH_OPT -L $VSC_LOCAL_PORT:$VSC_REMOTE_IP:$VSC_REMOTE_PORT -N &
URL : http://localhost:$VSC_LOCAL_PORT
BJOB ID : $VSC_BJOB_ID
EOF
# setup SSH tunnel from local computer to compute node via login node
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment