From 733a9651e90be66b1165a17fcd90365b87f973a0 Mon Sep 17 00:00:00 2001 From: sfux <samuel.fux@id.ethz.ch> Date: Thu, 10 Nov 2022 07:24:11 +0000 Subject: [PATCH] Update start_vscode.sh --- start_vscode.sh | 74 +++++++++++++++++++++++-------------------------- 1 file changed, 35 insertions(+), 39 deletions(-) diff --git a/start_vscode.sh b/start_vscode.sh index be32a1c..9e609f2 100755 --- a/start_vscode.sh +++ b/start_vscode.sh @@ -6,12 +6,13 @@ # connect it with a local browser to it # # # # Main author : Samuel Fux # -# Contributions : Andreas Lugmayr, Mike Boss # +# Contributions : Andreas Lugmayr, Mike Boss, Nadia Marounina # # Date : October 2021 # # Location : ETH Zurich # -# Version : 0.1 # +# Version : 0.2 # # Change history : # # # +# 24.10.2022 Added Slurm support # # 19.05.2022 JOBID is now saved to reconnect_info file # # 28.10.2021 Initial version of the script based on Jupyter script # # # @@ -22,7 +23,7 @@ ############################################################################### # Version -VSC_VERSION="0.1" +VSC_VERSION="0.2" # Script directory VSC_SCRIPTDIR=$(pwd) @@ -59,6 +60,9 @@ VSC_WAITING_INTERVAL=60 # SSH key location default : no default VSC_SSH_KEY_PATH="" +# Batch system : Slurm +VSC_BATCH_SYSTEM="SLURM" + ############################################################################### # Usage instructions # ############################################################################### @@ -75,7 +79,7 @@ Options: -n | --numcores NUM_CPU Number of CPU cores to be used on the cluster -W | --runtime RUN_TIME Run time limit for the code-server in hours and minutes HH:MM -m | --memory MEM_PER_CORE Memory limit in MB per core - -b | --batchsys BATCH_SYS Batch system to use for the submission of this job to Euler (LSF/SLURM) + -b | --batchsys BATCH_SYS Batch system to use (LSF or SLURM) Optional arguments: @@ -84,15 +88,14 @@ Optional arguments: -h | --help Display help for this script and quit -i | --interval INTERVAL Time interval for checking if the job on the cluster already started -k | --key SSH_KEY_PATH Path to SSH key with non-standard name - -s | --shareholdergr SHAREHOLDERGR Shareholder group, mandatory when requesting GPUs with SLURM batch system -v | --version Display version of the script and exit Examples: - ./start_vscode.sh -u sfux -n 4 -W 04:00 -m 2048 + ./start_vscode.sh -u sfux -b SLURM -n 4 -W 04:00 -m 2048 - ./start_vscode.sh --username sfux --numcores 2 --runtime 01:30 --memory 2048 + ./start_vscode.sh --username sfux --batchsys SLURM --numcores 2 --runtime 01:30 --memory 2048 ./start_vscode.sh -c $HOME/.vsc_config @@ -105,6 +108,7 @@ VSC_RUN_TIME="01:00" # Run time limit for the code-server in hours and mi VSC_MEM_PER_CPU_CORE=1024 # Memory limit in MB per core VSC_WAITING_INTERVAL=60 # Time interval to check if the job on the cluster already started VSC_SSH_KEY_PATH="" # Path to SSH key with non-standard name +VSC_BATCH_SYSTEM="SLURM" # Batch system to use (SLURM or LSF) EOF exit 1 @@ -169,11 +173,6 @@ do shift shift ;; - -s|--shareholdergr) - SHAREHOLDERGR=$2 - shift - shift - ;; *) echo -e "Warning: ignoring unknown option $1 \n" shift @@ -281,17 +280,19 @@ else echo -e "Using SSH key $VSC_SSH_KEY_PATH" fi -#check in the case where GPUs are requested with SLURM whether the shareholder group is provided -if [[ "$VSC_NUM_GPU" > "0" && $BATCH_SYS = "SLURM" && $SHAREHOLDERGR = "" ]]; then - echo -e "Please provide the shareholder group if requesting GPUs with SLURM" - display_help -fi - -#if batch system has not been provided, default to LSF: -if [ -z "$BATCH_SYS" ]; then - BATCH_SYS="LSF" -fi +# check if VSC_BATCH_SYSTEM is set to SLURM or LSF +case $VSC_BATCH_SYSTEM in + LSF) + echo -e "Using LSF batch system" + ;; + SLURM) + echo -e "Using Slurm batch system" + ;; + *) + echo -e "Error: Unknown batch system $VSC_BATCH_SYSTEM. Please either specify LSF or SLURM as batch system" + ;; +esac # put together string for SSH options VSC_SSH_OPT="$VSC_SKPATH $VSC_USERNAME@$VSC_HOSTNAME" @@ -324,7 +325,7 @@ ENDSSH # run the code-server job on Euler and save the ip of the compute node in the file vscip in the home directory of the user on Euler echo -e "Connecting to $VSC_HOSTNAME to start the code-server in a $BATCH_SYS batch job" case $BATCH_SYS in - "LSF" ) + "LSF") VSC_BJOB_OUT=$(ssh $VSC_SSH_OPT bsub -n $VSC_NUM_CPU -W $VSC_RUN_TIME -R "rusage[mem=$VSC_MEM_PER_CPU_CORE]" $VSC_SNUM_GPU<<ENDBSUB module load $VSC_MODULE_COMMAND export XDG_RUNTIME_DIR="\$HOME/vsc_runtime" @@ -332,16 +333,14 @@ case $BATCH_SYS in echo "Remote IP:\$VSC_IP_REMOTE" >> /cluster/home/$VSC_USERNAME/vscip code-server --bind-addr=\${VSC_IP_REMOTE}:8899 ENDBSUB - ) - ;; - - "SLURM" ) +) ;; + "SLURM") VSC_RUN_TIME="${VSC_RUN_TIME}":00" " if [ "$VSC_NUM_GPU" -gt "0" ]; then - VSC_SNUM_GPU="-G $VSC_NUM_GPU -A $SHAREHOLDERGR" + VSC_SNUM_GPU="-G $VSC_NUM_GPU" fi @@ -355,18 +354,12 @@ code-server --bind-addr=\${VSC_IP_REMOTE}:8899 ENDBSUB ) ;; - *) - echo - echo "Please specify either LSF or SLURM as your choice of the batch system" - echo - display_help - ;; -esac - - +esac -VSC_BJOB_ID=$(echo $VSC_BJOB_OUT | awk '/is submitted/{print substr($2, 2, length($2)-2);}') +# TODO: get jobid for both cases (LSF/Slurm) +# store jobid in a variable +# VSC_BJOB_ID=$(echo $VSC_BJOB_OUT | awk '/is submitted/{print substr($2, 2, length($2)-2);}') # wait until batch job has started, poll every $VSC_WAITING_INTERVAL seconds to check if /cluster/home/$VSC_USERNAME/vscip exists # once the file exists and is not empty the batch job has started @@ -409,6 +402,10 @@ VSC_LOCAL_PORT=$((3 * 2**14 + RANDOM % 2**14)) echo -e "Using local port: $VSC_LOCAL_PORT" # write reconnect_info file +# +# FIXME: add jobid +# BJOB ID : $VSC_BJOB_ID + cat <<EOF > $VSC_SCRIPTDIR/reconnect_info Restart file Remote IP address : $VSC_REMOTE_IP @@ -416,7 +413,6 @@ Remote port : $VSC_REMOTE_PORT Local port : $VSC_LOCAL_PORT SSH tunnel : ssh $VSC_SSH_OPT -L $VSC_LOCAL_PORT:$VSC_REMOTE_IP:$VSC_REMOTE_PORT -N & URL : http://localhost:$VSC_LOCAL_PORT -BJOB ID : $VSC_BJOB_ID EOF # setup SSH tunnel from local computer to compute node via login node -- GitLab