Commit f7197d23 authored by Jarunan Panyasantisuk's avatar Jarunan Panyasantisuk
Browse files

all the scripts

parent 62c34122
# alphafold_on_euler
This project is to create a script to help users estimate the computing resources required for AlphaFold jobs and automatically output a run script ready to be submitted.
\ No newline at end of file
This project is to create a script to help users estimate the computing resources required for AlphaFold jobs and automatically output a run script ready to be submitted.
Usage:
```
./setup_alphafold_run_script.sh -f [Fasta file] -w [work directory] --max_template_date yyyy-mm-dd
```
```
[jarunanp@eu-login-20 alphafold_on_euler]$ ./setup_alphafold_run_script.sh -f ../../fastafiles/IFGSC_6mer.fasta
Reading /cluster/work/sis/cdss/jarunanp/21_12_alphafold_benchmark/fastafiles/IFGSC_6mer.fasta
Protein name: IFGSC_6mer
Number of sequences: 6
Protein type: multimer
Number of amino acids:
sum: 1246
max: 242
Estimate required resources:
Run time:
Number of CPUs:
Total CPU memory:
Number of GPUs:
Total GPU memory:
Total scratch space:
Output an LSF run script for AlphaFold2: /cluster/work/sis/cdss/jarunanp/21_12_alphafold_benchmark/scripts/alphafold_on_euler/run_alphafold.bsub
```
#!/usr/bin/bash
#BSUB -n 12
#BSUB -n 24:00
#BSUB -R "rusage[mem=10000, scratch=10000]"
#BSUB -R "rusage[ngpus_excl_p=1] select[gpu_mtotal0>=10240]"
#BSUB -R "span[hosts=1]"
#BSUB -J alphafold
source /cluster/apps/local/env2lmod.sh
module load gcc/6.3.0 openmpi/4.0.2 alphafold/2.1.1
source /cluster/apps/nss/alphafold/venv_alphafold/bin/activate
# Define paths to databases and out put directory
DATA_DIR=/cluster/project/alphafold
OUTPUT_DIR=${TMPDIR}/output
# Activate unified memory
export TF_FORCE_UNIFIED_MEMORY=0
export XLA_PYTHON_CLIENT_MEM_FRACTION=1.0
python /cluster/apps/nss/alphafold/alphafold-2.1.1/run_alphafold.py \
--data_dir=$DATA_DIR \
--output_dir=$OUTPUT_DIR \
--max_template_date="2022-01-28" \
--bfd_database_path=$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--uniref90_database_path=$DATA_DIR/uniref90/uniref90.fasta \
--uniclust30_database_path=$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
--mgnify_database_path=$DATA_DIR/mgnify/mgy_clusters_2018_12.fa \
--template_mmcif_dir=$DATA_DIR/pdb_mmcif/mmcif_files \
--obsolete_pdbs_path=$DATA_DIR/pdb_mmcif/obsolete.dat \
--model_preset=multimer --pdb_seqres_database_path=$DATA_DIR/pdb_seqres/pdb_seqres.txt --uniprot_database_path=/uniprot/uniprot.fasta \
--fasta_paths=/cluster/work/sis/cdss/jarunanp/21_12_alphafold_benchmark/fastafiles/IFGSC_6mer.fasta
mkdir -p output/IFGSC_6mer
rsync -av $TMPDIR/output/IFGSC_6mer ./output/IFGSC_6mer
#!/usr/bin/bash
# Initialize variables
FASTAFILE="undefined.fasta"
WORKDIR=$PWD
MAX_TEMPLATE_DATE=$(date +'%Y-%m-%d')
# Parse in arguments
while [[ $# -gt 0 ]]; do
case $1 in
-f|--fastafile)
# Get absolute path
FASTAFILE=$(readlink -m $2)
# Get the protein name
fastaname=$(basename -- "$FASTAFILE")
PROTEIN="${fastaname%.*}"
echo " Reading $FASTAFILE"
echo " Protein name: $PROTEIN"
shift;
shift;
;;
-w|--workdir)
# Users can specify a work directory, e.g., $SCRATCH/alphafold_tests
# Otherwise it will use the current directy as a work directory
WORKDIR="$2"
shift;
shift;
;;
--max_template_date)
# The max template date of the databases to use for pair representation
# This could affect the accuracy of the outcome
MAX_TEMPLAT_DATE="$2"
shift;
shift;
;;
esac
done
# Count the number of lines in the fastafile
n_lines=$(grep -cve '^\s*$' $FASTAFILE)
echo " Number of sequences: $((n_lines/2))"
# Determine if the protein is a monomer or multimer
# If n_lines = 2 => 1 protein sequence => monomer
# If n_lines > 2 => multiple protein sequences => multimer
if (( "$n_lines" <= 2 )); then
echo " Protein type: monomer"
OPTIONS="--pdb70_database_path=\$DATA_DIR/pdb70/pdb70 \\ "
elif (( "$n_lines" > 2 )); then
echo " Protein type: multimer"
OPTIONS="--model_preset=multimer --pdb_seqres_database_path=\$DATA_DIR/pdb_seqres/pdb_seqres.txt --uniprot_database_path=$DATA_DIR/uniprot/uniprot.fasta \\ "
fi
# Determine the sequence length
# The required total GPU mem depends on the sum of the number of amino acids
# The required total CPU mem depends on the max of the number of amino acids
sum_aa=$(awk ' { getline aa; sum+=length(aa); } END { print sum } ' $FASTAFILE)
max_aa=$(awk ' BEGIN {max=0} { getline aa; if (length(aa) > max) {max=length(aa)}} END { print max } ' $FASTAFILE)
echo " Number of amino acids:"
echo " sum: $sum_aa"
echo " max: $max_aa"
# Estimate the required computing resources
# For simplicity, the two types of GPUs users could select are RTX 2080 Ti with 11GB GPU mem (GPU_MEM_MB>=10240)
# and TITAN RTX with 24GB GPU mem (GPU_MEM_MB >= 20480)
if (( "$sum_aa" < 800 )); then
RUNTIME="04:00"
NCPUS=12
NGPUS=1
GPU_MEM_MB=10240
TOTAL_GPU_MEM_MB=10240
TOTAL_CPU_MEM_MB=120000
TOTAL_SCRATCH_MB=120000
ENABLE_UNIFIED_MEMORY=0
MEM_FRACTION=1
elif (( "$sum_aa" >= 800 )) && (( "$sum_aa" < 1500 )); then
RUNTIME="24:00"
NCPUS=12
NGPUS=1
GPU_MEM_MB=10240
TOTAL_GPU_MEM_MB=20480
TOTAL_CPU_MEM_MB=120000
TOTAL_SCRATCH_MB=120000
ENABLE_UNIFIED_MEMORY=0
MEM_FRACTION=1
elif (( "$sum_aa" >= 1500 )) && (( "$sum_aa" < 2500 )); then
RUNTIME="24:00"
NCPUS=24
NGPUS=1
GPU_MEM_MB=20480
TOTAL_GPU_MEM_MB=81920
TOTAL_CPU_MEM_MB=240000
TOTAL_SCRATCH_MB=240000
ENABLE_UNIFIED_MEMORY=1
MEM_FRACTION=$((TOTAL_GPU_MEM_MB/GPU_MEM_MB))
elif (( "$sum_aa" >= 2500 )) && (( "$sum_aa" < 3500 )); then
RUNTIME="48:00"
NCPUS=48
NGPUS=1
GPU_MEM_MB=20480
TOTAL_GPU_MEM_MB=81920
TOTAL_CPU_MEM_MB=480000
TOTAL_SCRATCH_MB=240000
ENABLE_UNIFIED_MEMORY=1
MEM_FRACTION=$((TOTAL_GPU_MEM_MB/GPU_MEM_MB))
elif (( "$sum_aa" >= 3500 )); then
RUNTIME="120:00"
NCPUS=64
NGPUS=1
GPU_MEM_MB=20480
TOTAL_GPU_MEM_MB=163840
TOTAL_CPU_MEM_MB=640000
TOTAL_SCRATCH_MB=320000
ENABLE_UNIFIED_MEMORY=1
MEM_FRACTION=$((TOTAL_GPU_MEM_MB/GPU_MEM_MB))
fi
echo -e " Estimate required resources:"
echo -e " Run time: "
echo -e " Number of CPUs: "
echo -e " Total CPU memory: "
echo -e " Number of GPUs: "
echo -e " Total GPU memory: "
echo -e " Total scratch space: "
########################################
# Output an LSF run script for AlphaFold
########################################
mkdir -p $WORKDIR
RUNSCRIPT=$WORKDIR/"run_alphafold.bsub"
echo -e " Output an LSF run script for AlphaFold2: $RUNSCRIPT"
cat <<EOF > $RUNSCRIPT
#!/usr/bin/bash
#BSUB -n $NCPUS
#BSUB -n $RUNTIME
#BSUB -R "rusage[mem=$((TOTAL_CPU_MEM_MB/NCPUS)), scratch=$((TOTAL_SCRATCH_MB/NCPUS))]"
#BSUB -R "rusage[ngpus_excl_p=$NGPUS] select[gpu_mtotal0>=$GPU_MEM_MB]"
#BSUB -R "span[hosts=1]"
#BSUB -J alphafold
source /cluster/apps/local/env2lmod.sh
module load gcc/6.3.0 openmpi/4.0.2 alphafold/2.1.1
source /cluster/apps/nss/alphafold/venv_alphafold/bin/activate
# Define paths to databases and out put directory
DATA_DIR=/cluster/project/alphafold
OUTPUT_DIR=\${TMPDIR}/output
# Activate unified memory
export TF_FORCE_UNIFIED_MEMORY=$ENABLE_UNIFIED_MEMORY
export XLA_PYTHON_CLIENT_MEM_FRACTION=$MEM_FRACTION.0
python /cluster/apps/nss/alphafold/alphafold-2.1.1/run_alphafold.py \\
--data_dir=\$DATA_DIR \\
--output_dir=\$OUTPUT_DIR \\
--max_template_date="$MAX_TEMPLATE_DATE" \\
--bfd_database_path=\$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \\
--uniref90_database_path=\$DATA_DIR/uniref90/uniref90.fasta \\
--uniclust30_database_path=\$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \\
--mgnify_database_path=\$DATA_DIR/mgnify/mgy_clusters_2018_12.fa \\
--template_mmcif_dir=\$DATA_DIR/pdb_mmcif/mmcif_files \\
--obsolete_pdbs_path=\$DATA_DIR/pdb_mmcif/obsolete.dat \\
$OPTIONS
--fasta_paths=$FASTAFILE
mkdir -p output/$PROTEIN
rsync -av \$TMPDIR/output/$PROTEIN ./output/$PROTEIN
EOF
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment