Tested on Wahab cluster.master
parent
3aa1688f8e
commit
dfb9db6a60
1 changed files with 41 additions and 0 deletions
@ -0,0 +1,41 @@ |
||||
#!/bin/bash |
||||
# |
||||
# check-gpu-utilization.sh |
||||
# Given a SLURM cluster, enumerate all the GPU nodes except those that are down |
||||
# |
||||
# Created: 2023-04-06 |
||||
|
||||
set -eu |
||||
|
||||
# Must be a valid regex |
||||
GPU_PARTITIONS='^gpu$' |
||||
|
||||
# list GPU nodes being utilized (partially/fully) |
||||
LIST_GPU_NODES=( $(sinfo -N | awk '($3 ~ /'"$GPU_PARTITIONS"'/) && ($4 ~ /^(mix|alloc)$/) { print $1 }') ) |
||||
|
||||
echo "$0" |
||||
date |
||||
|
||||
# list all the jobs: |
||||
|
||||
echo "=== LISTING OF ALL GPU JOBS ===" |
||||
LIST_GPU_JOBS=$(squeue | awk '$2 ~ /'"$GPU_PARTITIONS"'/ { print }') |
||||
echo "$LIST_GPU_JOBS" |
||||
echo |
||||
|
||||
echo "=== LISTING OF GPU UTILIZATIONS PER NODE ===" |
||||
for Node in "${LIST_GPU_NODES[@]}"; do |
||||
echo " :: node: $Node" |
||||
ssh "$Node" nvidia-smi |
||||
echo |
||||
done |
||||
echo |
||||
|
||||
echo "=== LISTING OF GPU JOB SPECIFICATIONS ===" |
||||
for Job in $(echo "${LIST_GPU_JOBS}" | awk '{ if ($1 != "JOBID") { print($1) } }'); do |
||||
#echo " :: Job: $Node" |
||||
scontrol show job "$Job" |
||||
#echo |
||||
done |
||||
|
||||
#squeue | awk '$2 ~ /'"$GPU_PARTITIONS"'/ { print }' |
Loading…
Reference in new issue