diff --git a/sge/node-slot-status.sh b/sge/node-slot-status.sh index d848026..71a96c3 100755 --- a/sge/node-slot-status.sh +++ b/sge/node-slot-status.sh @@ -1,49 +1,109 @@ #!/bin/bash # 20151028 +# +# Note: original extraction command on turing: +# +# qstat -f | grep -ve '^[-# ]' -e '^queuename' | less +# + +: ${optShowDisabledNodes=0} +: ${optPrintRaw=0} + + +function node_slot_stats_raw() +# Prints the node stats from `qstat -f' in raw format: +# - not printing disabled nodes +# - not showing the computational jobs that are running on these nodes +{ + qstat -f \ + | gawk -v optShowDisabledNodes="$optShowDisabledNodes" \ + ' +BEGIN { + STDERR = "/dev/stderr" +} +FNR == 1 && $1 == "queuename" { print; next; } + +# Valid host status field +($0 ~ /^[A-Za-z]/) && (NF == 5 || NF == 6) && (optShowDisabledNodes!=0 || ($6 !~ /d/)) { + print +} +' +} function node_slot_stats_per_machine_type() -# Original extraction command on turing: +# Prints status of slot availability per machine type (defined as +# host with the same base hostname (e.g. "c6-", or "c8-"). +# Originally implemented based on the naming of hosts on Turing cluster. # -# qstat -f | grep -ve '^[-# ]' -e '^queuename' | less +# Example output: (changes depending on what's disabled and the load of the cluster) +# +# MACHTYPE NODE CORES used free resv +# c6 15 240 77 163 0 +# c8 40 768 569 199 0 +# cr 74 1480 988 492 0 +# crhimem 3 96 0 96 0 +# crphi 10 200 48 152 0 +# d430 49 1568 1292 276 0 +# d730 10 280 10 270 0 # # FIXME: If a machine is covered by more than one queue, this will cause the counts -# to be overestimated. +# to be overestimated. Must register if a machine has been encountered and not +# re-account that machine. { - qstat -f \ - | gawk ' + qstat -f | _Process_node_slot_stats_per_machine_type +} + +function _Process_node_slot_stats_per_machine_type() +# Processing part of the routine above. +{ + gawk \ + -v optShowDisabledNodes="$optShowDisabledNodes" \ + -v optPrintRaw="$optPrintRaw" \ + '#### BEGIN { STDERR = "/dev/stderr" + hostnames_seen[-1234] = 0 } + FNR == 1 && $1 == "queuename" { next; } # Valid host status field ($0 ~ /^[A-Za-z]/) && (NF == 5 || NF == 6) { - #print($0) queue_node = $1 core_usage_combo = $3 states = $6 # if any # skip disabled hosts - if (states ~ /d/) next; + if (states ~ /d/ && (optShowDisabledNodes==0)) next; + + if (optPrintRaw != 0) print($0) # gawk extension of match: - if (! match(queue_node, /^([^@]+)@([^-]+)-(.*)$/, Strs)) - { - print("Invalid queue/host combo: " queue_node) > STDERR - next - } - else + if (match(queue_node, /^([^@]+)@([^-]+)-(.*)$/, Strs)) { queue = Strs[1] hostkind = Strs[2] hostnum = Strs[3] + hostname = hostkind "-" hostnum + } + else + { + print("Invalid queue/host combo: " queue_node) > STDERR + next } split(core_usage_combo, Strs, "/") slots_resv = Strs[1] slots_used = Strs[2] slots_tot = Strs[3] + # Avoiding double counting: + if (hostname in hostname_seen) + { + print("Host already seen: " hostname) > STDERR + next + } + mach_node_count[hostkind] = mach_node_count[hostkind] + 1 mach_node_slot_count[hostkind] = slots_tot # assume homogenous! This DOES NOT work with c8-type nodes! mach_slots_tot[hostkind] = mach_slots_tot[hostkind] + slots_tot @@ -76,7 +136,36 @@ function report_node_stats() END { report_node_stats() } -' +' \ + "$@" } -node_slot_stats_per_machine_type +function node_slot_stats_per_machine_type_f() +{ + _Process_node_slot_stats_per_machine_type "$1" +} + + +case "$1" in +(--raw|raw) + node_slot_stats_raw + ;; +(--stats|stats|"") + if [ "$2" ]; then + node_slot_stats_per_machine_type_f "$2" + else + node_slot_stats_per_machine_type + fi + ;; +(--stats-with-disabled|stats-with-disabled) + if [ "$2" ]; then + optShowDisabledNodes=1 node_slot_stats_per_machine_type_f "$2" + else + optShowDisabledNodes=1 node_slot_stats_per_machine_type + fi + ;; +(*) + echo "Unknown action: $1" >&2 + exit 2 + ;; +esac