* show-node-status.py: A toolbox to analyze node status returned by SGE.

master
Wirawan Purwanto 8 years ago
parent acfb11e010
commit f06803ba6c
  1. 232
      sge/show-node-status.py

@ -20,10 +20,16 @@ import re
import subprocess import subprocess
import sys import sys
class ParseError(RuntimeError):
pass
class ProgramError(RuntimeError):
pass
#----------------------- UNDER CONSTRUCTION ----------------------- #----------------------- UNDER CONSTRUCTION -----------------------
#Nothing was done yet #Nothing was done yet
def node_slot_stats_raw(qstat_f, show_disabled_nodes=True): def node_slot_stats_raw(qstat_f, show_disabled_nodes=False):
"""Prints the node stats from `qstat -f' in raw format: """Prints the node stats from `qstat -f' in raw format:
- not printing disabled nodes - not printing disabled nodes
- not showing the computational jobs that are running on these nodes - not showing the computational jobs that are running on these nodes
@ -44,6 +50,180 @@ def node_slot_stats_raw(qstat_f, show_disabled_nodes=True):
print(L) print(L)
def node_slot_stats_per_machine_type(qstat_f, show_disabled_nodes=False):
"""Prints status of slot availability per machine type (defined as
host with the same base hostname (e.g. "c6-", or "c8-").
Originally implemented based on the naming of hosts on Turing cluster.
In SGE terminology, "slot" means a CPU core.
Example output:
MACHTYPE NODE CORES used free resv
c6 15 240 77 163 0
c8 40 768 569 199 0
cr 74 1480 988 492 0
crhimem 3 96 0 96 0
crphi 10 200 48 152 0
d430 49 1568 1292 276 0
d730 10 280 10 270 0
(changes depending on what's disabled and the load of the cluster)
FIXME: If a machine is covered by more than one queue, this will
cause the counts to be overestimated. Must register if a machine has
been encountered and not re-account that machine.
However this may not be the best approach as queues are overlapping
on machines. Since on Turing, the practice is not to further split a
machine to multiple queues (i.e. a 32-core node have all the 32
cores assignable to both main and timed-main queues, rather than
dedicating 16 for main and 16 for timed-main), we use a particular
way to avoid the double-counting:
- slots_resv: total number of reserved slots in a node (for whatever
the sysadmin designates) -- sum them up
- slots_used: total number of slots currently used (i.e.,
occupied by jobs) -- sum them up
- slots_tot: total number of slots in a node -- take the maximum
value encountered.
Had the nodes split-dedicated to a particular queue, we have to
take the sum of the values instead.
"""
from pprint import pprint
host_stats = collect_host_stats(qstat_f, show_disabled_nodes)
#pprint(host_stats)
hosttype_stats = summarize_hosttype_stats(host_stats)
#pprint(hosttype_stats)
print_hosttype_stats(hosttype_stats)
def collect_host_stats(qstat_f, show_disabled_nodes=None):
"""Internal routine to collect node stats from `qstat -f` by
combining node status that were printed for each `queue@hostname`
combinations.
The result is a dict with hostname as the key."""
host_stats = {}
#host_list_by_kinds = {}
def host_get_stats_rec(hostname):
if hostname not in host_stats:
s = {
'slots_resv': 0,
'slots_used': 0,
'slots_tot': 0,
'queues': [],
}
host_stats[hostname] = s
return host_stats[hostname]
FNR = 0
for L in qstat_f:
FNR += 1
FLDS = L.split()
status_flags = FLDS[5] if (len(FLDS) > 5) else ""
if FNR == 1 and FLDS[0] == "queuename":
continue
# Valid host status field
if re.search(r'^[A-Za-z]', L) and len(FLDS) in (5,6):
# This line has a format like this:
# main@c8-014.cm.cluster BIP 0/10/16 9.98 linux-x64 d
# ^ queue & node name
queue_node, queue_type, core_usage_combo, node_load, os_arch \
= tuple(FLDS[0:5])
try:
node_load = float(node_load)
except ValueError:
node_load = 0
# status flags, see above
# skip disabled hosts
if ("d" in status_flags) and not show_disabled_nodes:
continue
#if (optPrintRaw != 0) print($0)
# Extract more useful info
m = re.search(r'^([^@]+)@([^-]+)-(.*)$', queue_node)
if not m:
raise ParseError, \
"Invalid queue/host combo on line %D: %s" % (FNR, queue_node)
queue, hostkind, hostnum = m.groups()
hostname = hostkind + "-" + hostnum
slots_resv, slots_used, slots_tot = map(int, core_usage_combo.split("/"))
hoststat = host_get_stats_rec(hostname)
hoststat['slots_resv'] += slots_resv
hoststat['slots_used'] += slots_used
# FIXME assume same across queues; fix if not correct:
hoststat['slots_tot'] = max(hoststat['slots_tot'], slots_tot)
hoststat['os_arch'] = os_arch
# FIXME we assume all of same queue type; fix if not correct:
hoststat['queue_type'] = queue_type
hoststat['queues'].append(queue)
# FIXME we assume all have same load; fix if not correct:
hoststat['node_load'] = node_load
#return host_list_by_kinds, host_stats
return host_stats
def summarize_hosttype_stats(host_stats):
"""Further summarize the host stats by the host type (denoted by the
prefix of the hostname before the dash character, i.e. "c8" for
"c8-003").
"""
hosttype_stats = {}
def hosttype_get_stats_rec(hosttype):
if hosttype not in hosttype_stats:
s = {
'hosts': [],
}
hosttype_stats[hosttype] = s
return hosttype_stats[hosttype]
for (hosttype, hostname) in [ (h.split('-')[0], h) for h in host_stats.keys() ]:
#print(hosttype, hostname)
hts = hosttype_get_stats_rec(hosttype)
hts['hosts'].append(hostname)
for hts in hosttype_stats.values():
#print hts
hts['host_count'] = len(hts['hosts'])
hts['slots_resv'] = sum(host_stats[h]['slots_resv'] for h in hts['hosts'])
hts['slots_tot'] = sum(host_stats[h]['slots_tot'] for h in hts['hosts'])
hts['slots_used'] = sum(host_stats[h]['slots_used'] for h in hts['hosts'])
hts['node_load'] = sum(host_stats[h]['node_load'] for h in hts['hosts'])
hts['os_arch'] = host_stats[hts['hosts'][0]]['slots_used']
return hosttype_stats
def print_hosttype_stats(hosttype_stats):
hosttypes = sorted(hosttype_stats.keys())
print("%-16s %5s %5s %5s %5s %5s %7s %9s" \
% ("MACHTYPE", "NODES", "CORES", "used", "free", "resv", "load", "load/used"))
for ht in hosttypes:
hts = hosttype_stats[ht]
print("%-16s %5d %5d %5d %5d %5d %7.2f %9.3f" \
% (ht, hts['host_count'],
hts['slots_tot'],
hts['slots_used'],
hts['slots_tot'] - hts['slots_used'] - hts['slots_resv'],
hts['slots_resv'],
hts['node_load'],
hts['node_load'] / hts['slots_used'] if hts['slots_used'] != 0
else 0.0 if hts['node_load'] < 0.75 else float('nan')
)
)
def help(): def help():
@ -65,13 +245,14 @@ stats
""" """
def main_default(argv, save_qstat=True): def main_default(argv, save_qstat=None):
"""Main default function: """Main default function:
- By default we invoke qstat -f and prints the analysis. - By default we invoke qstat -f and prints the analysis.
- If argv[1] is given, then we read in the file and - If argv[1] is given, then we read in the file and
use that for the analysis. use that for the analysis.
""" """
from time import localtime, strftime from time import localtime, strftime
from getopt import getopt
dtime = localtime() dtime = localtime()
dtimestr = strftime("%Y%m%d-%H%M", dtime) dtimestr = strftime("%Y%m%d-%H%M", dtime)
@ -87,10 +268,23 @@ def main_default(argv, save_qstat=True):
raise ValueError, "Unknown action: "+argv[1] raise ValueError, "Unknown action: "+argv[1]
# Skip program name and first command: # Skip program name and first command:
cmdargs = argv[2:] cmdargs_in = argv[2:]
cmdopts, cmdargs = getopt(cmdargs_in,
"ds",
["show-disabled-nodes=",
"include-disabled-nodes=",
"save",
])
# Default options # Default options
show_disabled_nodes = False show_disabled_nodes = False
for o,a in cmdopts:
if o in ('-d',):
show_disabled_nodes = True
elif o in ('--show-disabled-nodes', '--include-disabled-nodes'):
show_disabled_nodes = parse_int_or_bool(a)
elif o in ('-s', '--save'):
save_qstat = True
if len(cmdargs) > 0: if len(cmdargs) > 0:
qstat_f_current = open(cmdargs[0], "r").read().splitlines() qstat_f_current = open(cmdargs[0], "r").read().splitlines()
@ -106,11 +300,11 @@ def main_default(argv, save_qstat=True):
show_disabled_nodes=show_disabled_nodes, show_disabled_nodes=show_disabled_nodes,
) )
elif cmd == "stats": elif cmd == "stats":
node_slots_stats_per_node_type(qstat_f_current, node_slot_stats_per_machine_type(qstat_f_current,
show_disabled_nodes=show_disabled_nodes, show_disabled_nodes=show_disabled_nodes,
) )
else: else:
raise "Missing support for command: "+cmd raise ProgramError, "Missing support for command: "+cmd
@ -157,6 +351,34 @@ def str_fmt_heading(fmt):
return _str_fmt_heading_rx.sub(r'\1s', fmt) return _str_fmt_heading_rx.sub(r'\1s', fmt)
def parse_int_or_bool(S):
if isinstance(S, basestring):
S = S.strip().lower()
try:
return int(S)
except ValueError:
if S in ('true', 't', 'yes', 'y', 'on'):
return True
elif S in ('false', 'f', 'no', 'n', 'off', '-', ''):
return False
else:
raise ValueError, "Don't understand '%s' for boolean value" % S
else:
return S
def parse_bool(S):
if isinstance(S, basestring):
S = S.strip().lower()
if S in ('true', 't', 'yes', 'y', 'on', '1'):
return True
elif S in ('false', 'f', 'no', 'n', 'off', '0', '-', ''):
return False
else:
raise ValueError, "Don't understand '%s' for boolean value" % S
else:
return S
# stub main code # stub main code

Loading…
Cancel
Save