|
|
@ -20,10 +20,16 @@ import re |
|
|
|
import subprocess |
|
|
|
import subprocess |
|
|
|
import sys |
|
|
|
import sys |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ParseError(RuntimeError): |
|
|
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ProgramError(RuntimeError): |
|
|
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
#----------------------- UNDER CONSTRUCTION ----------------------- |
|
|
|
#----------------------- UNDER CONSTRUCTION ----------------------- |
|
|
|
#Nothing was done yet |
|
|
|
#Nothing was done yet |
|
|
|
|
|
|
|
|
|
|
|
def node_slot_stats_raw(qstat_f, show_disabled_nodes=True): |
|
|
|
def node_slot_stats_raw(qstat_f, show_disabled_nodes=False): |
|
|
|
"""Prints the node stats from `qstat -f' in raw format: |
|
|
|
"""Prints the node stats from `qstat -f' in raw format: |
|
|
|
- not printing disabled nodes |
|
|
|
- not printing disabled nodes |
|
|
|
- not showing the computational jobs that are running on these nodes |
|
|
|
- not showing the computational jobs that are running on these nodes |
|
|
@ -44,6 +50,180 @@ def node_slot_stats_raw(qstat_f, show_disabled_nodes=True): |
|
|
|
print(L) |
|
|
|
print(L) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def node_slot_stats_per_machine_type(qstat_f, show_disabled_nodes=False): |
|
|
|
|
|
|
|
"""Prints status of slot availability per machine type (defined as |
|
|
|
|
|
|
|
host with the same base hostname (e.g. "c6-", or "c8-"). |
|
|
|
|
|
|
|
Originally implemented based on the naming of hosts on Turing cluster. |
|
|
|
|
|
|
|
In SGE terminology, "slot" means a CPU core. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Example output: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MACHTYPE NODE CORES used free resv |
|
|
|
|
|
|
|
c6 15 240 77 163 0 |
|
|
|
|
|
|
|
c8 40 768 569 199 0 |
|
|
|
|
|
|
|
cr 74 1480 988 492 0 |
|
|
|
|
|
|
|
crhimem 3 96 0 96 0 |
|
|
|
|
|
|
|
crphi 10 200 48 152 0 |
|
|
|
|
|
|
|
d430 49 1568 1292 276 0 |
|
|
|
|
|
|
|
d730 10 280 10 270 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(changes depending on what's disabled and the load of the cluster) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FIXME: If a machine is covered by more than one queue, this will |
|
|
|
|
|
|
|
cause the counts to be overestimated. Must register if a machine has |
|
|
|
|
|
|
|
been encountered and not re-account that machine. |
|
|
|
|
|
|
|
However this may not be the best approach as queues are overlapping |
|
|
|
|
|
|
|
on machines. Since on Turing, the practice is not to further split a |
|
|
|
|
|
|
|
machine to multiple queues (i.e. a 32-core node have all the 32 |
|
|
|
|
|
|
|
cores assignable to both main and timed-main queues, rather than |
|
|
|
|
|
|
|
dedicating 16 for main and 16 for timed-main), we use a particular |
|
|
|
|
|
|
|
way to avoid the double-counting: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- slots_resv: total number of reserved slots in a node (for whatever |
|
|
|
|
|
|
|
the sysadmin designates) -- sum them up |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- slots_used: total number of slots currently used (i.e., |
|
|
|
|
|
|
|
occupied by jobs) -- sum them up |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- slots_tot: total number of slots in a node -- take the maximum |
|
|
|
|
|
|
|
value encountered. |
|
|
|
|
|
|
|
Had the nodes split-dedicated to a particular queue, we have to |
|
|
|
|
|
|
|
take the sum of the values instead. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
from pprint import pprint |
|
|
|
|
|
|
|
host_stats = collect_host_stats(qstat_f, show_disabled_nodes) |
|
|
|
|
|
|
|
#pprint(host_stats) |
|
|
|
|
|
|
|
hosttype_stats = summarize_hosttype_stats(host_stats) |
|
|
|
|
|
|
|
#pprint(hosttype_stats) |
|
|
|
|
|
|
|
print_hosttype_stats(hosttype_stats) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def collect_host_stats(qstat_f, show_disabled_nodes=None): |
|
|
|
|
|
|
|
"""Internal routine to collect node stats from `qstat -f` by |
|
|
|
|
|
|
|
combining node status that were printed for each `queue@hostname` |
|
|
|
|
|
|
|
combinations. |
|
|
|
|
|
|
|
The result is a dict with hostname as the key.""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
host_stats = {} |
|
|
|
|
|
|
|
#host_list_by_kinds = {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def host_get_stats_rec(hostname): |
|
|
|
|
|
|
|
if hostname not in host_stats: |
|
|
|
|
|
|
|
s = { |
|
|
|
|
|
|
|
'slots_resv': 0, |
|
|
|
|
|
|
|
'slots_used': 0, |
|
|
|
|
|
|
|
'slots_tot': 0, |
|
|
|
|
|
|
|
'queues': [], |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
host_stats[hostname] = s |
|
|
|
|
|
|
|
return host_stats[hostname] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FNR = 0 |
|
|
|
|
|
|
|
for L in qstat_f: |
|
|
|
|
|
|
|
FNR += 1 |
|
|
|
|
|
|
|
FLDS = L.split() |
|
|
|
|
|
|
|
status_flags = FLDS[5] if (len(FLDS) > 5) else "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if FNR == 1 and FLDS[0] == "queuename": |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Valid host status field |
|
|
|
|
|
|
|
if re.search(r'^[A-Za-z]', L) and len(FLDS) in (5,6): |
|
|
|
|
|
|
|
# This line has a format like this: |
|
|
|
|
|
|
|
# main@c8-014.cm.cluster BIP 0/10/16 9.98 linux-x64 d |
|
|
|
|
|
|
|
# ^ queue & node name |
|
|
|
|
|
|
|
queue_node, queue_type, core_usage_combo, node_load, os_arch \ |
|
|
|
|
|
|
|
= tuple(FLDS[0:5]) |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
node_load = float(node_load) |
|
|
|
|
|
|
|
except ValueError: |
|
|
|
|
|
|
|
node_load = 0 |
|
|
|
|
|
|
|
# status flags, see above |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# skip disabled hosts |
|
|
|
|
|
|
|
if ("d" in status_flags) and not show_disabled_nodes: |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#if (optPrintRaw != 0) print($0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Extract more useful info |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
m = re.search(r'^([^@]+)@([^-]+)-(.*)$', queue_node) |
|
|
|
|
|
|
|
if not m: |
|
|
|
|
|
|
|
raise ParseError, \ |
|
|
|
|
|
|
|
"Invalid queue/host combo on line %D: %s" % (FNR, queue_node) |
|
|
|
|
|
|
|
queue, hostkind, hostnum = m.groups() |
|
|
|
|
|
|
|
hostname = hostkind + "-" + hostnum |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
slots_resv, slots_used, slots_tot = map(int, core_usage_combo.split("/")) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hoststat = host_get_stats_rec(hostname) |
|
|
|
|
|
|
|
hoststat['slots_resv'] += slots_resv |
|
|
|
|
|
|
|
hoststat['slots_used'] += slots_used |
|
|
|
|
|
|
|
# FIXME assume same across queues; fix if not correct: |
|
|
|
|
|
|
|
hoststat['slots_tot'] = max(hoststat['slots_tot'], slots_tot) |
|
|
|
|
|
|
|
hoststat['os_arch'] = os_arch |
|
|
|
|
|
|
|
# FIXME we assume all of same queue type; fix if not correct: |
|
|
|
|
|
|
|
hoststat['queue_type'] = queue_type |
|
|
|
|
|
|
|
hoststat['queues'].append(queue) |
|
|
|
|
|
|
|
# FIXME we assume all have same load; fix if not correct: |
|
|
|
|
|
|
|
hoststat['node_load'] = node_load |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#return host_list_by_kinds, host_stats |
|
|
|
|
|
|
|
return host_stats |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def summarize_hosttype_stats(host_stats): |
|
|
|
|
|
|
|
"""Further summarize the host stats by the host type (denoted by the |
|
|
|
|
|
|
|
prefix of the hostname before the dash character, i.e. "c8" for |
|
|
|
|
|
|
|
"c8-003"). |
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
hosttype_stats = {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def hosttype_get_stats_rec(hosttype): |
|
|
|
|
|
|
|
if hosttype not in hosttype_stats: |
|
|
|
|
|
|
|
s = { |
|
|
|
|
|
|
|
'hosts': [], |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
hosttype_stats[hosttype] = s |
|
|
|
|
|
|
|
return hosttype_stats[hosttype] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (hosttype, hostname) in [ (h.split('-')[0], h) for h in host_stats.keys() ]: |
|
|
|
|
|
|
|
#print(hosttype, hostname) |
|
|
|
|
|
|
|
hts = hosttype_get_stats_rec(hosttype) |
|
|
|
|
|
|
|
hts['hosts'].append(hostname) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for hts in hosttype_stats.values(): |
|
|
|
|
|
|
|
#print hts |
|
|
|
|
|
|
|
hts['host_count'] = len(hts['hosts']) |
|
|
|
|
|
|
|
hts['slots_resv'] = sum(host_stats[h]['slots_resv'] for h in hts['hosts']) |
|
|
|
|
|
|
|
hts['slots_tot'] = sum(host_stats[h]['slots_tot'] for h in hts['hosts']) |
|
|
|
|
|
|
|
hts['slots_used'] = sum(host_stats[h]['slots_used'] for h in hts['hosts']) |
|
|
|
|
|
|
|
hts['node_load'] = sum(host_stats[h]['node_load'] for h in hts['hosts']) |
|
|
|
|
|
|
|
hts['os_arch'] = host_stats[hts['hosts'][0]]['slots_used'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return hosttype_stats |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def print_hosttype_stats(hosttype_stats): |
|
|
|
|
|
|
|
hosttypes = sorted(hosttype_stats.keys()) |
|
|
|
|
|
|
|
print("%-16s %5s %5s %5s %5s %5s %7s %9s" \ |
|
|
|
|
|
|
|
% ("MACHTYPE", "NODES", "CORES", "used", "free", "resv", "load", "load/used")) |
|
|
|
|
|
|
|
for ht in hosttypes: |
|
|
|
|
|
|
|
hts = hosttype_stats[ht] |
|
|
|
|
|
|
|
print("%-16s %5d %5d %5d %5d %5d %7.2f %9.3f" \ |
|
|
|
|
|
|
|
% (ht, hts['host_count'], |
|
|
|
|
|
|
|
hts['slots_tot'], |
|
|
|
|
|
|
|
hts['slots_used'], |
|
|
|
|
|
|
|
hts['slots_tot'] - hts['slots_used'] - hts['slots_resv'], |
|
|
|
|
|
|
|
hts['slots_resv'], |
|
|
|
|
|
|
|
hts['node_load'], |
|
|
|
|
|
|
|
hts['node_load'] / hts['slots_used'] if hts['slots_used'] != 0 |
|
|
|
|
|
|
|
else 0.0 if hts['node_load'] < 0.75 else float('nan') |
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def help(): |
|
|
|
def help(): |
|
|
@ -65,13 +245,14 @@ stats |
|
|
|
""" |
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main_default(argv, save_qstat=True): |
|
|
|
def main_default(argv, save_qstat=None): |
|
|
|
"""Main default function: |
|
|
|
"""Main default function: |
|
|
|
- By default we invoke qstat -f and prints the analysis. |
|
|
|
- By default we invoke qstat -f and prints the analysis. |
|
|
|
- If argv[1] is given, then we read in the file and |
|
|
|
- If argv[1] is given, then we read in the file and |
|
|
|
use that for the analysis. |
|
|
|
use that for the analysis. |
|
|
|
""" |
|
|
|
""" |
|
|
|
from time import localtime, strftime |
|
|
|
from time import localtime, strftime |
|
|
|
|
|
|
|
from getopt import getopt |
|
|
|
|
|
|
|
|
|
|
|
dtime = localtime() |
|
|
|
dtime = localtime() |
|
|
|
dtimestr = strftime("%Y%m%d-%H%M", dtime) |
|
|
|
dtimestr = strftime("%Y%m%d-%H%M", dtime) |
|
|
@ -87,10 +268,23 @@ def main_default(argv, save_qstat=True): |
|
|
|
raise ValueError, "Unknown action: "+argv[1] |
|
|
|
raise ValueError, "Unknown action: "+argv[1] |
|
|
|
|
|
|
|
|
|
|
|
# Skip program name and first command: |
|
|
|
# Skip program name and first command: |
|
|
|
cmdargs = argv[2:] |
|
|
|
cmdargs_in = argv[2:] |
|
|
|
|
|
|
|
cmdopts, cmdargs = getopt(cmdargs_in, |
|
|
|
|
|
|
|
"ds", |
|
|
|
|
|
|
|
["show-disabled-nodes=", |
|
|
|
|
|
|
|
"include-disabled-nodes=", |
|
|
|
|
|
|
|
"save", |
|
|
|
|
|
|
|
]) |
|
|
|
|
|
|
|
|
|
|
|
# Default options |
|
|
|
# Default options |
|
|
|
show_disabled_nodes = False |
|
|
|
show_disabled_nodes = False |
|
|
|
|
|
|
|
for o,a in cmdopts: |
|
|
|
|
|
|
|
if o in ('-d',): |
|
|
|
|
|
|
|
show_disabled_nodes = True |
|
|
|
|
|
|
|
elif o in ('--show-disabled-nodes', '--include-disabled-nodes'): |
|
|
|
|
|
|
|
show_disabled_nodes = parse_int_or_bool(a) |
|
|
|
|
|
|
|
elif o in ('-s', '--save'): |
|
|
|
|
|
|
|
save_qstat = True |
|
|
|
|
|
|
|
|
|
|
|
if len(cmdargs) > 0: |
|
|
|
if len(cmdargs) > 0: |
|
|
|
qstat_f_current = open(cmdargs[0], "r").read().splitlines() |
|
|
|
qstat_f_current = open(cmdargs[0], "r").read().splitlines() |
|
|
@ -106,11 +300,11 @@ def main_default(argv, save_qstat=True): |
|
|
|
show_disabled_nodes=show_disabled_nodes, |
|
|
|
show_disabled_nodes=show_disabled_nodes, |
|
|
|
) |
|
|
|
) |
|
|
|
elif cmd == "stats": |
|
|
|
elif cmd == "stats": |
|
|
|
node_slots_stats_per_node_type(qstat_f_current, |
|
|
|
node_slot_stats_per_machine_type(qstat_f_current, |
|
|
|
show_disabled_nodes=show_disabled_nodes, |
|
|
|
show_disabled_nodes=show_disabled_nodes, |
|
|
|
) |
|
|
|
) |
|
|
|
else: |
|
|
|
else: |
|
|
|
raise "Missing support for command: "+cmd |
|
|
|
raise ProgramError, "Missing support for command: "+cmd |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -157,6 +351,34 @@ def str_fmt_heading(fmt): |
|
|
|
return _str_fmt_heading_rx.sub(r'\1s', fmt) |
|
|
|
return _str_fmt_heading_rx.sub(r'\1s', fmt) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_int_or_bool(S): |
|
|
|
|
|
|
|
if isinstance(S, basestring): |
|
|
|
|
|
|
|
S = S.strip().lower() |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
return int(S) |
|
|
|
|
|
|
|
except ValueError: |
|
|
|
|
|
|
|
if S in ('true', 't', 'yes', 'y', 'on'): |
|
|
|
|
|
|
|
return True |
|
|
|
|
|
|
|
elif S in ('false', 'f', 'no', 'n', 'off', '-', ''): |
|
|
|
|
|
|
|
return False |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
raise ValueError, "Don't understand '%s' for boolean value" % S |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
return S |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_bool(S): |
|
|
|
|
|
|
|
if isinstance(S, basestring): |
|
|
|
|
|
|
|
S = S.strip().lower() |
|
|
|
|
|
|
|
if S in ('true', 't', 'yes', 'y', 'on', '1'): |
|
|
|
|
|
|
|
return True |
|
|
|
|
|
|
|
elif S in ('false', 'f', 'no', 'n', 'off', '0', '-', ''): |
|
|
|
|
|
|
|
return False |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
raise ValueError, "Don't understand '%s' for boolean value" % S |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
return S |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# stub main code |
|
|
|
# stub main code |
|
|
|
|
|
|
|
|
|
|
|