@ -50,6 +50,52 @@ def node_slot_stats_raw(qstat_f, show_disabled_nodes=False):
print ( L )
def node_slot_stats ( qstat_f , show_disabled_nodes = False ) :
""" Prints status of slot availability per machine type (defined as
host with the same base hostname ( e . g . " c6- " , or " c8- " ) .
Originally implemented based on the naming of hosts on Turing cluster .
In SGE terminology , " slot " means a CPU core .
Example output :
MACHTYPE NODE CORES used free resv
c6 15 240 77 163 0
c8 40 768 569 199 0
cr 74 1480 988 492 0
crhimem 3 96 0 96 0
crphi 10 200 48 152 0
d430 49 1568 1292 276 0
d730 10 280 10 270 0
( changes depending on what ' s disabled and the load of the cluster)
FIXME : If a machine is covered by more than one queue , this will
cause the counts to be overestimated . Must register if a machine has
been encountered and not re - account that machine .
However this may not be the best approach as queues are overlapping
on machines . Since on Turing , the practice is not to further split a
machine to multiple queues ( i . e . a 32 - core node have all the 32
cores assignable to both main and timed - main queues , rather than
dedicating 16 for main and 16 for timed - main ) , we use a particular
way to avoid the double - counting :
- slots_resv : total number of reserved slots in a node ( for whatever
the sysadmin designates ) - - sum them up
- slots_used : total number of slots currently used ( i . e . ,
occupied by jobs ) - - sum them up
- slots_tot : total number of slots in a node - - take the maximum
value encountered .
Had the nodes split - dedicated to a particular queue , we have to
take the sum of the values instead .
"""
from pprint import pprint
host_stats = collect_host_stats ( qstat_f , show_disabled_nodes )
print_host_stats ( host_stats )
def node_slot_stats_per_machine_type ( qstat_f , show_disabled_nodes = False ) :
""" Prints status of slot availability per machine type (defined as
host with the same base hostname ( e . g . " c6- " , or " c8- " ) .
@ -174,6 +220,35 @@ def collect_host_stats(qstat_f, show_disabled_nodes=None):
return host_stats
def node_load_ratio ( node_load , slots_used ) :
""" Ratio of node load vs slots claimed to be used. This should be close to one
if the job uses the CPUs efficiently , or near zero if most jobs are interactive
( i . e . lots of idling ) . """
return node_load / slots_used if slots_used != 0 \
else 0.0 if node_load < 0.75 \
else float ( ' nan ' )
def print_host_stats ( host_stats ) :
""" Prints the per-host statistics gathered by `collect_host_stats`.
"""
hostnames = sorted ( host_stats . keys ( ) )
print ( " %-16s %5s %5s %5s %5s %7s %9s " \
% ( " HOST " , " CORES " , " used " , " free " , " resv " , " load " , " load/used " ) )
for h in hostnames :
hs = host_stats [ h ]
print ( " %-16s %5d %5d %5d %5d %7.2f %9.3f " \
% ( h ,
hs [ ' slots_tot ' ] ,
hs [ ' slots_used ' ] ,
hs [ ' slots_tot ' ] - hs [ ' slots_used ' ] - hs [ ' slots_resv ' ] ,
hs [ ' slots_resv ' ] ,
hs [ ' node_load ' ] ,
node_load_ratio ( hs [ ' node_load ' ] , hs [ ' slots_used ' ] ) ,
)
)
def summarize_hosttype_stats ( host_stats ) :
""" Further summarize the host stats by the host type (denoted by the
prefix of the hostname before the dash character , i . e . " c8 " for
@ -226,6 +301,7 @@ def print_hosttype_stats(hosttype_stats):
def help ( ) :
msg = """ \
% ( CMD ) s - Shows node status from SGE information
@ -263,7 +339,9 @@ def main_default(argv):
elif argv [ 1 ] in ( ' --raw ' , ' raw ' ) :
cmd = " raw "
elif argv [ 1 ] in ( ' --stats ' , ' stats ' , ' stat ' ) :
cmd = " stats "
cmd = " stats " # old stats, a.k.a. hosttype_stats
elif re . search ( r ' ^(--)?host-?stat ' , argv [ 1 ] ) :
cmd = " hoststats "
elif argv [ 1 ] in ( ' --help ' , ' help ' , ' -h ' ) :
help ( )
return 0
@ -311,6 +389,10 @@ def main_default(argv):
node_slot_stats_raw ( qstat_f_current ,
show_disabled_nodes = show_disabled_nodes ,
)
elif cmd == " hoststats " :
node_slot_stats ( qstat_f_current ,
show_disabled_nodes = show_disabled_nodes ,
)
elif cmd == " stats " :
node_slot_stats_per_machine_type ( qstat_f_current ,
show_disabled_nodes = show_disabled_nodes ,