|
|
@ -62,9 +62,12 @@ class sh(object): |
|
|
|
return 0 |
|
|
|
return 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_g = globals() |
|
|
|
globals().setdefault("NODE_LIST", []) |
|
|
|
_g.setdefault("NODE_LIST", []) |
|
|
|
globals().setdefault("NODE_BAD_LIST", set()) |
|
|
|
#_g.setdefault("NODE_BAD_LIST", set()) |
|
|
|
|
|
|
|
_g.setdefault("NODE_BAD_LIST", []) |
|
|
|
|
|
|
|
_g.setdefault("NODE_GOOD_LIST", []) |
|
|
|
|
|
|
|
_g.setdefault("ROOT_DIR", "cluster-info") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_node_list(): |
|
|
|
def get_node_list(): |
|
|
@ -95,14 +98,17 @@ def rhost_run(host, cmdline): |
|
|
|
return rslt |
|
|
|
return rslt |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def rhosts_pipe_out(cmdline, filename, hosts=None, rootdir="cluster-info"): |
|
|
|
def rhosts_pipe_out(cmdline, filename, hosts=None, rootdir=None): |
|
|
|
"""Executes cmdline on each remote host (the list is given in and |
|
|
|
"""Executes cmdline on each remote host (the list is given in and |
|
|
|
""" |
|
|
|
""" |
|
|
|
|
|
|
|
global ROOT_DIR |
|
|
|
from os.path import dirname, join, isdir |
|
|
|
from os.path import dirname, join, isdir |
|
|
|
path_join = join |
|
|
|
path_join = join |
|
|
|
Verb = 100 |
|
|
|
Verb = 100 |
|
|
|
if hosts is None: |
|
|
|
if hosts is None: |
|
|
|
hosts = node_list() |
|
|
|
hosts = node_list() |
|
|
|
|
|
|
|
if rootdir is None: |
|
|
|
|
|
|
|
rootdir = ROOT_DIR |
|
|
|
for H in hosts: |
|
|
|
for H in hosts: |
|
|
|
host_base = H.split(".")[0] |
|
|
|
host_base = H.split(".")[0] |
|
|
|
outfname = path_join(rootdir, host_base, filename) |
|
|
|
outfname = path_join(rootdir, host_base, filename) |
|
|
@ -137,12 +143,69 @@ def test_accessible_hosts(hosts=None): |
|
|
|
return good_hosts, bad_hosts |
|
|
|
return good_hosts, bad_hosts |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def cpuinfo_extract_processor_names(fn, ht=False): |
|
|
|
|
|
|
|
# REFS: |
|
|
|
|
|
|
|
# https://access.redhat.com/discussions/480953 |
|
|
|
|
|
|
|
"""Extracts the names of processors from /proc/cpuinfo. |
|
|
|
|
|
|
|
Returns it as a list of processor names. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
WARNING: Hyperthreading is detected with a lame methodology, |
|
|
|
|
|
|
|
and only half of the number of cores are reported (i.e. only |
|
|
|
|
|
|
|
physical cores)""" |
|
|
|
|
|
|
|
A = [] |
|
|
|
|
|
|
|
siblings_on_socket = None |
|
|
|
|
|
|
|
cores_on_socket = None |
|
|
|
|
|
|
|
with open(fn, "r") as F: |
|
|
|
|
|
|
|
for L in F: |
|
|
|
|
|
|
|
if L.startswith("model name"): |
|
|
|
|
|
|
|
modelname = L.split(":", 1)[1].strip() |
|
|
|
|
|
|
|
A.append(modelname) |
|
|
|
|
|
|
|
elif L.startswith("siblings"): |
|
|
|
|
|
|
|
siblings_on_socket = int(L.split(":", 1)[1].strip()) |
|
|
|
|
|
|
|
elif L.startswith("cpu cores"): |
|
|
|
|
|
|
|
cores_on_socket = int(L.split(":", 1)[1].strip()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#print "siblings: ", siblings_on_socket |
|
|
|
|
|
|
|
#print "cores: ", cores_on_socket |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# FIXME: Quick-and-dirty solution for hyperthreading; |
|
|
|
|
|
|
|
# see Red Hat site above; not 100% reliable if there are several |
|
|
|
|
|
|
|
# kinds of CPU models, which I don't think I'll ever encountered. |
|
|
|
|
|
|
|
if (not ht) \ |
|
|
|
|
|
|
|
and siblings_on_socket is not None \ |
|
|
|
|
|
|
|
and cores_on_socket is not None \ |
|
|
|
|
|
|
|
and siblings_on_socket != cores_on_socket: |
|
|
|
|
|
|
|
assert cores_on_socket*2 == siblings_on_socket |
|
|
|
|
|
|
|
# ^^otherwise it's not Hyperthreading, the code has to be fixed! |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
A = A[0:len(A)/2] ### HACK!!! |
|
|
|
|
|
|
|
print("Warning: hyperthreading detected in %s" % fn) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return A |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def agg_count_names(namelist): |
|
|
|
|
|
|
|
"""Aggregates the names in namelist to names->count mapping, as a dict. |
|
|
|
|
|
|
|
Useful, e.g. for counting number of unique elements in a list. |
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
A = {} |
|
|
|
|
|
|
|
for C in namelist: |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
A[C] = A[C] + 1 |
|
|
|
|
|
|
|
except KeyError: |
|
|
|
|
|
|
|
A[C] = 1 |
|
|
|
|
|
|
|
return A |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Below are the main gather tools |
|
|
|
# Below are the main gather tools |
|
|
|
|
|
|
|
|
|
|
|
def gather_cpuinfo(hosts=None): |
|
|
|
def gather_cpuinfo(hosts=None): |
|
|
|
"""Gather tool: for cpuinfo""" |
|
|
|
"""Gather tool: for cpuinfo""" |
|
|
|
rhosts_pipe_out(("cat", "/proc/cpuinfo"), "cpuinfo.txt", hosts=hosts) |
|
|
|
rhosts_pipe_out(("cat", "/proc/cpuinfo"), "cpuinfo.txt", hosts=hosts) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def gather_lscpu(hosts=None): |
|
|
|
|
|
|
|
"""Gather tool: for lscpu""" |
|
|
|
|
|
|
|
rhosts_pipe_out(("lscpu"), "lscpu.txt", hosts=hosts) |
|
|
|
|
|
|
|
|
|
|
|
def gather_lspci(hosts=None): |
|
|
|
def gather_lspci(hosts=None): |
|
|
|
"""Gather tool: for lspci""" |
|
|
|
"""Gather tool: for lspci""" |
|
|
@ -152,5 +215,95 @@ def gather_free(hosts=None): |
|
|
|
"""Gather tool: for free""" |
|
|
|
"""Gather tool: for free""" |
|
|
|
rhosts_pipe_out(("free"), "free.txt", hosts=hosts) |
|
|
|
rhosts_pipe_out(("free"), "free.txt", hosts=hosts) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def gather_uname_a(hosts=None): |
|
|
|
|
|
|
|
"""Gather tool: for free""" |
|
|
|
|
|
|
|
rhosts_pipe_out(("uname", "-a"), "uname-a.txt", hosts=hosts) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#def dict_str_sorted(d): |
|
|
|
|
|
|
|
# return "{" + ", ". |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def summarize_cpu(hosts=None): |
|
|
|
|
|
|
|
from pprint import pformat |
|
|
|
|
|
|
|
global ROOT_DIR |
|
|
|
|
|
|
|
hosts_base = [ H.split(".")[0] for H in hosts ] |
|
|
|
|
|
|
|
getfile = lambda H, bn: os.path.join(ROOT_DIR, H, bn) |
|
|
|
|
|
|
|
cpu_info = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
px_hosts_by_type = {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for H in hosts_base: |
|
|
|
|
|
|
|
px_names = cpuinfo_extract_processor_names(getfile(H, "cpuinfo.txt")) |
|
|
|
|
|
|
|
px_group = agg_count_names(px_names) |
|
|
|
|
|
|
|
#print("%s : %s" % (H, px_group)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
px_group_key = pformat(px_group) # use pretty representation |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
px_hosts_by_type[px_group_key]["hosts"] += [ H ] |
|
|
|
|
|
|
|
except KeyError: |
|
|
|
|
|
|
|
px_hosts_by_type[px_group_key] = { |
|
|
|
|
|
|
|
"cpu_count": px_group, |
|
|
|
|
|
|
|
"hosts": [ H ] |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return px_hosts_by_type |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def print_summarize_cpu(summary): |
|
|
|
|
|
|
|
host_types = sorted(summary.keys()) |
|
|
|
|
|
|
|
nproc_grand_total = 0 |
|
|
|
|
|
|
|
nnode_grand_total = 0 |
|
|
|
|
|
|
|
for T in host_types: |
|
|
|
|
|
|
|
rec = summary[T] |
|
|
|
|
|
|
|
nproc_per_node = sum(rec["cpu_count"].values()) |
|
|
|
|
|
|
|
print("%s:: %d hosts, %d procs/node, total %d procs" \ |
|
|
|
|
|
|
|
% (T, |
|
|
|
|
|
|
|
len(rec["hosts"]), |
|
|
|
|
|
|
|
nproc_per_node, |
|
|
|
|
|
|
|
len(rec["hosts"]) * nproc_per_node, |
|
|
|
|
|
|
|
)) |
|
|
|
|
|
|
|
print("") |
|
|
|
|
|
|
|
print(" " + " ".join(sorted(rec["hosts"]))) |
|
|
|
|
|
|
|
print("") |
|
|
|
|
|
|
|
nproc_grand_total += len(rec["hosts"]) * nproc_per_node |
|
|
|
|
|
|
|
nnode_grand_total += len(rec["hosts"]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Grand total %d procs" % nproc_grand_total) |
|
|
|
|
|
|
|
print("Grand total %d nodes" % nnode_grand_total) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def tally_summarize_cpu(summary): |
|
|
|
|
|
|
|
"""Tallies up the total number of processors |
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_cpu_composition(): |
|
|
|
|
|
|
|
summ = summarize_cpu(NODE_GOOD_LIST) |
|
|
|
|
|
|
|
print_summarize_cpu(summ) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def Gather_all(): |
|
|
|
|
|
|
|
"""Master gathering routine, to gather everything all at once. |
|
|
|
|
|
|
|
It will take some time to gather every bit of information. |
|
|
|
|
|
|
|
""" |
|
|
|
|
|
|
|
global NODE_GOOD_LIST, NODE_BAD_LIST, NODE_LIST |
|
|
|
|
|
|
|
print("Testing node accesibility...") |
|
|
|
|
|
|
|
NODE_GOOD_LIST, NODE_BAD_LIST = test_accesible_hosts() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\nGathering cpuinfo...") |
|
|
|
|
|
|
|
gather_cpuinfo(NODE_GOOD_LIST) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\nGathering lscpu...") |
|
|
|
|
|
|
|
gather_lscpu(NODE_GOOD_LIST) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\nGathering lspci...") |
|
|
|
|
|
|
|
gather_lspci(NODE_GOOD_LIST) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\nGathering free mem...") |
|
|
|
|
|
|
|
gather_free(NODE_GOOD_LIST) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\nGathering uname...") |
|
|
|
|
|
|
|
gather_uname_a(NODE_GOOD_LIST) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|