* Added analysis tool to summarize CPUs or group compute nodes based

on their CPUs.
9 years ago · a0ad7c25bc
parent 52619c3688
commit a0ad7c25bc
1 changed files with 157 additions and 4 deletions
--- a/sge/dump-cluster-info.py
+++ b/sge/dump-cluster-info.py
@ -62,9 +62,12 @@ class sh(object):
    return 0


-
-globals().setdefault("NODE_LIST", [])
-globals().setdefault("NODE_BAD_LIST", set())
+_g = globals()
+_g.setdefault("NODE_LIST", [])
+#_g.setdefault("NODE_BAD_LIST", set())
+_g.setdefault("NODE_BAD_LIST", [])
+_g.setdefault("NODE_GOOD_LIST", [])
+_g.setdefault("ROOT_DIR", "cluster-info")


 def get_node_list():
@ -95,14 +98,17 @@ def rhost_run(host, cmdline):
  return rslt
 

-def rhosts_pipe_out(cmdline, filename, hosts=None, rootdir="cluster-info"):
+def rhosts_pipe_out(cmdline, filename, hosts=None, rootdir=None):
  """Executes cmdline on each remote host (the list is given in and 
  """
+  global ROOT_DIR
  from os.path import dirname, join, isdir
  path_join = join
  Verb = 100
  if hosts is None:
    hosts = node_list()
+  if rootdir is None:
+    rootdir = ROOT_DIR
  for H in hosts:
    host_base = H.split(".")[0]
    outfname = path_join(rootdir, host_base, filename)
@ -137,12 +143,69 @@ def test_accessible_hosts(hosts=None):
  return good_hosts, bad_hosts


+def cpuinfo_extract_processor_names(fn, ht=False):
+  # REFS:
+  # https://access.redhat.com/discussions/480953
+  """Extracts the names of processors from /proc/cpuinfo.
+  Returns it as a list of processor names.
+
+  WARNING: Hyperthreading is detected with a lame methodology,
+  and only half of the number of cores are reported (i.e. only
+  physical cores)"""
+  A = []
+  siblings_on_socket = None
+  cores_on_socket = None
+  with open(fn, "r") as F:
+    for L in F:
+      if L.startswith("model name"):
+        modelname = L.split(":", 1)[1].strip()
+        A.append(modelname)
+      elif L.startswith("siblings"):
+        siblings_on_socket = int(L.split(":", 1)[1].strip())
+      elif L.startswith("cpu cores"):
+        cores_on_socket = int(L.split(":", 1)[1].strip())
+
+  #print "siblings: ", siblings_on_socket
+  #print "cores: ", cores_on_socket
+
+  # FIXME: Quick-and-dirty solution for hyperthreading;
+  # see Red Hat site above; not 100% reliable if there are several
+  # kinds of CPU models, which I don't think I'll ever encountered.
+  if (not ht) \
+     and siblings_on_socket is not None \
+     and cores_on_socket is not None \
+     and siblings_on_socket != cores_on_socket:
+    assert cores_on_socket*2 == siblings_on_socket
+    # ^^otherwise it's not Hyperthreading, the code has to be fixed!
+
+    A = A[0:len(A)/2] ### HACK!!!
+    print("Warning: hyperthreading detected in %s" % fn)
+
+  return A
+
+
+def agg_count_names(namelist):
+  """Aggregates the names in namelist to names->count mapping, as a dict.
+  Useful, e.g. for counting number of unique elements in a list.
+  """
+  A = {}
+  for C in namelist:
+    try:
+      A[C] = A[C] + 1
+    except KeyError:
+      A[C] = 1
+  return A
+
+
 # Below are the main gather tools

 def gather_cpuinfo(hosts=None):
  """Gather tool: for cpuinfo"""
  rhosts_pipe_out(("cat", "/proc/cpuinfo"), "cpuinfo.txt", hosts=hosts)

+def gather_lscpu(hosts=None):
+  """Gather tool: for lscpu"""
+  rhosts_pipe_out(("lscpu"), "lscpu.txt", hosts=hosts)

 def gather_lspci(hosts=None):
  """Gather tool: for lspci"""
@ -152,5 +215,95 @@ def gather_free(hosts=None):
  """Gather tool: for free"""
  rhosts_pipe_out(("free"), "free.txt", hosts=hosts)

+def gather_uname_a(hosts=None):
+  """Gather tool: for free"""
+  rhosts_pipe_out(("uname", "-a"), "uname-a.txt", hosts=hosts)
+
+
+#def dict_str_sorted(d):
+#  return "{" + ", ".
+
+def summarize_cpu(hosts=None):
+  from pprint import pformat
+  global ROOT_DIR
+  hosts_base = [ H.split(".")[0] for H in hosts ]
+  getfile = lambda H, bn: os.path.join(ROOT_DIR, H, bn)
+  cpu_info = []
+
+  px_hosts_by_type = {}
+
+  for H in hosts_base:
+    px_names = cpuinfo_extract_processor_names(getfile(H, "cpuinfo.txt"))
+    px_group = agg_count_names(px_names)
+    #print("%s : %s" % (H, px_group))
+
+    px_group_key = pformat(px_group)  # use pretty representation
+
+    try:
+      px_hosts_by_type[px_group_key]["hosts"] += [ H ]
+    except KeyError:
+      px_hosts_by_type[px_group_key] = {
+        "cpu_count": px_group,
+        "hosts": [ H ]
+      }
+
+  return px_hosts_by_type
+
+
+def print_summarize_cpu(summary):
+  host_types = sorted(summary.keys())
+  nproc_grand_total = 0
+  nnode_grand_total = 0
+  for T in host_types:
+    rec = summary[T]
+    nproc_per_node = sum(rec["cpu_count"].values())
+    print("%s:: %d hosts, %d procs/node, total %d procs" \
+          % (T,
+             len(rec["hosts"]),
+             nproc_per_node,
+             len(rec["hosts"]) * nproc_per_node,
+            ))
+    print("")
+    print("    " + " ".join(sorted(rec["hosts"])))
+    print("")
+    nproc_grand_total += len(rec["hosts"]) * nproc_per_node
+    nnode_grand_total += len(rec["hosts"])
+
+  print("Grand total %d procs" % nproc_grand_total)
+  print("Grand total %d nodes" % nnode_grand_total)
+
+
+def tally_summarize_cpu(summary):
+  """Tallies up the total number of processors
+  """
+
+
+def analyze_cpu_composition():
+  summ = summarize_cpu(NODE_GOOD_LIST)
+  print_summarize_cpu(summ)
+
+
+def Gather_all():
+  """Master gathering routine, to gather everything all at once.
+  It will take some time to gather every bit of information.
+  """
+  global NODE_GOOD_LIST, NODE_BAD_LIST, NODE_LIST
+  print("Testing node accesibility...")
+  NODE_GOOD_LIST, NODE_BAD_LIST = test_accesible_hosts()
+
+  print("\nGathering cpuinfo...")
+  gather_cpuinfo(NODE_GOOD_LIST)
+
+  print("\nGathering lscpu...")
+  gather_lscpu(NODE_GOOD_LIST)
+
+  print("\nGathering lspci...")
+  gather_lspci(NODE_GOOD_LIST)
+
+  print("\nGathering free mem...")
+  gather_free(NODE_GOOD_LIST)
+
+  print("\nGathering uname...")
+  gather_uname_a(NODE_GOOD_LIST)