run, dump the process trees, etc.master
parent
8ae0841ca6
commit
8b99995409
1 changed files with 158 additions and 0 deletions
@ -0,0 +1,158 @@ |
|||||||
|
#!/bin/bash |
||||||
|
# |
||||||
|
# 20160713 |
||||||
|
# Wirawan Purwanto |
||||||
|
|
||||||
|
shopt -s extglob |
||||||
|
|
||||||
|
function find_run_hosts() |
||||||
|
# Find where a job runs. |
||||||
|
{ |
||||||
|
local optJobNumber="$1" |
||||||
|
qstat -f | _Find_run_hosts |
||||||
|
} |
||||||
|
|
||||||
|
function _Find_run_hosts() |
||||||
|
# Processing part of the routine above. |
||||||
|
# Takes qstat -f output. |
||||||
|
{ |
||||||
|
awk \ |
||||||
|
-v optPrintRaw="$optPrintRaw" \ |
||||||
|
-v optJobNumber="$optJobNumber" \ |
||||||
|
'#### |
||||||
|
BEGIN { |
||||||
|
STDERR = "/dev/stderr" |
||||||
|
} |
||||||
|
|
||||||
|
# Valid host status field |
||||||
|
($0 ~ /^[A-Za-z]/) && (NF == 5 || NF == 6) { |
||||||
|
host_info = $0 |
||||||
|
queue_node = $1 |
||||||
|
core_usage_combo = $3 |
||||||
|
states = $6 # if any |
||||||
|
next |
||||||
|
} |
||||||
|
|
||||||
|
$1 == optJobNumber { |
||||||
|
print host_info |
||||||
|
print $0 |
||||||
|
}' # end awk script |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
function find_job_owner() |
||||||
|
# Arg: <jobnumber> |
||||||
|
{ |
||||||
|
qstat -j "$1" \ |
||||||
|
| awk '$1 == "owner:" { print $2; exit }' |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
function find_job_master_node() |
||||||
|
# Arg: <jobnumber> |
||||||
|
# Finds the "master" node of the job, i.e. where the batch job script |
||||||
|
# was first initially launched. |
||||||
|
# Unfortunately qstat -j doesn't return the desired info, we have to |
||||||
|
# utilize full qstat to get this info. |
||||||
|
{ |
||||||
|
qstat \ |
||||||
|
| awk -v optJobNumber="$1" ' |
||||||
|
$1 == optJobNumber { |
||||||
|
queueHost = $8 |
||||||
|
sub(/^.*@/, "", queueHost) |
||||||
|
print queueHost |
||||||
|
exit |
||||||
|
}' # end awk script |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
function list_job_nodes() |
||||||
|
# Arg: <jobnumber> |
||||||
|
{ |
||||||
|
find_run_hosts "$1" \ |
||||||
|
| awk ' |
||||||
|
($0 ~ /^[A-Za-z]/) && (NF == 5 || NF == 6) { |
||||||
|
host = $1 |
||||||
|
sub(/^.*@/, "", host) |
||||||
|
print host |
||||||
|
}' # end awk script |
||||||
|
} |
||||||
|
|
||||||
|
function dump_process_tree1() |
||||||
|
# Arg: <host> <user> |
||||||
|
{ |
||||||
|
local HOST="$1" JOB_OWNER="$2" |
||||||
|
ssh "$HOST" ps ux --forest -u "$JOB_OWNER" \ |
||||||
|
| awk -v JobOwner="$JOB_OWNER" '($1 == "USER") || ($1 == JobOwner) { print }' |
||||||
|
# Note: some workaround was needed because it also printed other users |
||||||
|
# notably the one running this job. |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
function dump_process_trees() |
||||||
|
{ |
||||||
|
local SGE_JOB_ID="$1" |
||||||
|
local MASTER_HOST ALL_HOSTS HOST |
||||||
|
local JOB_OWNER |
||||||
|
echo "job_number: $SGE_JOB_ID" |
||||||
|
MASTER_HOST=$(find_job_master_node "$SGE_JOB_ID") |
||||||
|
JOB_OWNER=$(find_job_owner "$SGE_JOB_ID") |
||||||
|
ALL_HOSTS=( $(list_job_nodes "$SGE_JOB_ID") ) |
||||||
|
echo "master_host: $MASTER_HOST" |
||||||
|
dump_process_tree1 "$MASTER_HOST" "$JOB_OWNER" |
||||||
|
|
||||||
|
for HOST in "${ALL_HOSTS[@]}"; do |
||||||
|
if [ "$HOST" = "$MASTER_HOST" ]; then continue; fi |
||||||
|
echo "host: $HOST" |
||||||
|
dump_process_tree1 "$HOST" "$JOB_OWNER" |
||||||
|
done |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
# Main program switchboard |
||||||
|
|
||||||
|
case "$1" in |
||||||
|
(+([0-9])) |
||||||
|
SGE_JOB_ID="$1" |
||||||
|
find_run_hosts "$SGE_JOB_ID" |
||||||
|
;; |
||||||
|
(--process*|process*|proc|procs|px) |
||||||
|
# Prints where the head node of the job is (i.e. the master node of the job) |
||||||
|
# where the job script was first executing |
||||||
|
if [ -z "$2" ]; then |
||||||
|
echo "Job ID required as arg 2" >&2 |
||||||
|
exit 2 |
||||||
|
fi |
||||||
|
SGE_JOB_ID="$2" |
||||||
|
dump_process_trees "$SGE_JOB_ID" |
||||||
|
;; |
||||||
|
(--head-node|--head|head|headnode|head-node|--master-node|--master|master|masternode|master-node) |
||||||
|
# Prints where the head node of the job is (i.e. the master node of the job) |
||||||
|
# where the job script was first executing |
||||||
|
if [ -z "$2" ]; then |
||||||
|
echo "Job ID required as arg 2" >&2 |
||||||
|
exit 2 |
||||||
|
fi |
||||||
|
SGE_JOB_ID="$2" |
||||||
|
find_job_master_node "$SGE_JOB_ID" |
||||||
|
;; |
||||||
|
(--list-node*|--list|list|listnode*|list-node*|node|nodes) |
||||||
|
# Lists all the compute nodes being used by this job |
||||||
|
if [ -z "$2" ]; then |
||||||
|
echo "Job ID required as arg 2" >&2 |
||||||
|
exit 2 |
||||||
|
fi |
||||||
|
SGE_JOB_ID="$2" |
||||||
|
list_job_nodes "$SGE_JOB_ID" |
||||||
|
;; |
||||||
|
(*) |
||||||
|
echo "Unknown action: $1" >&2 |
||||||
|
exit 2 |
||||||
|
;; |
||||||
|
esac |
||||||
|
|
||||||
|
#SGE_JOB_ID="$1" |
||||||
|
#find_run_hosts "$SGE_JOB_ID" |
||||||
|
#find_job_owner "$SGE_JOB_ID" |
||||||
|
#find_job_head_node "$SGE_JOB_ID" |
||||||
|
|
Loading…
Reference in new issue