Commit 96114711 authored by Azat Khuziyakhmetov's avatar Azat Khuziyakhmetov
Browse files

updated nvidia plugin

parent 08130e49
......@@ -4,6 +4,8 @@
# Created by: Khuziyakhmetov, Azat <azat.khuziyakhmetov@gwdg.de>
# date: November 13, 2017
# CUDA location
CUDA_HOME=/cm/local/apps/cuda/libs/current
HOSTNAME="$(hostname -s)"
the_right_host_among() {
......@@ -28,7 +30,8 @@ the_right_host_among() {
# exit 0
#fi
smi=/cm/local/apps/cuda/libs/current/bin/nvidia-smi
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_HOME/lib64
smi=$CUDA_HOME/bin/nvidia-smi
gpu_params="index,gpu_name,gpu_bus_id,fan.speed,pstate,memory.total,memory.used,utilization.gpu,utilization.memory,encoder.stats.sessionCount,encoder.stats.averageFps,encoder.stats.averageLatency,ecc.errors.corrected.volatile.total,ecc.errors.corrected.aggregate.total,ecc.errors.uncorrected.volatile.total,ecc.errors.uncorrected.aggregate.total,retired_pages.sbe,retired_pages.dbe,temperature.gpu,power.draw,power.limit,enforced.power.limit,clocks.gr,clocks.sm,clocks.mem,clocks.video"
gpu_stats=`$smi --format=csv,noheader,nounits --query-gpu=$gpu_params`
......@@ -65,7 +68,7 @@ while read -r line; do
declare -A proc_vals
while read -r val; do
proc_vals[${proc_param_idx[$cnt]}]=`[ "$val" == "[Not Supported]" ] && echo -1 || echo $val`
proc_vals[${proc_param_idx[$cnt]}]=`[ "$val" == "[Not Supported]" -o "$val" == "[Unknown Error]" ] && echo -1 || echo $val`
cnt=$((cnt+1))
done <<< "$(echo $line | tr "," "\n")";
......@@ -73,6 +76,8 @@ while read -r line; do
if [ "${proc_vals[pid]}" -eq "-1" ]; then continue; fi;
JOBID=`strings /proc/${proc_vals[pid]}/environ | grep -E (SLURM|LSF)_JOBID= | cut -d= -f2`
cpu_stats=`ps --no-headers -o $cpu_params -p ${proc_vals[pid]}`
cnt=0
......@@ -88,7 +93,7 @@ while read -r line; do
gpu_name=${proc_vals[gpu_name]// /\\ }
bus=`echo ${proc_vals[gpu_bus_id]} | cut -d':' -f2`
echo "nvidia_proc,host=${HOSTNAME},gpu_name=$gpu_name,bus=$bus,username=${cpu_vals[uname]} pid=${proc_vals[pid]},name=\"${proc_vals[name]}\",used_memory=${proc_vals[used_memory]},cpu_pcpu=${cpu_vals[pcpu]},cpu_pmem=${cpu_vals[pmem]},cpu_maj_flt=${cpu_vals[maj_flt]},cpu_min_flt=${cpu_vals[min_flt]},cpu_nlwp=${cpu_vals[nlwp]},cpu_rss=${cpu_vals[rss]},cpu_vsz=${cpu_vals[vsz]} $timestamp"
echo "nvidia_proc,host=${HOSTNAME},gpu_name=${gpu_name},bus=${bus},username=${cpu_vals[uname]},JOBID=${JOBID} pid=${proc_vals[pid]},name=\"${proc_vals[name]}\",used_memory=${proc_vals[used_memory]},cpu_pcpu=${cpu_vals[pcpu]},cpu_pmem=${cpu_vals[pmem]},cpu_maj_flt=${cpu_vals[maj_flt]},cpu_min_flt=${cpu_vals[min_flt]},cpu_nlwp=${cpu_vals[nlwp]},cpu_rss=${cpu_vals[rss]},cpu_vsz=${cpu_vals[vsz]} $timestamp"
timestamp=$((timestamp+1000000000))
done <<< "$proc_stats"
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment