diff options
author | Volker Hoffmann <volker@cheleb.net> | 2016-02-07 11:44:57 +0100 |
---|---|---|
committer | Volker Hoffmann <volker@cheleb.net> | 2016-02-07 11:44:57 +0100 |
commit | 61ce5110a013552a875ac81ccca897c108e61ca7 (patch) | |
tree | c40b51b687dbedc6c2697c48c1130c94b1f9f23f | |
parent | d77f9c7470ab40d32dcf00d7018d85530c5ec426 (diff) |
feat: add gpu stats reporting
-rw-r--r-- | Common/nvidia.py | 74 | ||||
-rw-r--r-- | ticker.py | 18 |
2 files changed, 92 insertions, 0 deletions
diff --git a/Common/nvidia.py b/Common/nvidia.py new file mode 100644 index 0000000..88463b3 --- /dev/null +++ b/Common/nvidia.py @@ -0,0 +1,74 @@ +""" +NVIDIA Interaction. +""" + +import pandas as pd +import numpy as np +import glob as glob + + +def read_gpu_stats(node='vesta1'): + """ + Reads stats from CSV file, which was dumped on the GPU nodes. + + This query generates the CSV file. Run on GPU node. + nvidia-smi --query-gpu=index,uuid,name,temperature.gpu,utilization.gpu,utilization.memory,power.draw --format=csv,noheader > vesta1_`date '+%s'`.csv + + CSV files is named "$node_$unix_epoch_in_seconds.csv". + For example, "vesta1_1454748497.csv". + + @param: node - GPU node, used to nconstruct filename [String] + @return: df - GPU Stats [Pandas Dataframe] + @return: epoch - Time (Seconds since 01-01-1970) [Integer] + @return: success - Did we get data? [Boolean] + """ + + # Dev + # basedir = 'Test/' + + # Prod + basedir = '/home/ics/volker/TmpDash/' + + # Globbing + globs = glob.glob("%s/%s_*.csv" % (basedir, node)) + globs = sorted(globs) + + # Is there anybody out there? + if len(globs) > 0: + + # Extract Filename + fname = globs[-1] + + # Extract Epoch + epoch = int(fname[:-4].split('_')[-1]) + + names_cols = [ 'gpu_id', 'uuid', 'gpu_name', 'gpu_temperature', \ + 'gpu_utilization', 'memory_utilization', 'power_draw' ] + df = pd.read_csv(fname, sep=', ', header=None, names=names_cols, \ + engine='python') + + # Drop Units + df.power_draw = \ + df.power_draw.apply(lambda a: a.split(' ')[0]) + df.gpu_utilization = \ + df.gpu_utilization.apply(lambda a: a.split(' ')[0]) + df.memory_utilization = \ + df.memory_utilization.apply(lambda a: a.split(' ')[0]) + + # Fix Types + df.power_draw = \ + np.asarray(df.power_draw, dtype=np.float64) + df.gpu_utilization = \ + np.asarray(df.gpu_utilization, dtype=np.float64) + df.memory_utilization = \ + np.asarray(df.memory_utilization, dtype=np.float64) + + # Add Node Column + df['node'] = pd.Series(["%s" % node]*len(df)) + + # Return + return df, epoch, True + + else: + # Return + return None, None, False @@ -6,6 +6,7 @@ import Common.slurm as slurm import Common.sensors as sensors import Common.influx as influx import Common.derived as derived +import Common.nvidia as nvidia import time @@ -74,6 +75,23 @@ line = "room_temperature,room=zbox_room value=%.2f %i" % \ (temperature, netatmo_epoch) lines.append(line) +# GPU Stats +for gpu_node in [ 'vesta1', 'vesta2' ]: + df, gpu_epoch, sucess = nvidia.read_gpu_stats(node=gpu_node) + for irow, [ index, row ] in enumerate(df.iterrows()): + line_01 = "gpu_temperature,node=%s,uuid=%s value=%.2f %i" % \ + (row.node, row.uuid, row.gpu_temperature, gpu_epoch) + line_02 = "gpu_power_draw,node=%s,uuid=%s value=%.2f %i" % \ + (row.node, row.uuid, row.power_draw, gpu_epoch) + line_03 = "gpu_utilization,node=%s,uuid=%s value=%.2f %i" % \ + (row.node, row.uuid, row.gpu_utilization, gpu_epoch) + line_04 = "gpu_memory_utilization,node=%s,uuid=%s value=%.2f %i" % \ + (row.node, row.uuid, row.memory_utilization, gpu_epoch) + lines.append(line_01) + lines.append(line_02) + lines.append(line_03) + lines.append(line_04) + # Join data = "\n".join(lines) |