diff options
author | Volker Hoffmann <volker@cheleb.net> | 2016-11-01 09:54:51 +0100 |
---|---|---|
committer | Volker Hoffmann <volker@cheleb.net> | 2016-11-01 09:54:51 +0100 |
commit | e41d81ff279691756151384c8e296257dc9372ab (patch) | |
tree | 53f9f5304029fa612552d73f23da630f1dcd5c8c | |
parent | 8178da6f07177376064c4c4c31830d2cb37cb82f (diff) |
feat: deal with "[Unknown Error]" in GPU power draw
-rw-r--r-- | Common/nvidia.py | 6 | ||||
-rw-r--r-- | ticker.py | 9 |
2 files changed, 12 insertions, 3 deletions
diff --git a/Common/nvidia.py b/Common/nvidia.py index 3579c7b..e7d6188 100644 --- a/Common/nvidia.py +++ b/Common/nvidia.py @@ -60,9 +60,13 @@ def read_gpu_stats(node='vesta1', type='tesla'): df = pd.read_csv(fname, sep=', ', header=None, names=names_cols, \ engine='python') + # Deal With Errors + df.ix[df.power_draw == '[Unknown Error]', 'power_draw' ] = np.nan + # Drop Units df.power_draw = \ - df.power_draw.apply(lambda a: a.split(' ')[0]) + df.ix[~pd.isnull(df.power_draw), \ + 'power_draw'].apply(lambda a: a.split(' ')[0]) df.gpu_utilization = \ df.gpu_utilization.apply(lambda a: a.split(' ')[0]) df.memory_utilization = \ @@ -8,6 +8,7 @@ import Common.influx as influx import Common.derived as derived import Common.nvidia as nvidia import time +import numpy as np # ############################################################################# @@ -76,13 +77,17 @@ line = "room_temperature,room=zbox_room value=%.2f %i" % \ lines.append(line) # GPU Stats, Tesla Cards +# NB: InfluxDB cannot deal with NaN values. We thus skip submitting the row. +# This is not ideal. We should think of a way to record such failures. +# https://github.com/influxdata/influxdb/issues/4089 for gpu_node in [ 'vesta1', 'vesta2' ]: df, gpu_epoch, sucess = nvidia.read_gpu_stats(node=gpu_node, type='tesla') for irow, [ index, row ] in enumerate(df.iterrows()): lines.append("gpu_temperature,node=%s,uuid=%s value=%.2f %i" % \ (row.node, row.uuid, row.gpu_temperature, gpu_epoch)) - lines.append("gpu_power_draw,node=%s,uuid=%s value=%.2f %i" % \ - (row.node, row.uuid, row.power_draw, gpu_epoch)) + if ~np.isnan(row.power_draw): + lines.append("gpu_power_draw,node=%s,uuid=%s value=%.2f %i" % \ + (row.node, row.uuid, row.power_draw, gpu_epoch)) lines.append("gpu_utilization,node=%s,uuid=%s value=%.2f %i" % \ (row.node, row.uuid, row.gpu_utilization, gpu_epoch)) lines.append("gpu_memory_utilization,node=%s,uuid=%s value=%.2f %i" % \ |