From 1684ac7bbc23aaffd489112766b76ce28659ef00 Mon Sep 17 00:00:00 2001 From: Volker Hoffmann Date: Wed, 26 Aug 2015 08:56:36 +0200 Subject: feat: compute cluster utilisation --- Common/derived.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ ticker.py | 14 ++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 Common/derived.py diff --git a/Common/derived.py b/Common/derived.py new file mode 100644 index 0000000..fae3da5 --- /dev/null +++ b/Common/derived.py @@ -0,0 +1,47 @@ +""" +Derived Quantities. +""" + + +def compute_utilization(number_of_nodes_down, \ + number_of_jobs_by_partition): + """ + Compute Cluster Utilization. + + @todo: Less Hardcoding. + + @param: number_of_nodes_down - [Dict {'partition': 2, ...}] + @param: number_of_jobs_by_partition - [Dict {'zbox': {'running': 1}, ...] + @return: utilization - [Dict {'cpu': 0.3, 'tasna': 0.5, ..., 'gpu': 0.9}] + """ + + utilization = {} + + # CPU Utilization (Without Hyperthreading) + # 2 Sockets per Server, 8 Cores per Socket, 192 Servers = 3072 Cores + total_cpu_cores = 16*192 + cpu_nodes_down = number_of_nodes_down['cpu']*16 + allocated_cpu_cores = number_of_jobs_by_partition['cpu']['running'] + utilization['cpu'] = \ + float(allocated_cpu_cores) / float(total_cpu_cores - cpu_nodes_down) + + # Tasna Utilization + # 4 GTX 590 Boards per Server, 2 GPUs per Board, 5 Servers = 40 Slots + total_tasna_slots = 40 - number_of_nodes_down['tasna'] + allocated_tasna_slots = number_of_jobs_by_partition['tasna']['running'] + utilization['tasna'] = \ + float(allocated_tasna_slots) / float(total_tasna_slots) + + # Vesta Utilization + # 8 K80 Boards per Server, 2 GPUs per Board, 2 Servers = 32 Slots + total_vesta_slots = 32 - number_of_nodes_down['vesta'] + allocated_vesta_slots = number_of_jobs_by_partition['vesta']['running'] + utilization['vesta'] = \ + float(allocated_vesta_slots) / float(total_vesta_slots) + + # GPU Utilization + total_gpu_slots = total_tasna_slots + total_vesta_slots + allocated_gpu_slots = allocated_tasna_slots + allocated_vesta_slots + utilization['gpu'] = float(allocated_gpu_slots) / float(total_gpu_slots) + + return utilization diff --git a/ticker.py b/ticker.py index 3b52b08..3cc34ec 100644 --- a/ticker.py +++ b/ticker.py @@ -5,6 +5,7 @@ Read, Parse, Post Stats to InfluxDB. import Common.slurm as slurm import Common.sensors as sensors import Common.influx as influx +import Common.derived as derived import time @@ -23,6 +24,13 @@ njobs_by_partition_and_state = \ slurm.get_number_of_jobs_by_partition_and_state() # ############################################################################# +# Compute Derived Quantities +# ############################################################################# +utilization_by_partition = \ + derived.compute_utilization(number_of_nodes_down, \ + njobs_by_partition_and_state) + +# ############################################################################# # Load Data from Netatmo # ############################################################################# temperature, netatmo_epoch = sensors.get_netatmo_temperature() @@ -52,6 +60,12 @@ for partition, njobs_by_state in njobs_by_partition_and_state.iteritems(): (state, partition, njobs, epoch) lines.append(line) +# Cluster Utilization +for partition, utilization in utilization_by_partition.iteritems(): + line = "utilization,partition=%s value=%.6f %i" % \ + (partition, utilization, epoch) + lines.append(line) + # Netatmo line = "room_temperature,room=zbox_room value=%.2f %i" % \ (temperature, netatmo_epoch) -- cgit v1.1