aboutsummaryrefslogtreecommitdiffstats
path: root/Common/slurm.py
blob: bb9381aeb72fe03830e3606f4d2dbe490fdce459 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
"""
Slurm Polling Functions.
"""

import subprocess as sp


def get_number_of_nodes_down():
    """
    Get Down/Drained Nodes for Partitions.
    
    Slurm Command:
    sinfo --format=%o --list-reasons --noheader --partition=zbox

    @return: number_of_nodes_down - [Dict {'partition': 2, ...}]
    """

    number_of_nodes_down = {}
    partitions = [ 'zbox', 'serial', 'debug', 'tasna', 'vesta' ]
    for partition in partitions:
        cmd = [ 'sinfo', '--format=%o', \
                '--list-reasons', '--noheader', \
                "--partition=%s" % partition ]
        p = sp.Popen(cmd, stdout=sp.PIPE)
        p.wait()
        data, _ = p.communicate()
        if len(data) == 0:
            count = 0
        else:
            count = len(data.strip().split('\n'))
        number_of_nodes_down["%s" % partition] = count

    # Aggregate CPU Counts
    cpu_partitions = [ 'zbox', 'serial', 'debug' ]
    running_sum = 0
    for partition in cpu_partitions:
        running_sum += number_of_nodes_down["%s" % partition]
    number_of_nodes_down['cpu'] = running_sum

    # Aggregate GPU Counts
    gpu_partitions = [ 'tasna', 'vesta' ]
    running_sum = 0
    for partition in gpu_partitions:
        running_sum += number_of_nodes_down["%s" % partition]
    number_of_nodes_down['gpu'] = running_sum

    # Aggregate All Counts
    running_sum = 0
    for partition in partitions:
        running_sum += number_of_nodes_down["%s" % partition]
    number_of_nodes_down['all'] = running_sum

    # Fake Data
    # number_of_nodes_down = \
    #     {'cpu': 15,
    #      'debug': 3,
    #      'gpu': 1,
    #      'serial': 2,
    #      'tasna': 1,
    #      'vesta': 0,
    #      'zbox': 10}

    return number_of_nodes_down


def get_cpu_allocations():
    """
    Get CPU Allocations per Partition. Also return sum over all CPU partitions.

    Slurm Command:
    squeue --format=%C --partition zbox --noheader

    @return: number_of_allocated_cpus - [Dict {'partition': ncpus, ...}]
    """

    # Old, Really Slow Command:
    # sacct --format=partition,alloccpus --allocations \
    #       --state=RUNNING --allusers --noheader --parsable2

    partitions = [ 'zbox', 'serial', 'debug' ]
    number_of_allocated_cpus = {}
    for partition in partitions:
        cmd = [ 'squeue', \
                '--format=%C', '--noheader', "--partition=%s" % partition ]
        p = sp.Popen(cmd, stdout=sp.PIPE)
        p.wait()
        data, _ = p.communicate()
        running_sum = 0
        if len(data) > 0:
            for line in data.strip().split('\n'):
                running_sum += int(line)
        number_of_allocated_cpus["%s" % partition] = running_sum
    number_of_allocated_cpus['cpu'] = sum(number_of_allocated_cpus.values())

    # Fake Data
    # number_of_allocated_cpus = { 'zbox': 768, 'serial': 12, 'debug': 198 }

    # Return
    return number_of_allocated_cpus


def get_number_of_jobs_by_partition_and_state():
    """
    Get Number of Jobs by State and Partition.

    Slurm Command:
    squeue --noheader --format=%T:%P --partition=zbox --state=pending

    @return number_of_jobs_by_partition - [Dict {'zbox': {'running': 1}, ...]
    """

    partitions = [ 'zbox', 'serial', 'debug', 'tasna', 'vesta' ]
    states = [ 'pending', 'running' ]

    # Get Raw Data
    number_of_jobs_by_partition = {}
    for partition in partitions:
        number_of_jobs_by_state = {}
        for state in states:
            cmd = [ 'squeue', '--noheader', '--format=%T:%P', \
                    "--partition=%s" % partition, \
                    "--state=%s" % state ]
            p = sp.Popen(cmd, stdout=sp.PIPE)
            p.wait()
            data, _ = p.communicate()
            if len(data) == 0:
                count = 0
            else:
                count = len(data.strip().split('\n'))
            number_of_jobs_by_state["%s" % state] = count
        number_of_jobs_by_partition["%s" % partition] = \
                number_of_jobs_by_state

    # Aggregate CPU Counts
    cpu_partitions = [ 'zbox', 'serial', 'debug' ]
    number_of_jobs_by_state = {}
    for state in states:
        total = 0
        for partition in cpu_partitions:
            total += number_of_jobs_by_partition[partition][state]
        number_of_jobs_by_state[state] = total
    number_of_jobs_by_partition['cpu'] = number_of_jobs_by_state

    # Aggregate GPU Counts
    gpu_partitions = [ 'vesta', 'tasna' ]
    number_of_jobs_by_state = {}
    for state in states:
        total = 0
        for partition in gpu_partitions:
            total += number_of_jobs_by_partition[partition][state]
        number_of_jobs_by_state[state] = total
    number_of_jobs_by_partition['gpu'] = number_of_jobs_by_state

    # Aggregate All Counts
    number_of_jobs_by_state = {}
    for state in states:
        total = 0
        for partition in partitions:
            total += number_of_jobs_by_partition[partition][state]
        number_of_jobs_by_state[state] = total
    number_of_jobs_by_partition['all'] = number_of_jobs_by_state

    # Fake Data
    # number_of_jobs_by_partition = \
    #     {'cpu': {'pending': 0, 'running': 465},
    #      'debug': {'pending': 0, 'running': 0},
    #      'gpu': {'pending': 0, 'running': 62},
    #      'serial': {'pending': 0, 'running': 457},
    #      'tasna': {'pending': 0, 'running': 31},
    #      'vesta': {'pending': 0, 'running': 31},
    #      'zbox': {'pending': 0, 'running': 8}}

    return number_of_jobs_by_partition