Latest revision as of 20:46, 26 November 2024

Cluster Status

SLURM offers a variety of tools to check the general status of nodes/partitions in a cluster.

sinfo

The sinfo command will show you the status of partitions in the cluster, in alphabetical order. Passing the -N flag will show each node individually, also in alphabetical order.

$ sinfo
PARTITION         AVAIL  TIMELIMIT  NODES  STATE NODELIST
cbcb                 up   infinite      9    mix cbcb[00,03,22,26-28],legacy[00,10,20]
cbcb                 up   infinite      3  alloc cbcb[01-02,29]
cbcb                 up   infinite     49   idle cbcb[04-20,23-25],legacy[01-09,11,13-19,21-28,30-31,34-35]
...
vulcan-scavenger     up   infinite     14    mix brigid[16-17],vulcan[23-25,27-30,32-33,36-37,45]
vulcan-scavenger     up   infinite      1  alloc vulcan35
vulcan-scavenger     up   infinite     34   idle brigid[18-19],vulcan[00-22,26,34,38-44]

$ sinfo -N
NODELIST     NODES         PARTITION STATE
brigid16         1        vulcan-cpu mix
brigid16         1         scavenger mix
brigid16         1      vulcan-dpart mix
brigid16         1  vulcan-scavenger mix
...
vulcan45         1     vulcan-ramani mix
vulcan45         1  vulcan-scavenger mix
vulcan45         1         scavenger mix

scontrol

The scontrol command can be used to view the status/configuration of the nodes in the cluster. If passed specific node name(s) only information about those node(s) will be displayed, otherwise all nodes will be listed. To specify multiple nodes, separate each node name by a comma (no spaces).

$ scontrol show nodes tron05,tron13
NodeName=tron05 Arch=x86_64 CoresPerSocket=16
   CPUAlloc=28 CPUTot=32 CPULoad=47.32
   AvailableFeatures=rhel8,x86_64,Zen,EPYC-7302,Ampere
   ActiveFeatures=rhel8,x86_64,Zen,EPYC-7302,Ampere
   Gres=gpu:rtxa6000:8
   NodeAddr=tron05 NodeHostName=tron05 Version=21.08.5
   OS=Linux 4.18.0-348.20.1.el8_5.x86_64 #1 SMP Tue Mar 8 12:56:54 EST 2022
   RealMemory=257538 AllocMem=157696 FreeMem=197620 Sockets=2 Boards=1
   State=MIXED ThreadsPerCore=1 TmpDisk=0 Weight=100 Owner=N/A MCS_label=N/A
   Partitions=scavenger,tron
   BootTime=2022-04-21T17:40:51 SlurmdStartTime=2022-04-21T18:00:56
   LastBusyTime=2022-04-22T11:21:16
   CfgTRES=cpu=32,mem=257538M,billing=346,gres/gpu=8,gres/gpu:rtxa6000=8
   AllocTRES=cpu=28,mem=154G,gres/gpu=7,gres/gpu:rtxa6000=7
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s

NodeName=tron13 Arch=x86_64 CoresPerSocket=16
   CPUAlloc=1 CPUTot=16 CPULoad=8.41
   AvailableFeatures=rhel8,x86_64,Zen,EPYC-7302P,Ampere
   ActiveFeatures=rhel8,x86_64,Zen,EPYC-7302P,Ampere
   Gres=gpu:rtxa4000:4
   NodeAddr=tron13 NodeHostName=tron13 Version=21.08.5
   OS=Linux 4.18.0-348.20.1.el8_5.x86_64 #1 SMP Tue Mar 8 12:56:54 EST 2022
   RealMemory=128525 AllocMem=65536 FreeMem=33463 Sockets=1 Boards=1
   State=MIXED ThreadsPerCore=1 TmpDisk=0 Weight=10 Owner=N/A MCS_label=N/A
   Partitions=scavenger,tron
   BootTime=2022-04-21T17:40:46 SlurmdStartTime=2022-04-21T17:54:51
   LastBusyTime=2022-04-22T13:04:57
   CfgTRES=cpu=16,mem=128525M,billing=173,gres/gpu=4,gres/gpu:rtxa4000=4
   AllocTRES=cpu=1,mem=64G,gres/gpu=4,gres/gpu:rtxa4000=4
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s

sacctmgr

The sacctmgr command shows cluster accounting information. One of the helpful commands is to list the available QoSes.

$ sacctmgr list qos format=Name,Priority,MaxWall,MaxJobsPU
      Name   Priority     MaxWall MaxJobsPU
---------- ---------- ----------- ---------
    normal          0
     dpart          0  2-00:00:00         8
       gpu          0    08:00:00         2

SLURM/ClusterStatus: Difference between revisions

Latest revision as of 20:46, 26 November 2024

Contents

Cluster Status

sinfo

scontrol

sacctmgr

Navigation menu

@@ Line 1: / Line 1: @@
 =Cluster Status=
-The general status of nodes/partitions in a cluster can be viewed using the sinfo and scontrol commands.
+SLURM offers a variety of tools to check the general status of nodes/partitions in a cluster.
 ==sinfo==
-sinfo will show you the status of partitions in the cluster. Passing the -N flag will show each node individually.
+The sinfo command will show you the status of partitions in the cluster, in alphabetical order. Passing the -N flag will show each node individually, also in alphabetical order.
 <pre>
-tgray26@shadosub:sinfo
+$ sinfo
-PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
+PARTITION         AVAIL  TIMELIMIT  NODES  STATE NODELIST
-test         up   infinite      2    mix shado[00-01]
+cbcb                 up   infinite      9    mix cbcb[00,03,22,26-28],legacy[00,10,20]
-test         up   infinite      7   idle shado[02-08]
+cbcb                 up   infinite      3  alloc cbcb[01-02,29]
-test2*       up   infinite      2    mix shado[00-01]
+cbcb                 up   infinite     49   idle cbcb[04-20,23-25],legacy[01-09,11,13-19,21-28,30-31,34-35]
-test2*       up   infinite      3   idle shado[02-04]
+...
+vulcan-scavenger     up   infinite     14    mix brigid[16-17],vulcan[23-25,27-30,32-33,36-37,45]
+vulcan-scavenger     up   infinite      1  alloc vulcan35
+vulcan-scavenger     up   infinite     34   idle brigid[18-19],vulcan[00-22,26,34,38-44]
 </pre>
 <pre>
-tgray26@shadosub:sinfo -N
+$ sinfo -N
-NODELIST   NODES PARTITION STATE
+NODELIST     NODES         PARTITION STATE
-shado00        1      test mix
+brigid16         1        vulcan-cpu mix
-shado00        1    test2* mix
+brigid16         1         scavenger mix
-shado01        1      test mix
+brigid16         1      vulcan-dpart mix
-shado01        1    test2* mix
+brigid16         1  vulcan-scavenger mix
-shado02        1      test idle
+...
-shado02        1    test2* idle
+vulcan45         1     vulcan-ramani mix
+vulcan45         1  vulcan-scavenger mix
+vulcan45         1         scavenger mix
 </pre>
 ==scontrol==
-The scontrol command, while generally reserved for administrator use, can be used to view the status/configuration of the nodes in the cluster. If passed a specific node name only information about that node will be displayed, otherwise all nodes will be listed.
+The scontrol command can be used to view the status/configuration of the nodes in the cluster. If passed specific node name(s) only information about those node(s) will be displayed, otherwise all nodes will be listed. To specify multiple nodes, separate each node name by a comma (no spaces).
 <pre>
-tgray26@shadosub:scontrol show nodes shado00
+$ scontrol show nodes tron05,tron13
-NodeName=shado00 Arch=x86_64 CoresPerSocket=4
+NodeName=tron05 Arch=x86_64 CoresPerSocket=16
-    CPUAlloc=0 CPUErr=0 CPUTot=8 CPULoad=0.01
+    CPUAlloc=28 CPUTot=32 CPULoad=47.32
-    AvailableFeatures=(null)
+    AvailableFeatures=rhel8,x86_64,Zen,EPYC-7302,Ampere
-    ActiveFeatures=(null)
+    ActiveFeatures=rhel8,x86_64,Zen,EPYC-7302,Ampere
-    Gres=(null)
+    Gres=gpu:rtxa6000:8
-    NodeAddr=shado00 NodeHostName=shado00 Version=16.05
+    NodeAddr=tron05 NodeHostName=tron05 Version=21.08.5
-    OS=Linux RealMemory=15885 AllocMem=0 FreeMem=12187 Sockets=2 Boards=1
+    OS=Linux 4.18.0-348.20.1.el8_5.x86_64 #1 SMP Tue Mar 8 12:56:54 EST 2022
-    State=IDLE ThreadsPerCore=1 TmpDisk=49975 Weight=1 Owner=N/A MCS_label=N/A
+   RealMemory=257538 AllocMem=157696 FreeMem=197620 Sockets=2 Boards=1
-    BootTime=2016-06-23T20:25:41 SlurmdStartTime=2016-07-10T13:33:29
+    State=MIXED ThreadsPerCore=1 TmpDisk=0 Weight=100 Owner=N/A MCS_label=N/A
+   Partitions=scavenger,tron
+    BootTime=2022-04-21T17:40:51 SlurmdStartTime=2022-04-21T18:00:56
+   LastBusyTime=2022-04-22T11:21:16
+   CfgTRES=cpu=32,mem=257538M,billing=346,gres/gpu=8,gres/gpu:rtxa6000=8
+   AllocTRES=cpu=28,mem=154G,gres/gpu=7,gres/gpu:rtxa6000=7
     CapWatts=n/a
-    CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
+    CurrentWatts=0 AveWatts=0
     ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
+NodeName=tron13 Arch=x86_64 CoresPerSocket=16
+   CPUAlloc=1 CPUTot=16 CPULoad=8.41
+   AvailableFeatures=rhel8,x86_64,Zen,EPYC-7302P,Ampere
+   ActiveFeatures=rhel8,x86_64,Zen,EPYC-7302P,Ampere
+   Gres=gpu:rtxa4000:4
+   NodeAddr=tron13 NodeHostName=tron13 Version=21.08.5
+   OS=Linux 4.18.0-348.20.1.el8_5.x86_64 #1 SMP Tue Mar 8 12:56:54 EST 2022
+   RealMemory=128525 AllocMem=65536 FreeMem=33463 Sockets=1 Boards=1
+   State=MIXED ThreadsPerCore=1 TmpDisk=0 Weight=10 Owner=N/A MCS_label=N/A
+   Partitions=scavenger,tron
+   BootTime=2022-04-21T17:40:46 SlurmdStartTime=2022-04-21T17:54:51
+   LastBusyTime=2022-04-22T13:04:57
+   CfgTRES=cpu=16,mem=128525M,billing=173,gres/gpu=4,gres/gpu:rtxa4000=4
+   AllocTRES=cpu=1,mem=64G,gres/gpu=4,gres/gpu:rtxa4000=4
+   CapWatts=n/a
+   CurrentWatts=0 AveWatts=0
+   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
+</pre>
+==sacctmgr==
+The sacctmgr command shows cluster accounting information.  One of the helpful commands is to list the available QoSes.
+<pre>
+$ sacctmgr list qos format=Name,Priority,MaxWall,MaxJobsPU
+      Name   Priority     MaxWall MaxJobsPU
+---------- ---------- ----------- ---------
+    normal          0
+     dpart          0  2-00:00:00         8
+       gpu          0    08:00:00         2
 </pre>

SLURM/ClusterStatus: Difference between revisions

Latest revision as of 20:46, 26 November 2024

Cluster Status

sinfo

scontrol

sacctmgr

Navigation menu

Search