Check current and past jobs
Check currently running jobs¶
The sfapi_client
can easily be used to get your current jobs running on the system, or information about past jobs.
First we'll import the required libraries, AsyncClient
to handle the requests and the Machine
enum to get the correct resource.
# import client library
from sfapi_client import AsyncClient
from sfapi_client.compute import Machine
# this will help display the outputs later
import json
# Print our results nicely to the notebook
def print_json(j):
return print(json.dumps(j, indent=4))
# Change this to your username
user_name="elvis"
The next lines of code will create a client which can be used to get an Compute
object. This object can be used to look at the current and past job queues, list files and directories, and run simple commands on the system.
In this example we will get the currently running jobs for the user elvis
.
async with AsyncClient() as client:
perlmutter = await client.compute(Machine.perlmutter)
# This selects just the jobs in the regular cpu partition on perlmutter
jobs = await perlmutter.jobs(user=user_name, partition='regular_milan_ss11')
All the jobs currently in the job queue are stored in the newly created jobs
list. An easy way of viewing and this list is to print the objects in the list.
print_json([j.dict() for j in jobs])
[ { "account": "ntrain", "tres_per_node": "N/A", "min_cpus": "1", "min_tmp_disk": "0", "end_time": "N/A", "features": "cpu", "group": "12345", "over_subscribe": "NO", "jobid": "8407414", "name": "large_job.sh", "comment": "(null)", "time_limit": "12:00:00", "min_memory": "0", "req_nodes": "", "command": "/global/homes/e/elvis/job_subs/large_job.sh", "priority": "67684", "qos": "regular_1", "reason": "Priority", "field_": null, "st": "PD", "user": "elvis", "reservation": "(null)", "wckey": "(null)", "exc_nodes": "", "nice": "0", "s_c_t": "*:*:*", "exec_host": "n/a", "cpus": "25", "nodes": "25", "dependency": "(null)", "array_job_id": "8407414", "sockets_per_node": "*", "cores_per_socket": "*", "threads_per_core": "*", "array_task_id": "N/A", "time_left": "12:00:00", "time": "0:00", "nodelist": "", "contiguous": "0", "partition": "regular_milan_ss11", "nodelist_reason_": "(Priority)", "start_time": "N/A", "state": "PENDING", "uid": "12345", "submit_time": "2023-05-02T18:10:51", "licenses": "u1:1", "core_spec": "N/A", "schednodes": "(null)", "work_dir": "/global/homes/e/elvis/job_subs" }, { "account": "ntrain", "tres_per_node": "N/A", "min_cpus": "1", "min_tmp_disk": "0", "end_time": "N/A", "features": "cpu", "group": "12345", "over_subscribe": "NO", "jobid": "8407432", "name": "large_job.sh", "comment": "(null)", "time_limit": "12:00:00", "min_memory": "0", "req_nodes": "", "command": "/global/homes/e/elvis/job_subs/large_job.sh", "priority": "67683", "qos": "regular_1", "reason": "Priority", "field_": null, "st": "PD", "user": "elvis", "reservation": "(null)", "wckey": "(null)", "exc_nodes": "", "nice": "0", "s_c_t": "*:*:*", "exec_host": "n/a", "cpus": "25", "nodes": "25", "dependency": "(null)", "array_job_id": "8407432", "sockets_per_node": "*", "cores_per_socket": "*", "threads_per_core": "*", "array_task_id": "N/A", "time_left": "12:00:00", "time": "0:00", "nodelist": "", "contiguous": "0", "partition": "regular_milan_ss11", "nodelist_reason_": "(Priority)", "start_time": "N/A", "state": "PENDING", "uid": "12345", "submit_time": "2023-05-02T18:11:30", "licenses": "u1:1", "core_spec": "N/A", "schednodes": "(null)", "work_dir": "/global/homes/e/elvis/job_subs" } ]
A single job can be pulled from the list for further inspection. This returns a Squeue
object which is the output you would get from running the command squeue
from slurm. This is useful for getting jobs which are currently pending or running.
jobs[0]
AsyncJobSqueue(account='ntrain', tres_per_node='N/A', min_cpus='1', min_tmp_disk='0', end_time='N/A', features='cpu', group='12345', over_subscribe='NO', jobid='8407414', name='large_job.sh', comment='(null)', time_limit='12:00:00', min_memory='0', req_nodes='', command='/global/homes/e/elvis/job_subs/large_job.sh', priority='67684', qos='regular_1', reason='Priority', field_=None, st='PD', user='elvis', reservation='(null)', wckey='(null)', exc_nodes='', nice='0', s_c_t='*:*:*', exec_host='n/a', cpus='25', nodes='25', dependency='(null)', array_job_id='8407414', sockets_per_node='*', cores_per_socket='*', threads_per_core='*', array_task_id='N/A', time_left='12:00:00', time='0:00', nodelist='', contiguous='0', partition='regular_milan_ss11', nodelist_reason_='(Priority)', start_time='N/A', state=<JobState.PENDING: 'PENDING'>, uid='12345', submit_time='2023-05-02T18:10:51', licenses='u1:1', core_spec='N/A', schednodes='(null)', work_dir='/global/homes/e/elvis/job_subs', compute=AsyncCompute(name='perlmutter', full_name='Perlmutter', description='System is active', system_type='compute', notes=[], status=<StatusValue.active: 'active'>, updated_at=datetime.datetime(2023, 4, 28, 21, 19, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))), client=<sfapi_client._async.client.AsyncClient object at 0x10bb3ffd0>))
More specific information about the job can also be gotten from this object like the number of nodes or it's jobid
.
print(f"Number of nodes = {jobs[0].nodes}")
print(f"jobid = {jobs[0].jobid}")
Number of nodes = 25 jobid = 8407414
The information for a job can also be retrieved later based on the jobid. This can be useful for seeing if a job completed, failed, or was canceled.
async with AsyncClient() as client:
perlmutter = await client.compute(Machine.perlmutter)
# Get the running job based on it's job ID
job = await perlmutter.job(jobid=jobs[0].jobid)
# Print out the current status of the job
print_json(job.dict())
{ "account": "ntrain", "admincomment": "", "alloccpus": "25", "allocnodes": "0", "alloctres": "", "associd": "206287", "avecpu": "", "avecpufreq": "", "avediskread": "", "avediskwrite": "", "avepages": "", "averss": "", "avevmsize": "", "blockid": "", "cluster": "perlmutter", "comment": "", "constraints": "cpu", "consumedenergy": "0", "consumedenergyraw": "0", "cputime": "00:00:00", "cputimeraw": "0", "dbindex": "64266939", "derivedexitcode": "0:0", "elapsed": "00:00:00", "elapsedraw": "0", "eligible": "2023-05-02T18:10:51", "end": "Unknown", "exitcode": "0:0", "flags": "StartRecieved", "gid": "12345", "group": "elvis", "jobid": "8407414", "jobidraw": "8407414", "jobname": "large_job.sh", "layout": "", "maxdiskread": "", "maxdiskreadnode": "", "maxdiskreadtask": "", "maxdiskwrite": "", "maxdiskwritenode": "", "maxdiskwritetask": "", "maxpages": "", "maxpagesnode": "", "maxpagestask": "", "maxrss": "", "maxrssnode": "", "maxrsstask": "", "maxvmsize": "", "maxvmsizenode": "", "maxvmsizetask": "", "mcslabel": "", "mincpu": "", "mincpunode": "", "mincputask": "", "ncpus": "25", "nnodes": "25", "nodelist": "None assigned", "ntasks": "", "priority": "67679", "partition": "regular_milan_ss11", "qos": "regular_1", "qosraw": "16", "reason": "None", "reqcpufreq": "Unknown", "reqcpufreqmin": "Unknown", "reqcpufreqmax": "Unknown", "reqcpufreqgov": "Unknown", "reqcpus": "25", "reqmem": "12200050M", "reqnodes": "25", "reqtres": "billing=25,cpu=25,mem=12200050M,node=25", "reservation": "", "reservationid": "", "reserved": null, "resvcpu": null, "resvcpuraw": null, "start": "Unknown", "state": "PENDING", "submit": "2023-05-02T18:10:51", "suspended": "00:00:00", "systemcpu": "00:00:00", "systemcomment": "", "timelimit": "12:00:00", "timelimitraw": "720", "totalcpu": "00:00:00", "tresusageinave": "", "tresusageinmax": "", "tresusageinmaxnode": "", "tresusageinmaxtask": "", "tresusageinmin": "", "tresusageinminnode": "", "tresusageinmintask": "", "tresusageintot": "", "tresusageoutave": "", "tresusageoutmax": "", "tresusageoutmaxnode": "", "tresusageoutmaxtask": "", "tresusageoutmin": "", "tresusageoutminnode": "", "tresusageoutmintask": "", "tresusageouttot": "", "uid": "12345", "user": "elvis", "usercpu": "00:00:00", "wckey": "", "wckeyid": "0", "workdir": "/global/homes/e/elvis/job_subs" }
Multiple jobs can also be retrieved simultaniosly later on by creating a list of jobids. This can reduce the amount of calls needed to the Superfacility REST Api and get your results back
async with AsyncClient() as client:
perlmutter = await client.compute(Machine.perlmutter)
# Get the running job based on it's job ID
# This time we'll get information from sacct
jobs = await perlmutter.jobs(jobids=[jobs[0].jobid, jobs[1].jobid], command='sacct')
# Print out the current status of the job
print_json([j.dict() for j in jobs])
[ { "account": "ntrain", "admincomment": "", "alloccpus": "25", "allocnodes": "0", "alloctres": "", "associd": "206287", "avecpu": "", "avecpufreq": "", "avediskread": "", "avediskwrite": "", "avepages": "", "averss": "", "avevmsize": "", "blockid": "", "cluster": "perlmutter", "comment": "", "constraints": "cpu", "consumedenergy": "0", "consumedenergyraw": "0", "cputime": "00:00:00", "cputimeraw": "0", "dbindex": "64266939", "derivedexitcode": "0:0", "elapsed": "00:00:00", "elapsedraw": "0", "eligible": "2023-05-02T18:10:51", "end": "Unknown", "exitcode": "0:0", "flags": "StartRecieved", "gid": "12345", "group": "elvis", "jobid": "8407414", "jobidraw": "8407414", "jobname": "large_job.sh", "layout": "", "maxdiskread": "", "maxdiskreadnode": "", "maxdiskreadtask": "", "maxdiskwrite": "", "maxdiskwritenode": "", "maxdiskwritetask": "", "maxpages": "", "maxpagesnode": "", "maxpagestask": "", "maxrss": "", "maxrssnode": "", "maxrsstask": "", "maxvmsize": "", "maxvmsizenode": "", "maxvmsizetask": "", "mcslabel": "", "mincpu": "", "mincpunode": "", "mincputask": "", "ncpus": "25", "nnodes": "25", "nodelist": "None assigned", "ntasks": "", "priority": "67679", "partition": "regular_milan_ss11", "qos": "regular_1", "qosraw": "16", "reason": "None", "reqcpufreq": "Unknown", "reqcpufreqmin": "Unknown", "reqcpufreqmax": "Unknown", "reqcpufreqgov": "Unknown", "reqcpus": "25", "reqmem": "12200050M", "reqnodes": "25", "reqtres": "billing=25,cpu=25,mem=12200050M,node=25", "reservation": "", "reservationid": "", "reserved": null, "resvcpu": null, "resvcpuraw": null, "start": "Unknown", "state": "PENDING", "submit": "2023-05-02T18:10:51", "suspended": "00:00:00", "systemcpu": "00:00:00", "systemcomment": "", "timelimit": "12:00:00", "timelimitraw": "720", "totalcpu": "00:00:00", "tresusageinave": "", "tresusageinmax": "", "tresusageinmaxnode": "", "tresusageinmaxtask": "", "tresusageinmin": "", "tresusageinminnode": "", "tresusageinmintask": "", "tresusageintot": "", "tresusageoutave": "", "tresusageoutmax": "", "tresusageoutmaxnode": "", "tresusageoutmaxtask": "", "tresusageoutmin": "", "tresusageoutminnode": "", "tresusageoutmintask": "", "tresusageouttot": "", "uid": "12345", "user": "elvis", "usercpu": "00:00:00", "wckey": "", "wckeyid": "0", "workdir": "/global/homes/e/elvis/job_subs" }, { "account": "ntrain", "admincomment": "", "alloccpus": "25", "allocnodes": "0", "alloctres": "", "associd": "206287", "avecpu": "", "avecpufreq": "", "avediskread": "", "avediskwrite": "", "avepages": "", "averss": "", "avevmsize": "", "blockid": "", "cluster": "perlmutter", "comment": "", "constraints": "cpu", "consumedenergy": "0", "consumedenergyraw": "0", "cputime": "00:00:00", "cputimeraw": "0", "dbindex": "64267039", "derivedexitcode": "0:0", "elapsed": "00:00:00", "elapsedraw": "0", "eligible": "2023-05-02T18:11:30", "end": "Unknown", "exitcode": "0:0", "flags": "StartRecieved", "gid": "12345", "group": "elvis", "jobid": "8407432", "jobidraw": "8407432", "jobname": "large_job.sh", "layout": "", "maxdiskread": "", "maxdiskreadnode": "", "maxdiskreadtask": "", "maxdiskwrite": "", "maxdiskwritenode": "", "maxdiskwritetask": "", "maxpages": "", "maxpagesnode": "", "maxpagestask": "", "maxrss": "", "maxrssnode": "", "maxrsstask": "", "maxvmsize": "", "maxvmsizenode": "", "maxvmsizetask": "", "mcslabel": "", "mincpu": "", "mincpunode": "", "mincputask": "", "ncpus": "25", "nnodes": "25", "nodelist": "None assigned", "ntasks": "", "priority": "67679", "partition": "regular_milan_ss11", "qos": "regular_1", "qosraw": "16", "reason": "None", "reqcpufreq": "Unknown", "reqcpufreqmin": "Unknown", "reqcpufreqmax": "Unknown", "reqcpufreqgov": "Unknown", "reqcpus": "25", "reqmem": "12200050M", "reqnodes": "25", "reqtres": "billing=25,cpu=25,mem=12200050M,node=25", "reservation": "", "reservationid": "", "reserved": null, "resvcpu": null, "resvcpuraw": null, "start": "Unknown", "state": "PENDING", "submit": "2023-05-02T18:11:30", "suspended": "00:00:00", "systemcpu": "00:00:00", "systemcomment": "", "timelimit": "12:00:00", "timelimitraw": "720", "totalcpu": "00:00:00", "tresusageinave": "", "tresusageinmax": "", "tresusageinmaxnode": "", "tresusageinmaxtask": "", "tresusageinmin": "", "tresusageinminnode": "", "tresusageinmintask": "", "tresusageintot": "", "tresusageoutave": "", "tresusageoutmax": "", "tresusageoutmaxnode": "", "tresusageoutmaxtask": "", "tresusageoutmin": "", "tresusageoutminnode": "", "tresusageoutmintask": "", "tresusageouttot": "", "uid": "12345", "user": "elvis", "usercpu": "00:00:00", "wckey": "", "wckeyid": "0", "workdir": "/global/homes/e/elvis/job_subs" } ]
Jobs can also be interacted with later based on their jobid. This includes canceling the jobs and seeing their final status.
async with AsyncClient() as client:
perlmutter = await client.compute(Machine.perlmutter)
# Get the running job based on it's job ID
# This time we'll get information from sacct
jobs = await perlmutter.jobs(jobids=[jobs[0].jobid, jobs[1].jobid], command='sacct')
for job in jobs:
await job.cancel(wait=True)
await job.update()
print_json([j.dict() for j in jobs])
[ { "account": "ntrain", "admincomment": "{\"resizing\":0,\"features\":\"cpu\",\"arrayTaskId\":4294967294,\"qos\":\"regular_1\",\"arrayJobId\":0,\"jobAccount\":\"ntrain\",\"submitTime\":1683076251,\"partition\":\"regular_milan_ss11\",\"uid\":12345,\"cluster\":\"perlmutter\",\"argv\":[\"\\/global\\/u1\\/t\\/elvis\\/job_subs\\/large_job.sh\"],\"gresRequest\":\"cpu=25,mem=12200050M,node=25,billing=25\",\"licenses\":\"u1:1\",\"name\":\"large_job.sh\",\"stdinPath\":\"\\/dev\\/null\",\"timeLimit\":720,\"packJobId\":0,\"jobId\":8407414,\"allocNodes\":0,\"allocCpus\":0,\"workingDirectory\":\"\\/global\\/u1\\/t\\/elvis\\/job_subs\",\"restartCnt\":0,\"jobExitCode\":0,\"reboot\":0,\"startTime\":1683077035,\"priority\":67684,\"endTime\":1683077035,\"jobDerivedExitCode\":0,\"packJobOffset\":0}", "alloccpus": "25", "allocnodes": "0", "alloctres": "", "associd": "206287", "avecpu": "", "avecpufreq": "", "avediskread": "", "avediskwrite": "", "avepages": "", "averss": "", "avevmsize": "", "blockid": "", "cluster": "perlmutter", "comment": "", "constraints": "cpu", "consumedenergy": "0", "consumedenergyraw": "0", "cputime": "00:00:00", "cputimeraw": "0", "dbindex": "64266939", "derivedexitcode": "0:0", "elapsed": "00:00:00", "elapsedraw": "0", "eligible": "2023-05-02T18:10:51", "end": "2023-05-02T18:23:55", "exitcode": "0:0", "flags": "StartRecieved", "gid": "12345", "group": "elvis", "jobid": "8407414", "jobidraw": "8407414", "jobname": "large_job.sh", "layout": "", "maxdiskread": "", "maxdiskreadnode": "", "maxdiskreadtask": "", "maxdiskwrite": "", "maxdiskwritenode": "", "maxdiskwritetask": "", "maxpages": "", "maxpagesnode": "", "maxpagestask": "", "maxrss": "", "maxrssnode": "", "maxrsstask": "", "maxvmsize": "", "maxvmsizenode": "", "maxvmsizetask": "", "mcslabel": "", "mincpu": "", "mincpunode": "", "mincputask": "", "ncpus": "25", "nnodes": "25", "nodelist": "None assigned", "ntasks": "", "priority": "67679", "partition": "regular_milan_ss11", "qos": "regular_1", "qosraw": "16", "reason": "None", "reqcpufreq": "Unknown", "reqcpufreqmin": "Unknown", "reqcpufreqmax": "Unknown", "reqcpufreqgov": "Unknown", "reqcpus": "25", "reqmem": "12200050M", "reqnodes": "25", "reqtres": "billing=25,cpu=25,mem=12200050M,node=25", "reservation": "", "reservationid": "", "reserved": null, "resvcpu": null, "resvcpuraw": null, "start": "None", "state": "CANCELLED", "submit": "2023-05-02T18:10:51", "suspended": "00:00:00", "systemcpu": "00:00:00", "systemcomment": "", "timelimit": "12:00:00", "timelimitraw": "720", "totalcpu": "00:00:00", "tresusageinave": "", "tresusageinmax": "", "tresusageinmaxnode": "", "tresusageinmaxtask": "", "tresusageinmin": "", "tresusageinminnode": "", "tresusageinmintask": "", "tresusageintot": "", "tresusageoutave": "", "tresusageoutmax": "", "tresusageoutmaxnode": "", "tresusageoutmaxtask": "", "tresusageoutmin": "", "tresusageoutminnode": "", "tresusageoutmintask": "", "tresusageouttot": "", "uid": "12345", "user": "elvis", "usercpu": "00:00:00", "wckey": "", "wckeyid": "0", "workdir": "/global/homes/e/elvis/job_subs" }, { "account": "ntrain", "admincomment": "{\"resizing\":0,\"features\":\"cpu\",\"arrayTaskId\":4294967294,\"qos\":\"regular_1\",\"arrayJobId\":0,\"jobAccount\":\"ntrain\",\"submitTime\":1683076290,\"partition\":\"regular_milan_ss11\",\"uid\":12345,\"cluster\":\"perlmutter\",\"argv\":[\"\\/global\\/u1\\/t\\/elvis\\/job_subs\\/large_job.sh\"],\"gresRequest\":\"cpu=25,mem=12200050M,node=25,billing=25\",\"licenses\":\"u1:1\",\"name\":\"large_job.sh\",\"stdinPath\":\"\\/dev\\/null\",\"timeLimit\":720,\"packJobId\":0,\"jobId\":8407432,\"allocNodes\":0,\"allocCpus\":0,\"workingDirectory\":\"\\/global\\/u1\\/t\\/elvis\\/job_subs\",\"restartCnt\":0,\"jobExitCode\":0,\"reboot\":0,\"startTime\":1683077066,\"priority\":67683,\"endTime\":1683077066,\"jobDerivedExitCode\":0,\"packJobOffset\":0}", "alloccpus": "25", "allocnodes": "0", "alloctres": "", "associd": "206287", "avecpu": "", "avecpufreq": "", "avediskread": "", "avediskwrite": "", "avepages": "", "averss": "", "avevmsize": "", "blockid": "", "cluster": "perlmutter", "comment": "", "constraints": "cpu", "consumedenergy": "0", "consumedenergyraw": "0", "cputime": "00:00:00", "cputimeraw": "0", "dbindex": "64267039", "derivedexitcode": "0:0", "elapsed": "00:00:00", "elapsedraw": "0", "eligible": "2023-05-02T18:11:30", "end": "2023-05-02T18:24:26", "exitcode": "0:0", "flags": "StartRecieved", "gid": "12345", "group": "elvis", "jobid": "8407432", "jobidraw": "8407432", "jobname": "large_job.sh", "layout": "", "maxdiskread": "", "maxdiskreadnode": "", "maxdiskreadtask": "", "maxdiskwrite": "", "maxdiskwritenode": "", "maxdiskwritetask": "", "maxpages": "", "maxpagesnode": "", "maxpagestask": "", "maxrss": "", "maxrssnode": "", "maxrsstask": "", "maxvmsize": "", "maxvmsizenode": "", "maxvmsizetask": "", "mcslabel": "", "mincpu": "", "mincpunode": "", "mincputask": "", "ncpus": "25", "nnodes": "25", "nodelist": "None assigned", "ntasks": "", "priority": "67679", "partition": "regular_milan_ss11", "qos": "regular_1", "qosraw": "16", "reason": "None", "reqcpufreq": "Unknown", "reqcpufreqmin": "Unknown", "reqcpufreqmax": "Unknown", "reqcpufreqgov": "Unknown", "reqcpus": "25", "reqmem": "12200050M", "reqnodes": "25", "reqtres": "billing=25,cpu=25,mem=12200050M,node=25", "reservation": "", "reservationid": "", "reserved": null, "resvcpu": null, "resvcpuraw": null, "start": "None", "state": "CANCELLED", "submit": "2023-05-02T18:11:30", "suspended": "00:00:00", "systemcpu": "00:00:00", "systemcomment": "", "timelimit": "12:00:00", "timelimitraw": "720", "totalcpu": "00:00:00", "tresusageinave": "", "tresusageinmax": "", "tresusageinmaxnode": "", "tresusageinmaxtask": "", "tresusageinmin": "", "tresusageinminnode": "", "tresusageinmintask": "", "tresusageintot": "", "tresusageoutave": "", "tresusageoutmax": "", "tresusageoutmaxnode": "", "tresusageoutmaxtask": "", "tresusageoutmin": "", "tresusageoutminnode": "", "tresusageoutmintask": "", "tresusageouttot": "", "uid": "12345", "user": "elvis", "usercpu": "00:00:00", "wckey": "", "wckeyid": "0", "workdir": "/global/homes/e/elvis/job_subs" } ]