Slurm: Difference between revisions
Jump to navigation
Jump to search
(Created page with "==install slurm under fedora 21== # Build slurm rpmbuild -ta slurm*.tar.bz2 # Install rpms. yum -y install munge slurm slurm-plugins slurm-munge # configure munge dd if=/dev/...") |
|||
(7 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
==install slurm under fedora | ==install slurm under fedora 26== | ||
Edit slurm.spec to avoid: "plugin_load_from_file: dlopen(/usr/lib64/slurm/select_linear.so): /usr/lib64/slurm/select_linear.so: undefined symbol: slurm_job_preempt_mode." Add the four lines between %... | |||
%build | |||
CFLAGS="$RPM_OPT_FLAGS -Wl,-z,lazy" | |||
CXXFLAGS="$RPM_OPT_FLAGS -Wl,-z,lazy" | |||
export CFLAGS | |||
export CXXFLAGS | |||
%configure \ | |||
Build slurm | |||
rpmbuild -ta slurm*.tar.bz2 | rpmbuild -ta slurm*.tar.bz2 | ||
Install rpms. | |||
yum -y install munge slurm slurm-plugins slurm-munge | yum -y install munge slurm slurm-plugins slurm-munge | ||
dd if=/dev/random bs=1 count=1024 > /etc/munge/munge.key | ==configure munge== | ||
chmod 0600 /etc/munge/munge.key | dd if=/dev/random bs=1 count=1024 > /etc/munge/munge.key | ||
chown munge /etc/munge/munge.key | chmod 0600 /etc/munge/munge.key | ||
systemctl start munge | chown munge /etc/munge/munge.key | ||
systemctl start munge | |||
==test installation== | ==test installation== | ||
Generate a credential on stdout. | Generate a credential on stdout. | ||
munge -n | munge -n | ||
Line 28: | Line 28: | ||
Run a quick benchmark. | Run a quick benchmark. | ||
remunge | remunge | ||
how does it work | |||
scontrol show config | scontrol show config | ||
check priorities of jobs using the command | |||
scontrol show job". | scontrol show job". | ||
==job control== | |||
Submit a job | |||
sbatch /tmp/slurm_test_1 | sbatch /tmp/slurm_test_1 | ||
List jobs: | |||
squeue | squeue | ||
Get job details: | |||
scontrol show job 106 | scontrol show job 106 | ||
Suspend a job (root only): | |||
scontrol suspend 135 | scontrol suspend 135 | ||
Resume a job (root only): | |||
scontrol resume 135 | scontrol resume 135 | ||
Kill a job. Users can kill their own jobs, root can kill any job. | |||
scancel 135 | scancel 135 | ||
Hold a job | |||
scontrol hold 139 | scontrol hold 139 | ||
Release a job: | |||
scontrol release 139 | scontrol release 139 | ||
List partitions: | |||
sinfo | sinfo | ||
example job script. | |||
#!/usr/bin/env bash | #!/usr/bin/env bash | ||
#SBATCH -p defq | #SBATCH -p defq | ||
#SBATCH -J simple | #SBATCH -J simple | ||
sleep 60 | sleep 60 | ||
==troubleshooting== | |||
Restore node. | |||
scontrol update nodename=www state=down Reason=troubleshooting | |||
scontrol update nodename=www state=resume |
Latest revision as of 17:05, 5 November 2017
install slurm under fedora 26
Edit slurm.spec to avoid: "plugin_load_from_file: dlopen(/usr/lib64/slurm/select_linear.so): /usr/lib64/slurm/select_linear.so: undefined symbol: slurm_job_preempt_mode." Add the four lines between %...
%build CFLAGS="$RPM_OPT_FLAGS -Wl,-z,lazy" CXXFLAGS="$RPM_OPT_FLAGS -Wl,-z,lazy" export CFLAGS export CXXFLAGS %configure \
Build slurm
rpmbuild -ta slurm*.tar.bz2
Install rpms.
yum -y install munge slurm slurm-plugins slurm-munge
configure munge
dd if=/dev/random bs=1 count=1024 > /etc/munge/munge.key chmod 0600 /etc/munge/munge.key chown munge /etc/munge/munge.key systemctl start munge
test installation
Generate a credential on stdout.
munge -n
Check if a credential can be locally decoded.
munge -n | unmunge
Check if a credential can be remotely decoded.
munge -n | ssh somehost unmunge
Run a quick benchmark.
remunge
how does it work
scontrol show config
check priorities of jobs using the command
scontrol show job".
job control
Submit a job
sbatch /tmp/slurm_test_1
List jobs:
squeue
Get job details:
scontrol show job 106
Suspend a job (root only):
scontrol suspend 135
Resume a job (root only):
scontrol resume 135
Kill a job. Users can kill their own jobs, root can kill any job.
scancel 135
Hold a job
scontrol hold 139
Release a job:
scontrol release 139
List partitions:
sinfo
example job script.
#!/usr/bin/env bash #SBATCH -p defq #SBATCH -J simple sleep 60
troubleshooting
Restore node.
scontrol update nodename=www state=down Reason=troubleshooting scontrol update nodename=www state=resume