Initial Commit
This commit is contained in:
commit
2fd6216b0b
91
create_cluster.sh
Executable file
91
create_cluster.sh
Executable file
@ -0,0 +1,91 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ax
|
||||
|
||||
project_name="hpc"
|
||||
project="--project ${project_name}"
|
||||
lxc="lxc ${project}"
|
||||
|
||||
lxc_exec="${lxc} exec"
|
||||
|
||||
lxc project create ${project_name}
|
||||
|
||||
${lxc} profile edit default < lxd_profile.yaml
|
||||
|
||||
ppa="ppa:nuccitheboss/experimental-hpc"
|
||||
|
||||
mgmt_node="slurm-master"
|
||||
|
||||
${lxc} launch ubuntu:jammy ${mgmt_node}
|
||||
sleep 5
|
||||
${lxc_exec} ${mgmt_node} -- add-apt-repository -y ${ppa}
|
||||
${lxc_exec} ${mgmt_node} -- apt -y install slurm-wlm
|
||||
${lxc} file push munge.key ${mgmt_node}/etc/munge/
|
||||
${lxc_exec} ${mgmt_node} -- systemctl restart munge
|
||||
${lxc_exec} ${mgmt_node} -- systemctl enable munge
|
||||
|
||||
${lxc} file push slurm.conf ${mgmt_node}/etc/slurm/
|
||||
${lxc_exec} ${mgmt_node} -- mkdir /var/spool/slurmctld
|
||||
${lxc_exec} ${mgmt_node} -- chown slurm:slurm /var/spool/slurmctld
|
||||
${lxc_exec} ${mgmt_node} -- systemctl start slurmctld
|
||||
${lxc_exec} ${mgmt_node} -- systemctl enable slurmctld
|
||||
${lxc_exec} ${mgmt_node} -- systemctl start slurmd
|
||||
${lxc_exec} ${mgmt_node} -- systemctl enable slurmd
|
||||
|
||||
nodes=5
|
||||
|
||||
for i in `seq 1 ${nodes}`
|
||||
do
|
||||
node=slurm-comp0${i}
|
||||
|
||||
${lxc} launch ubuntu:jammy ${node}
|
||||
sleep 5
|
||||
${lxc_exec} ${node} -- add-apt-repository -y ${ppa}
|
||||
${lxc_exec} ${node} -- apt -y install slurmd
|
||||
${lxc} file push munge.key ${node}/etc/munge/
|
||||
${lxc_exec} ${node} -- systemctl restart munge
|
||||
${lxc_exec} ${node} -- systemctl enable munge
|
||||
|
||||
${lxc} file push slurm.conf ${node}/etc/slurm/
|
||||
${lxc_exec} ${node} -- systemctl start slurmd
|
||||
${lxc_exec} ${node} -- systemctl enable slurmd
|
||||
done
|
||||
|
||||
tmp_hosts=$(mktemp)
|
||||
|
||||
cat > ${tmp_hosts} << EOF
|
||||
127.0.0.1 localhost
|
||||
|
||||
# The following lines are desirable for IPv6 capable hosts
|
||||
::1 ip6-localhost ip6-loopback
|
||||
fe00::0 ip6-localnet
|
||||
ff00::0 ip6-mcastprefix
|
||||
ff02::1 ip6-allnodes
|
||||
ff02::2 ip6-allrouters
|
||||
ff02::3 ip6-allhosts
|
||||
|
||||
EOF
|
||||
|
||||
addresses=$(${lxc} list --format json | jq -rc '.[] | select(.name | contains("slurm")) | [.name,.state.network.eth0.addresses[].address]')
|
||||
|
||||
for line in ${addresses}
|
||||
do
|
||||
|
||||
name=$(echo $line | jq -rc .[0])
|
||||
ipv4=$(echo $line | jq -rc .[1])
|
||||
|
||||
echo ${ipv4} ${name} >> ${tmp_hosts}
|
||||
|
||||
done
|
||||
|
||||
for line in ${addresses}
|
||||
do
|
||||
|
||||
name=$(echo $line | jq -rc .[0])
|
||||
|
||||
${lxc} file push ${tmp_hosts} ${name}/etc/hosts
|
||||
${lxc_exec} ${name} -- sudo systemctl restart slurmd
|
||||
|
||||
done
|
||||
|
||||
${lxc_exec} ${mgmt_node} -- sudo systemctl restart slurmctld
|
9
lxd_profile.yaml
Normal file
9
lxd_profile.yaml
Normal file
@ -0,0 +1,9 @@
|
||||
devices:
|
||||
eth0:
|
||||
name: eth0
|
||||
network: lxdbr0
|
||||
type: nic
|
||||
root:
|
||||
path: /
|
||||
pool: virtual
|
||||
type: disk
|
148
slurm.conf
Normal file
148
slurm.conf
Normal file
@ -0,0 +1,148 @@
|
||||
# slurm.conf file generated by configurator.html.
|
||||
# Put this file on all nodes of your cluster.
|
||||
# See the slurm.conf man page for more information.
|
||||
#
|
||||
ClusterName=cluster
|
||||
SlurmctldHost=slurm-master
|
||||
#SlurmctldHost=
|
||||
#
|
||||
#DisableRootJobs=NO
|
||||
#EnforcePartLimits=NO
|
||||
#Epilog=
|
||||
#EpilogSlurmctld=
|
||||
#FirstJobId=1
|
||||
#MaxJobId=67043328
|
||||
#GresTypes=
|
||||
#GroupUpdateForce=0
|
||||
#GroupUpdateTime=600
|
||||
#JobFileAppend=0
|
||||
#JobRequeue=1
|
||||
#JobSubmitPlugins=lua
|
||||
#KillOnBadExit=0
|
||||
#LaunchType=launch/slurm
|
||||
#Licenses=foo*4,bar
|
||||
#MailProg=/bin/mail
|
||||
#MaxJobCount=10000
|
||||
#MaxStepCount=40000
|
||||
#MaxTasksPerNode=512
|
||||
MpiDefault=none
|
||||
#MpiParams=ports=#-#
|
||||
#PluginDir=
|
||||
#PlugStackConfig=
|
||||
#PrivateData=jobs
|
||||
ProctrackType=proctrack/cgroup
|
||||
#Prolog=
|
||||
#PrologFlags=
|
||||
#PrologSlurmctld=
|
||||
#PropagatePrioProcess=0
|
||||
#PropagateResourceLimits=
|
||||
#PropagateResourceLimitsExcept=
|
||||
#RebootProgram=
|
||||
ReturnToService=1
|
||||
SlurmctldPidFile=/var/run/slurmctld.pid
|
||||
SlurmctldPort=6817
|
||||
SlurmdPidFile=/var/run/slurmd.pid
|
||||
SlurmdPort=6818
|
||||
SlurmdSpoolDir=/var/spool/slurmd
|
||||
SlurmUser=slurm
|
||||
#SlurmdUser=root
|
||||
#SrunEpilog=
|
||||
#SrunProlog=
|
||||
StateSaveLocation=/var/spool/slurmctld
|
||||
SwitchType=switch/none
|
||||
#TaskEpilog=
|
||||
TaskPlugin=task/affinity,task/cgroup
|
||||
#TaskProlog=
|
||||
#TopologyPlugin=topology/tree
|
||||
#TmpFS=/tmp
|
||||
#TrackWCKey=no
|
||||
#TreeWidth=
|
||||
#UnkillableStepProgram=
|
||||
#UsePAM=0
|
||||
#
|
||||
#
|
||||
# TIMERS
|
||||
#BatchStartTimeout=10
|
||||
#CompleteWait=0
|
||||
#EpilogMsgTime=2000
|
||||
#GetEnvTimeout=2
|
||||
#HealthCheckInterval=0
|
||||
#HealthCheckProgram=
|
||||
InactiveLimit=0
|
||||
KillWait=30
|
||||
#MessageTimeout=10
|
||||
#ResvOverRun=0
|
||||
MinJobAge=300
|
||||
#OverTimeLimit=0
|
||||
SlurmctldTimeout=120
|
||||
SlurmdTimeout=300
|
||||
#UnkillableStepTimeout=60
|
||||
#VSizeFactor=0
|
||||
Waittime=0
|
||||
#
|
||||
#
|
||||
# SCHEDULING
|
||||
#DefMemPerCPU=0
|
||||
#MaxMemPerCPU=0
|
||||
#SchedulerTimeSlice=30
|
||||
SchedulerType=sched/backfill
|
||||
SelectType=select/cons_tres
|
||||
#
|
||||
#
|
||||
# JOB PRIORITY
|
||||
#PriorityFlags=
|
||||
#PriorityType=priority/basic
|
||||
#PriorityDecayHalfLife=
|
||||
#PriorityCalcPeriod=
|
||||
#PriorityFavorSmall=
|
||||
#PriorityMaxAge=
|
||||
#PriorityUsageResetPeriod=
|
||||
#PriorityWeightAge=
|
||||
#PriorityWeightFairshare=
|
||||
#PriorityWeightJobSize=
|
||||
#PriorityWeightPartition=
|
||||
#PriorityWeightQOS=
|
||||
#
|
||||
#
|
||||
# LOGGING AND ACCOUNTING
|
||||
#AccountingStorageEnforce=0
|
||||
#AccountingStorageHost=
|
||||
#AccountingStoragePass=
|
||||
#AccountingStoragePort=
|
||||
#AccountingStorageType=accounting_storage/slurmdbd
|
||||
#AccountingStorageUser=
|
||||
#AccountingStoreFlags=
|
||||
#JobCompHost=
|
||||
#JobCompLoc=
|
||||
#JobCompParams=
|
||||
#JobCompPass=
|
||||
#JobCompPort=
|
||||
JobCompType=jobcomp/none
|
||||
#JobCompUser=
|
||||
#JobContainerType=job_container/none
|
||||
JobAcctGatherFrequency=30
|
||||
JobAcctGatherType=jobacct_gather/none
|
||||
SlurmctldDebug=info
|
||||
SlurmctldLogFile=/var/log/slurmctld.log
|
||||
SlurmdDebug=info
|
||||
SlurmdLogFile=/var/log/slurmd.log
|
||||
#SlurmSchedLogFile=
|
||||
#SlurmSchedLogLevel=
|
||||
#DebugFlags=
|
||||
#
|
||||
#
|
||||
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
|
||||
#SuspendProgram=
|
||||
#ResumeProgram=
|
||||
#SuspendTimeout=
|
||||
#ResumeTimeout=
|
||||
#ResumeRate=
|
||||
#SuspendExcNodes=
|
||||
#SuspendExcParts=
|
||||
#SuspendRate=
|
||||
#SuspendTime=
|
||||
#
|
||||
#
|
||||
# COMPUTE NODES
|
||||
NodeName=slurm-master,slurm-comp0[1-5] CPUs=1 State=UNKNOWN
|
||||
PartitionName=debug Nodes=slurm-comp0[1-5] Default=YES MaxTime=INFINITE State=UP
|
Loading…
Reference in New Issue
Block a user