Initial Commit

This commit is contained in:
Arif Ali 2023-08-08 21:22:09 +01:00
commit 2fd6216b0b
Signed by: arif
GPG Key ID: 369608FBA1353A70
4 changed files with 248 additions and 0 deletions

91
create_cluster.sh Executable file
View File

@ -0,0 +1,91 @@
#!/bin/bash
set -ax
project_name="hpc"
project="--project ${project_name}"
lxc="lxc ${project}"
lxc_exec="${lxc} exec"
lxc project create ${project_name}
${lxc} profile edit default < lxd_profile.yaml
ppa="ppa:nuccitheboss/experimental-hpc"
mgmt_node="slurm-master"
${lxc} launch ubuntu:jammy ${mgmt_node}
sleep 5
${lxc_exec} ${mgmt_node} -- add-apt-repository -y ${ppa}
${lxc_exec} ${mgmt_node} -- apt -y install slurm-wlm
${lxc} file push munge.key ${mgmt_node}/etc/munge/
${lxc_exec} ${mgmt_node} -- systemctl restart munge
${lxc_exec} ${mgmt_node} -- systemctl enable munge
${lxc} file push slurm.conf ${mgmt_node}/etc/slurm/
${lxc_exec} ${mgmt_node} -- mkdir /var/spool/slurmctld
${lxc_exec} ${mgmt_node} -- chown slurm:slurm /var/spool/slurmctld
${lxc_exec} ${mgmt_node} -- systemctl start slurmctld
${lxc_exec} ${mgmt_node} -- systemctl enable slurmctld
${lxc_exec} ${mgmt_node} -- systemctl start slurmd
${lxc_exec} ${mgmt_node} -- systemctl enable slurmd
nodes=5
for i in `seq 1 ${nodes}`
do
node=slurm-comp0${i}
${lxc} launch ubuntu:jammy ${node}
sleep 5
${lxc_exec} ${node} -- add-apt-repository -y ${ppa}
${lxc_exec} ${node} -- apt -y install slurmd
${lxc} file push munge.key ${node}/etc/munge/
${lxc_exec} ${node} -- systemctl restart munge
${lxc_exec} ${node} -- systemctl enable munge
${lxc} file push slurm.conf ${node}/etc/slurm/
${lxc_exec} ${node} -- systemctl start slurmd
${lxc_exec} ${node} -- systemctl enable slurmd
done
tmp_hosts=$(mktemp)
cat > ${tmp_hosts} << EOF
127.0.0.1 localhost
# The following lines are desirable for IPv6 capable hosts
::1 ip6-localhost ip6-loopback
fe00::0 ip6-localnet
ff00::0 ip6-mcastprefix
ff02::1 ip6-allnodes
ff02::2 ip6-allrouters
ff02::3 ip6-allhosts
EOF
addresses=$(${lxc} list --format json | jq -rc '.[] | select(.name | contains("slurm")) | [.name,.state.network.eth0.addresses[].address]')
for line in ${addresses}
do
name=$(echo $line | jq -rc .[0])
ipv4=$(echo $line | jq -rc .[1])
echo ${ipv4} ${name} >> ${tmp_hosts}
done
for line in ${addresses}
do
name=$(echo $line | jq -rc .[0])
${lxc} file push ${tmp_hosts} ${name}/etc/hosts
${lxc_exec} ${name} -- sudo systemctl restart slurmd
done
${lxc_exec} ${mgmt_node} -- sudo systemctl restart slurmctld

9
lxd_profile.yaml Normal file
View File

@ -0,0 +1,9 @@
devices:
eth0:
name: eth0
network: lxdbr0
type: nic
root:
path: /
pool: virtual
type: disk

BIN
munge.key Normal file

Binary file not shown.

148
slurm.conf Normal file
View File

@ -0,0 +1,148 @@
# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ClusterName=cluster
SlurmctldHost=slurm-master
#SlurmctldHost=
#
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=67043328
#GresTypes=
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=lua
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=10000
#MaxStepCount=40000
#MaxTasksPerNode=512
MpiDefault=none
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
ProctrackType=proctrack/cgroup
#Prolog=
#PrologFlags=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool/slurmctld
SwitchType=switch/none
#TaskEpilog=
TaskPlugin=task/affinity,task/cgroup
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_tres
#
#
# JOB PRIORITY
#PriorityFlags=
#PriorityType=priority/basic
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
#AccountingStorageHost=
#AccountingStoragePass=
#AccountingStoragePort=
#AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageUser=
#AccountingStoreFlags=
#JobCompHost=
#JobCompLoc=
#JobCompParams=
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
#JobContainerType=job_container/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurmd.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#DebugFlags=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
NodeName=slurm-master,slurm-comp0[1-5] CPUs=1 State=UNKNOWN
PartitionName=debug Nodes=slurm-comp0[1-5] Default=YES MaxTime=INFINITE State=UP