From 2fd6216b0b061045d47cba8f5b848dc00677dbe2 Mon Sep 17 00:00:00 2001 From: Arif Ali Date: Tue, 8 Aug 2023 21:22:09 +0100 Subject: [PATCH] Initial Commit --- create_cluster.sh | 91 ++++++++++++++++++++++++++++ lxd_profile.yaml | 9 +++ munge.key | Bin 0 -> 128 bytes slurm.conf | 148 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 248 insertions(+) create mode 100755 create_cluster.sh create mode 100644 lxd_profile.yaml create mode 100644 munge.key create mode 100644 slurm.conf diff --git a/create_cluster.sh b/create_cluster.sh new file mode 100755 index 0000000..e5653d6 --- /dev/null +++ b/create_cluster.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +set -ax + +project_name="hpc" +project="--project ${project_name}" +lxc="lxc ${project}" + +lxc_exec="${lxc} exec" + +lxc project create ${project_name} + +${lxc} profile edit default < lxd_profile.yaml + +ppa="ppa:nuccitheboss/experimental-hpc" + +mgmt_node="slurm-master" + +${lxc} launch ubuntu:jammy ${mgmt_node} +sleep 5 +${lxc_exec} ${mgmt_node} -- add-apt-repository -y ${ppa} +${lxc_exec} ${mgmt_node} -- apt -y install slurm-wlm +${lxc} file push munge.key ${mgmt_node}/etc/munge/ +${lxc_exec} ${mgmt_node} -- systemctl restart munge +${lxc_exec} ${mgmt_node} -- systemctl enable munge + +${lxc} file push slurm.conf ${mgmt_node}/etc/slurm/ +${lxc_exec} ${mgmt_node} -- mkdir /var/spool/slurmctld +${lxc_exec} ${mgmt_node} -- chown slurm:slurm /var/spool/slurmctld +${lxc_exec} ${mgmt_node} -- systemctl start slurmctld +${lxc_exec} ${mgmt_node} -- systemctl enable slurmctld +${lxc_exec} ${mgmt_node} -- systemctl start slurmd +${lxc_exec} ${mgmt_node} -- systemctl enable slurmd + +nodes=5 + +for i in `seq 1 ${nodes}` +do + node=slurm-comp0${i} + + ${lxc} launch ubuntu:jammy ${node} + sleep 5 + ${lxc_exec} ${node} -- add-apt-repository -y ${ppa} + ${lxc_exec} ${node} -- apt -y install slurmd + ${lxc} file push munge.key ${node}/etc/munge/ + ${lxc_exec} ${node} -- systemctl restart munge + ${lxc_exec} ${node} -- systemctl enable munge + + ${lxc} file push slurm.conf ${node}/etc/slurm/ + ${lxc_exec} ${node} -- systemctl start slurmd + ${lxc_exec} ${node} -- systemctl enable slurmd +done + +tmp_hosts=$(mktemp) + +cat > ${tmp_hosts} << EOF +127.0.0.1 localhost + +# The following lines are desirable for IPv6 capable hosts +::1 ip6-localhost ip6-loopback +fe00::0 ip6-localnet +ff00::0 ip6-mcastprefix +ff02::1 ip6-allnodes +ff02::2 ip6-allrouters +ff02::3 ip6-allhosts + +EOF + +addresses=$(${lxc} list --format json | jq -rc '.[] | select(.name | contains("slurm")) | [.name,.state.network.eth0.addresses[].address]') + +for line in ${addresses} +do + + name=$(echo $line | jq -rc .[0]) + ipv4=$(echo $line | jq -rc .[1]) + + echo ${ipv4} ${name} >> ${tmp_hosts} + +done + +for line in ${addresses} +do + + name=$(echo $line | jq -rc .[0]) + + ${lxc} file push ${tmp_hosts} ${name}/etc/hosts + ${lxc_exec} ${name} -- sudo systemctl restart slurmd + +done + +${lxc_exec} ${mgmt_node} -- sudo systemctl restart slurmctld diff --git a/lxd_profile.yaml b/lxd_profile.yaml new file mode 100644 index 0000000..45f83eb --- /dev/null +++ b/lxd_profile.yaml @@ -0,0 +1,9 @@ +devices: + eth0: + name: eth0 + network: lxdbr0 + type: nic + root: + path: / + pool: virtual + type: disk diff --git a/munge.key b/munge.key new file mode 100644 index 0000000000000000000000000000000000000000..4811b775e489862ed061ad3d7e8361f3b63481dc GIT binary patch literal 128 zcmV-`0Du1@m7)Y04P1Z1guG1@4cxQE#y`6*K*OL-(Ifzc1B?8izm6HA*=N65z9$2Y zkV8VBt-tyuR8=wP3H^=0@xW4Y258s&au i1L$BvDFYpsRoav#Y@gqfdHTXK$-aD`WKVF4#zQk_lRoSK literal 0 HcmV?d00001 diff --git a/slurm.conf b/slurm.conf new file mode 100644 index 0000000..458a9ca --- /dev/null +++ b/slurm.conf @@ -0,0 +1,148 @@ +# slurm.conf file generated by configurator.html. +# Put this file on all nodes of your cluster. +# See the slurm.conf man page for more information. +# +ClusterName=cluster +SlurmctldHost=slurm-master +#SlurmctldHost= +# +#DisableRootJobs=NO +#EnforcePartLimits=NO +#Epilog= +#EpilogSlurmctld= +#FirstJobId=1 +#MaxJobId=67043328 +#GresTypes= +#GroupUpdateForce=0 +#GroupUpdateTime=600 +#JobFileAppend=0 +#JobRequeue=1 +#JobSubmitPlugins=lua +#KillOnBadExit=0 +#LaunchType=launch/slurm +#Licenses=foo*4,bar +#MailProg=/bin/mail +#MaxJobCount=10000 +#MaxStepCount=40000 +#MaxTasksPerNode=512 +MpiDefault=none +#MpiParams=ports=#-# +#PluginDir= +#PlugStackConfig= +#PrivateData=jobs +ProctrackType=proctrack/cgroup +#Prolog= +#PrologFlags= +#PrologSlurmctld= +#PropagatePrioProcess=0 +#PropagateResourceLimits= +#PropagateResourceLimitsExcept= +#RebootProgram= +ReturnToService=1 +SlurmctldPidFile=/var/run/slurmctld.pid +SlurmctldPort=6817 +SlurmdPidFile=/var/run/slurmd.pid +SlurmdPort=6818 +SlurmdSpoolDir=/var/spool/slurmd +SlurmUser=slurm +#SlurmdUser=root +#SrunEpilog= +#SrunProlog= +StateSaveLocation=/var/spool/slurmctld +SwitchType=switch/none +#TaskEpilog= +TaskPlugin=task/affinity,task/cgroup +#TaskProlog= +#TopologyPlugin=topology/tree +#TmpFS=/tmp +#TrackWCKey=no +#TreeWidth= +#UnkillableStepProgram= +#UsePAM=0 +# +# +# TIMERS +#BatchStartTimeout=10 +#CompleteWait=0 +#EpilogMsgTime=2000 +#GetEnvTimeout=2 +#HealthCheckInterval=0 +#HealthCheckProgram= +InactiveLimit=0 +KillWait=30 +#MessageTimeout=10 +#ResvOverRun=0 +MinJobAge=300 +#OverTimeLimit=0 +SlurmctldTimeout=120 +SlurmdTimeout=300 +#UnkillableStepTimeout=60 +#VSizeFactor=0 +Waittime=0 +# +# +# SCHEDULING +#DefMemPerCPU=0 +#MaxMemPerCPU=0 +#SchedulerTimeSlice=30 +SchedulerType=sched/backfill +SelectType=select/cons_tres +# +# +# JOB PRIORITY +#PriorityFlags= +#PriorityType=priority/basic +#PriorityDecayHalfLife= +#PriorityCalcPeriod= +#PriorityFavorSmall= +#PriorityMaxAge= +#PriorityUsageResetPeriod= +#PriorityWeightAge= +#PriorityWeightFairshare= +#PriorityWeightJobSize= +#PriorityWeightPartition= +#PriorityWeightQOS= +# +# +# LOGGING AND ACCOUNTING +#AccountingStorageEnforce=0 +#AccountingStorageHost= +#AccountingStoragePass= +#AccountingStoragePort= +#AccountingStorageType=accounting_storage/slurmdbd +#AccountingStorageUser= +#AccountingStoreFlags= +#JobCompHost= +#JobCompLoc= +#JobCompParams= +#JobCompPass= +#JobCompPort= +JobCompType=jobcomp/none +#JobCompUser= +#JobContainerType=job_container/none +JobAcctGatherFrequency=30 +JobAcctGatherType=jobacct_gather/none +SlurmctldDebug=info +SlurmctldLogFile=/var/log/slurmctld.log +SlurmdDebug=info +SlurmdLogFile=/var/log/slurmd.log +#SlurmSchedLogFile= +#SlurmSchedLogLevel= +#DebugFlags= +# +# +# POWER SAVE SUPPORT FOR IDLE NODES (optional) +#SuspendProgram= +#ResumeProgram= +#SuspendTimeout= +#ResumeTimeout= +#ResumeRate= +#SuspendExcNodes= +#SuspendExcParts= +#SuspendRate= +#SuspendTime= +# +# +# COMPUTE NODES +NodeName=slurm-master,slurm-comp0[1-5] CPUs=1 State=UNKNOWN +PartitionName=debug Nodes=slurm-comp0[1-5] Default=YES MaxTime=INFINITE State=UP