Initial Commit
This commit is contained in:
		
							
								
								
									
										91
									
								
								create_cluster.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										91
									
								
								create_cluster.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,91 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -ax | ||||
|  | ||||
| project_name="hpc" | ||||
| project="--project ${project_name}" | ||||
| lxc="lxc ${project}" | ||||
|  | ||||
| lxc_exec="${lxc} exec" | ||||
|  | ||||
| lxc project create ${project_name} | ||||
|  | ||||
| ${lxc} profile edit default < lxd_profile.yaml | ||||
|  | ||||
| ppa="ppa:nuccitheboss/experimental-hpc" | ||||
|  | ||||
| mgmt_node="slurm-master" | ||||
|  | ||||
| ${lxc} launch ubuntu:jammy ${mgmt_node} | ||||
| sleep 5 | ||||
| ${lxc_exec} ${mgmt_node} -- add-apt-repository -y ${ppa} | ||||
| ${lxc_exec} ${mgmt_node} -- apt -y install slurm-wlm | ||||
| ${lxc} file push munge.key ${mgmt_node}/etc/munge/ | ||||
| ${lxc_exec} ${mgmt_node} -- systemctl restart munge | ||||
| ${lxc_exec} ${mgmt_node} -- systemctl enable munge | ||||
|  | ||||
| ${lxc} file push slurm.conf ${mgmt_node}/etc/slurm/ | ||||
| ${lxc_exec} ${mgmt_node} -- mkdir /var/spool/slurmctld | ||||
| ${lxc_exec} ${mgmt_node} -- chown slurm:slurm /var/spool/slurmctld | ||||
| ${lxc_exec} ${mgmt_node} -- systemctl start slurmctld | ||||
| ${lxc_exec} ${mgmt_node} -- systemctl enable slurmctld | ||||
| ${lxc_exec} ${mgmt_node} -- systemctl start slurmd | ||||
| ${lxc_exec} ${mgmt_node} -- systemctl enable slurmd | ||||
|  | ||||
| nodes=5 | ||||
|  | ||||
| for i in `seq 1 ${nodes}` | ||||
| do | ||||
|   node=slurm-comp0${i} | ||||
|  | ||||
|   ${lxc} launch ubuntu:jammy ${node} | ||||
|   sleep 5 | ||||
|   ${lxc_exec} ${node} -- add-apt-repository -y ${ppa} | ||||
|   ${lxc_exec} ${node} -- apt -y install slurmd | ||||
|   ${lxc} file push munge.key ${node}/etc/munge/ | ||||
|   ${lxc_exec} ${node} -- systemctl restart munge | ||||
|   ${lxc_exec} ${node} -- systemctl enable munge | ||||
|  | ||||
|   ${lxc} file push slurm.conf ${node}/etc/slurm/ | ||||
|   ${lxc_exec} ${node} -- systemctl start slurmd | ||||
|   ${lxc_exec} ${node} -- systemctl enable slurmd | ||||
| done | ||||
|  | ||||
| tmp_hosts=$(mktemp) | ||||
|  | ||||
| cat > ${tmp_hosts} << EOF | ||||
| 127.0.0.1 localhost | ||||
|  | ||||
| # The following lines are desirable for IPv6 capable hosts | ||||
| ::1 ip6-localhost ip6-loopback | ||||
| fe00::0 ip6-localnet | ||||
| ff00::0 ip6-mcastprefix | ||||
| ff02::1 ip6-allnodes | ||||
| ff02::2 ip6-allrouters | ||||
| ff02::3 ip6-allhosts | ||||
|  | ||||
| EOF | ||||
|  | ||||
| addresses=$(${lxc} list --format json | jq -rc '.[] | select(.name | contains("slurm")) | [.name,.state.network.eth0.addresses[].address]') | ||||
|  | ||||
| for line in ${addresses} | ||||
| do | ||||
|  | ||||
|   name=$(echo $line | jq -rc .[0]) | ||||
|   ipv4=$(echo $line | jq -rc .[1]) | ||||
|  | ||||
|   echo ${ipv4} ${name} >> ${tmp_hosts} | ||||
|  | ||||
| done | ||||
|  | ||||
| for line in ${addresses} | ||||
| do | ||||
|  | ||||
|   name=$(echo $line | jq -rc .[0]) | ||||
|  | ||||
|   ${lxc} file push ${tmp_hosts}  ${name}/etc/hosts | ||||
|   ${lxc_exec} ${name} -- sudo systemctl restart slurmd | ||||
|  | ||||
| done | ||||
|  | ||||
| ${lxc_exec} ${mgmt_node} -- sudo systemctl restart slurmctld | ||||
							
								
								
									
										9
									
								
								lxd_profile.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								lxd_profile.yaml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,9 @@ | ||||
| devices: | ||||
|   eth0: | ||||
|     name: eth0 | ||||
|     network: lxdbr0 | ||||
|     type: nic | ||||
|   root: | ||||
|     path: / | ||||
|     pool: virtual | ||||
|     type: disk | ||||
							
								
								
									
										148
									
								
								slurm.conf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										148
									
								
								slurm.conf
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,148 @@ | ||||
| # slurm.conf file generated by configurator.html. | ||||
| # Put this file on all nodes of your cluster. | ||||
| # See the slurm.conf man page for more information. | ||||
| # | ||||
| ClusterName=cluster | ||||
| SlurmctldHost=slurm-master | ||||
| #SlurmctldHost= | ||||
| # | ||||
| #DisableRootJobs=NO | ||||
| #EnforcePartLimits=NO | ||||
| #Epilog= | ||||
| #EpilogSlurmctld= | ||||
| #FirstJobId=1 | ||||
| #MaxJobId=67043328 | ||||
| #GresTypes= | ||||
| #GroupUpdateForce=0 | ||||
| #GroupUpdateTime=600 | ||||
| #JobFileAppend=0 | ||||
| #JobRequeue=1 | ||||
| #JobSubmitPlugins=lua | ||||
| #KillOnBadExit=0 | ||||
| #LaunchType=launch/slurm | ||||
| #Licenses=foo*4,bar | ||||
| #MailProg=/bin/mail | ||||
| #MaxJobCount=10000 | ||||
| #MaxStepCount=40000 | ||||
| #MaxTasksPerNode=512 | ||||
| MpiDefault=none | ||||
| #MpiParams=ports=#-# | ||||
| #PluginDir= | ||||
| #PlugStackConfig= | ||||
| #PrivateData=jobs | ||||
| ProctrackType=proctrack/cgroup | ||||
| #Prolog= | ||||
| #PrologFlags= | ||||
| #PrologSlurmctld= | ||||
| #PropagatePrioProcess=0 | ||||
| #PropagateResourceLimits= | ||||
| #PropagateResourceLimitsExcept= | ||||
| #RebootProgram= | ||||
| ReturnToService=1 | ||||
| SlurmctldPidFile=/var/run/slurmctld.pid | ||||
| SlurmctldPort=6817 | ||||
| SlurmdPidFile=/var/run/slurmd.pid | ||||
| SlurmdPort=6818 | ||||
| SlurmdSpoolDir=/var/spool/slurmd | ||||
| SlurmUser=slurm | ||||
| #SlurmdUser=root | ||||
| #SrunEpilog= | ||||
| #SrunProlog= | ||||
| StateSaveLocation=/var/spool/slurmctld | ||||
| SwitchType=switch/none | ||||
| #TaskEpilog= | ||||
| TaskPlugin=task/affinity,task/cgroup | ||||
| #TaskProlog= | ||||
| #TopologyPlugin=topology/tree | ||||
| #TmpFS=/tmp | ||||
| #TrackWCKey=no | ||||
| #TreeWidth= | ||||
| #UnkillableStepProgram= | ||||
| #UsePAM=0 | ||||
| # | ||||
| # | ||||
| # TIMERS | ||||
| #BatchStartTimeout=10 | ||||
| #CompleteWait=0 | ||||
| #EpilogMsgTime=2000 | ||||
| #GetEnvTimeout=2 | ||||
| #HealthCheckInterval=0 | ||||
| #HealthCheckProgram= | ||||
| InactiveLimit=0 | ||||
| KillWait=30 | ||||
| #MessageTimeout=10 | ||||
| #ResvOverRun=0 | ||||
| MinJobAge=300 | ||||
| #OverTimeLimit=0 | ||||
| SlurmctldTimeout=120 | ||||
| SlurmdTimeout=300 | ||||
| #UnkillableStepTimeout=60 | ||||
| #VSizeFactor=0 | ||||
| Waittime=0 | ||||
| # | ||||
| # | ||||
| # SCHEDULING | ||||
| #DefMemPerCPU=0 | ||||
| #MaxMemPerCPU=0 | ||||
| #SchedulerTimeSlice=30 | ||||
| SchedulerType=sched/backfill | ||||
| SelectType=select/cons_tres | ||||
| # | ||||
| # | ||||
| # JOB PRIORITY | ||||
| #PriorityFlags= | ||||
| #PriorityType=priority/basic | ||||
| #PriorityDecayHalfLife= | ||||
| #PriorityCalcPeriod= | ||||
| #PriorityFavorSmall= | ||||
| #PriorityMaxAge= | ||||
| #PriorityUsageResetPeriod= | ||||
| #PriorityWeightAge= | ||||
| #PriorityWeightFairshare= | ||||
| #PriorityWeightJobSize= | ||||
| #PriorityWeightPartition= | ||||
| #PriorityWeightQOS= | ||||
| # | ||||
| # | ||||
| # LOGGING AND ACCOUNTING | ||||
| #AccountingStorageEnforce=0 | ||||
| #AccountingStorageHost= | ||||
| #AccountingStoragePass= | ||||
| #AccountingStoragePort= | ||||
| #AccountingStorageType=accounting_storage/slurmdbd | ||||
| #AccountingStorageUser= | ||||
| #AccountingStoreFlags= | ||||
| #JobCompHost= | ||||
| #JobCompLoc= | ||||
| #JobCompParams= | ||||
| #JobCompPass= | ||||
| #JobCompPort= | ||||
| JobCompType=jobcomp/none | ||||
| #JobCompUser= | ||||
| #JobContainerType=job_container/none | ||||
| JobAcctGatherFrequency=30 | ||||
| JobAcctGatherType=jobacct_gather/none | ||||
| SlurmctldDebug=info | ||||
| SlurmctldLogFile=/var/log/slurmctld.log | ||||
| SlurmdDebug=info | ||||
| SlurmdLogFile=/var/log/slurmd.log | ||||
| #SlurmSchedLogFile= | ||||
| #SlurmSchedLogLevel= | ||||
| #DebugFlags= | ||||
| # | ||||
| # | ||||
| # POWER SAVE SUPPORT FOR IDLE NODES (optional) | ||||
| #SuspendProgram= | ||||
| #ResumeProgram= | ||||
| #SuspendTimeout= | ||||
| #ResumeTimeout= | ||||
| #ResumeRate= | ||||
| #SuspendExcNodes= | ||||
| #SuspendExcParts= | ||||
| #SuspendRate= | ||||
| #SuspendTime= | ||||
| # | ||||
| # | ||||
| # COMPUTE NODES | ||||
| NodeName=slurm-master,slurm-comp0[1-5] CPUs=1 State=UNKNOWN | ||||
| PartitionName=debug Nodes=slurm-comp0[1-5] Default=YES MaxTime=INFINITE State=UP | ||||
		Reference in New Issue
	
	Block a user