2014-10-21 22:58:18 +01:00

288 lines
7.8 KiB
YAML

heat_template_version: 2014-10-16
description: >
This template will boot a HPC cluster with one or more compute
nodes (as specified by the number_of_compute_nodes parameter, which
defaults to "10").
parameters:
#
# REQUIRED PARAMETERS
#
key_name:
type: string
description: name of ssh key to be provisioned on our server
public_net_id:
type: string
description: uuid of a network to use for floating ip addresses
#
# OPTIONAL PARAMETERS
#
image:
type: string
default: centos7
description: glance image used to boot the server
host_prefix:
type: string
default: compute
description: The prefix of the hostname of the compute node
flavor:
type: string
default: m1.small
description: flavor to use when booting the server
dns_nameserver:
type: string
description: address of a dns nameserver reachable in your environment
default: 8.8.8.8
number_of_compute_nodes:
type: string
description: how many compute nodes to spawn
default: 10
resources:
master_wait_handle:
type: "AWS::CloudFormation::WaitConditionHandle"
master_wait_condition:
type: "AWS::CloudFormation::WaitCondition"
depends_on:
- master_node
properties:
Handle:
get_resource: master_wait_handle
Timeout: "1200"
######################################################################
#
# network resources. allocate a network and router for our server.
# it would also be possible to take advantage of existing network
# resources (and have the deployer provide network and subnet ids,
# etc, as parameters), but I wanted to minmize the amount of
# configuration necessary to make this go.
fixed_net:
type: "OS::Neutron::Net"
# This is the subnet on which we will deploy our server.
fixed_subnet:
type: "OS::Neutron::Subnet"
properties:
cidr: 10.0.9.0/24
network_id: { get_resource: fixed_net }
dns_nameservers: [{ get_param: dns_nameserver }]
# create a router attached to the external network provided as a
# parameter to this stack.
extrouter:
type: "OS::Neutron::Router"
properties:
external_gateway_info:
network:
get_param: public_net_id
# attached fixed_subnet to our extrouter router.
extrouter_inside:
type: "OS::Neutron::RouterInterface"
properties:
router_id:
get_resource: extrouter
subnet_id: { get_resource: fixed_subnet }
######################################################################
#
# security groups. we need to permit network traffic of various
# sorts.
#
secgroup_base:
type: "OS::Neutron::SecurityGroup"
properties:
rules:
- protocol: icmp
- protocol: tcp
port_range_min: 22
port_range_max: 22
secgroup_compute:
type: "OS::Neutron::SecurityGroup"
properties:
rules:
- protocol: tcp
port_range_min: 22
port_range_max: 22
- protocol: tcp
port_range_min: 4001
port_range_max: 4001
- protocol: tcp
port_range_min: 15000
port_range_max: 15004
######################################################################
#
# databases server. this sets up a MySQL server
#
master_node:
type: "OS::Nova::Server"
depends_on:
- extrouter_inside
properties:
name: master
image:
get_param: image
flavor:
get_param: flavor
key_name:
get_param: key_name
user_data_format: RAW
user_data:
str_replace:
template: |
#!/bin/bash
yum -y upgrade
cat > /etc/yum.repos.d/torque.repo << EOF
[torque]
name=torque
baseurl=http://192.168.95.200/install/post/otherpkgs/el7/torque
enabled=1
gpgcheck=0
EOF
yum -y install torque-server torque-scheduler
chkconfig trqauthd on
chkconfig pbs_server on
chkconfig pbs_sched on
myip=$(ip addr show eth0 | awk '$1 == "inet" {print $2}' | cut -f1 -d/)
myip_last_octet=${myip##*.}
echo $myip `hostname` >> /etc/hosts
mkdir -p /var/spool/torque/server_priv
echo $myip > /var/spool/torque/server_name
rm -rf /var/spool/torque/server_priv/nodes
mkdir -p /var/spool/torque/checkpoint
pbs_server -t create -f
service trqauthd restart
service pbs_server restart
service pbs_sched restart
IFS="," read -a array1 <<< "$COMP_NODE_ADDRESSES"
IFS="," read -a array2 <<< "$COMP_NODE_NAMES"
length=${#array1[@]}
for ((i=0;i<$length;i++)); do
comp_ip=${array1[$i]}
comp_name=${array2[$i]}
echo -e "$comp_ip $comp_name" >> /etc/hosts
qmgr -c "c n $comp_name"
pbsnodes -c $comp_name
done
qmgr -c "c q testq"
qmgr -c "s q testq queue_type=e"
qmgr -c "s q testq enabled=t"
qmgr -c "s q testq started=t"
qmgr -c "s s scheduling=true"
qmgr -c "s s default_queue=testq"
groupadd -g 4001 testuser
useradd -g 4001 -u 4001 -m testuser
cat > ~testuser/torque.script << EOF
#!/bin/bash
#PBS -S /bin/bash
#PBS -N test.job
#PBS -l nodes=1:ppn=1
echo Starting Job, sleeping ...
sleep 1200
echo Done
EOF
chown testuser:testuser ~testuser/torque.script
cat > /tmp/wait-data << EOF
{
"Status" : "SUCCESS",
"Reason" : "Setup Complete",
"UniqueId" : "None",
"Data" : "OK"
}
EOF
curl -T /tmp/wait-data '$WAIT_HANDLE'
params:
"$COMP_NODE_ADDRESSES":
list_join: [",", {get_attr: [compute_nodes, compute_node_ip]}]
"$COMP_NODE_NAMES":
list_join: [",", {get_attr: [compute_nodes, compute_node_name]}]
"$WAIT_HANDLE":
get_resource: master_wait_handle
networks:
- port:
get_resource: master_node_eth0
master_node_eth0:
type: "OS::Neutron::Port"
properties:
network_id:
get_resource: fixed_net
security_groups:
- get_resource: secgroup_base
- get_resource: secgroup_compute
fixed_ips:
- subnet_id:
get_resource: fixed_subnet
master_node_floating:
type: "OS::Neutron::FloatingIP"
depends_on:
- extrouter_inside
properties:
floating_network_id:
get_param: public_net_id
port_id:
get_resource: master_node_eth0
compute_nodes:
type: "OS::Heat::ResourceGroup"
depends_on:
- extrouter_inside
properties:
count: {get_param: number_of_compute_nodes}
resource_def:
type: compute_node.yaml
properties:
index_name: "%index%"
host_prefix: {get_param: host_prefix}
key_name: {get_param: key_name}
image: {get_param: image}
flavor: {get_param: flavor}
private_net_id: {get_resource: fixed_net}
private_subnet_id: {get_resource: fixed_subnet}
public_net_id: {get_param: public_net_id}
master_node_ip: {get_attr: [master_node_eth0, fixed_ips, 0, ip_address]}
outputs:
master_node_external:
value: {get_attr: [master_node_floating, floating_ip_address]}
compute_node_ips:
value: {get_attr: [compute_nodes, compute_node_ip]}
compute_node_names:
value: {get_attr: [compute_nodes, compute_node_name]}
compute_node_external:
value: {get_attr: [compute_nodes, compute_node_external_ip]}