heat_template_version: 2014-10-16 description: > This template will boot a HPC cluster with one or more compute nodes (as specified by the number_of_compute_nodes parameter, which defaults to "10"). parameters: # # REQUIRED PARAMETERS # key_name: type: string description: name of ssh key to be provisioned on our server public_net_id: type: string description: uuid of a network to use for floating ip addresses # # OPTIONAL PARAMETERS # image: type: string default: centos7 description: glance image used to boot the server host_prefix: type: string default: compute description: The prefix of the hostname of the compute node flavor: type: string default: m1.small description: flavor to use when booting the server dns_nameserver: type: string description: address of a dns nameserver reachable in your environment default: 8.8.8.8 number_of_compute_nodes: type: string description: how many compute nodes to spawn default: 10 resources: master_wait_handle: type: "AWS::CloudFormation::WaitConditionHandle" master_wait_condition: type: "AWS::CloudFormation::WaitCondition" depends_on: - master_node properties: Handle: get_resource: master_wait_handle Timeout: "1200" ###################################################################### # # network resources. allocate a network and router for our server. # it would also be possible to take advantage of existing network # resources (and have the deployer provide network and subnet ids, # etc, as parameters), but I wanted to minmize the amount of # configuration necessary to make this go. fixed_net: type: "OS::Neutron::Net" # This is the subnet on which we will deploy our server. fixed_subnet: type: "OS::Neutron::Subnet" properties: cidr: 10.0.9.0/24 network_id: { get_resource: fixed_net } dns_nameservers: [{ get_param: dns_nameserver }] # create a router attached to the external network provided as a # parameter to this stack. extrouter: type: "OS::Neutron::Router" properties: external_gateway_info: network: get_param: public_net_id # attached fixed_subnet to our extrouter router. extrouter_inside: type: "OS::Neutron::RouterInterface" properties: router_id: get_resource: extrouter subnet_id: { get_resource: fixed_subnet } ###################################################################### # # security groups. we need to permit network traffic of various # sorts. # secgroup_base: type: "OS::Neutron::SecurityGroup" properties: rules: - protocol: icmp - protocol: tcp port_range_min: 22 port_range_max: 22 secgroup_compute: type: "OS::Neutron::SecurityGroup" properties: rules: - protocol: tcp port_range_min: 22 port_range_max: 22 - protocol: tcp port_range_min: 4001 port_range_max: 4001 - protocol: tcp port_range_min: 15000 port_range_max: 15004 ###################################################################### # # databases server. this sets up a MySQL server # master_node: type: "OS::Nova::Server" depends_on: - extrouter_inside properties: name: master image: get_param: image flavor: get_param: flavor key_name: get_param: key_name user_data_format: RAW user_data: str_replace: template: | #!/bin/bash yum -y upgrade cat > /etc/yum.repos.d/torque.repo << EOF [torque] name=torque baseurl=http://192.168.95.200/install/post/otherpkgs/el7/torque enabled=1 gpgcheck=0 EOF yum -y install torque-server torque-scheduler chkconfig trqauthd on chkconfig pbs_server on chkconfig pbs_sched on myip=$(ip addr show eth0 | awk '$1 == "inet" {print $2}' | cut -f1 -d/) myip_last_octet=${myip##*.} echo $myip `hostname` >> /etc/hosts mkdir -p /var/spool/torque/server_priv echo $myip > /var/spool/torque/server_name rm -rf /var/spool/torque/server_priv/nodes mkdir -p /var/spool/torque/checkpoint pbs_server -t create -f service trqauthd restart service pbs_server restart service pbs_sched restart IFS="," read -a array1 <<< "$COMP_NODE_ADDRESSES" IFS="," read -a array2 <<< "$COMP_NODE_NAMES" length=${#array1[@]} for ((i=0;i<$length;i++)); do comp_ip=${array1[$i]} comp_name=${array2[$i]} echo -e "$comp_ip $comp_name" >> /etc/hosts qmgr -c "c n $comp_name" pbsnodes -c $comp_name done qmgr -c "c q testq" qmgr -c "s q testq queue_type=e" qmgr -c "s q testq enabled=t" qmgr -c "s q testq started=t" qmgr -c "s s scheduling=true" qmgr -c "s s default_queue=testq" groupadd -g 4001 testuser useradd -g 4001 -u 4001 -m testuser cat > ~testuser/torque.script << EOF #!/bin/bash #PBS -S /bin/bash #PBS -N test.job #PBS -l nodes=1:ppn=1 echo Starting Job, sleeping ... sleep 1200 echo Done EOF chown testuser:testuser ~testuser/torque.script cat > /tmp/wait-data << EOF { "Status" : "SUCCESS", "Reason" : "Setup Complete", "UniqueId" : "None", "Data" : "OK" } EOF curl -T /tmp/wait-data '$WAIT_HANDLE' params: "$COMP_NODE_ADDRESSES": list_join: [",", {get_attr: [compute_nodes, compute_node_ip]}] "$COMP_NODE_NAMES": list_join: [",", {get_attr: [compute_nodes, compute_node_name]}] "$WAIT_HANDLE": get_resource: master_wait_handle networks: - port: get_resource: master_node_eth0 master_node_eth0: type: "OS::Neutron::Port" properties: network_id: get_resource: fixed_net security_groups: - get_resource: secgroup_base - get_resource: secgroup_compute fixed_ips: - subnet_id: get_resource: fixed_subnet master_node_floating: type: "OS::Neutron::FloatingIP" depends_on: - extrouter_inside properties: floating_network_id: get_param: public_net_id port_id: get_resource: master_node_eth0 compute_nodes: type: "OS::Heat::ResourceGroup" depends_on: - extrouter_inside properties: count: {get_param: number_of_compute_nodes} resource_def: type: compute_node.yaml properties: index_name: "%index%" host_prefix: {get_param: host_prefix} key_name: {get_param: key_name} image: {get_param: image} flavor: {get_param: flavor} private_net_id: {get_resource: fixed_net} private_subnet_id: {get_resource: fixed_subnet} public_net_id: {get_param: public_net_id} master_node_ip: {get_attr: [master_node_eth0, fixed_ips, 0, ip_address]} outputs: master_node_external: value: {get_attr: [master_node_floating, floating_ip_address]} compute_node_ips: value: {get_attr: [compute_nodes, compute_node_ip]} compute_node_names: value: {get_attr: [compute_nodes, compute_node_name]} compute_node_external: value: {get_attr: [compute_nodes, compute_node_external_ip]}