From 05f4119eae9858ae0fb51eb161c6499b7928ab9b Mon Sep 17 00:00:00 2001 From: cxhong Date: Wed, 17 Jun 2020 15:45:30 -0400 Subject: [PATCH] Add Cuda11 setup scripts --- .../xcat/samples/cuda11/cuda11_power9_setup | 154 ++++++++++++++++++ .../xcat/samples/cuda11/cuda11_setup.readme | 66 ++++++++ .../xcat/samples/cuda11/cuda_power9_setup | 96 +++++++++++ 3 files changed, 316 insertions(+) create mode 100755 xCAT-server/share/xcat/samples/cuda11/cuda11_power9_setup create mode 100644 xCAT-server/share/xcat/samples/cuda11/cuda11_setup.readme create mode 100755 xCAT-server/share/xcat/samples/cuda11/cuda_power9_setup diff --git a/xCAT-server/share/xcat/samples/cuda11/cuda11_power9_setup b/xCAT-server/share/xcat/samples/cuda11/cuda11_power9_setup new file mode 100755 index 000000000..5a2a27f0c --- /dev/null +++ b/xCAT-server/share/xcat/samples/cuda11/cuda11_power9_setup @@ -0,0 +1,154 @@ +#!/bin/bash +# +# Copyright (C) 2018 International Business Machines +# Eclipse Public License, Version 1.0 (EPL-1.0) +# +# +# 2018-03-21 GONG Jie +# 2018-04-24 Matt Ezell +# +# This script is used for doing extra setup steps for NVIDIA POWER9 CUDA driver +# on RHEL 7. Please refer document below for details. +# +# http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#power9-setup +# + +umask 0022 + +# Disable a udev rule installed by default in some Linux distributions that cause hot-pluggable +# memory to be automatically onlined when it is physically probed. +# +# The overrides for /lib/udev rules should be done in /etc/udev +# +UDEV_REDHAT_SOURCE=${IMG_ROOTIMGDIR}/lib/udev/rules.d/40-redhat.rules +UDEV_REDHAT_TARGET=${IMG_ROOTIMGDIR}/etc/udev/rules.d/40-redhat.rules + +# If the file does not exist in /etc/udev, copy it from /lib/udev +if [ ! -e ${UDEV_REDHAT_TARGET} ]; then + cp -n ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET} +fi + +# Disable udev memory auto-onlining Rule for cuda10.x +# +# For RHELS 7.5 ALT +# +sed -i "s/^\(SUBSYSTEM==\"memory\".*\)/#\1/" ${UDEV_REDHAT_TARGET} +# +# For RHELS 7.6 ALT +# +if [[ `grep 'Memory hotadd request' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null && grep 'LABEL="memory_hotplug_end' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` == 0 ]]; then + echo "Detected RHELS 7.6 ALT, modifying ${UDEV_REDHAT_TARGET}..." + # Comment out the memory hotadd request (for reference) + if [[ `grep "## Memory hotadd request" ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` != 0 ]]; then + # but only run one time, not if it's already commented out. (to handle multiple genimage calls) + #sed -i '/Memory hotadd request/,+8 s/^/#/' ${UDEV_REDHAT_TARGET} + # RH76 CUDA doc recommends the following: + sed -i s/^\SUBSYSTEM!=\"memory\"/SUBSYSTEM==\"\*\"/ ${UDEV_REDHAT_TARGET} + sed -i s/^\ACTION!=\"add\"/ACTION==\"\*\"/ /tmp/40-redhat.rules ${UDEV_REDHAT_TARGET} + fi +fi + +echo "Comparing ${UDEV_REDHAT_SOURCE} and ${UDEV_REDHAT_TARGET}" +diff ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET} + +# Setting NVIDIA parameters in both /etc/modprobe.d and /usr/lib/modprobe.d + +echo "==> Setting NVIDIA options in /usr/lib/modprobe.d/gpusupport and /etc/modprobe.d" +echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/gpusupport.conf +echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/etc/modprobe.d/gpusupport.conf + +grep nouveau ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf +if (( $? )) +then +echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf +fi + +grep nouveau ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf +if (( $? )) +then +echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf +fi + +# This is for nvprof (per George Chochia) +grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf +if (( $? )) +then + echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf +fi + +grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf +if (( $? )) +then + echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf +fi + +#for cuda nvidia installation +unset ARCH +installed_kernel=`uname -r` + +function hack_uname() +{ + mv "$1/bin/uname" "$1/bin/uname.xcat" + cat <<-EOF >>"$1/bin/uname" +#!/bin/sh + case "\$1" in + "-m") + ARCH="\$(rpm -q kernel-\$("\$0" -r) --qf '%{arch}' 2>/dev/null)" + echo "\$ARCH" + ;; + "-r") + kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)" + echo "\$kernel_version" + ;; + "-s"|"") + echo "Linux" + ;; + esac + exit 0 +EOF + + chmod 0755 "$1/bin/uname" +} + + +if [ -z "${IMG_ROOTIMGDIR}" ] +then + kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)" + mkinitrd -v -f "/boot/initramfs-${kernel_version}.img" "${kernel_version}" + hack_uname ${IMG_ROOTIMGDIR} +else + hack_uname +fi + +updated_kernel=`uname -r` + +dnf -y module install nvidia-driver:latest-dkms +dnf -y install cuda + +#depmod nvidia_rsync_mem used installed kernel not updated kernel +if [ ${updated_kernel} != ${installed_kernel} ] +then + ln -s /lib/modules/${updated_kernel} /lib/modules/${installed_kernel} +fi +dnf -y install nvidia_peer_memory +dnf -y install nv_rsync_mem +dnf -y install nvidia_gdrcopy_kernel +if [ ${updated_kernel} != ${installed_kernel} ] +then + rm -rf /lib/modules/${installed_kernel} +fi + +[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}" +$CHROOTCMD /bin/bash -c "systemctl enable nvidia-persistenced" + + +[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}" +$CHROOTCMD /bin/bash -c "systemctl enable nvidia_gdrcopy_kernel.service" + +if [ -z "${IMG_ROOTIMGDIR}" ] +then + mv "${IMG_ROOTIMGDIR}/bin/uname.xcat" "${IMG_ROOTIMGDIR}/bin/uname" +else + mv "/bin/uname.xcat" "/bin/uname" +fi + diff --git a/xCAT-server/share/xcat/samples/cuda11/cuda11_setup.readme b/xCAT-server/share/xcat/samples/cuda11/cuda11_setup.readme new file mode 100644 index 000000000..5e888811b --- /dev/null +++ b/xCAT-server/share/xcat/samples/cuda11/cuda11_setup.readme @@ -0,0 +1,66 @@ +cuda setup scripts +================== + +This section documented NVIDIA CUDA Toolkit v11 installation on the power9 rhels8.1 system. + +Diskful images +-------------- + +The following ``cudafull`` osimage definitions will be created from the base ``rhels8.1-ppc64le-install-compute`` osimage. :: + + # lsdef -t osimage rhels8.1.0-ppc64le-install-cudafull + Object name: rhels8.1.0-ppc64le-install-cudafull + imagetype=linux + osarch=ppc64le + osdistroname=rhels8.1.0-ppc64le + osname=Linux + osvers=rhels8.1.0 + otherpkgdir=/install/post/otherpkgs/rhels8.1.0/ppc64le + pkgdir=/install/rhels8.1.0/ppc64le,/install/REPO/software/nvidia/cuda-core/11.0.1_450.36.06-1/repo/ppc64le,/install/cuda11/ppc64le/cuda-deps + pkglist=/opt/xcat/share/xcat/install/rh/compute.rhels8.cuda.pkglist + postscripts=custom.ps/nvidia/postinstall/cuda11_power9_setup + profile=compute + provmethod=install + template=/opt/xcat/share/xcat/install/rh/compute.rhels8.tmpl + +Postscripts +^^^^^^^^^^^ + +xCAT provides ``cuda_power9_setup`` postscripts to setup additional configuration to install NVIDIA POWER9 CUDA driver. For ``CUDA11``, it has issue to installing nvidia-drivers modules with kickstart. To workaround this problem, xCAT provides another postscripts ``cuda11_power9_setup``, the CUDA packages will be installed from this postscripts instead from package list and this is only apply to the diskfull installation. + +CUDA dependences +^^^^^^^^^^^^^^^^ + +``dkms`` and ``opencl-filesystem`` are required for the CUDA11. ``dkms`` packages can be found in the rhel8 EPEL repository. ``opencl-filesystem`` packages can be found in the rhel8 CodeReady repository. :: + + # ls -ltr /install/cuda11/ppc64le/cuda-deps + -rw-r--r-- 1 root root 82332 Jun 16 10:25 dkms-2.8.1-4.20200214git5ca628c.el8.noarch.rpm + -rw-r--r-- 1 root root 90148 Jun 16 10:29 opencl-headers-2.2-1.20180306gite986688.el8.noarch.rpm + -rw-r--r-- 1 root root 8668 Jun 16 10:29 opencl-filesystem-1.0-6.el8.noarch.rpm + drwxr-xr-x 2 root root 4096 Jun 16 15:10 repodata + +CUDA Packages +^^^^^^^^^^^^^ + +``cuda-repo-rhel8-11-0-local-11.0.1_450.36.06-1.ppc64le.rpm`` is used for above osimage and it disbuted in the ``/install/REPO/software/nvidia/cuda-core/11.0.1_450.36.06-1/repo/ppc64le`` dir. +Besides rhels8 base packlist, the following packages needs to be added also. :: + + # diff /opt/xcat/share/xcat/install/rh/compute.rhels8.cuda.pkglist /opt/xcat/share/xcat/install/rh/compute.rhels8.pkglist + 12,27d11 + < libtirpc + < yum + < rpm + < dnf + < dracut + < dracut-network + < e2fsprogs + < #For Cuda + < kernel-devel + < kernel-headers + < make + < gcc + < pciutils + < dkms + < opencl-filesystem + +NOTE: The two scripts in this directory verified with HPC service stack software. diff --git a/xCAT-server/share/xcat/samples/cuda11/cuda_power9_setup b/xCAT-server/share/xcat/samples/cuda11/cuda_power9_setup new file mode 100755 index 000000000..ac7626290 --- /dev/null +++ b/xCAT-server/share/xcat/samples/cuda11/cuda_power9_setup @@ -0,0 +1,96 @@ +#!/bin/bash +# +# Copyright (C) 2018 International Business Machines +# Eclipse Public License, Version 1.0 (EPL-1.0) +# +# +# 2018-03-21 GONG Jie +# 2018-04-24 Matt Ezell +# +# This script is used for doing extra setup steps for NVIDIA POWER9 CUDA driver +# on RHEL 7. Please refer document below for details. +# +# http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#power9-setup +# + +umask 0022 + +[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}" +$CHROOTCMD /bin/bash -c "systemctl enable nvidia-persistenced" + + +[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}" +$CHROOTCMD /bin/bash -c "systemctl enable nvidia_gdrcopy_kernel.service" + +# Disable a udev rule installed by default in some Linux distributions that cause hot-pluggable +# memory to be automatically onlined when it is physically probed. +# +# The overrides for /lib/udev rules should be done in /etc/udev +# +UDEV_REDHAT_SOURCE=${IMG_ROOTIMGDIR}/lib/udev/rules.d/40-redhat.rules +UDEV_REDHAT_TARGET=${IMG_ROOTIMGDIR}/etc/udev/rules.d/40-redhat.rules + +# If the file does not exist in /etc/udev, copy it from /lib/udev +if [ ! -e ${UDEV_REDHAT_TARGET} ]; then + cp -n ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET} +fi + +# Disable udev memory auto-onlining Rule for cuda10.x +# +# For RHELS 7.5 ALT +# +sed -i "s/^\(SUBSYSTEM==\"memory\".*\)/#\1/" ${UDEV_REDHAT_TARGET} +# +# For RHELS 7.6 ALT +# +if [[ `grep 'Memory hotadd request' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null && grep 'LABEL="memory_hotplug_end' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` == 0 ]]; then + echo "Detected RHELS 7.6 ALT, modifying ${UDEV_REDHAT_TARGET}..." + # Comment out the memory hotadd request (for reference) + if [[ `grep "## Memory hotadd request" ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` != 0 ]]; then + # but only run one time, not if it's already commented out. (to handle multiple genimage calls) + #sed -i '/Memory hotadd request/,+8 s/^/#/' ${UDEV_REDHAT_TARGET} + # RH76 CUDA doc recommends the following: + sed -i s/^\SUBSYSTEM!=\"memory\"/SUBSYSTEM==\"\*\"/ ${UDEV_REDHAT_TARGET} + sed -i s/^\ACTION!=\"add\"/ACTION==\"\*\"/ /tmp/40-redhat.rules ${UDEV_REDHAT_TARGET} + fi +fi + +echo "Comparing ${UDEV_REDHAT_SOURCE} and ${UDEV_REDHAT_TARGET}" +diff ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET} + +# Setting NVIDIA parameters in both /etc/modprobe.d and /usr/lib/modprobe.d + +echo "==> Setting NVIDIA options in /usr/lib/modprobe.d/gpusupport and /etc/modprobe.d" +echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/gpusupport.conf +echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/etc/modprobe.d/gpusupport.conf + +grep nouveau ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf +if (( $? )) +then +echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf +fi + +grep nouveau ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf +if (( $? )) +then +echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf +fi + +# This is for nvprof (per George Chochia) +grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf +if (( $? )) +then + echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf +fi + +grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf +if (( $? )) +then + echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf +fi + +if [ -z "${IMG_ROOTIMGDIR}" ] +then + kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)" + mkinitrd -v -f "/boot/initramfs-${kernel_version}.img" "${kernel_version}" +fi