diff --git a/xCAT-server/share/xcat/samples/cuda11/cuda11_power9_setup b/xCAT-server/share/xcat/samples/cuda11/cuda11_power9_setup new file mode 100755 index 000000000..5a2a27f0c --- /dev/null +++ b/xCAT-server/share/xcat/samples/cuda11/cuda11_power9_setup @@ -0,0 +1,154 @@ +#!/bin/bash +# +# Copyright (C) 2018 International Business Machines +# Eclipse Public License, Version 1.0 (EPL-1.0) +# +# +# 2018-03-21 GONG Jie +# 2018-04-24 Matt Ezell +# +# This script is used for doing extra setup steps for NVIDIA POWER9 CUDA driver +# on RHEL 7. Please refer document below for details. +# +# http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#power9-setup +# + +umask 0022 + +# Disable a udev rule installed by default in some Linux distributions that cause hot-pluggable +# memory to be automatically onlined when it is physically probed. +# +# The overrides for /lib/udev rules should be done in /etc/udev +# +UDEV_REDHAT_SOURCE=${IMG_ROOTIMGDIR}/lib/udev/rules.d/40-redhat.rules +UDEV_REDHAT_TARGET=${IMG_ROOTIMGDIR}/etc/udev/rules.d/40-redhat.rules + +# If the file does not exist in /etc/udev, copy it from /lib/udev +if [ ! -e ${UDEV_REDHAT_TARGET} ]; then + cp -n ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET} +fi + +# Disable udev memory auto-onlining Rule for cuda10.x +# +# For RHELS 7.5 ALT +# +sed -i "s/^\(SUBSYSTEM==\"memory\".*\)/#\1/" ${UDEV_REDHAT_TARGET} +# +# For RHELS 7.6 ALT +# +if [[ `grep 'Memory hotadd request' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null && grep 'LABEL="memory_hotplug_end' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` == 0 ]]; then + echo "Detected RHELS 7.6 ALT, modifying ${UDEV_REDHAT_TARGET}..." + # Comment out the memory hotadd request (for reference) + if [[ `grep "## Memory hotadd request" ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` != 0 ]]; then + # but only run one time, not if it's already commented out. (to handle multiple genimage calls) + #sed -i '/Memory hotadd request/,+8 s/^/#/' ${UDEV_REDHAT_TARGET} + # RH76 CUDA doc recommends the following: + sed -i s/^\SUBSYSTEM!=\"memory\"/SUBSYSTEM==\"\*\"/ ${UDEV_REDHAT_TARGET} + sed -i s/^\ACTION!=\"add\"/ACTION==\"\*\"/ /tmp/40-redhat.rules ${UDEV_REDHAT_TARGET} + fi +fi + +echo "Comparing ${UDEV_REDHAT_SOURCE} and ${UDEV_REDHAT_TARGET}" +diff ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET} + +# Setting NVIDIA parameters in both /etc/modprobe.d and /usr/lib/modprobe.d + +echo "==> Setting NVIDIA options in /usr/lib/modprobe.d/gpusupport and /etc/modprobe.d" +echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/gpusupport.conf +echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/etc/modprobe.d/gpusupport.conf + +grep nouveau ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf +if (( $? )) +then +echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf +fi + +grep nouveau ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf +if (( $? )) +then +echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf +fi + +# This is for nvprof (per George Chochia) +grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf +if (( $? )) +then + echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf +fi + +grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf +if (( $? )) +then + echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf +fi + +#for cuda nvidia installation +unset ARCH +installed_kernel=`uname -r` + +function hack_uname() +{ + mv "$1/bin/uname" "$1/bin/uname.xcat" + cat <<-EOF >>"$1/bin/uname" +#!/bin/sh + case "\$1" in + "-m") + ARCH="\$(rpm -q kernel-\$("\$0" -r) --qf '%{arch}' 2>/dev/null)" + echo "\$ARCH" + ;; + "-r") + kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)" + echo "\$kernel_version" + ;; + "-s"|"") + echo "Linux" + ;; + esac + exit 0 +EOF + + chmod 0755 "$1/bin/uname" +} + + +if [ -z "${IMG_ROOTIMGDIR}" ] +then + kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)" + mkinitrd -v -f "/boot/initramfs-${kernel_version}.img" "${kernel_version}" + hack_uname ${IMG_ROOTIMGDIR} +else + hack_uname +fi + +updated_kernel=`uname -r` + +dnf -y module install nvidia-driver:latest-dkms +dnf -y install cuda + +#depmod nvidia_rsync_mem used installed kernel not updated kernel +if [ ${updated_kernel} != ${installed_kernel} ] +then + ln -s /lib/modules/${updated_kernel} /lib/modules/${installed_kernel} +fi +dnf -y install nvidia_peer_memory +dnf -y install nv_rsync_mem +dnf -y install nvidia_gdrcopy_kernel +if [ ${updated_kernel} != ${installed_kernel} ] +then + rm -rf /lib/modules/${installed_kernel} +fi + +[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}" +$CHROOTCMD /bin/bash -c "systemctl enable nvidia-persistenced" + + +[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}" +$CHROOTCMD /bin/bash -c "systemctl enable nvidia_gdrcopy_kernel.service" + +if [ -z "${IMG_ROOTIMGDIR}" ] +then + mv "${IMG_ROOTIMGDIR}/bin/uname.xcat" "${IMG_ROOTIMGDIR}/bin/uname" +else + mv "/bin/uname.xcat" "/bin/uname" +fi + diff --git a/xCAT-server/share/xcat/samples/cuda11/cuda11_setup.readme b/xCAT-server/share/xcat/samples/cuda11/cuda11_setup.readme new file mode 100644 index 000000000..d6bb77371 --- /dev/null +++ b/xCAT-server/share/xcat/samples/cuda11/cuda11_setup.readme @@ -0,0 +1,73 @@ +cuda setup scripts +================== + +This sample documents installation of the NVIDIA CUDA Toolkit v11 on IBM POWER9 servers as part of xCAT diskful provisioning of Red Hat Enterprise Linux 8.1. +For ``CUDA11``, there is a known issue that prevents successful installion of the nvidia-drivers module as part of the operating system kickstart install process used by diskful provisioning. +Diskless provisioning can still be performed using the traditional osimage method; these instructions apply to diskful provisioning only. + + +Diskful images +-------------- + +For diskful provisioning, create a new ``cudafull`` osimage definition using the default ``rhels8.1-ppc64le-install-compute`` osimage as a starting point. :: + + # lsdef -t osimage rhels8.1.0-ppc64le-install-cudafull + Object name: rhels8.1.0-ppc64le-install-cudafull + imagetype=linux + osarch=ppc64le + osdistroname=rhels8.1.0-ppc64le + osname=Linux + osvers=rhels8.1.0 + otherpkgdir=/install/post/otherpkgs/rhels8.1.0/ppc64le + pkgdir=/install/rhels8.1.0/ppc64le,/install/REPO/software/nvidia/cuda-core/11.0.1_450.36.06-1/repo/ppc64le,/install/cuda11/ppc64le/cuda-deps + pkglist=/opt/xcat/share/xcat/install/rh/compute.rhels8.cuda.pkglist + postscripts=custom.ps/nvidia/postinstall/cuda11_power9_setup + profile=compute + provmethod=install + template=/opt/xcat/share/xcat/install/rh/compute.rhels8.tmpl + + +Postscripts +^^^^^^^^^^^ + +For ``CUDA11``, there is a known issue that prevents successful installion of the nvidia-drivers module as part of the Red Hat kickstart install process used by diskful provisioning. As an example method to work around this problem, refer to the postscript named ``cuda11_power9_setup``. This postscript will install the NVIDIA CUDA packages directly instead of relying on the osimage package list mechanism. ``cuda11_power9_setup`` is only needed for diskful provisioning. + + +CUDA dependences +^^^^^^^^^^^^^^^^ + +``dkms`` and ``opencl-filesystem`` are required for the CUDA11. ``dkms`` packages can be found in the rhel8 EPEL repository. ``opencl-filesystem`` packages can be found in the rhel8 CodeReady repository. :: + + # ls -ltr /install/cuda11/ppc64le/cuda-deps + -rw-r--r-- 1 root root 82332 Jun 16 10:25 dkms-2.8.1-4.20200214git5ca628c.el8.noarch.rpm + -rw-r--r-- 1 root root 90148 Jun 16 10:29 opencl-headers-2.2-1.20180306gite986688.el8.noarch.rpm + -rw-r--r-- 1 root root 8668 Jun 16 10:29 opencl-filesystem-1.0-6.el8.noarch.rpm + drwxr-xr-x 2 root root 4096 Jun 16 15:10 repodata + + +CUDA Packages +^^^^^^^^^^^^^ + +``cuda-repo-rhel8-11-0-local-11.0.1_450.36.06-1.ppc64le.rpm`` is used for the example ``cudafull`` osimage and the contents are copied into a directory named ``/install/REPO/software/nvidia/cuda-core/11.0.1_450.36.06-1/repo/ppc64le``. +In addition to the rhels8 base pkglist, the following packages need to also be added. :: + + # diff /opt/xcat/share/xcat/install/rh/compute.rhels8.cuda.pkglist /opt/xcat/share/xcat/install/rh/compute.rhels8.pkglist + 12,27d11 + < libtirpc + < yum + < rpm + < dnf + < dracut + < dracut-network + < e2fsprogs + < #For Cuda + < kernel-devel + < kernel-headers + < make + < gcc + < pciutils + < dkms + < opencl-filesystem + + +NOTE: The samples in this directory were verified as part of the IBM HPC POWER9 Clusters service pack testing