Add Cuda11 setup scripts

2025-08-09 21:10:15 +00:00 · 2020-06-17 15:45:30 -04:00
parent a93c7c9dcb
commit 05f4119eae
3 changed files with 316 additions and 0 deletions
--- a/xCAT-server/share/xcat/samples/cuda11/cuda11_power9_setup
+++ b/xCAT-server/share/xcat/samples/cuda11/cuda11_power9_setup
@@ -0,0 +1,154 @@
+#!/bin/bash
+#
+# Copyright (C) 2018 International Business Machines
+# Eclipse Public License, Version 1.0 (EPL-1.0)
+#     <http://www.eclipse.org/legal/epl-v10.html>
+#
+# 2018-03-21 GONG Jie <gongjie@linux.vnet.ibm.com>
+# 2018-04-24 Matt Ezell <ezellma@ornl.gov>
+#
+# This script is used for doing extra setup steps for NVIDIA POWER9 CUDA driver
+# on RHEL 7. Please refer document below for details.
+#
+# http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#power9-setup
+#
+
+umask 0022
+
+# Disable a udev rule installed by default in some Linux distributions that cause hot-pluggable
+# memory to be automatically onlined when it is physically probed.
+#
+# The overrides for /lib/udev rules should be done in /etc/udev
+#
+UDEV_REDHAT_SOURCE=${IMG_ROOTIMGDIR}/lib/udev/rules.d/40-redhat.rules
+UDEV_REDHAT_TARGET=${IMG_ROOTIMGDIR}/etc/udev/rules.d/40-redhat.rules
+
+# If the file does not exist in /etc/udev, copy it from /lib/udev
+if [ ! -e ${UDEV_REDHAT_TARGET} ]; then
+  cp -n ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET}
+fi
+
+# Disable udev memory auto-onlining Rule for cuda10.x
+#
+# For RHELS 7.5 ALT
+#
+sed -i "s/^\(SUBSYSTEM==\"memory\".*\)/#\1/" ${UDEV_REDHAT_TARGET}
+#
+# For RHELS 7.6 ALT
+#
+if [[ `grep 'Memory hotadd request' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null && grep 'LABEL="memory_hotplug_end' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` == 0 ]]; then
+    echo "Detected RHELS 7.6 ALT, modifying ${UDEV_REDHAT_TARGET}..."
+    # Comment out the memory hotadd request (for reference)
+    if [[ `grep "## Memory hotadd request" ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` != 0 ]]; then
+        # but only run one time, not if it's already commented out. (to handle multiple genimage calls)
+        #sed -i '/Memory hotadd request/,+8 s/^/#/' ${UDEV_REDHAT_TARGET}
+        # RH76 CUDA doc recommends the following:
+        sed -i s/^\SUBSYSTEM!=\"memory\"/SUBSYSTEM==\"\*\"/ ${UDEV_REDHAT_TARGET}
+        sed -i s/^\ACTION!=\"add\"/ACTION==\"\*\"/ /tmp/40-redhat.rules ${UDEV_REDHAT_TARGET}
+    fi 
+fi
+
+echo "Comparing ${UDEV_REDHAT_SOURCE} and ${UDEV_REDHAT_TARGET}"
+diff ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET}
+
+# Setting NVIDIA parameters in both /etc/modprobe.d and /usr/lib/modprobe.d
+
+echo "==> Setting NVIDIA options in /usr/lib/modprobe.d/gpusupport and /etc/modprobe.d"
+echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/gpusupport.conf
+echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/etc/modprobe.d/gpusupport.conf
+
+grep nouveau ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
+if (( $? ))
+then
+echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
+fi
+
+grep nouveau ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
+if (( $? ))
+then
+echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
+fi
+
+# This is for nvprof (per George Chochia)
+grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
+if (( $? ))
+then
+  echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
+fi
+
+grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
+if (( $? ))
+then
+  echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
+fi
+
+#for cuda nvidia installation
+unset ARCH
+installed_kernel=`uname -r`
+
+function hack_uname()
+{
+    mv "$1/bin/uname" "$1/bin/uname.xcat"
+    cat <<-EOF >>"$1/bin/uname"
+#!/bin/sh
+  case "\$1" in
+    "-m")
+        ARCH="\$(rpm -q kernel-\$("\$0" -r) --qf '%{arch}' 2>/dev/null)"
+        echo "\$ARCH"
+        ;;
+    "-r")
+        kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)"
+        echo "\$kernel_version"
+        ;;
+    "-s"|"")
+        echo "Linux"
+        ;;
+  esac
+  exit 0
+EOF
+
+    chmod 0755 "$1/bin/uname"
+}
+
+
+if [ -z "${IMG_ROOTIMGDIR}" ] 
+then
+	kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)"
+	mkinitrd -v -f "/boot/initramfs-${kernel_version}.img" "${kernel_version}"
+        hack_uname ${IMG_ROOTIMGDIR}
+else
+        hack_uname
+fi
+
+updated_kernel=`uname -r`
+
+dnf -y module install nvidia-driver:latest-dkms
+dnf -y install cuda
+
+#depmod nvidia_rsync_mem used installed kernel not updated kernel
+if [ ${updated_kernel} != ${installed_kernel} ]
+then
+    ln -s /lib/modules/${updated_kernel} /lib/modules/${installed_kernel}
+fi
+dnf -y install nvidia_peer_memory
+dnf -y install nv_rsync_mem
+dnf -y install nvidia_gdrcopy_kernel
+if [ ${updated_kernel} != ${installed_kernel} ]
+then
+    rm -rf /lib/modules/${installed_kernel}
+fi
+
+[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}"
+$CHROOTCMD /bin/bash -c "systemctl enable nvidia-persistenced"
+
+
+[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}"
+$CHROOTCMD /bin/bash -c "systemctl enable nvidia_gdrcopy_kernel.service"
+
+if [ -z "${IMG_ROOTIMGDIR}" ] 
+then
+    mv "${IMG_ROOTIMGDIR}/bin/uname.xcat" "${IMG_ROOTIMGDIR}/bin/uname"
+else
+    mv "/bin/uname.xcat" "/bin/uname"
+fi
+
--- a/xCAT-server/share/xcat/samples/cuda11/cuda11_setup.readme
+++ b/xCAT-server/share/xcat/samples/cuda11/cuda11_setup.readme
@@ -0,0 +1,66 @@
+cuda setup scripts
+==================
+
+This section documented NVIDIA CUDA Toolkit v11 installation on the power9 rhels8.1 system. 
+
+Diskful images
+--------------
+
+The following  ``cudafull`` osimage definitions will be created from the base ``rhels8.1-ppc64le-install-compute`` osimage. ::
+
+ # lsdef -t osimage rhels8.1.0-ppc64le-install-cudafull
+ Object name: rhels8.1.0-ppc64le-install-cudafull
+    imagetype=linux
+    osarch=ppc64le
+    osdistroname=rhels8.1.0-ppc64le
+    osname=Linux
+    osvers=rhels8.1.0
+    otherpkgdir=/install/post/otherpkgs/rhels8.1.0/ppc64le
+    pkgdir=/install/rhels8.1.0/ppc64le,/install/REPO/software/nvidia/cuda-core/11.0.1_450.36.06-1/repo/ppc64le,/install/cuda11/ppc64le/cuda-deps
+    pkglist=/opt/xcat/share/xcat/install/rh/compute.rhels8.cuda.pkglist
+    postscripts=custom.ps/nvidia/postinstall/cuda11_power9_setup
+    profile=compute
+    provmethod=install
+    template=/opt/xcat/share/xcat/install/rh/compute.rhels8.tmpl
+
+Postscripts
+^^^^^^^^^^^
+
+xCAT provides ``cuda_power9_setup`` postscripts to setup additional configuration to install NVIDIA POWER9 CUDA driver. For ``CUDA11``,  it has issue to installing nvidia-drivers modules with kickstart.  To workaround this problem,  xCAT provides another postscripts ``cuda11_power9_setup``,  the CUDA packages will be installed from this postscripts instead from package list and this is only apply to the diskfull installation.
+
+CUDA dependences
+^^^^^^^^^^^^^^^^
+
+``dkms`` and ``opencl-filesystem`` are required for the CUDA11.  ``dkms`` packages can be found in the rhel8 EPEL repository.  ``opencl-filesystem`` packages can be found in the rhel8 CodeReady repository. ::
+ 
+  # ls -ltr /install/cuda11/ppc64le/cuda-deps
+  -rw-r--r-- 1 root root 82332 Jun 16 10:25 dkms-2.8.1-4.20200214git5ca628c.el8.noarch.rpm
+  -rw-r--r-- 1 root root 90148 Jun 16 10:29 opencl-headers-2.2-1.20180306gite986688.el8.noarch.rpm
+  -rw-r--r-- 1 root root  8668 Jun 16 10:29 opencl-filesystem-1.0-6.el8.noarch.rpm
+  drwxr-xr-x 2 root root  4096 Jun 16 15:10 repodata
+
+CUDA Packages
+^^^^^^^^^^^^^
+
+``cuda-repo-rhel8-11-0-local-11.0.1_450.36.06-1.ppc64le.rpm`` is used for above osimage and it disbuted in the ``/install/REPO/software/nvidia/cuda-core/11.0.1_450.36.06-1/repo/ppc64le`` dir. 
+Besides rhels8 base packlist, the following packages needs to be added also. ::
+
+  # diff /opt/xcat/share/xcat/install/rh/compute.rhels8.cuda.pkglist /opt/xcat/share/xcat/install/rh/compute.rhels8.pkglist
+  12,27d11
+  < libtirpc
+  < yum
+  < rpm
+  < dnf
+  < dracut
+  < dracut-network
+  < e2fsprogs
+  < #For Cuda
+  < kernel-devel
+  < kernel-headers
+  < make
+  < gcc
+  < pciutils
+  < dkms
+  < opencl-filesystem
+
+NOTE: The two scripts in this directory verified with HPC service stack software.  
--- a/xCAT-server/share/xcat/samples/cuda11/cuda_power9_setup
+++ b/xCAT-server/share/xcat/samples/cuda11/cuda_power9_setup
@@ -0,0 +1,96 @@
+#!/bin/bash
+#
+# Copyright (C) 2018 International Business Machines
+# Eclipse Public License, Version 1.0 (EPL-1.0)
+#     <http://www.eclipse.org/legal/epl-v10.html>
+#
+# 2018-03-21 GONG Jie <gongjie@linux.vnet.ibm.com>
+# 2018-04-24 Matt Ezell <ezellma@ornl.gov>
+#
+# This script is used for doing extra setup steps for NVIDIA POWER9 CUDA driver
+# on RHEL 7. Please refer document below for details.
+#
+# http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#power9-setup
+#
+
+umask 0022
+
+[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}"
+$CHROOTCMD /bin/bash -c "systemctl enable nvidia-persistenced"
+
+
+[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}"
+$CHROOTCMD /bin/bash -c "systemctl enable nvidia_gdrcopy_kernel.service"
+
+# Disable a udev rule installed by default in some Linux distributions that cause hot-pluggable
+# memory to be automatically onlined when it is physically probed.
+#
+# The overrides for /lib/udev rules should be done in /etc/udev
+#
+UDEV_REDHAT_SOURCE=${IMG_ROOTIMGDIR}/lib/udev/rules.d/40-redhat.rules
+UDEV_REDHAT_TARGET=${IMG_ROOTIMGDIR}/etc/udev/rules.d/40-redhat.rules
+
+# If the file does not exist in /etc/udev, copy it from /lib/udev
+if [ ! -e ${UDEV_REDHAT_TARGET} ]; then
+  cp -n ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET}
+fi
+
+# Disable udev memory auto-onlining Rule for cuda10.x
+#
+# For RHELS 7.5 ALT
+#
+sed -i "s/^\(SUBSYSTEM==\"memory\".*\)/#\1/" ${UDEV_REDHAT_TARGET}
+#
+# For RHELS 7.6 ALT
+#
+if [[ `grep 'Memory hotadd request' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null && grep 'LABEL="memory_hotplug_end' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` == 0 ]]; then
+    echo "Detected RHELS 7.6 ALT, modifying ${UDEV_REDHAT_TARGET}..."
+    # Comment out the memory hotadd request (for reference)
+    if [[ `grep "## Memory hotadd request" ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` != 0 ]]; then
+        # but only run one time, not if it's already commented out. (to handle multiple genimage calls)
+        #sed -i '/Memory hotadd request/,+8 s/^/#/' ${UDEV_REDHAT_TARGET}
+        # RH76 CUDA doc recommends the following:
+        sed -i s/^\SUBSYSTEM!=\"memory\"/SUBSYSTEM==\"\*\"/ ${UDEV_REDHAT_TARGET}
+        sed -i s/^\ACTION!=\"add\"/ACTION==\"\*\"/ /tmp/40-redhat.rules ${UDEV_REDHAT_TARGET}
+    fi 
+fi
+
+echo "Comparing ${UDEV_REDHAT_SOURCE} and ${UDEV_REDHAT_TARGET}"
+diff ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET}
+
+# Setting NVIDIA parameters in both /etc/modprobe.d and /usr/lib/modprobe.d
+
+echo "==> Setting NVIDIA options in /usr/lib/modprobe.d/gpusupport and /etc/modprobe.d"
+echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/gpusupport.conf
+echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/etc/modprobe.d/gpusupport.conf
+
+grep nouveau ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
+if (( $? ))
+then
+echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
+fi
+
+grep nouveau ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
+if (( $? ))
+then
+echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
+fi
+
+# This is for nvprof (per George Chochia)
+grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
+if (( $? ))
+then
+  echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
+fi
+
+grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
+if (( $? ))
+then
+  echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
+fi
+
+if [ -z "${IMG_ROOTIMGDIR}" ] 
+then
+	kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)"
+	mkinitrd -v -f "/boot/initramfs-${kernel_version}.img" "${kernel_version}"
+fi