2
0
mirror of https://github.com/xcat2/xcat-core.git synced 2025-05-31 10:06:39 +00:00

Add Cuda11 setup scripts

This commit is contained in:
cxhong 2020-06-17 15:45:30 -04:00
parent a93c7c9dcb
commit 05f4119eae
3 changed files with 316 additions and 0 deletions

View File

@ -0,0 +1,154 @@
#!/bin/bash
#
# Copyright (C) 2018 International Business Machines
# Eclipse Public License, Version 1.0 (EPL-1.0)
# <http://www.eclipse.org/legal/epl-v10.html>
#
# 2018-03-21 GONG Jie <gongjie@linux.vnet.ibm.com>
# 2018-04-24 Matt Ezell <ezellma@ornl.gov>
#
# This script is used for doing extra setup steps for NVIDIA POWER9 CUDA driver
# on RHEL 7. Please refer document below for details.
#
# http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#power9-setup
#
umask 0022
# Disable a udev rule installed by default in some Linux distributions that cause hot-pluggable
# memory to be automatically onlined when it is physically probed.
#
# The overrides for /lib/udev rules should be done in /etc/udev
#
UDEV_REDHAT_SOURCE=${IMG_ROOTIMGDIR}/lib/udev/rules.d/40-redhat.rules
UDEV_REDHAT_TARGET=${IMG_ROOTIMGDIR}/etc/udev/rules.d/40-redhat.rules
# If the file does not exist in /etc/udev, copy it from /lib/udev
if [ ! -e ${UDEV_REDHAT_TARGET} ]; then
cp -n ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET}
fi
# Disable udev memory auto-onlining Rule for cuda10.x
#
# For RHELS 7.5 ALT
#
sed -i "s/^\(SUBSYSTEM==\"memory\".*\)/#\1/" ${UDEV_REDHAT_TARGET}
#
# For RHELS 7.6 ALT
#
if [[ `grep 'Memory hotadd request' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null && grep 'LABEL="memory_hotplug_end' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` == 0 ]]; then
echo "Detected RHELS 7.6 ALT, modifying ${UDEV_REDHAT_TARGET}..."
# Comment out the memory hotadd request (for reference)
if [[ `grep "## Memory hotadd request" ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` != 0 ]]; then
# but only run one time, not if it's already commented out. (to handle multiple genimage calls)
#sed -i '/Memory hotadd request/,+8 s/^/#/' ${UDEV_REDHAT_TARGET}
# RH76 CUDA doc recommends the following:
sed -i s/^\SUBSYSTEM!=\"memory\"/SUBSYSTEM==\"\*\"/ ${UDEV_REDHAT_TARGET}
sed -i s/^\ACTION!=\"add\"/ACTION==\"\*\"/ /tmp/40-redhat.rules ${UDEV_REDHAT_TARGET}
fi
fi
echo "Comparing ${UDEV_REDHAT_SOURCE} and ${UDEV_REDHAT_TARGET}"
diff ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET}
# Setting NVIDIA parameters in both /etc/modprobe.d and /usr/lib/modprobe.d
echo "==> Setting NVIDIA options in /usr/lib/modprobe.d/gpusupport and /etc/modprobe.d"
echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/gpusupport.conf
echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/etc/modprobe.d/gpusupport.conf
grep nouveau ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
if (( $? ))
then
echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
fi
grep nouveau ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
if (( $? ))
then
echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
fi
# This is for nvprof (per George Chochia)
grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
if (( $? ))
then
echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
fi
grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
if (( $? ))
then
echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
fi
#for cuda nvidia installation
unset ARCH
installed_kernel=`uname -r`
function hack_uname()
{
mv "$1/bin/uname" "$1/bin/uname.xcat"
cat <<-EOF >>"$1/bin/uname"
#!/bin/sh
case "\$1" in
"-m")
ARCH="\$(rpm -q kernel-\$("\$0" -r) --qf '%{arch}' 2>/dev/null)"
echo "\$ARCH"
;;
"-r")
kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)"
echo "\$kernel_version"
;;
"-s"|"")
echo "Linux"
;;
esac
exit 0
EOF
chmod 0755 "$1/bin/uname"
}
if [ -z "${IMG_ROOTIMGDIR}" ]
then
kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)"
mkinitrd -v -f "/boot/initramfs-${kernel_version}.img" "${kernel_version}"
hack_uname ${IMG_ROOTIMGDIR}
else
hack_uname
fi
updated_kernel=`uname -r`
dnf -y module install nvidia-driver:latest-dkms
dnf -y install cuda
#depmod nvidia_rsync_mem used installed kernel not updated kernel
if [ ${updated_kernel} != ${installed_kernel} ]
then
ln -s /lib/modules/${updated_kernel} /lib/modules/${installed_kernel}
fi
dnf -y install nvidia_peer_memory
dnf -y install nv_rsync_mem
dnf -y install nvidia_gdrcopy_kernel
if [ ${updated_kernel} != ${installed_kernel} ]
then
rm -rf /lib/modules/${installed_kernel}
fi
[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}"
$CHROOTCMD /bin/bash -c "systemctl enable nvidia-persistenced"
[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}"
$CHROOTCMD /bin/bash -c "systemctl enable nvidia_gdrcopy_kernel.service"
if [ -z "${IMG_ROOTIMGDIR}" ]
then
mv "${IMG_ROOTIMGDIR}/bin/uname.xcat" "${IMG_ROOTIMGDIR}/bin/uname"
else
mv "/bin/uname.xcat" "/bin/uname"
fi

View File

@ -0,0 +1,66 @@
cuda setup scripts
==================
This section documented NVIDIA CUDA Toolkit v11 installation on the power9 rhels8.1 system.
Diskful images
--------------
The following ``cudafull`` osimage definitions will be created from the base ``rhels8.1-ppc64le-install-compute`` osimage. ::
# lsdef -t osimage rhels8.1.0-ppc64le-install-cudafull
Object name: rhels8.1.0-ppc64le-install-cudafull
imagetype=linux
osarch=ppc64le
osdistroname=rhels8.1.0-ppc64le
osname=Linux
osvers=rhels8.1.0
otherpkgdir=/install/post/otherpkgs/rhels8.1.0/ppc64le
pkgdir=/install/rhels8.1.0/ppc64le,/install/REPO/software/nvidia/cuda-core/11.0.1_450.36.06-1/repo/ppc64le,/install/cuda11/ppc64le/cuda-deps
pkglist=/opt/xcat/share/xcat/install/rh/compute.rhels8.cuda.pkglist
postscripts=custom.ps/nvidia/postinstall/cuda11_power9_setup
profile=compute
provmethod=install
template=/opt/xcat/share/xcat/install/rh/compute.rhels8.tmpl
Postscripts
^^^^^^^^^^^
xCAT provides ``cuda_power9_setup`` postscripts to setup additional configuration to install NVIDIA POWER9 CUDA driver. For ``CUDA11``, it has issue to installing nvidia-drivers modules with kickstart. To workaround this problem, xCAT provides another postscripts ``cuda11_power9_setup``, the CUDA packages will be installed from this postscripts instead from package list and this is only apply to the diskfull installation.
CUDA dependences
^^^^^^^^^^^^^^^^
``dkms`` and ``opencl-filesystem`` are required for the CUDA11. ``dkms`` packages can be found in the rhel8 EPEL repository. ``opencl-filesystem`` packages can be found in the rhel8 CodeReady repository. ::
# ls -ltr /install/cuda11/ppc64le/cuda-deps
-rw-r--r-- 1 root root 82332 Jun 16 10:25 dkms-2.8.1-4.20200214git5ca628c.el8.noarch.rpm
-rw-r--r-- 1 root root 90148 Jun 16 10:29 opencl-headers-2.2-1.20180306gite986688.el8.noarch.rpm
-rw-r--r-- 1 root root 8668 Jun 16 10:29 opencl-filesystem-1.0-6.el8.noarch.rpm
drwxr-xr-x 2 root root 4096 Jun 16 15:10 repodata
CUDA Packages
^^^^^^^^^^^^^
``cuda-repo-rhel8-11-0-local-11.0.1_450.36.06-1.ppc64le.rpm`` is used for above osimage and it disbuted in the ``/install/REPO/software/nvidia/cuda-core/11.0.1_450.36.06-1/repo/ppc64le`` dir.
Besides rhels8 base packlist, the following packages needs to be added also. ::
# diff /opt/xcat/share/xcat/install/rh/compute.rhels8.cuda.pkglist /opt/xcat/share/xcat/install/rh/compute.rhels8.pkglist
12,27d11
< libtirpc
< yum
< rpm
< dnf
< dracut
< dracut-network
< e2fsprogs
< #For Cuda
< kernel-devel
< kernel-headers
< make
< gcc
< pciutils
< dkms
< opencl-filesystem
NOTE: The two scripts in this directory verified with HPC service stack software.

View File

@ -0,0 +1,96 @@
#!/bin/bash
#
# Copyright (C) 2018 International Business Machines
# Eclipse Public License, Version 1.0 (EPL-1.0)
# <http://www.eclipse.org/legal/epl-v10.html>
#
# 2018-03-21 GONG Jie <gongjie@linux.vnet.ibm.com>
# 2018-04-24 Matt Ezell <ezellma@ornl.gov>
#
# This script is used for doing extra setup steps for NVIDIA POWER9 CUDA driver
# on RHEL 7. Please refer document below for details.
#
# http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#power9-setup
#
umask 0022
[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}"
$CHROOTCMD /bin/bash -c "systemctl enable nvidia-persistenced"
[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}"
$CHROOTCMD /bin/bash -c "systemctl enable nvidia_gdrcopy_kernel.service"
# Disable a udev rule installed by default in some Linux distributions that cause hot-pluggable
# memory to be automatically onlined when it is physically probed.
#
# The overrides for /lib/udev rules should be done in /etc/udev
#
UDEV_REDHAT_SOURCE=${IMG_ROOTIMGDIR}/lib/udev/rules.d/40-redhat.rules
UDEV_REDHAT_TARGET=${IMG_ROOTIMGDIR}/etc/udev/rules.d/40-redhat.rules
# If the file does not exist in /etc/udev, copy it from /lib/udev
if [ ! -e ${UDEV_REDHAT_TARGET} ]; then
cp -n ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET}
fi
# Disable udev memory auto-onlining Rule for cuda10.x
#
# For RHELS 7.5 ALT
#
sed -i "s/^\(SUBSYSTEM==\"memory\".*\)/#\1/" ${UDEV_REDHAT_TARGET}
#
# For RHELS 7.6 ALT
#
if [[ `grep 'Memory hotadd request' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null && grep 'LABEL="memory_hotplug_end' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` == 0 ]]; then
echo "Detected RHELS 7.6 ALT, modifying ${UDEV_REDHAT_TARGET}..."
# Comment out the memory hotadd request (for reference)
if [[ `grep "## Memory hotadd request" ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` != 0 ]]; then
# but only run one time, not if it's already commented out. (to handle multiple genimage calls)
#sed -i '/Memory hotadd request/,+8 s/^/#/' ${UDEV_REDHAT_TARGET}
# RH76 CUDA doc recommends the following:
sed -i s/^\SUBSYSTEM!=\"memory\"/SUBSYSTEM==\"\*\"/ ${UDEV_REDHAT_TARGET}
sed -i s/^\ACTION!=\"add\"/ACTION==\"\*\"/ /tmp/40-redhat.rules ${UDEV_REDHAT_TARGET}
fi
fi
echo "Comparing ${UDEV_REDHAT_SOURCE} and ${UDEV_REDHAT_TARGET}"
diff ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET}
# Setting NVIDIA parameters in both /etc/modprobe.d and /usr/lib/modprobe.d
echo "==> Setting NVIDIA options in /usr/lib/modprobe.d/gpusupport and /etc/modprobe.d"
echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/gpusupport.conf
echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/etc/modprobe.d/gpusupport.conf
grep nouveau ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
if (( $? ))
then
echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
fi
grep nouveau ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
if (( $? ))
then
echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
fi
# This is for nvprof (per George Chochia)
grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
if (( $? ))
then
echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
fi
grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
if (( $? ))
then
echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
fi
if [ -z "${IMG_ROOTIMGDIR}" ]
then
kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)"
mkinitrd -v -f "/boot/initramfs-${kernel_version}.img" "${kernel_version}"
fi