mirror of
https://github.com/xcat2/xcat-core.git
synced 2025-05-31 10:06:39 +00:00
Add Cuda11 setup scripts
This commit is contained in:
parent
a93c7c9dcb
commit
05f4119eae
154
xCAT-server/share/xcat/samples/cuda11/cuda11_power9_setup
Executable file
154
xCAT-server/share/xcat/samples/cuda11/cuda11_power9_setup
Executable file
@ -0,0 +1,154 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (C) 2018 International Business Machines
|
||||
# Eclipse Public License, Version 1.0 (EPL-1.0)
|
||||
# <http://www.eclipse.org/legal/epl-v10.html>
|
||||
#
|
||||
# 2018-03-21 GONG Jie <gongjie@linux.vnet.ibm.com>
|
||||
# 2018-04-24 Matt Ezell <ezellma@ornl.gov>
|
||||
#
|
||||
# This script is used for doing extra setup steps for NVIDIA POWER9 CUDA driver
|
||||
# on RHEL 7. Please refer document below for details.
|
||||
#
|
||||
# http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#power9-setup
|
||||
#
|
||||
|
||||
umask 0022
|
||||
|
||||
# Disable a udev rule installed by default in some Linux distributions that cause hot-pluggable
|
||||
# memory to be automatically onlined when it is physically probed.
|
||||
#
|
||||
# The overrides for /lib/udev rules should be done in /etc/udev
|
||||
#
|
||||
UDEV_REDHAT_SOURCE=${IMG_ROOTIMGDIR}/lib/udev/rules.d/40-redhat.rules
|
||||
UDEV_REDHAT_TARGET=${IMG_ROOTIMGDIR}/etc/udev/rules.d/40-redhat.rules
|
||||
|
||||
# If the file does not exist in /etc/udev, copy it from /lib/udev
|
||||
if [ ! -e ${UDEV_REDHAT_TARGET} ]; then
|
||||
cp -n ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET}
|
||||
fi
|
||||
|
||||
# Disable udev memory auto-onlining Rule for cuda10.x
|
||||
#
|
||||
# For RHELS 7.5 ALT
|
||||
#
|
||||
sed -i "s/^\(SUBSYSTEM==\"memory\".*\)/#\1/" ${UDEV_REDHAT_TARGET}
|
||||
#
|
||||
# For RHELS 7.6 ALT
|
||||
#
|
||||
if [[ `grep 'Memory hotadd request' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null && grep 'LABEL="memory_hotplug_end' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` == 0 ]]; then
|
||||
echo "Detected RHELS 7.6 ALT, modifying ${UDEV_REDHAT_TARGET}..."
|
||||
# Comment out the memory hotadd request (for reference)
|
||||
if [[ `grep "## Memory hotadd request" ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` != 0 ]]; then
|
||||
# but only run one time, not if it's already commented out. (to handle multiple genimage calls)
|
||||
#sed -i '/Memory hotadd request/,+8 s/^/#/' ${UDEV_REDHAT_TARGET}
|
||||
# RH76 CUDA doc recommends the following:
|
||||
sed -i s/^\SUBSYSTEM!=\"memory\"/SUBSYSTEM==\"\*\"/ ${UDEV_REDHAT_TARGET}
|
||||
sed -i s/^\ACTION!=\"add\"/ACTION==\"\*\"/ /tmp/40-redhat.rules ${UDEV_REDHAT_TARGET}
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Comparing ${UDEV_REDHAT_SOURCE} and ${UDEV_REDHAT_TARGET}"
|
||||
diff ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET}
|
||||
|
||||
# Setting NVIDIA parameters in both /etc/modprobe.d and /usr/lib/modprobe.d
|
||||
|
||||
echo "==> Setting NVIDIA options in /usr/lib/modprobe.d/gpusupport and /etc/modprobe.d"
|
||||
echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/gpusupport.conf
|
||||
echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/etc/modprobe.d/gpusupport.conf
|
||||
|
||||
grep nouveau ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
|
||||
if (( $? ))
|
||||
then
|
||||
echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
|
||||
fi
|
||||
|
||||
grep nouveau ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
|
||||
if (( $? ))
|
||||
then
|
||||
echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
|
||||
fi
|
||||
|
||||
# This is for nvprof (per George Chochia)
|
||||
grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
|
||||
if (( $? ))
|
||||
then
|
||||
echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
|
||||
fi
|
||||
|
||||
grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
|
||||
if (( $? ))
|
||||
then
|
||||
echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
|
||||
fi
|
||||
|
||||
#for cuda nvidia installation
|
||||
unset ARCH
|
||||
installed_kernel=`uname -r`
|
||||
|
||||
function hack_uname()
|
||||
{
|
||||
mv "$1/bin/uname" "$1/bin/uname.xcat"
|
||||
cat <<-EOF >>"$1/bin/uname"
|
||||
#!/bin/sh
|
||||
case "\$1" in
|
||||
"-m")
|
||||
ARCH="\$(rpm -q kernel-\$("\$0" -r) --qf '%{arch}' 2>/dev/null)"
|
||||
echo "\$ARCH"
|
||||
;;
|
||||
"-r")
|
||||
kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)"
|
||||
echo "\$kernel_version"
|
||||
;;
|
||||
"-s"|"")
|
||||
echo "Linux"
|
||||
;;
|
||||
esac
|
||||
exit 0
|
||||
EOF
|
||||
|
||||
chmod 0755 "$1/bin/uname"
|
||||
}
|
||||
|
||||
|
||||
if [ -z "${IMG_ROOTIMGDIR}" ]
|
||||
then
|
||||
kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)"
|
||||
mkinitrd -v -f "/boot/initramfs-${kernel_version}.img" "${kernel_version}"
|
||||
hack_uname ${IMG_ROOTIMGDIR}
|
||||
else
|
||||
hack_uname
|
||||
fi
|
||||
|
||||
updated_kernel=`uname -r`
|
||||
|
||||
dnf -y module install nvidia-driver:latest-dkms
|
||||
dnf -y install cuda
|
||||
|
||||
#depmod nvidia_rsync_mem used installed kernel not updated kernel
|
||||
if [ ${updated_kernel} != ${installed_kernel} ]
|
||||
then
|
||||
ln -s /lib/modules/${updated_kernel} /lib/modules/${installed_kernel}
|
||||
fi
|
||||
dnf -y install nvidia_peer_memory
|
||||
dnf -y install nv_rsync_mem
|
||||
dnf -y install nvidia_gdrcopy_kernel
|
||||
if [ ${updated_kernel} != ${installed_kernel} ]
|
||||
then
|
||||
rm -rf /lib/modules/${installed_kernel}
|
||||
fi
|
||||
|
||||
[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}"
|
||||
$CHROOTCMD /bin/bash -c "systemctl enable nvidia-persistenced"
|
||||
|
||||
|
||||
[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}"
|
||||
$CHROOTCMD /bin/bash -c "systemctl enable nvidia_gdrcopy_kernel.service"
|
||||
|
||||
if [ -z "${IMG_ROOTIMGDIR}" ]
|
||||
then
|
||||
mv "${IMG_ROOTIMGDIR}/bin/uname.xcat" "${IMG_ROOTIMGDIR}/bin/uname"
|
||||
else
|
||||
mv "/bin/uname.xcat" "/bin/uname"
|
||||
fi
|
||||
|
66
xCAT-server/share/xcat/samples/cuda11/cuda11_setup.readme
Normal file
66
xCAT-server/share/xcat/samples/cuda11/cuda11_setup.readme
Normal file
@ -0,0 +1,66 @@
|
||||
cuda setup scripts
|
||||
==================
|
||||
|
||||
This section documented NVIDIA CUDA Toolkit v11 installation on the power9 rhels8.1 system.
|
||||
|
||||
Diskful images
|
||||
--------------
|
||||
|
||||
The following ``cudafull`` osimage definitions will be created from the base ``rhels8.1-ppc64le-install-compute`` osimage. ::
|
||||
|
||||
# lsdef -t osimage rhels8.1.0-ppc64le-install-cudafull
|
||||
Object name: rhels8.1.0-ppc64le-install-cudafull
|
||||
imagetype=linux
|
||||
osarch=ppc64le
|
||||
osdistroname=rhels8.1.0-ppc64le
|
||||
osname=Linux
|
||||
osvers=rhels8.1.0
|
||||
otherpkgdir=/install/post/otherpkgs/rhels8.1.0/ppc64le
|
||||
pkgdir=/install/rhels8.1.0/ppc64le,/install/REPO/software/nvidia/cuda-core/11.0.1_450.36.06-1/repo/ppc64le,/install/cuda11/ppc64le/cuda-deps
|
||||
pkglist=/opt/xcat/share/xcat/install/rh/compute.rhels8.cuda.pkglist
|
||||
postscripts=custom.ps/nvidia/postinstall/cuda11_power9_setup
|
||||
profile=compute
|
||||
provmethod=install
|
||||
template=/opt/xcat/share/xcat/install/rh/compute.rhels8.tmpl
|
||||
|
||||
Postscripts
|
||||
^^^^^^^^^^^
|
||||
|
||||
xCAT provides ``cuda_power9_setup`` postscripts to setup additional configuration to install NVIDIA POWER9 CUDA driver. For ``CUDA11``, it has issue to installing nvidia-drivers modules with kickstart. To workaround this problem, xCAT provides another postscripts ``cuda11_power9_setup``, the CUDA packages will be installed from this postscripts instead from package list and this is only apply to the diskfull installation.
|
||||
|
||||
CUDA dependences
|
||||
^^^^^^^^^^^^^^^^
|
||||
|
||||
``dkms`` and ``opencl-filesystem`` are required for the CUDA11. ``dkms`` packages can be found in the rhel8 EPEL repository. ``opencl-filesystem`` packages can be found in the rhel8 CodeReady repository. ::
|
||||
|
||||
# ls -ltr /install/cuda11/ppc64le/cuda-deps
|
||||
-rw-r--r-- 1 root root 82332 Jun 16 10:25 dkms-2.8.1-4.20200214git5ca628c.el8.noarch.rpm
|
||||
-rw-r--r-- 1 root root 90148 Jun 16 10:29 opencl-headers-2.2-1.20180306gite986688.el8.noarch.rpm
|
||||
-rw-r--r-- 1 root root 8668 Jun 16 10:29 opencl-filesystem-1.0-6.el8.noarch.rpm
|
||||
drwxr-xr-x 2 root root 4096 Jun 16 15:10 repodata
|
||||
|
||||
CUDA Packages
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
``cuda-repo-rhel8-11-0-local-11.0.1_450.36.06-1.ppc64le.rpm`` is used for above osimage and it disbuted in the ``/install/REPO/software/nvidia/cuda-core/11.0.1_450.36.06-1/repo/ppc64le`` dir.
|
||||
Besides rhels8 base packlist, the following packages needs to be added also. ::
|
||||
|
||||
# diff /opt/xcat/share/xcat/install/rh/compute.rhels8.cuda.pkglist /opt/xcat/share/xcat/install/rh/compute.rhels8.pkglist
|
||||
12,27d11
|
||||
< libtirpc
|
||||
< yum
|
||||
< rpm
|
||||
< dnf
|
||||
< dracut
|
||||
< dracut-network
|
||||
< e2fsprogs
|
||||
< #For Cuda
|
||||
< kernel-devel
|
||||
< kernel-headers
|
||||
< make
|
||||
< gcc
|
||||
< pciutils
|
||||
< dkms
|
||||
< opencl-filesystem
|
||||
|
||||
NOTE: The two scripts in this directory verified with HPC service stack software.
|
96
xCAT-server/share/xcat/samples/cuda11/cuda_power9_setup
Executable file
96
xCAT-server/share/xcat/samples/cuda11/cuda_power9_setup
Executable file
@ -0,0 +1,96 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (C) 2018 International Business Machines
|
||||
# Eclipse Public License, Version 1.0 (EPL-1.0)
|
||||
# <http://www.eclipse.org/legal/epl-v10.html>
|
||||
#
|
||||
# 2018-03-21 GONG Jie <gongjie@linux.vnet.ibm.com>
|
||||
# 2018-04-24 Matt Ezell <ezellma@ornl.gov>
|
||||
#
|
||||
# This script is used for doing extra setup steps for NVIDIA POWER9 CUDA driver
|
||||
# on RHEL 7. Please refer document below for details.
|
||||
#
|
||||
# http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#power9-setup
|
||||
#
|
||||
|
||||
umask 0022
|
||||
|
||||
[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}"
|
||||
$CHROOTCMD /bin/bash -c "systemctl enable nvidia-persistenced"
|
||||
|
||||
|
||||
[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}"
|
||||
$CHROOTCMD /bin/bash -c "systemctl enable nvidia_gdrcopy_kernel.service"
|
||||
|
||||
# Disable a udev rule installed by default in some Linux distributions that cause hot-pluggable
|
||||
# memory to be automatically onlined when it is physically probed.
|
||||
#
|
||||
# The overrides for /lib/udev rules should be done in /etc/udev
|
||||
#
|
||||
UDEV_REDHAT_SOURCE=${IMG_ROOTIMGDIR}/lib/udev/rules.d/40-redhat.rules
|
||||
UDEV_REDHAT_TARGET=${IMG_ROOTIMGDIR}/etc/udev/rules.d/40-redhat.rules
|
||||
|
||||
# If the file does not exist in /etc/udev, copy it from /lib/udev
|
||||
if [ ! -e ${UDEV_REDHAT_TARGET} ]; then
|
||||
cp -n ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET}
|
||||
fi
|
||||
|
||||
# Disable udev memory auto-onlining Rule for cuda10.x
|
||||
#
|
||||
# For RHELS 7.5 ALT
|
||||
#
|
||||
sed -i "s/^\(SUBSYSTEM==\"memory\".*\)/#\1/" ${UDEV_REDHAT_TARGET}
|
||||
#
|
||||
# For RHELS 7.6 ALT
|
||||
#
|
||||
if [[ `grep 'Memory hotadd request' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null && grep 'LABEL="memory_hotplug_end' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` == 0 ]]; then
|
||||
echo "Detected RHELS 7.6 ALT, modifying ${UDEV_REDHAT_TARGET}..."
|
||||
# Comment out the memory hotadd request (for reference)
|
||||
if [[ `grep "## Memory hotadd request" ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` != 0 ]]; then
|
||||
# but only run one time, not if it's already commented out. (to handle multiple genimage calls)
|
||||
#sed -i '/Memory hotadd request/,+8 s/^/#/' ${UDEV_REDHAT_TARGET}
|
||||
# RH76 CUDA doc recommends the following:
|
||||
sed -i s/^\SUBSYSTEM!=\"memory\"/SUBSYSTEM==\"\*\"/ ${UDEV_REDHAT_TARGET}
|
||||
sed -i s/^\ACTION!=\"add\"/ACTION==\"\*\"/ /tmp/40-redhat.rules ${UDEV_REDHAT_TARGET}
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Comparing ${UDEV_REDHAT_SOURCE} and ${UDEV_REDHAT_TARGET}"
|
||||
diff ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET}
|
||||
|
||||
# Setting NVIDIA parameters in both /etc/modprobe.d and /usr/lib/modprobe.d
|
||||
|
||||
echo "==> Setting NVIDIA options in /usr/lib/modprobe.d/gpusupport and /etc/modprobe.d"
|
||||
echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/gpusupport.conf
|
||||
echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/etc/modprobe.d/gpusupport.conf
|
||||
|
||||
grep nouveau ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
|
||||
if (( $? ))
|
||||
then
|
||||
echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
|
||||
fi
|
||||
|
||||
grep nouveau ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
|
||||
if (( $? ))
|
||||
then
|
||||
echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
|
||||
fi
|
||||
|
||||
# This is for nvprof (per George Chochia)
|
||||
grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
|
||||
if (( $? ))
|
||||
then
|
||||
echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf
|
||||
fi
|
||||
|
||||
grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
|
||||
if (( $? ))
|
||||
then
|
||||
echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf
|
||||
fi
|
||||
|
||||
if [ -z "${IMG_ROOTIMGDIR}" ]
|
||||
then
|
||||
kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)"
|
||||
mkinitrd -v -f "/boot/initramfs-${kernel_version}.img" "${kernel_version}"
|
||||
fi
|
Loading…
x
Reference in New Issue
Block a user