From dbe555769769e8f11efaf912cbe60ec747f2abe2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=E1=B4=8F=C9=B4=C9=A2=20Jie?= Date: Thu, 12 Apr 2018 17:19:28 +0800 Subject: [PATCH] Change CUDA 9.2 installation document and example setup script for POWER 9 (#5074) * Change cuda installation document for POWER 9 * Update the example script for CUDA setup on POWER 9 * Change CUDA installation document, ask user to use postscripts instead of postbootscripts * Tweak the nvidia_patch.conf * Redo initrd image for both diskless and diskfull compute node * Fix typo --- .../advanced/gpu/nvidia/osimage/rhels.rst | 24 ++++++------ xCAT/postscripts/cuda_power9_setup | 37 +++++++++++++++++++ 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/docs/source/advanced/gpu/nvidia/osimage/rhels.rst b/docs/source/advanced/gpu/nvidia/osimage/rhels.rst index 77dee5f3f..a810fddee 100644 --- a/docs/source/advanced/gpu/nvidia/osimage/rhels.rst +++ b/docs/source/advanced/gpu/nvidia/osimage/rhels.rst @@ -189,24 +189,24 @@ xCAT includes a script, ``cuda_power9_setup`` as example, to help user handle th Diskful osimage ^^^^^^^^^^^^^^^ -For diskful deployment, there is no need to change the osimage definition. Instead, add this postscript to your compute node postbootscrtips list. +For diskful deployment, there is no need to change the osimage definition. Instead, add this postscript to your compute node postbootscrtips list. :: - chdef p9compute -p postbootscripts=cuda_power9_setup + chdef p9compute -p postscripts=cuda_power9_setup -Disless osimage -^^^^^^^^^^^^^^^ +Diskless osimage +^^^^^^^^^^^^^^^^ -For diskless deployment, the script need to add to the postinstall script of the osimage. And it should be run in the chroot environment. Please refer the following commands as an example. +For diskless deployment, the script need to add to the postinstall script of the osimage. And it should be run in the chroot environment. Please refer the following commands as an example. :: - mkdir -p /install/custom/netboot - cp /opt/xcat/share/xcat/netboot/rh/compute.rhels7.ppc64le.postinstall /opt/xcat/share/xcat/netboot/rh/compute.rhels7.ppc64le.postinstall + mkdir -p /install/custom/netboot/rh + cp /opt/xcat/share/xcat/netboot/rh/compute.rhels7.ppc64le.postinstall /install/custom/netboot/rh/cudafull.rhels7.ppc64le.postinstall - cat >>/install/custom/netboot/rh/cudafull.rhels7.ppc64le.postinstall <-EOF + cat >>/install/custom/netboot/rh/cudafull.rhels7.ppc64le.postinstall <<-EOF - cp /install/postscripts/cuda_power9_setup /install/netboot/rhels7.5/ppc64le/compute/rootimg/tmp/cuda_power9_setup" - chroot /install/netboot/rhels7.5/ppc64le/compute/rootimg" /tmp/cuda_power9_setup + cp /install/postscripts/cuda_power9_setup \$installroot/tmp/cuda_power9_setup + chroot \$installroot /tmp/cuda_power9_setup - rm -f /install/netboot/rhels7.5/ppc64le/compute/rootimg/tmp/cuda_power9_setup + rm -f \$installroot/tmp/cuda_power9_setup EOF - chdef -t osimage rhels7.5-ppc64le-netboot-cudafull postinstall=/opt/xcat/share/xcat/netboot/rh/compute.rhels7.ppc64le.postinstall + chdef -t osimage rhels7.5-ppc64le-netboot-cudafull postinstall=/install/custom/netboot/rh/cudafull.rhels7.ppc64le.postinstall diff --git a/xCAT/postscripts/cuda_power9_setup b/xCAT/postscripts/cuda_power9_setup index d2120f465..caf0cddb5 100755 --- a/xCAT/postscripts/cuda_power9_setup +++ b/xCAT/postscripts/cuda_power9_setup @@ -40,3 +40,40 @@ chmod 0644 /usr/lib/systemd/system/nvidia-persistenced.service systemctl enable nvidia-persistenced rm -f /lib/udev/rules.d/40-redhat.rules + +# +# Extra steps for passing parameters to kernel module nvidia +# +mkdir -p /usr/lib/dracut/modules.d/95nvidia +cat >/usr/lib/dracut/modules.d/95nvidia/module-setup.sh <\$initdir/etc/modprobe.d/nvidia.conf +} +EOF + +chmod 0755 /usr/lib/dracut/modules.d/95nvidia/module-setup.sh +echo 'add_dracutmodules+=" nvidia "' >/etc/dracut.conf.d/nvidia_patch.conf + +# Patch for the diskfull environment, or in case kernel module nvidia need to be reloaded +echo 'options nvidia NVreg_RegistryDwords="RMNumaOnlining=0x1;RMNvLinkSpeedControl=0x9;PeerMappingOverride=1"' >/etc/modprobe.d/nvidia.conf + +# Redo the initrd image +kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)" +mkinitrd -v -f "/boot/initramfs-${kernel_version}.img" "${kernel_version}"