#!/bin/bash #nvidia-kdump-config #enable disable of kdump on DGX systems #arguments can be: # enable-dmesg-dump # enable-vmcore-dump # disable arg="$1" find_crypt_root() { physroot="" rootmnt=$(findmnt -fn -o SOURCE /) dmlink=$(readlink ${rootmnt}) if [ $? -ne 0 ]; then return 1 else if [ -z ${dmlink} ]; then return 1 else dmbase=$(basename $dmlink) for DEV in $(ls /sys/class/block/); do if [ -e /sys/class/block/${DEV}/holders/${dmbase} ]; then physroot="/dev/${DEV}" fi done fi fi if [ -z ${physroot} ]; then return 1 else echo "${physroot}" fi return 0 } CONF_FILE="/etc/default/grub.d/kdump-tools.cfg" case $arg in enable-dmesg-dump | enable-vmcore-dump) PLAT_FUNCS="/usr/local/sbin/nv_scripts/plat_funcs.bash" GENERAL_FUNCS="/usr/local/sbin/nv_scripts/general_funcs.bash" . "${GENERAL_FUNCS}" . "${PLAT_FUNCS}" if skip_platform_detection; then MEMRESV="" else MEMRESV=$(plat_get_crashdump_mem) fi ARCH=`uname -m` # Set MEMRESV to default value for ARM or X86 if skip_platform_detection is true or if the # platform was not recognized in plat_get_crashdump_mem which returns an empty string) if [ "$MEMRESV" == "" ]; then MEMRESV_DFLT_X86="1G-:2048M" MEMRESV_DFLT_ARM="2048M,high" if [ "$ARCH" == "aarch64" ]; then MEMRESV=$MEMRESV_DFLT_ARM else MEMRESV=$MEMRESV_DFLT_X86 fi fi # If X86 platform, increase memory reservation if a crypt+root is being used (+864M) # and if reserved crashkernel size is less than 2048M if [ "$ARCH" != "aarch64" ]; then find_crypt_root > /dev/null if [ $? -eq 0 ]; then # Get the crashkernel memory size in MB from $MEMRESV by removing the text after it, and # before it if needed MEMRESV_SZ_ORIG=${MEMRESV%M*} MEMRESV_SZ_ORIG=${MEMRESV_SZ_ORIG#*:} # If the original crashkernel memory size to be reserved is less than 2048M, add 864 to # it and make a string substitution to put in the new value if [ "$MEMRESV_SZ_ORIG" -lt 2048 ]; then MEMRESV_SZ_NEW=$(($MEMRESV_SZ_ORIG + 864)) MEMRESV=$(echo "$MEMRESV" | sed -e "s/${MEMRESV_SZ_ORIG}/${MEMRESV_SZ_NEW}/" ) fi fi fi if [ "$arg" == "enable-dmesg-dump" ]; then echo "Enable DMESG dump. VMCORE will not be dumped." sed -i 's/USE_KDUMP=0/USE_KDUMP=1/g' /etc/default/kdump-tools > /dev/null sed -i 's/KDUMP_SKIP_VMCORE=0/KDUMP_SKIP_VMCORE=1/g' /etc/default/kdump-tools > /dev/null elif [ "$arg" == "enable-vmcore-dump" ]; then echo "Enable VMCORE dump." sed -i 's/USE_KDUMP=0/USE_KDUMP=1/g' /etc/default/kdump-tools > /dev/null sed -i 's/KDUMP_SKIP_VMCORE=1/KDUMP_SKIP_VMCORE=0/g' /etc/default/kdump-tools > /dev/null fi echo "GRUB_CMDLINE_LINUX=\"\$GRUB_CMDLINE_LINUX crashkernel=${MEMRESV}\"" > ${CONF_FILE} echo "kernel.panic_on_unrecovered_nmi=1" > /etc/sysctl.d/90-dgx-crashdump.conf echo "kernel.unknown_nmi_panic=1" >> /etc/sysctl.d/90-dgx-crashdump.conf echo "kernel.hardlockup_panic=1" >> /etc/sysctl.d/90-dgx-crashdump.conf echo "kernel.panic_on_io_nmi=1" >> /etc/sysctl.d/90-dgx-crashdump.conf echo "kernel.softlockup_panic=1" >> /etc/sysctl.d/90-dgx-crashdump.conf echo "kernel.panic_on_oops=1" >> /etc/sysctl.d/90-dgx-crashdump.conf echo "kernel.hung_task_panic=1" >> /etc/sysctl.d/90-dgx-crashdump.conf echo "kernel.panic_on_rcu_stall=1" >> /etc/sysctl.d/90-dgx-crashdump.conf echo "kernel.panic=30" >> /etc/sysctl.d/90-dgx-crashdump.conf update-grub echo "System must be rebooted for changes to take effect!" ;; disable) echo "Disable kdump" sed -i 's/USE_KDUMP=1/USE_KDUMP=0/g' /etc/default/kdump-tools > /dev/null echo 'GRUB_CMDLINE_LINUX="$GRUB_CMDLINE_LINUX crashkernel=1G-:0M"' > ${CONF_FILE} rm -f /etc/sysctl.d/90-dgx-crashdump.conf kdump-config unload update-grub echo "System must be rebooted for changes to take effect!" ;; *) echo "expected nvidia-kdump-config enable-dmesg-dump|enable-vmcore-dump|disable" ;; esac