#!/bin/bash # # Unused functions notif_on_change() { NVIDIA_CAPS_PATH="/dev/nvidia-caps" inotifywait --event create,move,delete "${NVIDIA_CAPS_PATH}" } exec_on_change() { echo "Updating containerd override and restart docker daemon" create_containerd_override || echo "Failed to create containerd override" systemctl restart docker || echo "Failed to restart docker" } # End unused functions # create_containerd_override() { # If nvidia-smi is not found raise err and exit if [ ! -f /usr/bin/nvidia-smi ]; then echo "nvidia-smi not found. Returning error." return 1 fi MINOR_NUM_3D_GPU="/tmp/NV_3D_GPU_MINOR_NUMBERS.txt" MINOR_NUM_ALL_GPU="/tmp/NV_ALL_GPU_MINOR_NUMBERS.txt" rm -f ${MINOR_NUM_3D_GPU} > /dev/null 2>&1 rm -f ${MINOR_NUM_ALL_GPU} > /dev/null 2>&1 # Retrieve all 3D controller class NVIDIA GPUs NV_3D_GPU_BDF_LIST=`lspci | grep "3D controller" | grep -i NVIDIA | cut -d' ' -f1` for BDF in `echo ${NV_3D_GPU_BDF_LIST}`; do MINOR_NUMBER=`nvidia-smi -q -i $BDF | grep "Minor Number" | awk '{print $4}'` echo $MINOR_NUMBER >> ${MINOR_NUM_3D_GPU} done if [[ -f ${MINOR_NUM_3D_GPU} && ! -s ${MINOR_NUM_3D_GPU} ]]; then rm -rf ${MINOR_NUM_3D_GPU} ${MINOR_NUM_ALL_GPU} || echo "Failed to remove temporary files" return 1 fi # Retrive Minor Numbers of all Nvidia GPUs nvidia-smi -q | grep Minor | awk '{print $4}' > ${MINOR_NUM_ALL_GPU} # Get Minor Numbers of the Nvidia GPU which are NOT of Class 3D controller OUT=`comm -13 <(sort ${MINOR_NUM_3D_GPU}) <(sort ${MINOR_NUM_ALL_GPU})` # Determine OS type . /etc/os-release if [[ ($ID == rhel || $ID == centos) && $VERSION == 7* ]]; then # For RHEL 7 and CentOS 7 modify docker-override.conf CONF_DIR="/etc/systemd/system/docker.service.d" CONF_FILE="$CONF_DIR/docker-override.conf" # Make sure the file exists if [[ ! -f ${CONF_FILE} ]]; then mkdir -p $CONF_DIR > /dev/null 2>&1 touch $CONF_FILE echo "[Service]" > $CONF_FILE fi # Remove any existing DeviceAllow lines sed -i "/DeviceAllow/ d" $CONF_FILE else # For everything else remove old containerd-override.conf and create a new one CONF_DIR="/etc/systemd/system/containerd.service.d" CONF_FILE="$CONF_DIR/containerd-override.conf" # Remove if there exist any containerd-override.conf rm -f $CONF_FILE > /dev/null 2>&1 mkdir -p $CONF_DIR > /dev/null 2>&1 touch $CONF_FILE echo "[Service]" > $CONF_FILE fi # Add all /dev/nvidia* as DeviceAllow DEV_NVIDIA=`find /dev/ -name \* -type c | grep nvidia | grep -v nvidia-caps | sort` for DEV in `echo ${DEV_NVIDIA}`; do sed -i "$ aDeviceAllow=${DEV}" $CONF_FILE done # Add all nvidia-caps devices sed -i "$ aDeviceAllow=char-nvidia-caps" $CONF_FILE # Remove GPUs which are NOT of Class '3D Controller' for NUMBER in `echo ${OUT}`; do sed -i "/\bnvidia${NUMBER}\b/ d" $CONF_FILE done echo "Successfully configured nvidia docker to only use Compute GPUs" systemctl daemon-reload rm -rf ${MINOR_NUM_3D_GPU} ${MINOR_NUM_ALL_GPU} || echo "Failed to remove temporary files" return 0 } ### MAIN ### if [[ $EUID -ne 0 ]]; then echo "Requires sudo to run" exit 1 fi plat_funcs="/usr/local/sbin/nv_scripts/plat_funcs.bash" . ${plat_funcs} prodname=$(get_system_product_name) if plat_needs_containerd_override; then # Create the override file initially create_containerd_override # [bug 200744004]: No need to check on /dev/nvidia-caps anymore. We are # already allowing all such devices by using "DeviceAllow=char-nvidia-caps" # Main loop to look for changes in GPU topology #while notif_on_change; do # exec_on_change #done fi exit 0