K8S는 기본적으로 GPU를 스케쥴링 할 수 없고 GPU 공급업체에서 제공하는 디바이스 플러그인 설치 후 스케쥴링 가능
nvidia-container-toolkit Install
curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo |\sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
mkdir -p /root/files/packages/nvidia
dnf download -y --resolve --destdir=/root/files/packages/nvidia nvidia-container-toolkit
dnf localinstall -y --disablerepo=\* /root/files/packages/nvidia/*.rpm
nvidia-ctk --version
nvidia-container-toolkit -version
containerd default_runtime_name 변경
systemctl status containerd --no-pager
$> nvidia-ctk runtime configure --runtime=containerd
INFO[0000] Using config version 2
INFO[0000] Using CRI runtime plugin name "io.containerd.grpc.v1.cri"
INFO[0000] Wrote updated config to /etc/containerd/config.toml
INFO[0000] It is recommended that containerd daemon be restarted.
sudo sed -i 's/default_runtime_name = "runc"/default_runtime_name = "nvidia"/g' /etc/containerd/config.toml
cat /etc/containerd/config.toml | grep default_runtime_name
# 재시작
sudo systemctl restart containerd
# 변경 적용 후 containerd 상태 확인 - Active: active (running)
systemctl status containerd --no-pager
k8s-device-plugin 설치
mkdir -p /root/files/nvidia
cd /root/files/nvidia
helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
helm repo update && helm repo list
helm search repo nvdp --devel
helm pull nvdp/nvidia-device-plugin --version 0.17.0
# tar -xvf nvidia-device-plugin-0.17.0.tgz
helm upgrade --install nvdp nvidia-device-plugin-0.17.0.tgz \
--namespace=nvidia-device-plugin \
--create-namespace \
--set gfd.enabled=true