之前在ubuntu和centos下docker环境配置nvidia-container-runtime可以查看之前笔记
centos7:https://sulao.cn/post/942.html
ubuntu:https://sulao.cn/post/919.html
containerd下配置nvidia-container-runtime可以查看这个笔记:https://sulao.cn/post/947.html
由于containerd下配置nvidia-container-runtime比较繁琐,所以写了脚本去自动修改
使用python修改config.toml文件首先需要安装toml模块
pip3 install toml
然后就可以使用toml模块的load和dump方法进行toml格式的数据处理了,load方法就是将toml格式数据读取成字典格式数据,dump方式则反过来将字典转换为toml格式的数据。
以下是自动修改containerd的配置为nvidia-container-runtime的脚本。
#!/usr/bin/python3 #coding: utf-8 import toml import sys import os import json import time import shutil import logging import subprocess logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(filename)s[line:%(lineno)d] %(message)s', datefmt='%Y-%m-%d') config_path = r"/etc/containerd" config_name = "config.toml" nvidia_toolkit = ["/usr/bin/nvidia-container-runtime", "/usr/bin/nvidia-container-runtime-hook", "/usr/bin/nvidia-container-cli"] def formmat_out_put(cmd, json_format=False): output=subprocess.getoutput(cmd) if json_format == False: output =output.split('\n') return output def check_nvidia_runtime(nvidia_toolkit): for toolkit in nvidia_toolkit: if not os.path.exists(toolkit): sys.exit("不能找到 {} 命令, 请检查nvidia-container-runtime是否安装!".format(toolkit)) def modify_config(config): # logging.info("修改sandbox_image配置...") # config["plugins"]["io.containerd.grpc.v1.cri"]["sandbox_image"] = "registry.cn-hangzhou.aliyuncs.com/google_containers/pause:3.8" if "io.containerd.runtime.v1.linux" in config["plugins"]: logging.info("找到io.containerd.runtime.v1.linux配置, 修改runtime为nvidia-container-runtime.") config["plugins"]["io.containerd.runtime.v1.linux"]["runtime"] = "nvidia-container-runtime" else: logging.warning("未找到io.containerd.runtime.v1.linux配置, 直接添加该配置...") config["plugins"]["io.containerd.runtime.v1.linux"] = { "no_shim": False, "runtime": "nvidia-container-runtime", "runtime_root": "", "shim": "containerd-shim", "shim_debug": False } logging.debug("添加/修改的io.containerd.runtime.v1.linux配置为: ".format(json.dumps(config["plugins"]["io.containerd.runtime.v1.linux"], indent=2))) logging.info("添加nvidia相关配置...") config["plugins"]["io.containerd.grpc.v1.cri"]["containerd"]["runtimes"]["nvidia"] = { "privileged_without_host_devices": False, "runtime_engine": "", "runtime_root": "", "runtime_type": "io.containerd.runc.v2" } logging.info("修改containerd默认运行时为nvidia.") config["plugins"]["io.containerd.grpc.v1.cri"]["containerd"]["default_runtime_name"] = "nvidia" config["plugins"]["io.containerd.grpc.v1.cri"]["containerd"]["runtimes"]["nvidia"]["options"] = { "BinaryName": "/usr/bin/nvidia-container-runtime", "SystemdCgroup": True } logging.debug("添加的nvidia配置为: {}".format(json.dumps(config["plugins"]["io.containerd.grpc.v1.cri"]["containerd"]["runtimes"]["nvidia"], indent=2))) return config if __name__ == "__main__": # logging.info("检测nvidia-container-runtime安装状态...") # check_nvidia_runtime(nvidia_toolkit) if not os.path.exists(os.path.join(config_path, config_name)): sys.exit("不能找到container配置文件 {}".format(os.path.join(config_path, config_name))) try: with open(os.path.join(config_path, config_name), "r") as f: config = toml.load(f) logging.debug("打印当前config配置: {}".format(config)) new_config = modify_config(config) logging.debug("打印修改后的config配置: {}".format(new_config)) except Exception as e: sys.exit("修改container配置文件失败, {}".format(e)) logging.info("containerd配置修改完成, 开始备份{}文件...".format(config_name)) shutil.copyfile(os.path.join(config_path, config_name), os.path.join(config_path, config_name+"_"+time.strftime("%Y%m%d%H%M%S", time.localtime()))) logging.info("将修改后的container配置写回 {} 文件".format(config_name)) try: with open(os.path.join(config_path, config_name), "w") as f: toml.dump(new_config, f) except Exception as e: sys.exit("写回containerd配置失败, {}".format(e)) logging.info("修改containerd配置操作完成!!!")