ubuntu22.04算力环境基础配置一键验证脚本

发了一个算力环境基础配置的一键脚本,这里将自己写的环境验证脚本也一并发下,方便后面自己使用。

#!/bin/bash
#set -e

HOST_LIST=('10.7.0.12 blog')

function INFO(){
    /bin/echo -e "\e[104m\e[97m[I]\e[49m\e[39m ${*}"
}
function WARNING(){
    /bin/echo >&2 -e "\e[101m\e[97m[W]\e[49m\e[39m ${*}"
}
function ERROR(){
    /bin/echo >&2 -e "\e[101m\e[97m[E]\e[49m\e[39m ${*}"
}
function CHECK_OPENSSH(){
	R=`dpkg -l | grep openssh | awk '{print $2":"$3}' |sed ':a;N;$!ba;s/\n/ /g'`
	INFO "当前OpenSSL版本:$R"
}
function CHECK_SSHD(){
	#R=`cat /etc/ssh/sshd_config | grep -iE '^Include\s+/etc/ssh/sshd_config.d' | wc -l`
	R=`cat /etc/ssh/sshd_config | grep -E '^PasswordAuthentication' | awk '{print $2}'`
	if [ -z "${R+x}" ]; then
		ERROR "禁止密码登录未开启!"
	else
		if [ "${R}" == "no" ]; then
			INFO "禁止密码登录已开启!"
		else
			ERROR "禁止密码登录未开启,请使用命令关闭:sudo sed -i '/PasswordAuthentication/s/yes/no/g' /etc/ssh/sshd_config && sudo systemctl restart sshd"
		fi
	fi
}
function CHECH_HOSTS(){
	R=`cat /etc/hosts | grep $(hostname) | wc -l` 
	A=`cat /etc/hosts | grep $(hostname) | awk '{print $1}'`
	if [ $R -eq 0 ]; then
		ERROR "当前主机名:$(hostname),hosts未添加当前主机名和地址绑定!"
	else
		INFO "当前主机名:$(hostname),hosts已添加绑定:$A"
	fi
}
function CHECK_NETS(){
	if ! command -v "ifconfig" >/dev/null 2>&1; then
		ERROR "net-tools 工具没有安装,请使用 apt install net-tools -y 进行安装"
	else
		R=`ifconfig | grep mtu | awk '{print $1,$NF}' | sed ':a;N;$!ba;s/\n/,/g'`
		INFO "网络接口信息:$R"
	fi
}
function CHECK_CPU_MODE(){
	R=`cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor 2>&1`
	if [ $? -eq 0 ];then
		if [ $R == "performance" ]; then
			INFO "CPU已设置性能模式:$R"
		else
			ERROR "当前CPU模式:$R"
		fi
	else
		WARNING "当前环境无法查询 CPU 模式!"
	fi
}
function CHECK_ULIMIT(){
	R=`ulimit -n`
	if [ $R -gt 1024 ]; then
		INFO "文件描述符已优化,open files:$R,max user processes:$P"
	else
		ERROR "文件描述符未优化!"
	fi
}
function CHECK_NOUVEAU(){
	R=`lsmod |grep nouveau | wc -l`
	if [ $R -eq 0 ]; then
		INFO "nouveau 驱动已禁用!"
	else
		ERROR "nouveau 驱动未禁用!"
	fi
}
function CHECK_NVIDIA_DRIVER(){
	if ! command -v "nvidia-smi" >/dev/null 2>&1; then
		ERROR "英伟达驱动未安装!"
	else
		R=`nvidia-smi -q -i 0 | grep -iE '(Product Name|Driver Version|CUDA Version|Persistence)' | awk -F ': ' '{print $2}' | sed ':a;N;$!ba;s/\n/,/g'`
		V=`echo $R | awk -F ',' '{print $1}'`
		N=`echo $R | awk -F ',' '{print $3}'`
		C=`echo $R | awk -F ',' '{print $2}'`
		INFO "英伟达 $N 驱动已安装,版本:$V,cuda toolkit 最高支持:$C"
		P=`echo $R | awk -F ',' '{print $NF}'`
		if [ $P == "Enabled" ]; then
			INFO "GPU 持续模式已开启!"
		else
			ERROR "GPU持续模式未开启!"
		fi
	fi
}
function CHECK_NVIDIA_FABRIC_MANAGER(){
	if ! command -v "nv-fabricmanager" >/dev/null 2>&1; then
		WARNING "英伟达Fabric Manager 未安装!"
	else
		R=`systemctl status nvidia-fabricmanager | grep "active (running)" | wc -l`
		V=`/usr/bin/nv-fabricmanager -v | awk '{print $NF}'`
		if [ $R -eq 0 ]; then
			ERROR "英伟达Fabric Manager 已安装,运行状态异常,版本:$V"
		else
			INFO "英伟达Fabric Manager 已安装,运行状态正常,版本:$V"
		fi
	fi
}
function CHECK_CUDA_TOOLKIT(){
	if ! command -v "nvcc" >/dev/null 2>&1; then
		ERROR "CUDA TOOLKIT 未安装!"
	else
		V=`nvcc -V | tail -n 1 | awk '{print $2}'`
		INFO "CUDA TOOLKIT 已安装,版本:$V"
	fi
}
function CHECK_MELLANOX_DRIVER(){
	if ! command -v "ofed_info" >/dev/null 2>&1; then
		ERROR "Mellanox 驱动未检测到!"
	else
		R=`ofed_info -s`
		INFO "Mellanox 驱动已安装,版本:${R%:*}"
		show_gids | awk '{print $1":"$2}' | grep -iE '^mlx' | uniq | while read LINE; do
			R=`ibstatus ${LINE}`
			STAT=`echo ${R%phys*} | awk '{print $NF}'`
			RATE=`echo ${R#*rate:} | awk '{print $1,$2}'`
			V=`ibstat ${LINE%:*} | grep 'Firmware version' | awk -F ': ' '{print $NF}'`
			M=`sudo cma_roce_mode -d ${LINE%:*}`
			echo -e "\t${LINE} \t| state: ${STAT} \t| rate: ${RATE} \t| firmware: ${V} \t| mode: ${M}"
		done
	fi
}
function CHECK_KERNEL_MODULE(){
	R=`lsmod | grep nvidia_peermem | wc -l`
	if [ $R -eq 0 ]; then
		ERROR "nvidia_peermem 未加载到内核!"
	else
		INFO "nvidia_peermem 已加载到内核!"
	fi
}
function CHECK_TIME_ZONE(){
	ZONE="Asia/Shanghai"
	R=`timedatectl | grep 'Time zone' | awk '{print $3}'`
	if [[ $R =~ "${ZONE}" ]]; then
		INFO "时区:$R"
	else
		ERROR "时区:$R"
	fi
}
function CHECK_MARK_SHOWHOLD(){
	R=`apt-mark showhold | sed ':a;N;$!ba;s/\n/ /g'`
	if [ ${#R} -eq 0 ]; then
		ERROR "当前没有禁止更新的软件!"
	else
		INFO "当前禁止更新软件有: $R"
	fi
}
function CHECK_DISABLE_IPV6(){
	R=`sudo sysctl -a | grep net.ipv6.conf.all.disable_ipv6`
	R2=`echo $R | awk -F '=' '{print $2}'`
	if [ $R2 -eq 1 ]; then
		INFO "IPV6已禁用!"
	else
		ERROR "未禁用IPV6!"
	fi
}
function CHECH_SYSTEM_SLEEP(){
	R=`systemctl status sleep.target | grep inactive | awk '{print $3}'`
	if [ $R == "(dead)" ]; then
		INFO "系统休眠已关闭!"
	else
		ERROR "系统休眠未关闭!"
	fi
}

source /etc/profile
source ~/.bashrc
INFO "内核版本:$(uname -r)"
INFO "GLIBC版本:$(ldd --version |grep GLIBC)"
CHECK_OPENSSH
CHECK_SSHD
CHECH_HOSTS
CHECK_NETS
CHECK_CPU_MODE
CHECK_ULIMIT
CHECK_NOUVEAU
CHECK_NVIDIA_DRIVER
CHECK_NVIDIA_FABRIC_MANAGER
CHECK_CUDA_TOOLKIT
CHECK_MELLANOX_DRIVER
CHECK_KERNEL_MODULE
CHECK_TIME_ZONE
CHECK_MARK_SHOWHOLD
CHECK_DISABLE_IPV6
CHECH_SYSTEM_SLEEP

202509291525041872613158.png

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://sulao.cn/post/1139

评论列表

0%