发了一个算力环境基础配置的一键脚本,这里将自己写的环境验证脚本也一并发下,方便后面自己使用。
#!/bin/bash
#set -e
HOST_LIST=('10.7.0.12 blog')
function INFO(){
/bin/echo -e "\e[104m\e[97m[I]\e[49m\e[39m ${*}"
}
function WARNING(){
/bin/echo >&2 -e "\e[101m\e[97m[W]\e[49m\e[39m ${*}"
}
function ERROR(){
/bin/echo >&2 -e "\e[101m\e[97m[E]\e[49m\e[39m ${*}"
}
function CHECK_OPENSSH(){
R=`dpkg -l | grep openssh | awk '{print $2":"$3}' |sed ':a;N;$!ba;s/\n/ /g'`
INFO "当前OpenSSL版本:$R"
}
function CHECK_SSHD(){
#R=`cat /etc/ssh/sshd_config | grep -iE '^Include\s+/etc/ssh/sshd_config.d' | wc -l`
R=`cat /etc/ssh/sshd_config | grep -E '^PasswordAuthentication' | awk '{print $2}'`
if [ -z "${R+x}" ]; then
ERROR "禁止密码登录未开启!"
else
if [ "${R}" == "no" ]; then
INFO "禁止密码登录已开启!"
else
ERROR "禁止密码登录未开启,请使用命令关闭:sudo sed -i '/PasswordAuthentication/s/yes/no/g' /etc/ssh/sshd_config && sudo systemctl restart sshd"
fi
fi
}
function CHECH_HOSTS(){
R=`cat /etc/hosts | grep $(hostname) | wc -l`
A=`cat /etc/hosts | grep $(hostname) | awk '{print $1}'`
if [ $R -eq 0 ]; then
ERROR "当前主机名:$(hostname),hosts未添加当前主机名和地址绑定!"
else
INFO "当前主机名:$(hostname),hosts已添加绑定:$A"
fi
}
function CHECK_NETS(){
if ! command -v "ifconfig" >/dev/null 2>&1; then
ERROR "net-tools 工具没有安装,请使用 apt install net-tools -y 进行安装"
else
R=`ifconfig | grep mtu | awk '{print $1,$NF}' | sed ':a;N;$!ba;s/\n/,/g'`
INFO "网络接口信息:$R"
fi
}
function CHECK_CPU_MODE(){
R=`cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor 2>&1`
if [ $? -eq 0 ];then
if [ $R == "performance" ]; then
INFO "CPU已设置性能模式:$R"
else
ERROR "当前CPU模式:$R"
fi
else
WARNING "当前环境无法查询 CPU 模式!"
fi
}
function CHECK_ULIMIT(){
R=`ulimit -n`
if [ $R -gt 1024 ]; then
INFO "文件描述符已优化,open files:$R,max user processes:$P"
else
ERROR "文件描述符未优化!"
fi
}
function CHECK_NOUVEAU(){
R=`lsmod |grep nouveau | wc -l`
if [ $R -eq 0 ]; then
INFO "nouveau 驱动已禁用!"
else
ERROR "nouveau 驱动未禁用!"
fi
}
function CHECK_NVIDIA_DRIVER(){
if ! command -v "nvidia-smi" >/dev/null 2>&1; then
ERROR "英伟达驱动未安装!"
else
R=`nvidia-smi -q -i 0 | grep -iE '(Product Name|Driver Version|CUDA Version|Persistence)' | awk -F ': ' '{print $2}' | sed ':a;N;$!ba;s/\n/,/g'`
V=`echo $R | awk -F ',' '{print $1}'`
N=`echo $R | awk -F ',' '{print $3}'`
C=`echo $R | awk -F ',' '{print $2}'`
INFO "英伟达 $N 驱动已安装,版本:$V,cuda toolkit 最高支持:$C"
P=`echo $R | awk -F ',' '{print $NF}'`
if [ $P == "Enabled" ]; then
INFO "GPU 持续模式已开启!"
else
ERROR "GPU持续模式未开启!"
fi
fi
}
function CHECK_NVIDIA_FABRIC_MANAGER(){
if ! command -v "nv-fabricmanager" >/dev/null 2>&1; then
WARNING "英伟达Fabric Manager 未安装!"
else
R=`systemctl status nvidia-fabricmanager | grep "active (running)" | wc -l`
V=`/usr/bin/nv-fabricmanager -v | awk '{print $NF}'`
if [ $R -eq 0 ]; then
ERROR "英伟达Fabric Manager 已安装,运行状态异常,版本:$V"
else
INFO "英伟达Fabric Manager 已安装,运行状态正常,版本:$V"
fi
fi
}
function CHECK_CUDA_TOOLKIT(){
if ! command -v "nvcc" >/dev/null 2>&1; then
ERROR "CUDA TOOLKIT 未安装!"
else
V=`nvcc -V | tail -n 1 | awk '{print $2}'`
INFO "CUDA TOOLKIT 已安装,版本:$V"
fi
}
function CHECK_MELLANOX_DRIVER(){
if ! command -v "ofed_info" >/dev/null 2>&1; then
ERROR "Mellanox 驱动未检测到!"
else
R=`ofed_info -s`
INFO "Mellanox 驱动已安装,版本:${R%:*}"
show_gids | awk '{print $1":"$2}' | grep -iE '^mlx' | uniq | while read LINE; do
R=`ibstatus ${LINE}`
STAT=`echo ${R%phys*} | awk '{print $NF}'`
RATE=`echo ${R#*rate:} | awk '{print $1,$2}'`
V=`ibstat ${LINE%:*} | grep 'Firmware version' | awk -F ': ' '{print $NF}'`
M=`sudo cma_roce_mode -d ${LINE%:*}`
echo -e "\t${LINE} \t| state: ${STAT} \t| rate: ${RATE} \t| firmware: ${V} \t| mode: ${M}"
done
fi
}
function CHECK_KERNEL_MODULE(){
R=`lsmod | grep nvidia_peermem | wc -l`
if [ $R -eq 0 ]; then
ERROR "nvidia_peermem 未加载到内核!"
else
INFO "nvidia_peermem 已加载到内核!"
fi
}
function CHECK_TIME_ZONE(){
ZONE="Asia/Shanghai"
R=`timedatectl | grep 'Time zone' | awk '{print $3}'`
if [[ $R =~ "${ZONE}" ]]; then
INFO "时区:$R"
else
ERROR "时区:$R"
fi
}
function CHECK_MARK_SHOWHOLD(){
R=`apt-mark showhold | sed ':a;N;$!ba;s/\n/ /g'`
if [ ${#R} -eq 0 ]; then
ERROR "当前没有禁止更新的软件!"
else
INFO "当前禁止更新软件有: $R"
fi
}
function CHECK_DISABLE_IPV6(){
R=`sudo sysctl -a | grep net.ipv6.conf.all.disable_ipv6`
R2=`echo $R | awk -F '=' '{print $2}'`
if [ $R2 -eq 1 ]; then
INFO "IPV6已禁用!"
else
ERROR "未禁用IPV6!"
fi
}
function CHECH_SYSTEM_SLEEP(){
R=`systemctl status sleep.target | grep inactive | awk '{print $3}'`
if [ $R == "(dead)" ]; then
INFO "系统休眠已关闭!"
else
ERROR "系统休眠未关闭!"
fi
}
source /etc/profile
source ~/.bashrc
INFO "内核版本:$(uname -r)"
INFO "GLIBC版本:$(ldd --version |grep GLIBC)"
CHECK_OPENSSH
CHECK_SSHD
CHECH_HOSTS
CHECK_NETS
CHECK_CPU_MODE
CHECK_ULIMIT
CHECK_NOUVEAU
CHECK_NVIDIA_DRIVER
CHECK_NVIDIA_FABRIC_MANAGER
CHECK_CUDA_TOOLKIT
CHECK_MELLANOX_DRIVER
CHECK_KERNEL_MODULE
CHECK_TIME_ZONE
CHECK_MARK_SHOWHOLD
CHECK_DISABLE_IPV6
CHECH_SYSTEM_SLEEP
内容版权声明:除非注明,否则皆为本站原创文章。
转载注明出处:https://sulao.cn/post/1139
评论列表