自己写的算力环境基础配置一键脚本,方便自己使用,如果普通用户没有免密的话需要输入一次密码,如果是root用户就直接执行
#!/bin/bash
#set -e
CURRENT_PATH=`readlink -f $(dirname $0)`
if [ -f ${CURRENT_PATH}/common.sh ]; then
. ${CURRENT_PATH}/common.sh
else
echo "无法找到公共配置文件!"
exit 1
fi
function INSTALL_AUTO_RESTART(){
INFO "设置安装软件无交互重启..."
if [ ! -f /etc/needrestart/needrestart.conf_bak ]; then
if [ -f /etc/needrestart/needrestart.conf ]; then
sudo cp /etc/needrestart/needrestart.conf /etc/needrestart/needrestart.conf_bak
LINE=`cat /etc/needrestart/needrestart.conf | grep -n '^#\$nrconf{restart}' | awk -F ':' '{print $1}'`
if [ ! -z "${LINE+x}" ]; then
R=`cat /etc/needrestart/needrestart.conf | grep -n '^\$nrconf{restart}' | wc -l`
if [ $R -eq 0 ]; then
sudo sed -i "${LINE}a \$nrconf{restart} = 'a';" /etc/needrestart/needrestart.conf
fi
fi
else
WARNING "未找到安装软件交互配置文件!"
fi
fi
}
function HOSTS_BIND_IP(){
INFO "设置主机名地址绑定..."
for HOST in "${HOST_LIST[@]}"
do
cat /etc/hosts | grep "${HOST}"
if [ $? -ne 0 ]; then
echo "${HOST}" | sudo tee -a /etc/hosts
fi
done
}
function GENERATE_SSH_ID(){
INFO "生成密钥..."
if [ ! -f ~/.ssh/id_rsa ]; then
ssh-keygen -t rsa -N '' -f ~/.ssh/id_rsa
chmod 700 ~/.ssh
#touch ~/.ssh/authorized_keys
cat ~/.ssh/id_rsa.pub >>~/.ssh/authorized_keys
chmod 600 ~/.ssh/authorized_keys
else
WARNING "密钥文件已存在,跳过!"
fi
}
function SET_CPU_PERFORMANCE(){
INFO "设置CPU Performance性能模式..."
echo 'GOVERNOR="performance"' | sudo tee /etc/default/cpufrequtils
sudo systemctl restart cpufrequtils
sudo systemctl enable cpufrequtils
}
function SET_SECURITY_LIMITS(){
INFO "优化文件描述符..."
if [ ! -f /etc/security/limits.conf_bak ]; then
sudo cp /etc/security/limits.conf /etc/security/limits.conf_bak
fi
sudo tee /etc/security/limits.conf > /dev/null <<EOF
* soft nofile 1000000
* hard nofile 1000000
* soft nproc 2000000
* hard nproc 2000000
* soft memlock unlimited
* hard memlock unlimited
* soft stack unlimited
* hard stack unlimited
root soft nofile 1000000
root hard nofile 1000000
root soft nproc 2000000
root hard nproc 2000000
root soft memlock unlimited
root hard memlock unlimited
root soft stack unlimited
root hard stack unlimited
EOF
}
function DISABLE_AUTO_UPDATE(){
INFO "禁用内核自动更新..."
sudo rm -f /etc/apt/apt.conf.d/50unattended-upgrades >/dev/null 2>&1
sudo systemctl stop unattended-upgrades.service
sudo systemctl disable unattended-upgrades.service
for i in `dpkg --list | grep -E 'linux-(headers|image|modules)-[0-9]' | awk '{print $2}'`
do sudo apt-mark hold $i
done
}
function FIXED_KERNEL_VERSION(){
INFO "固定内核版本..."
R=`grep -c "^GRUB_DEFAULT=" /etc/default/grub`
if [ $R -ne 0 ]; then
sudo sed -i "s/^GRUB_DEFAULT=.*/GRUB_DEFAULT=\"Advanced options for Ubuntu>Ubuntu, with Linux $(uname -r)\"/" /etc/default/grub
sudo update-grub
else
ERROR "没有找到GRUB_DEFAULT相关配置!"
fi
}
function SET_TIME_ZONE(){
ZONE="Asia/Shanghai"
INFO "设置时区为 ${ZONE}"
sudo timedatectl set-timezone ${ZONE}
sudo ntpdate -u ntp.aliyun.com
}
function DISABLE_IPV6(){
INFO "设置禁用IPV6 ..."
R=`sudo sysctl -a | grep net.ipv6.conf.all.disable_ipv6 | awk -F '=' '{print $2}'`
if [ $R -eq 1 ]; then
INFO "已配置禁用ipv6,跳过!"
else
echo 'net.ipv6.conf.all.disable_ipv6 = 1' | sudo tee -a /etc/sysctl.conf
sudo sysctl -p
fi
}
function DISABLE_SLEEP(){
INFO "禁用系统休眠..."
sudo systemctl mask sleep.target suspend.target hibernate.target hybrid-sleep.target
}
function DISABLE_NOUVEAU(){
INFO "禁用系统nouveau驱动..."
R=`lsmod | grep nouveau | wc -l`
if [ $R -eq 0 ]; then
INFO "nouveau 驱动已禁用,跳过!"
else
CHECK_ITEM=('blacklist\s+nouveau' 'blacklist\s+lbm‐nouveau' 'options\s+nouveau\s+modeset=0' 'alias\s+nouveau\s+off' 'alias\s+lbm‐nouveau\s+off')
for ITEM in "${CHECK_ITEM[@]}"
do
cat /etc/modprobe.d/blacklist.conf | grep -E ${ITEM}
if [ $? -ne 0 ]; then
echo ${ITEM//\\s+/ } | sudo tee -a /etc/modprobe.d/blacklist.conf
fi
done
sudo update-initramfs -k $(uname -r) -c
fi
}
function PUBLIC_KEY_LOGIN(){
INFO "设置密钥登录和优化配置..."
CHECK_ITEM=('^ClientAliveInterval\s+60' '^ClientAliveCountMax\s+3' '^MaxStartups\s+512' '^AuthorizedKeysFile\s+.ssh/authorized_keys')
for ITEM in "${CHECK_ITEM[@]}"
do
cat /etc/ssh/sshd_config | grep -E ${ITEM}
if [ $? -ne 0 ]; then
ITEM2=${ITEM//\\s+/ }
echo ${ITEM2:1} | sudo tee -a /etc/ssh/sshd_config
fi
done
sudo systemctl restart ssh
}
function TIME_SYNC_SERVER(){
if [ ${#HOST_LIST[@]} -gt 1 ]; then
INFO "部署时间同步服务..."
sudo apt -y install chrony
if [ ! -f /etc/chrony/chrony.conf_bak ]; then
sudo cp /etc/chrony/chrony.conf /etc/chrony/chrony.conf_bak
fi
if [ "${HOST_LIST[0]##* }" == "$(hostname)" ]; then
sudo tee /etc/chrony/chrony.conf >/dev/null <<EOF
server 127.127.1.0 iburst
local stratum 10
driftfile /var/lib/chrony/drift
makestep 1.0 3
rtcsync
logdir /var/log/chrony
allow all
EOF
else
sudo tee /etc/chrony/chrony.conf >/dev/null <<EOF
server ${HOST_LIST[0]%% *} iburst
driftfile /var/lib/chrony/drift
makestep 1.0 3
rtcsync
logdir /var/log/chrony
EOF
fi
sudo systemctl restart chrony
sudo systemctl enable chrony
sudo chronyc sourcestats -v
else
INFO "单节点无需部署时间同步服务!"
fi
}
function NFS_FILE_SHARE(){
if [ ${#HOST_LIST[@]} -gt 1 ]; then
if [ "${HOST_LIST[0]##* }" == "$(hostname)" ]; then
INFO "部署NFS SERVER..."
sudo apt install -y nfs-kernel-server
if [ ! -d ${NFS_SHARE_PATH} ]; then
sudo mkdir -p ${NFS_SHARE_PATH}
sudo chmod -R 777 ${NFS_SHARE_PATH}
fi
for HOST in "${HOST_LIST[@]}";
do
if [ "${HOST##* }" != "$(hostname)" ]; then
echo "${NFS_SHARE_PATH} ${HOST%% *}(rw,sync,no_subtree_check)" | sudo tee -a /etc/exports
fi
done
sudo systemctl restart nfs-server
sudo systemctl enable nfs-server
sudo exportfs -arv
else
INFO "挂载NFS共享目录..."
sudo apt install -y nfs-common
if [ ! -d ${NFS_MOUNT_PATH} ]; then
sudo mkdir -p ${NFS_MOUNT_PATH}
fi
timeout 10 sudo mount -t nfs -o rw,sync,hard,intr,timeo=5,retrans=3 ${HOST_LIST[0]%% *}:${NFS_SHARE_PATH} ${NFS_MOUNT_PATH}
if [ $? -eq 0 ]; then
INFO "共享目录${NFS_MOUNT_PATH}挂载成功!"
else
ERROR "共享目${NFS_MOUNT_PATH}录挂载失败!"
fi
fi
else
INFO "单节点无需部署NFS服务!"
fi
}
CONFIG_SUDOER
sudo apt update -y
INSTALL_AUTO_RESTART
sudo apt install build-essential g++ gcc make dkms ntpdate wget sshpass cpufrequtils unzip -y
HOSTS_BIND_IP
GENERATE_SSH_ID
SET_CPU_PERFORMANCE
SET_SECURITY_LIMITS
#DISABLE_AUTO_UPDATE
FIXED_KERNEL_VERSION
SET_TIME_ZONE
DISABLE_IPV6
DISABLE_SLEEP
DISABLE_NOUVEAU
PUBLIC_KEY_LOGIN
TIME_SYNC_SERVER
NFS_FILE_SHARE公共配置文件common.sh文件内容如下
#!/bin/bash
#set -e
#注意配置第一个主机一般为内部服务节点,例如时间同步服务器
HOST_LIST=('10.0.1.208 master' '10.0.2.234 node1')
NFS_SHARE_PATH="/ssd/nfs"
NFS_MOUNT_PATH="/data"
function INFO(){
/bin/echo -e "\e[104m\e[97m[I]\e[49m\e[39m ${*}"
}
function WARNING(){
/bin/echo >&2 -e "\e[105m\e[97m[W]\e[49m\e[39m ${*}"
}
function ERROR(){
/bin/echo >&2 -e "\e[101m\e[97m[E]\e[49m\e[39m ${*}"
}
function CONFIG_SUDOER(){
INFO "配置当前用户sudo切换免密..."
if [ "$(whoami)" == "root" ]; then
INFO "当前用户为root,跳过配置!"
else
if [ ! -f "/etc/sudoers.d/$(whoami)" ]; then
sudo tee /etc/sudoers.d/$(whoami) <<EOF
$(whoami) ALL=(ALL) NOPASSWD:ALL
EOF
else
INFO "当前账户:$(whoami) 免密已配置,跳过!"
fi
fi
}
function CHECK_GPU_COMMAND(){
if ! command -v "nvidia-smi" >/dev/null 2>&1; then
ERROR "英伟达驱动未安装!"
exit 1
else
export GPU_TOTAL=`nvidia-smi -L | wc -l`
fi
}
function CHECK_CUDA_PATH(){
if [ -d /usr/local/cuda ];then
export CUDA_PATH=`ls -l /usr/local/cuda | awk '{print $NF}'`
else
ERROR "检测没有找到 cuda toolkit 目录,请先安装 cuda toolkit !"
exit 1
fi
}
function CHECK_PASSWD(){
if [ -z "${PASSWORD}" ]; then
ERROR "请先设置密码 PASSWORD 到环境变量!"
exit 1
fi
}
内容版权声明:除非注明,否则皆为本站原创文章。
转载注明出处:https://sulao.cn/post/1138
相关阅读
- ubuntu22.04安装cuda失败提示"Uninstall manifest corrupt"
- ubuntu使用ssh命令批量设置集群节点具有sudo权限的账户sudo免密切换
- RTX 5090在cuda13.0下gpu-burn编译报错的解决方法
- k8s集群部署gpu-operator支持gpu节点自动发现和gpu上报
- ubuntu22.04算力环境基础配置一键验证脚本
- nccl-tests多机多卡测试环境一键编译部署脚本
- ubuntu22.04忘记root密码进入单用户模式修改密码
- ubuntu22.04屏蔽使用apt安装软件时出现弹窗要求选择重启服务的方法
- nccl-tests英伟达GPU单机多卡一键测试脚本
- ubuntu22.04删除系统中的新内核并回退的方法
评论列表