ubuntu22.04算力环境基础配置一键脚本

自己写的算力环境基础配置一键脚本,方便自己使用,如果普通用户没有免密的话需要输入一次密码,如果是root用户就直接执行

#!/bin/bash
#set -e

CURRENT_PATH=`readlink -f $(dirname $0)`
if [ -f ${CURRENT_PATH}/common.sh ]; then
    . ${CURRENT_PATH}/common.sh
else
    echo "无法找到公共配置文件!"
    exit 1
fi

function INSTALL_AUTO_RESTART(){
    INFO "设置安装软件无交互重启..."
    if [ ! -f /etc/needrestart/needrestart.conf_bak ]; then
        if [ -f /etc/needrestart/needrestart.conf ]; then
            sudo cp /etc/needrestart/needrestart.conf /etc/needrestart/needrestart.conf_bak
            LINE=`cat /etc/needrestart/needrestart.conf | grep -n '^#\$nrconf{restart}' | awk -F ':' '{print $1}'`
            if [ ! -z "${LINE+x}" ]; then
                R=`cat /etc/needrestart/needrestart.conf | grep -n '^\$nrconf{restart}' | wc -l`
                if [ $R -eq 0 ]; then
                    sudo sed -i "${LINE}a \$nrconf{restart} = 'a';" /etc/needrestart/needrestart.conf
                fi
            fi
        else
            WARNING "未找到安装软件交互配置文件!"
        fi
    fi
}
function HOSTS_BIND_IP(){
    INFO "设置主机名地址绑定..."
    for HOST in "${HOST_LIST[@]}"
    do
        cat /etc/hosts | grep "${HOST}"
        if [ $? -ne 0 ]; then
            echo "${HOST}" | sudo tee -a /etc/hosts
        fi
    done
}
function GENERATE_SSH_ID(){
    INFO "生成密钥..."
    if [ ! -f ~/.ssh/id_rsa ]; then
        ssh-keygen -t rsa -N '' -f ~/.ssh/id_rsa
        chmod 700 ~/.ssh
        #touch ~/.ssh/authorized_keys
        cat ~/.ssh/id_rsa.pub >>~/.ssh/authorized_keys
        chmod 600 ~/.ssh/authorized_keys
    else
        WARNING "密钥文件已存在,跳过!"
    fi
}
function SET_CPU_PERFORMANCE(){
    INFO "设置CPU Performance性能模式..."
    echo 'GOVERNOR="performance"' | sudo tee /etc/default/cpufrequtils
    sudo systemctl restart cpufrequtils
    sudo systemctl enable cpufrequtils
}
function SET_SECURITY_LIMITS(){
    INFO "优化文件描述符..."
    if [ ! -f /etc/security/limits.conf_bak ]; then
        sudo cp /etc/security/limits.conf /etc/security/limits.conf_bak
    fi
        sudo tee /etc/security/limits.conf > /dev/null <<EOF
* soft nofile 1000000
* hard nofile 1000000
* soft nproc 2000000
* hard nproc 2000000
* soft memlock unlimited
* hard memlock unlimited
* soft stack unlimited
* hard stack unlimited
root soft nofile 1000000
root hard nofile 1000000
root soft nproc 2000000
root hard nproc 2000000
root soft memlock unlimited
root hard memlock unlimited
root soft stack unlimited
root hard stack unlimited
EOF
}
function DISABLE_AUTO_UPDATE(){
    INFO "禁用内核自动更新..."
    sudo rm -f /etc/apt/apt.conf.d/50unattended-upgrades >/dev/null 2>&1
    sudo systemctl stop unattended-upgrades.service
    sudo systemctl disable unattended-upgrades.service
    for i in `dpkg --list | grep -E 'linux-(headers|image|modules)-[0-9]' | awk '{print $2}'`
        do sudo apt-mark hold $i
    done
}
function FIXED_KERNEL_VERSION(){
    INFO "固定内核版本..."
    R=`grep -c "^GRUB_DEFAULT=" /etc/default/grub`
    if [ $R -ne 0 ]; then
        sudo sed -i "s/^GRUB_DEFAULT=.*/GRUB_DEFAULT=\"Advanced options for Ubuntu>Ubuntu, with Linux $(uname -r)\"/" /etc/default/grub
        sudo update-grub
    else
        ERROR "没有找到GRUB_DEFAULT相关配置!"
    fi
}
function SET_TIME_ZONE(){
    ZONE="Asia/Shanghai"
    INFO "设置时区为 ${ZONE}"
    sudo timedatectl set-timezone ${ZONE}
    sudo ntpdate -u ntp.aliyun.com
}
function DISABLE_IPV6(){
    INFO "设置禁用IPV6 ..."
    R=`sudo sysctl -a | grep net.ipv6.conf.all.disable_ipv6 | awk -F '=' '{print $2}'`
    if [ $R -eq 1 ]; then
        INFO "已配置禁用ipv6,跳过!"
    else
        echo 'net.ipv6.conf.all.disable_ipv6 = 1' |  sudo tee -a /etc/sysctl.conf
        sudo sysctl -p
    fi
}
function DISABLE_SLEEP(){
    INFO "禁用系统休眠..."
    sudo systemctl mask sleep.target suspend.target hibernate.target hybrid-sleep.target
}
function DISABLE_NOUVEAU(){
    INFO "禁用系统nouveau驱动..."
    R=`lsmod | grep nouveau | wc -l`
    if [ $R -eq 0 ]; then
        INFO "nouveau 驱动已禁用,跳过!"
    else
        CHECK_ITEM=('blacklist\s+nouveau' 'blacklist\s+lbm‐nouveau' 'options\s+nouveau\s+modeset=0' 'alias\s+nouveau\s+off' 'alias\s+lbm‐nouveau\s+off')
        for ITEM in "${CHECK_ITEM[@]}"
        do
            cat /etc/modprobe.d/blacklist.conf | grep -E ${ITEM}
            if [ $? -ne 0 ]; then
                echo ${ITEM//\\s+/ } | sudo tee -a /etc/modprobe.d/blacklist.conf
            fi
        done
        sudo update-initramfs -k $(uname -r) -c
    fi
}
function PUBLIC_KEY_LOGIN(){
    INFO "设置密钥登录和优化配置..."
    CHECK_ITEM=('^ClientAliveInterval\s+60' '^ClientAliveCountMax\s+3' '^MaxStartups\s+512' '^AuthorizedKeysFile\s+.ssh/authorized_keys')
    for ITEM in "${CHECK_ITEM[@]}"
    do
        cat /etc/ssh/sshd_config | grep -E ${ITEM}
        if [ $? -ne 0 ]; then
            ITEM2=${ITEM//\\s+/ }
            echo ${ITEM2:1} | sudo tee -a /etc/ssh/sshd_config
        fi
    done
    sudo systemctl restart ssh
}
function TIME_SYNC_SERVER(){
    if [ ${#HOST_LIST[@]} -gt 1 ]; then
        INFO "部署时间同步服务..."
        sudo apt -y install chrony
        if [ ! -f /etc/chrony/chrony.conf_bak ]; then
            sudo cp /etc/chrony/chrony.conf /etc/chrony/chrony.conf_bak
        fi
        if [ "${HOST_LIST[0]##* }" == "$(hostname)" ]; then
            sudo tee /etc/chrony/chrony.conf >/dev/null <<EOF
server 127.127.1.0 iburst
local stratum 10
driftfile /var/lib/chrony/drift
makestep 1.0 3
rtcsync
logdir /var/log/chrony
allow all
EOF
        else
            sudo tee /etc/chrony/chrony.conf >/dev/null <<EOF
server ${HOST_LIST[0]%% *} iburst
driftfile /var/lib/chrony/drift
makestep 1.0 3
rtcsync
logdir /var/log/chrony
EOF
        fi
        sudo systemctl restart chrony
        sudo systemctl enable chrony
        sudo chronyc sourcestats -v
    else
        INFO "单节点无需部署时间同步服务!"
    fi
}
function NFS_FILE_SHARE(){
    if [ ${#HOST_LIST[@]} -gt 1 ]; then
        if [ "${HOST_LIST[0]##* }" == "$(hostname)" ]; then
            INFO "部署NFS SERVER..."
            sudo apt install -y nfs-kernel-server
            if [ ! -d ${NFS_SHARE_PATH} ]; then
                sudo mkdir -p ${NFS_SHARE_PATH}
                sudo chmod -R 777 ${NFS_SHARE_PATH}
            fi
            for HOST in "${HOST_LIST[@]}";
            do
                if [ "${HOST##* }" != "$(hostname)" ]; then
                    echo "${NFS_SHARE_PATH} ${HOST%% *}(rw,sync,no_subtree_check)" | sudo tee -a /etc/exports
                fi
            done
            sudo systemctl restart nfs-server
            sudo systemctl enable nfs-server
            sudo exportfs -arv
        else
            INFO "挂载NFS共享目录..."
            sudo apt install -y nfs-common
            if [ ! -d ${NFS_MOUNT_PATH} ]; then
                sudo mkdir -p ${NFS_MOUNT_PATH}
            fi
            timeout 10 sudo mount -t nfs -o rw,sync,hard,intr,timeo=5,retrans=3 ${HOST_LIST[0]%% *}:${NFS_SHARE_PATH} ${NFS_MOUNT_PATH}
            if [ $? -eq 0 ]; then
                INFO "共享目录${NFS_MOUNT_PATH}挂载成功!"
            else
                ERROR "共享目${NFS_MOUNT_PATH}录挂载失败!"
            fi
        fi
    else
        INFO "单节点无需部署NFS服务!"
    fi
}

CONFIG_SUDOER
sudo apt update -y
INSTALL_AUTO_RESTART
sudo apt install build-essential g++ gcc make dkms ntpdate wget sshpass cpufrequtils unzip -y
HOSTS_BIND_IP
GENERATE_SSH_ID
SET_CPU_PERFORMANCE
SET_SECURITY_LIMITS
#DISABLE_AUTO_UPDATE
FIXED_KERNEL_VERSION
SET_TIME_ZONE
DISABLE_IPV6
DISABLE_SLEEP
DISABLE_NOUVEAU
PUBLIC_KEY_LOGIN
TIME_SYNC_SERVER
NFS_FILE_SHARE

公共配置文件common.sh文件内容如下

#!/bin/bash
#set -e

#注意配置第一个主机一般为内部服务节点,例如时间同步服务器
HOST_LIST=('10.0.1.208 master' '10.0.2.234 node1')
NFS_SHARE_PATH="/ssd/nfs"
NFS_MOUNT_PATH="/data"

function INFO(){
    /bin/echo -e "\e[104m\e[97m[I]\e[49m\e[39m ${*}"
}
function WARNING(){
    /bin/echo >&2 -e "\e[105m\e[97m[W]\e[49m\e[39m ${*}"
}
function ERROR(){
    /bin/echo >&2 -e "\e[101m\e[97m[E]\e[49m\e[39m ${*}"
}
function CONFIG_SUDOER(){
    INFO "配置当前用户sudo切换免密..."
    if [ "$(whoami)" == "root" ]; then
        INFO "当前用户为root,跳过配置!"
    else
        if [ ! -f "/etc/sudoers.d/$(whoami)" ]; then
            sudo tee /etc/sudoers.d/$(whoami) <<EOF
$(whoami) ALL=(ALL) NOPASSWD:ALL
EOF
        else
            INFO "当前账户:$(whoami) 免密已配置,跳过!"
        fi
    fi
}
function CHECK_GPU_COMMAND(){
    if ! command -v "nvidia-smi" >/dev/null 2>&1; then
        ERROR "英伟达驱动未安装!"
        exit 1
    else
        export GPU_TOTAL=`nvidia-smi -L | wc -l`
    fi
}
function CHECK_CUDA_PATH(){
    if [ -d /usr/local/cuda ];then
        export CUDA_PATH=`ls -l /usr/local/cuda | awk '{print $NF}'`
    else
        ERROR "检测没有找到 cuda toolkit 目录,请先安装 cuda toolkit !"
        exit 1
    fi
}
function CHECK_PASSWD(){
    if [ -z "${PASSWORD}" ]; then
        ERROR "请先设置密码 PASSWORD 到环境变量!"
        exit 1
    fi
}

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://sulao.cn/post/1138

评论列表

0%