nccl-tests多机多卡测试环境一键编译部署脚本

此前我们做了一个单机多卡的nccl-tests部署测试脚本,多集卡多卡的我们也需要经常测试,所以这次记录了一个自己用的多机多卡openmpi+nccl+nccl-tests编译安装脚本,注意不含测试,这个脚本最终编译完成以后会打印出多机测试命令,只需要进行简单修改然后执行即可。单机多卡的可以查看这个笔记:https://sulao.cn/post/1125

需要准备如下源码包和脚本放一起,确保多机集群做好免密,openmpi多机测试可以单向的免密,确保机器cuda库和驱动都已经安装好。

nccl-master.zip
nccl-tests-master.zip
openmpi-4.1.8.tar.gz

脚本内容如下:

#!/bin/bash
#set -xe

CURRENT_PATH=`readlink -f $(dirname $0)`
if [ -f ${CURRENT_PATH}/common.sh ]; then
    . ${CURRENT_PATH}/common.sh
else
    echo "无法找到公共配置文件!"
    exit 1
fi

function BUILD_NCCL(){
    COMPUTE_CAP=`nvidia-smi --query-gpu=compute_cap --format=csv | grep -v compute_cap | head -1`
    COMPUTE_SM=$(echo "($COMPUTE_CAP * 10)/1" | bc)
    INFO "当前算力:${COMPUTE_SM}"
    cd ${CURRENT_PATH}
    if [ ! -d ${CURRENT_PATH}/nccl-master ]; then
        WARNING "${CURRENT_PATH}/nccl-master 目录不存在,开始解压!"
        unzip ${CURRENT_PATH}/nccl-master.zip
    fi
    cd ${CURRENT_PATH}/nccl-master
    mkdir -p ${CURRENT_PATH}/nccl
    if [ -d ${CURRENT_PATH}/nccl/lib ]; then
        INFO "检测到编译路径 ${CURRENT_PATH}/nccl/lib 存在,开始清理编译文件!"
        make clean
    fi
    INFO "开始编译 nccl..."
    make -j$(nproc) src.build BUILDDIR=${CURRENT_PATH}/nccl CUDA_HOME=${CUDA_PATH} NVCC_GENCODE="-gencode=arch=compute_${COMPUTE_SM},code=sm_${COMPUTE_SM}"
    if [ $? -eq 0 ]; then
        INFO "nccl 编译完成!"
    else
        ERROR "nccl 编译失败!"
        exit 1
    fi
    export LD_LIBRARY_PATH=${CURRENT_PATH}/nccl/lib:$LD_LIBRARY_PATH
    export PATH=${CURRENT_PATH}/nccl/bin:$PATH
}
function BUILD_OPENMPI(){
    cd ${CURRENT_PATH}
    OPENMPI_TARG=`ls openmpi-*.tar.gz`
    if [ ! -d ${CURRENT_PATH}/${OPENMPI_TARG%%.tar*} ]; then
        WARNING "${CURRENT_PATH}/${OPENMPI_TARG%%.tar*} 目录不存在,开始解压!"
        tar -zxvf ${CURRENT_PATH}/${OPENMPI_TARG}
    fi
    cd ${CURRENT_PATH}/${OPENMPI_TARG%%.tar*}
    mkdir -p ${CURRENT_PATH}/openmpi
    if [ -d ${CURRENT_PATH}/openmpi/lib ]; then
        INFO "检测到编译路径 ${CURRENT_PATH}/openmpi/lib 存在,开始清理编译文件!"
        make clean
    fi
    INFO "开始配置 openmpi..."
    ./configure  --prefix=${CURRENT_PATH}/openmpi --with-cuda=${CUDA_PATH}
    INFO "开始编译 openmpi..."
    make -j$(nproc)
    INFO "开始安装 openmpi..."
    make install
    if [ $? -eq 0 ]; then
        INFO "openmpi 编译完成!"
    else
        ERROR "openmpi 编译失败!"
        exit 1
    fi
    export LD_LIBRARY_PATH=${CURRENT_PATH}/openmpi/lib:$LD_LIBRARY_PATH
}
function BUILD_NCCL_TESTS(){
    cd ${CURRENT_PATH}
    if [ ! -d ${CURRENT_PATH}/nccl-tests-master ]; then
        WARNING "${CURRENT_PATH}/nccl-tests-master 目录不存在,开始解压!"
        unzip ${CURRENT_PATH}/nccl-tests-master.zip
    fi
    cd ${CURRENT_PATH}/nccl-tests-master
    if [ -d ${CURRENT_PATH}/nccl-tests-master/build ]; then
        INFO "检测到编译路径 ${CURRENT_PATH}/nccl-tests-master/build 存在,开始清理编译文件!"
        make clean
    fi
    INFO "开始编译 nccl-tests..."
    make MPI=1 MPI_HOME=${CURRENT_PATH}/openmpi CUDA_HOME=${CUDA_PATH} NCCL_HOME=${CURRENT_PATH}/nccl
    if [ $? -eq 0 ]; then
        INFO "nccl-tests 编译完成!"
    else
        ERROR "nccl-tests 编译失败!"
        exit 1
    fi
}
function GENERATE_TEST_COMMAND(){
    cd ${CURRENT_PATH}
    NET_DEVIC=`ip a | grep -E 'mtu|inet' | grep -B 1 ${HOST_LIST[0]%% *} | head -1 | awk -F ': ' '{print $2}'`
    PROCESS_NUM=$((${#HOST_LIST[@]}*${GPU_TOTAL}))
    INFO "请修改以下命令进行多机多卡测试,如果是ROCE网络注意修改GID!"
    SLOT=""
    for HOST in "${HOST_LIST[@]}"
    do
        SLOT="${SLOT},${HOST%% *}:${GPU_TOTAL}"
    done
    RATE=`ibstatus | grep rate | awk '{print $2}' | uniq -c | sort -k2nr | head -1`
    MLN_LIST=`ibstatus | grep -E -B 6 "${RATE##* }\s+Gb/sec" | grep device | awk '{print $3":"$5}' | sed ':a;N;$!ba;s/\n/,/g'`
    echo -e "${CURRENT_PATH}/openmpi/bin/mpirun -np ${PROCESS_NUM} \\
    -H ${SLOT:1} \\
    --allow-run-as-root -bind-to numa -map-by slot \\
    -x NCCL_DEBUG=INFO \\
    -x NCCL_ALGO=Ring \\
    -x NCCL_MAX_NCHANNELS=16 \\
    -x NCCL_MIN_NCHANNELS=16 \\
    -x NCCL_IB_HCA=${MLN_LIST//\'/} \\
    -x NCCL_IB_GID_INDEX=3 \\
    -x NCCL_IB_DISABLE=0 \\
    -x NCCL_IB_RETRY_CNT=7 \\
    -x NCCL_IB_TIMEOUT=23 \\
    -x NCCL_SOCKET_IFNAME=${NET_DEVICE} \\
    -x NCCL_NET_GDR_LEVEL=2 \\
    -x NCCL_IB_QPS_PER_CONNECTION=4 \\
    -x NCCL_IB_TC=160 \\
    -x NCCL_CHECKS_DISABLE=1 \\
    -x LD_LIBRARY_PATH=${CURRENT_PATH}/openmpi/lib:${CURRENT_PATH}/nccl/lib:\$LD_LIBRARY_PATH \\
    -x PATH=\$PATH \\
    -mca coll_hcoll_enable 0 \\
    -mca pml ob1 \\
    -mca btl_tcp_if_include ${NET_DEVICE} \\
    -mca btl ^openib \\
    ${CURRENT_PATH}/nccl-tests-master/build/all_reduce_perf -b 1M -e 4G -f 2 -g 1"
}

CHECK_GPU_COMMAND
CHECK_CUDA_PATH
BUILD_NCCL
BUILD_OPENMPI
BUILD_NCCL_TESTS
GENERATE_TEST_COMMAND

上述脚本执行完成以后会自动生成多机多卡nccl-tests测试命令,我们只需要核对命令,然后进行执行即可,注意每个节点都需要执行此脚本进行环境的编译配置。

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://sulao.cn/post/1136

评论列表

0%