nccl-tests英伟达GPU单机多卡一键测试脚本

由于经常需要做测试,所以撰写了一个测试单机多卡的bash脚本,前提需要环境中已经安装nvidia驱动和cuda库,且cuda库安装在默认目录/usr/local/下,然后nccl我是下载的zip包,名字是nccl-master.zip,nccl-tests包也是下载的zip的包,名字是nccl-tests-master.zip,这两个包名字写死了,将下面脚本内容存为脚本,前面提的包放在同一目录,然后使用bash命令进行执行。

脚本内容如下:

#!/bin/bash
set -e

CURRENT_PATH=`readlink -f $(dirname $0)`
RESULT_PATH=${CURRENT_PATH}/result_$(hostname)

function INFO(){
    /bin/echo -e "\e[104m\e[97m[INFO]\e[49m\e[39m ${*}"
}

function WARNING(){
    /bin/echo >&2 -e "\e[101m\e[97m[WARNING]\e[49m\e[39m ${*}"
}

function ERROR(){
    /bin/echo >&2 -e "\e[101m\e[97m[ERROR]\e[49m\e[39m ${*}"
}

function CHECK_GPU_COMMAND(){
    if ! command -v "nvidia-smi" >/dev/null 2>&1; then
		ERROR "英伟达驱动未安装!"
        exit 1
    else
        export GPU_TOTAL=`nvidia-smi -L | wc -l`
    fi
}

function CHECK_CUDA_PATH(){
    if [ -d /usr/local/cuda ];then
        export CUDA_PATH=`ls -l /usr/local/cuda | awk '{print $NF}'`
    else
        ERROR "检测没有找到 cuda toolkit 目录,请先安装 cuda toolkit !"
        exit 1
    fi
}

function CHECK_GPU_SM(){
    GPU_NAME=`nvidia-smi -q -i 0 | grep 'Product Name' | awk -F ': ' '{print $2}'`
    if [[ $GPU_NAME =~ A100$ ]] || [[ $GPU_NAME =~ A800-SXM4-80GB$ ]]; then
        COMPUTE_SM=80
    fi
    if [[ $GPU_NAME =~ 3090$ ]]; then
        COMPUTE_SM=86
    fi
    if [[ $GPU_NAME =~ 4090$ ]] || [[ $GPU_NAME =~ 4090D$ ]]; then
        COMPUTE_SM=89
    fi
    if [[ $GPU_NAME =~ H100$ ]] || [[ $GPU_NAME =~ H800$ ]] || [[ $GPU_NAME =~ H200$ ]]|| [[ $GPU_NAME =~ H20$ ]]; then
        COMPUTE_SM=90
    fi
    if [[ $GPU_NAME =~ B200$ ]] || [[ $GPU_NAME =~ GB200$ ]]; then
        COMPUTE_SM=100
    fi
    if [[ $GPU_NAME =~ 5090$ ]] || [[ $GPU_NAME =~ 5090D$ ]]; then
        COMPUTE_SM=120
    fi
    if [ -z "${COMPUTE_SM+x}" ]; then
        ERROR "没有获取到算力,请检查!"
        exit 1
    else
		export COMPUTE_SM=${COMPUTE_SM}
        INFO "当前算力:${COMPUTE_SM}"
    fi
}

function CUBLAS_BENCHMARK_TEST(){
    chmod +x cublasMatmulBench
    INFO "总共有 ${GPU_TOTAL} 张GPU卡!"
    for i in $(seq 0 $((GPU_TOTAL - 1))); do
        INFO "开始测试第 $i 号卡..."
        export CUDA_VISIBLE_DEVICES=$i
        FPD=`./cublasMatmulBench -P=ddd -m=15360 -n=18176 -k=8192 -T=8 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
        FP=`./cublasMatmulBench -P=sss -m=15360 -n=18176 -k=8192 -T=500 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print$10}'`
        TF=`./cublasMatmulBench -P=sss_fast_tf32 -m=15360 -n=18176 -k=8192 -T=500 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
        HHH=`./cublasMatmulBench -P=hhh -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
        HSH=`./cublasMatmulBench -P=hsh -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
        HSS=`./cublasMatmulBench -P=hss -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
        BF=`./cublasMatmulBench -P=tst -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
        FPS=`./cublasMatmulBench -P=qqssq -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
        echo -e "GPU $i \t| FP64: ${FPD} \t| FP32: ${FP} \t| TF32: ${TF} \t| HHH: ${HHH} \t| HSH: ${HSH} \t| HSS: ${HSS} \t| BF16: ${BF} \t| FP8: ${FPS}" >> ${RESULT_PATH}/cublas.log
        INFO "第 $i 号卡测试完成!"
    done
}

function NCCL_TESTS_BUILD(){
    INFO "开始编译 nccl 和 nccl-tests 测试工具!"
    if [ ! -d ${CURRENT_PATH}/nccl-master ]; then
        WARNING "${CURRENT_PATH}/nccl-master 不目录存,开始解压!"
        unzip ${CURRENT_PATH}/nccl-master.zip   
    fi
    cd ${CURRENT_PATH}/nccl-master
    mkdir -p ${CURRENT_PATH}/nccl
    if [ -d ${CURRENT_PATH}/nccl/lib ]; then
        INFO "检测到编译路径 ${CURRENT_PATH}/nccl/lib 存在,开始清理编译文件!"
        make clean
    fi
    make -j$(nproc) src.build BUILDDIR=${CURRENT_PATH}/nccl CUDA_HOME=${CUDA_PATH} NVCC_GENCODE="-gencode=arch=compute_${COMPUTE_SM},code=sm_${COMPUTE_SM}"
    if [ $? -eq 0 ]; then
        INFO "nccl 编译完成!"
    else
        ERROR "nccl 编译失败!"
        exit 1
    fi
    export LD_LIBRARY_PATH=${CURRENT_PATH}/nccl/lib:$LD_LIBRARY_PATH
    export PATH=${CURRENT_PATH}/nccl/bin:$PATH
    
	cd ${CURRENT_PATH}
    if [ ! -d ${CURRENT_PATH}/nccl-tests-master ]; then
        WARNING "${CURRENT_PATH}/nccl-tests-master 不目录存,开始解压!"
        unzip ${CURRENT_PATH}/nccl-tests-master.zip
    fi
    cd ${CURRENT_PATH}/nccl-tests-master
    if [ -d ${CURRENT_PATH}/nccl-tests-master/build ]; then
        INFO "检测到编译路径 ${CURRENT_PATH}/nccl-tests-master/build 存在,开始清理编译文件!"
        make clean
    fi
    make CUDA_HOME=${CUDA_PATH} NCCL_HOME=${CURRENT_PATH}/nccl
    if [ $? -eq 0 ]; then
        INFO "nccl-tests 编译完成!"
    else
        ERROR "nccl-tests 编译失败!"
        exit 1
    fi
    export NCCL_TESTS_PATH=${CURRENT_PATH}/nccl-tests-master
}

function NCCL_COMM_TESTS(){
    cd ${NCCL_TESTS_PATH}
    INFO "开始测试: ./build/all_reduce_perf -b 8 -e 4G -f 2 -g ${GPU_TOTAL}"
    ./build/all_reduce_perf -b 8 -e 4G -f 2 -g ${GPU_TOTAL} > ${RESULT_PATH}/all_reduce_perf.log
    if [ $? -eq 0 ];then
        INFO "all_reduce_perf 测试完成!"
    else
        ERROR "all_reduce_perf 测试失败,请检查安装配置!"
    fi

    sleep 5s
    INFO "开始测试: ./build/all_gather_perf -b 8 -e 4G -f 2 -g ${GPU_TOTAL}"
    ./build/all_gather_perf -b 8 -e 4G -f 2 -g ${GPU_TOTAL} > ${RESULT_PATH}/all_gather_perf.log
    if [ $? -eq 0 ];then
        INFO "all_gather_perf 测试完成!"
    else
        ERROR "all_gather_perf 测试失败,请检查安装配置!"
    fi

    sleep 5s
    INFO "开始测试: ./build/alltoall_perf -b 8 -e 4G -f 2 -g ${GPU_TOTAL}"
    ./build/alltoall_perf -b 8 -e 4G -f 2 -g ${GPU_TOTAL} > ${RESULT_PATH}/alltoall_perf.log
    if [ $? -eq 0 ];then
        INFO "alltoall_perf 测试完成!"
    else
        ERROR "alltoall_perf 测试失败,请检查安装配置!"
    fi
}

apt install unzip -y
INFO "当前路径:${CURRENT_PATH}"
source /etc/profile
source ~/.bashrc
mkdir -p ${RESULT_PATH}
CHECK_GPU_COMMAND
CHECK_CUDA_PATH
CHECK_GPU_SM
CUBLAS_BENCHMARK_TEST
NCCL_TESTS_BUILD
NCCL_COMM_TESTS

然后存为single_nccl_test.sh文件,执行以下命令进行执行

bash single_nccl_test.sh

测试结果会写入当前目录下的result_$(hostname)目录下。

202508272103537538358890.png

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://sulao.cn/post/1125

评论列表

0%