nccl-tests英伟达GPU单机多卡一键测试脚本

由于经常需要做测试,所以撰写了一个测试单机多卡的bash脚本,前提需要环境中已经安装nvidia驱动和cuda库,且cuda库安装在默认目录/usr/local/下,然后nccl我是下载的zip包,名字是nccl-master.zip,nccl-tests包也是下载的zip的包,名字是nccl-tests-master.zip,这两个包名字写死了,将下面脚本内容存为脚本,前面提的包放在同一目录,然后使用bash命令进行执行。

脚本内容如下:

#!/bin/bash
set -e

export CURRENT_PATH=`readlink -f $(dirname $0)`
export RESULT_PATH=${CURRENT_PATH}/result_$(hostname)

function CHECK_GPU_COMMAND(){
    if ! command -v "nvidia-smi" >/dev/null 2>&1; then
		echo "英伟达驱动未安装!"
        exit 1
    else
        export GPU_TOTAL=`nvidia-smi -L | wc -l`
    fi
}

function CHECK_GPU_SM(){
    export GPU_NAME=`nvidia-smi -q -i 0 | grep 'Product Name' | awk -F ': ' '{print $2}'`
    if [[ $GPU_NAME =~ A100$ ]] || [[ $GPU_NAME =~ A800-SXM4-80GB$ ]]; then
        COMPUTE_SM=80
    fi
    if [[ $GPU_NAME =~ 3090$ ]]; then
        COMPUTE_SM=86
    fi
    if [[ $GPU_NAME =~ 4090$ ]] || [[ $GPU_NAME =~ 4090D$ ]]; then
        COMPUTE_SM=89
    fi
    if [[ $GPU_NAME =~ H100$ ]] || [[ $GPU_NAME =~ H800$ ]] || [[ $GPU_NAME =~ H200$ ]]|| [[ $GPU_NAME =~ H20$ ]]; then
        COMPUTE_SM=90
    fi
    if [[ $GPU_NAME =~ B200$ ]] || [[ $GPU_NAME =~ GB200$ ]]; then
        COMPUTE_SM=100
    fi
    if [[ $GPU_NAME =~ 5090$ ]] || [[ $GPU_NAME =~ 5090D$ ]]; then
        COMPUTE_SM=120
    fi
    if [ -z "${COMPUTE_SM+x}" ]; then
        echo "没有获取到算力,请检查!"
        exit 1
    else
        echo "当前算力:${COMPUTE_SM}"
	export COMPUTE_SM=${COMPUTE_SM}
    fi
}

function CUBLAS_BENCHMARK_TEST(){
    chmod +x ${CURRENT_PATH}/cublasMatmulBench
    export GPU_TOTAL=`nvidia-smi -L | wc -l`
    echo "总共有 ${GPU_TOTAL} 张GPU卡!"
    for i in $(seq 0 $((GPU_TOTAL - 1))); do
        echo "开始测试第 $i 号卡..."
        export CUDA_VISIBLE_DEVICES=$i
        FPD=`./cublasMatmulBench -P=ddd -m=15360 -n=18176 -k=8192 -T=8 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
        FP=`./cublasMatmulBench -P=sss -m=15360 -n=18176 -k=8192 -T=500 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print$10}'`
        TF=`./cublasMatmulBench -P=sss_fast_tf32 -m=15360 -n=18176 -k=8192 -T=500 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
        HHH=`./cublasMatmulBench -P=hhh -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
        HSH=`./cublasMatmulBench -P=hsh -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
        HSS=`./cublasMatmulBench -P=hss -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
        BF=`./cublasMatmulBench -P=tst -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
        FPS=`./cublasMatmulBench -P=qqssq -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
        echo -e "GPU $i \t| FP64: ${FPD} \t| FP32: ${FP} \t| TF32: ${TF} \t| HHH: ${HHH} \t| HSH: ${HSH} \t| HSS: ${HSS} \t| BF16: ${BF} \t| FP8: ${FPS}" >> ${RESULT_PATH}/cublas.log
        echo "第 $i 号卡测试完成!"
    done
}

function NCCL_COMM_TESTS(){
    echo ${CURRENT_PATH}
    export CUDA_PATH=`ls -l /usr/local/cuda | awk '{print $NF}'`
    
    if [ ! -d ${CURRENT_PATH}/nccl-master ]; then
        echo "${CURRENT_PATH}/nccl-master 不目录存,开始解压!"
        unzip ${CURRENT_PATH}/nccl-master.zip   
    fi
    cd ${CURRENT_PATH}/nccl-master
    mkdir -p ${CURRENT_PATH}/nccl
    if [ -d ${CURRENT_PATH}/nccl/lib ]; then
        echo "检测到编译路径 ${CURRENT_PATH}/nccl/lib 存在,开始清理编译文件!"
        make clean
    fi
    make -j$(nproc) src.build BUILDDIR=${CURRENT_PATH}/nccl CUDA_HOME=${CUDA_PATH} NVCC_GENCODE="-gencode=arch=compute_${COMPUTE_SM},code=sm_${COMPUTE_SM}"
    if [ $? -eq 0 ]; then
        echo "nccl 编译完成!"
    else
        echo "nccl 编译失败!"
        exit 1
    fi
    export LD_LIBRARY_PATH=${CURRENT_PATH}/nccl/lib:$LD_LIBRARY_PATH
    export PATH=${CURRENT_PATH}/nccl/bin:$PATH
    
	cd ${CURRENT_PATH}
    if [ ! -d ${CURRENT_PATH}/nccl-tests-master ]; then
        echo "${CURRENT_PATH}/nccl-tests-master 不目录存,开始解压!"
        unzip ${CURRENT_PATH}/nccl-tests-master.zip
    fi
    cd ${CURRENT_PATH}/nccl-tests-master
    if [ -d ${CURRENT_PATH}/nccl-tests-master/build ]; then
        echo "检测到编译路径 ${CURRENT_PATH}/nccl-tests-master/build 存在,开始清理编译文件!"
        make clean
    fi
    make CUDA_HOME=${CUDA_PATH} NCCL_HOME=${CURRENT_PATH}/nccl
    if [ $? -eq 0 ]; then
        echo "nccl-tests 编译完成!"
    else
        echo "nccl-tests 编译失败!"
        exit 1
    fi
    echo "开始测试 all_reduce_perf"
    ${CURRENT_PATH}/nccl-tests-master/build/all_reduce_perf -b 8 -e 4G -f 2 -g ${GPU_TOTAL} > ${RESULT_PATH}/single_all_reduce_perf.log
    sleep 5s
    echo "开始测试 all_gather_perf"
    ${CURRENT_PATH}/nccl-tests-master/build/all_gather_perf -b 8 -e 4G -f 2 -g ${GPU_TOTAL} > ${RESULT_PATH}/single_all_gather_perf.log
    sleep 5s
    echo "开始测试 alltoall_perf"
    ${CURRENT_PATH}/nccl-tests-master/build/alltoall_perf -b 8 -e 4G -f 2 -g ${GPU_TOTAL} > ${RESULT_PATH}/single_alltoall_perf.log
    echo -e "nccl-tests 测试完成!"
}

apt install unzip -y
echo "当前路径:${CURRENT_PATH}"
source /etc/profile
source ~/.bashrc
mkdir -p $RESULT_PATH
CHECK_GPU_COMMAND
CHECK_GPU_SM
#CUBLAS_BENCHMARK_TEST
NCCL_COMM_TESTS

然后存为single_nccl_test.sh文件,执行以下命令进行执行

bash single_nccl_test.sh

测试结果会写入当前目录下的result_$(hostname)目录下。

202508272103537538358890.png

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://sulao.cn/post/1125

评论列表

0%