由于经常需要做测试,所以撰写了一个测试单机多卡的bash脚本,前提需要环境中已经安装nvidia驱动和cuda库,且cuda库安装在默认目录/usr/local/下,然后nccl我是下载的zip包,名字是nccl-master.zip,nccl-tests包也是下载的zip的包,名字是nccl-tests-master.zip,这两个包名字写死了,将下面脚本内容存为脚本,前面提的包放在同一目录,然后使用bash命令进行执行。
脚本内容如下:
#!/bin/bash
set -e
CURRENT_PATH=`readlink -f $(dirname $0)`
RESULT_PATH=${CURRENT_PATH}/result_$(hostname)
function INFO(){
/bin/echo -e "\e[104m\e[97m[INFO]\e[49m\e[39m ${*}"
}
function WARNING(){
/bin/echo >&2 -e "\e[101m\e[97m[WARNING]\e[49m\e[39m ${*}"
}
function ERROR(){
/bin/echo >&2 -e "\e[101m\e[97m[ERROR]\e[49m\e[39m ${*}"
}
function CHECK_GPU_COMMAND(){
if ! command -v "nvidia-smi" >/dev/null 2>&1; then
ERROR "英伟达驱动未安装!"
exit 1
else
export GPU_TOTAL=`nvidia-smi -L | wc -l`
fi
}
function CHECK_CUDA_PATH(){
if [ -d /usr/local/cuda ];then
export CUDA_PATH=`ls -l /usr/local/cuda | awk '{print $NF}'`
else
ERROR "检测没有找到 cuda toolkit 目录,请先安装 cuda toolkit !"
exit 1
fi
}
function CHECK_GPU_SM(){
GPU_NAME=`nvidia-smi -q -i 0 | grep 'Product Name' | awk -F ': ' '{print $2}'`
if [[ $GPU_NAME =~ A100$ ]] || [[ $GPU_NAME =~ A800-SXM4-80GB$ ]]; then
COMPUTE_SM=80
fi
if [[ $GPU_NAME =~ 3090$ ]]; then
COMPUTE_SM=86
fi
if [[ $GPU_NAME =~ 4090$ ]] || [[ $GPU_NAME =~ 4090D$ ]]; then
COMPUTE_SM=89
fi
if [[ $GPU_NAME =~ H100$ ]] || [[ $GPU_NAME =~ H800$ ]] || [[ $GPU_NAME =~ H200$ ]]|| [[ $GPU_NAME =~ H20$ ]]; then
COMPUTE_SM=90
fi
if [[ $GPU_NAME =~ B200$ ]] || [[ $GPU_NAME =~ GB200$ ]]; then
COMPUTE_SM=100
fi
if [[ $GPU_NAME =~ 5090$ ]] || [[ $GPU_NAME =~ 5090D$ ]]; then
COMPUTE_SM=120
fi
if [ -z "${COMPUTE_SM+x}" ]; then
ERROR "没有获取到算力,请检查!"
exit 1
else
export COMPUTE_SM=${COMPUTE_SM}
INFO "当前算力:${COMPUTE_SM}"
fi
}
function CUBLAS_BENCHMARK_TEST(){
chmod +x cublasMatmulBench
INFO "总共有 ${GPU_TOTAL} 张GPU卡!"
for i in $(seq 0 $((GPU_TOTAL - 1))); do
INFO "开始测试第 $i 号卡..."
export CUDA_VISIBLE_DEVICES=$i
FPD=`./cublasMatmulBench -P=ddd -m=15360 -n=18176 -k=8192 -T=8 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
FP=`./cublasMatmulBench -P=sss -m=15360 -n=18176 -k=8192 -T=500 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print$10}'`
TF=`./cublasMatmulBench -P=sss_fast_tf32 -m=15360 -n=18176 -k=8192 -T=500 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
HHH=`./cublasMatmulBench -P=hhh -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
HSH=`./cublasMatmulBench -P=hsh -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
HSS=`./cublasMatmulBench -P=hss -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
BF=`./cublasMatmulBench -P=tst -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
FPS=`./cublasMatmulBench -P=qqssq -m=15360 -n=18176 -k=8192 -T=1000 -ta=1 -B=0 -p=0 | grep -i CUDA | awk '{print $10}'`
echo -e "GPU $i \t| FP64: ${FPD} \t| FP32: ${FP} \t| TF32: ${TF} \t| HHH: ${HHH} \t| HSH: ${HSH} \t| HSS: ${HSS} \t| BF16: ${BF} \t| FP8: ${FPS}" >> ${RESULT_PATH}/cublas.log
INFO "第 $i 号卡测试完成!"
done
}
function NCCL_TESTS_BUILD(){
INFO "开始编译 nccl 和 nccl-tests 测试工具!"
if [ ! -d ${CURRENT_PATH}/nccl-master ]; then
WARNING "${CURRENT_PATH}/nccl-master 不目录存,开始解压!"
unzip ${CURRENT_PATH}/nccl-master.zip
fi
cd ${CURRENT_PATH}/nccl-master
mkdir -p ${CURRENT_PATH}/nccl
if [ -d ${CURRENT_PATH}/nccl/lib ]; then
INFO "检测到编译路径 ${CURRENT_PATH}/nccl/lib 存在,开始清理编译文件!"
make clean
fi
make -j$(nproc) src.build BUILDDIR=${CURRENT_PATH}/nccl CUDA_HOME=${CUDA_PATH} NVCC_GENCODE="-gencode=arch=compute_${COMPUTE_SM},code=sm_${COMPUTE_SM}"
if [ $? -eq 0 ]; then
INFO "nccl 编译完成!"
else
ERROR "nccl 编译失败!"
exit 1
fi
export LD_LIBRARY_PATH=${CURRENT_PATH}/nccl/lib:$LD_LIBRARY_PATH
export PATH=${CURRENT_PATH}/nccl/bin:$PATH
cd ${CURRENT_PATH}
if [ ! -d ${CURRENT_PATH}/nccl-tests-master ]; then
WARNING "${CURRENT_PATH}/nccl-tests-master 不目录存,开始解压!"
unzip ${CURRENT_PATH}/nccl-tests-master.zip
fi
cd ${CURRENT_PATH}/nccl-tests-master
if [ -d ${CURRENT_PATH}/nccl-tests-master/build ]; then
INFO "检测到编译路径 ${CURRENT_PATH}/nccl-tests-master/build 存在,开始清理编译文件!"
make clean
fi
make CUDA_HOME=${CUDA_PATH} NCCL_HOME=${CURRENT_PATH}/nccl
if [ $? -eq 0 ]; then
INFO "nccl-tests 编译完成!"
else
ERROR "nccl-tests 编译失败!"
exit 1
fi
export NCCL_TESTS_PATH=${CURRENT_PATH}/nccl-tests-master
}
function NCCL_COMM_TESTS(){
cd ${NCCL_TESTS_PATH}
INFO "开始测试: ./build/all_reduce_perf -b 8 -e 4G -f 2 -g ${GPU_TOTAL}"
./build/all_reduce_perf -b 8 -e 4G -f 2 -g ${GPU_TOTAL} > ${RESULT_PATH}/all_reduce_perf.log
if [ $? -eq 0 ];then
INFO "all_reduce_perf 测试完成!"
else
ERROR "all_reduce_perf 测试失败,请检查安装配置!"
fi
sleep 5s
INFO "开始测试: ./build/all_gather_perf -b 8 -e 4G -f 2 -g ${GPU_TOTAL}"
./build/all_gather_perf -b 8 -e 4G -f 2 -g ${GPU_TOTAL} > ${RESULT_PATH}/all_gather_perf.log
if [ $? -eq 0 ];then
INFO "all_gather_perf 测试完成!"
else
ERROR "all_gather_perf 测试失败,请检查安装配置!"
fi
sleep 5s
INFO "开始测试: ./build/alltoall_perf -b 8 -e 4G -f 2 -g ${GPU_TOTAL}"
./build/alltoall_perf -b 8 -e 4G -f 2 -g ${GPU_TOTAL} > ${RESULT_PATH}/alltoall_perf.log
if [ $? -eq 0 ];then
INFO "alltoall_perf 测试完成!"
else
ERROR "alltoall_perf 测试失败,请检查安装配置!"
fi
}
apt install unzip -y
INFO "当前路径:${CURRENT_PATH}"
source /etc/profile
source ~/.bashrc
mkdir -p ${RESULT_PATH}
CHECK_GPU_COMMAND
CHECK_CUDA_PATH
CHECK_GPU_SM
CUBLAS_BENCHMARK_TEST
NCCL_TESTS_BUILD
NCCL_COMM_TESTS
然后存为single_nccl_test.sh文件,执行以下命令进行执行
bash single_nccl_test.sh
测试结果会写入当前目录下的result_$(hostname)目录下。
内容版权声明:除非注明,否则皆为本站原创文章。
转载注明出处:https://sulao.cn/post/1125
评论列表