From dab2938937507f8bbdb2d058e4f989ed7094eac1 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 27 Jun 2019 15:46:09 -0500 Subject: [PATCH] tested on P100 --- driver/src/driver.cpp | 30 +++++++++++++++--------------- script/cmake-cuda_docker.sh | 30 ++++++++++++++++++++++++++++++ script/extract_asm-cuda.sh | 4 +++- 3 files changed, 48 insertions(+), 16 deletions(-) create mode 100755 script/cmake-cuda_docker.sh diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index e2da32bb4e..8cb0d27abb 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -597,7 +597,7 @@ int main(int argc, char* argv[]) constexpr index_t WPad = 0; #elif 0 // 1x1 filter, 8x8 image - // cuDNN 68%, ck:nvidia: 72.6%, ck:amd 34% + // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@MI50 34% constexpr index_t N = 64; constexpr index_t C = 1536; constexpr index_t HI = 8; @@ -613,7 +613,7 @@ int main(int argc, char* argv[]) constexpr index_t WPad = 0; #elif 0 // 1x1 filter, 8x8 image - // cuDNN 77%, ck:nvidia 76.4%, ck:amd 47% + // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@MI50 47% constexpr index_t N = 128; constexpr index_t C = 2048; constexpr index_t HI = 8; @@ -629,7 +629,7 @@ int main(int argc, char* argv[]) constexpr index_t WPad = 0; #elif 0 // 1x1 filter, 7x7 image - // cuDNN 82%, ck:nvidia 76.6%, ck:amd 54% + // cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@MI50 54% constexpr index_t N = 128; constexpr index_t C = 832; constexpr index_t HI = 7; @@ -645,7 +645,7 @@ int main(int argc, char* argv[]) constexpr index_t WPad = 0; #elif 0 // 1x1 filter, 8x8 image - // cuDNN 83%, ck:nvidia 75.4%, ck:amd 58% + // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@MI50 58% constexpr index_t N = 128; constexpr index_t C = 1280; constexpr index_t HI = 8; @@ -659,9 +659,9 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 0 +#elif 1 // 1x1 filter, 14x14 image - // cuDNN 62%, ck:nvidia 68.4%, ck:amd 44% + // cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@MI50 44% constexpr index_t N = 128; constexpr index_t C = 512; constexpr index_t HI = 14; @@ -677,7 +677,7 @@ int main(int argc, char* argv[]) constexpr index_t WPad = 0; #elif 0 // 1x1 filter, 8x8 image - // cuDNN 74%, ck:nvidia 57.1%, ck:amd 52% + // cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@MI50 52% constexpr index_t N = 64; constexpr index_t C = 1536; constexpr index_t HI = 8; @@ -693,7 +693,7 @@ int main(int argc, char* argv[]) constexpr index_t WPad = 0; #elif 0 // 1x1 filter, 28x28 image - // cuDNN 86%, ck:nvidia 84.6%, ck:amd 64% + // cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@MI50 64% constexpr index_t N = 128; constexpr index_t C = 256; constexpr index_t HI = 28; @@ -709,7 +709,7 @@ int main(int argc, char* argv[]) constexpr index_t WPad = 0; #elif 0 // 1x1 filter, 7x7 image - // cuDNN 71%, ck:55.9%, ck:amd 54% + // cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@MI50 54% constexpr index_t N = 128; constexpr index_t C = 832; constexpr index_t HI = 7; @@ -725,7 +725,7 @@ int main(int argc, char* argv[]) constexpr index_t WPad = 0; #elif 0 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output - // cuDNN 90%, ck:nvidia 93%, ck:amd 73% + // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@MI50 73% constexpr index_t N = 128; constexpr index_t C = 288; constexpr index_t HI = 35; @@ -741,7 +741,7 @@ int main(int argc, char* argv[]) constexpr index_t WPad = 0; #elif 0 // 1x1 filter, 17x17 input - // cuDNN 81%, ck:nvidia 76.8%, ck:amd 66% + // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@MI50 66% constexpr index_t N = 128; constexpr index_t C = 768; constexpr index_t HI = 17; @@ -755,9 +755,9 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 1 +#elif 0 // 1x1 filter, 14x14 image - // cuDNN 73%, ck:nvidia 72.7%, ck:amd 65% + // cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@MI50 65% constexpr index_t N = 128; constexpr index_t C = 528; constexpr index_t HI = 14; @@ -773,7 +773,7 @@ int main(int argc, char* argv[]) constexpr index_t WPad = 0; #elif 0 // 1x1 filter, 14x14 image - // cuDNN 73%, ck:nvidia 72.7%, ck:amd 65% + // cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@MI50 65% constexpr index_t N = 128; constexpr index_t C = 528; constexpr index_t HI = 14; @@ -789,7 +789,7 @@ int main(int argc, char* argv[]) constexpr index_t WPad = 0; #elif 0 // 1x1 filter, 7x7 image - // cuDNN 49%, ck:nvidia 52.8%, ck:amd 45% + // cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@MI50 45% constexpr index_t N = 128; constexpr index_t C = 832; constexpr index_t HI = 7; diff --git a/script/cmake-cuda_docker.sh b/script/cmake-cuda_docker.sh new file mode 100755 index 0000000000..d5ccaac761 --- /dev/null +++ b/script/cmake-cuda_docker.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +rm -f CMakeCache.txt +rm -f *.cmake +rm -rf CMakeFiles + +MY_PROJECT_SOURCE=../../../ +MY_PROJECT_INSTALL=../install.dir + +export CUDA_ROOT=/usr/local/cuda +export CPATH=$CPATH:$CUDA_ROOT/include +export LIBRARY_PATH=$LIBRARY_PATH:$CUDA_ROOT/lib64 +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_ROOT/lib64 + +cmake \ +-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ +-D CMAKE_CXX_COMPILER=clang++-6.0 \ +-D CMAKE_BUILD_TYPE=Release \ +-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ +-D DEVICE_BACKEND=NVIDIA \ +-D CUDA_COMMON_INCLUDE_DIR="/root/workspace/NVIDIA_CUDA-10.1_Samples/common/inc" \ +-D CMAKE_CUDA_FLAGS="-ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \ +${MY_PROJECT_SOURCE} + + +#-D CMAKE_CUDA_COMPILER="/package/install/cuda_10.0/bin/nvcc" \ +#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \ +#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \ +#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \ +#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -gencode=arch=compute_70,code=sm_70 -Xptxas -v -maxrregcount=128" \ diff --git a/script/extract_asm-cuda.sh b/script/extract_asm-cuda.sh index 84a0af76dd..4041ddc0ef 100755 --- a/script/extract_asm-cuda.sh +++ b/script/extract_asm-cuda.sh @@ -1 +1,3 @@ -cuobjdump -xelf all ./driver/driver && nvdisasm --print-code -g driver.sm_61.cubin > driver.sm_61.asm && nvdisasm --print-code -g driver.sm_70.cubin > driver.sm_70.asm +cuobjdump -xelf sm_60 ./driver/driver && nvdisasm --print-code -g driver.sm_60.cubin > driver.sm_60.asm +cuobjdump -xelf sm_61 ./driver/driver && nvdisasm --print-code -g driver.sm_61.cubin > driver.sm_61.asm +cuobjdump -xelf sm_70 ./driver/driver && nvdisasm --print-code -g driver.sm_70.cubin > driver.sm_70.asm