tested on P100

[ROCm/composable_kernel commit: dab2938937]
This commit is contained in:
Chao Liu
2019-06-27 15:46:09 -05:00
parent c37a237f00
commit 99fc474d24
3 changed files with 48 additions and 16 deletions

View File

@@ -597,7 +597,7 @@ int main(int argc, char* argv[])
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 8x8 image
// cuDNN 68%, ck:nvidia: 72.6%, ck:amd 34%
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@MI50 34%
constexpr index_t N = 64;
constexpr index_t C = 1536;
constexpr index_t HI = 8;
@@ -613,7 +613,7 @@ int main(int argc, char* argv[])
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 8x8 image
// cuDNN 77%, ck:nvidia 76.4%, ck:amd 47%
// cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@MI50 47%
constexpr index_t N = 128;
constexpr index_t C = 2048;
constexpr index_t HI = 8;
@@ -629,7 +629,7 @@ int main(int argc, char* argv[])
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 7x7 image
// cuDNN 82%, ck:nvidia 76.6%, ck:amd 54%
// cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@MI50 54%
constexpr index_t N = 128;
constexpr index_t C = 832;
constexpr index_t HI = 7;
@@ -645,7 +645,7 @@ int main(int argc, char* argv[])
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 8x8 image
// cuDNN 83%, ck:nvidia 75.4%, ck:amd 58%
// cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@MI50 58%
constexpr index_t N = 128;
constexpr index_t C = 1280;
constexpr index_t HI = 8;
@@ -659,9 +659,9 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
#elif 1
// 1x1 filter, 14x14 image
// cuDNN 62%, ck:nvidia 68.4%, ck:amd 44%
// cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@MI50 44%
constexpr index_t N = 128;
constexpr index_t C = 512;
constexpr index_t HI = 14;
@@ -677,7 +677,7 @@ int main(int argc, char* argv[])
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 8x8 image
// cuDNN 74%, ck:nvidia 57.1%, ck:amd 52%
// cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@MI50 52%
constexpr index_t N = 64;
constexpr index_t C = 1536;
constexpr index_t HI = 8;
@@ -693,7 +693,7 @@ int main(int argc, char* argv[])
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 28x28 image
// cuDNN 86%, ck:nvidia 84.6%, ck:amd 64%
// cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@MI50 64%
constexpr index_t N = 128;
constexpr index_t C = 256;
constexpr index_t HI = 28;
@@ -709,7 +709,7 @@ int main(int argc, char* argv[])
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 7x7 image
// cuDNN 71%, ck:55.9%, ck:amd 54%
// cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@MI50 54%
constexpr index_t N = 128;
constexpr index_t C = 832;
constexpr index_t HI = 7;
@@ -725,7 +725,7 @@ int main(int argc, char* argv[])
constexpr index_t WPad = 0;
#elif 0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cuDNN 90%, ck:nvidia 93%, ck:amd 73%
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@MI50 73%
constexpr index_t N = 128;
constexpr index_t C = 288;
constexpr index_t HI = 35;
@@ -741,7 +741,7 @@ int main(int argc, char* argv[])
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 17x17 input
// cuDNN 81%, ck:nvidia 76.8%, ck:amd 66%
// cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@MI50 66%
constexpr index_t N = 128;
constexpr index_t C = 768;
constexpr index_t HI = 17;
@@ -755,9 +755,9 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 1
#elif 0
// 1x1 filter, 14x14 image
// cuDNN 73%, ck:nvidia 72.7%, ck:amd 65%
// cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@MI50 65%
constexpr index_t N = 128;
constexpr index_t C = 528;
constexpr index_t HI = 14;
@@ -773,7 +773,7 @@ int main(int argc, char* argv[])
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 14x14 image
// cuDNN 73%, ck:nvidia 72.7%, ck:amd 65%
// cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@MI50 65%
constexpr index_t N = 128;
constexpr index_t C = 528;
constexpr index_t HI = 14;
@@ -789,7 +789,7 @@ int main(int argc, char* argv[])
constexpr index_t WPad = 0;
#elif 0
// 1x1 filter, 7x7 image
// cuDNN 49%, ck:nvidia 52.8%, ck:amd 45%
// cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@MI50 45%
constexpr index_t N = 128;
constexpr index_t C = 832;
constexpr index_t HI = 7;

30
script/cmake-cuda_docker.sh Executable file
View File

@@ -0,0 +1,30 @@
#!/bin/bash
rm -f CMakeCache.txt
rm -f *.cmake
rm -rf CMakeFiles
MY_PROJECT_SOURCE=../../../
MY_PROJECT_INSTALL=../install.dir
export CUDA_ROOT=/usr/local/cuda
export CPATH=$CPATH:$CUDA_ROOT/include
export LIBRARY_PATH=$LIBRARY_PATH:$CUDA_ROOT/lib64
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_ROOT/lib64
cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_CXX_COMPILER=clang++-6.0 \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D DEVICE_BACKEND=NVIDIA \
-D CUDA_COMMON_INCLUDE_DIR="/root/workspace/NVIDIA_CUDA-10.1_Samples/common/inc" \
-D CMAKE_CUDA_FLAGS="-ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \
${MY_PROJECT_SOURCE}
#-D CMAKE_CUDA_COMPILER="/package/install/cuda_10.0/bin/nvcc" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -gencode=arch=compute_70,code=sm_70 -Xptxas -v -maxrregcount=128" \

View File

@@ -1 +1,3 @@
cuobjdump -xelf all ./driver/driver && nvdisasm --print-code -g driver.sm_61.cubin > driver.sm_61.asm && nvdisasm --print-code -g driver.sm_70.cubin > driver.sm_70.asm
cuobjdump -xelf sm_60 ./driver/driver && nvdisasm --print-code -g driver.sm_60.cubin > driver.sm_60.asm
cuobjdump -xelf sm_61 ./driver/driver && nvdisasm --print-code -g driver.sm_61.cubin > driver.sm_61.asm
cuobjdump -xelf sm_70 ./driver/driver && nvdisasm --print-code -g driver.sm_70.cubin > driver.sm_70.asm