From b6d7e5fd48e87af46ebf43b93f4e19b3156732bc Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Fri, 26 Apr 2019 15:55:26 -0500 Subject: [PATCH] ds_read_offset [ROCm/composable_kernel commit: 49d5af1002c37e19db071271b7560ae6c64fefd5] --- script/ds_read_offset.sh | 12 + src/include/amd_inline_asm.hip.hpp | 518 +++++++++++++++++++++-------- 2 files changed, 393 insertions(+), 137 deletions(-) create mode 100755 script/ds_read_offset.sh diff --git a/script/ds_read_offset.sh b/script/ds_read_offset.sh new file mode 100755 index 0000000000..22e756bff7 --- /dev/null +++ b/script/ds_read_offset.sh @@ -0,0 +1,12 @@ +for((i=0;i<=4096;i=i+64)) +do + OFFSET=$i + echo "if(offset == $OFFSET)" + echo "{" + echo " asm volatile(\"\\n \\" + echo " ds_read_b128 %0, %1 offset:$OFFSET\n \\" + echo " \"" + echo " : \"=v\"(r)" + echo " : \"v\"(__to_local(lds)));" + echo "}" +done diff --git a/src/include/amd_inline_asm.hip.hpp b/src/include/amd_inline_asm.hip.hpp index 4a8be241ba..44b480f542 100644 --- a/src/include/amd_inline_asm.hip.hpp +++ b/src/include/amd_inline_asm.hip.hpp @@ -201,278 +201,522 @@ __device__ void ds_read_b128(vector_type::MemoryType& r, void* lds, in if(offset == 0) { asm volatile("\n \ - ds_read_b128 %0, %1 \n \ + ds_read_b128 %0, %1 offset:0\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 128) + if(offset == 64) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:128 \n \ + ds_read_b128 %0, %1 offset:64\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 256) + if(offset == 128) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:256 \n \ + ds_read_b128 %0, %1 offset:128\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 384) + if(offset == 192) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:384 \n \ + ds_read_b128 %0, %1 offset:192\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 512) + if(offset == 256) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:512 \n \ + ds_read_b128 %0, %1 offset:256\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 640) + if(offset == 320) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:640 \n \ + ds_read_b128 %0, %1 offset:320\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 768) + if(offset == 384) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:768 \n \ + ds_read_b128 %0, %1 offset:384\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 896) + if(offset == 448) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:896 \n \ + ds_read_b128 %0, %1 offset:448\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 1024) + if(offset == 512) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:1024 \n \ + ds_read_b128 %0, %1 offset:512\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 1152) + if(offset == 576) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:1152 \n \ + ds_read_b128 %0, %1 offset:576\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 1280) + if(offset == 640) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:1280 \n \ + ds_read_b128 %0, %1 offset:640\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 1408) + if(offset == 704) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:1408 \n \ + ds_read_b128 %0, %1 offset:704\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 1536) + if(offset == 768) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:1536 \n \ + ds_read_b128 %0, %1 offset:768\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 1664) + if(offset == 832) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:1664 \n \ + ds_read_b128 %0, %1 offset:832\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 1792) + if(offset == 896) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:1792 \n \ + ds_read_b128 %0, %1 offset:896\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 1920) + if(offset == 960) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:1920 \n \ + ds_read_b128 %0, %1 offset:960\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 2048) + if(offset == 1024) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:2048 \n \ + ds_read_b128 %0, %1 offset:1024\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 2176) + if(offset == 1088) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:2176 \n \ + ds_read_b128 %0, %1 offset:1088\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 2304) + if(offset == 1152) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:2304 \n \ + ds_read_b128 %0, %1 offset:1152\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 2432) + if(offset == 1216) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:2432 \n \ + ds_read_b128 %0, %1 offset:1216\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 2560) + if(offset == 1280) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:2560 \n \ + ds_read_b128 %0, %1 offset:1280\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 2688) + if(offset == 1344) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:2688 \n \ + ds_read_b128 %0, %1 offset:1344\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 2816) + if(offset == 1408) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:2816 \n \ + ds_read_b128 %0, %1 offset:1408\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 2944) + if(offset == 1472) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:2944 \n \ + ds_read_b128 %0, %1 offset:1472\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 3072) + if(offset == 1536) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:3072 \n \ + ds_read_b128 %0, %1 offset:1536\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 3200) + if(offset == 1600) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:3200 \n \ + ds_read_b128 %0, %1 offset:1600\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 3328) + if(offset == 1664) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:3328 \n \ + ds_read_b128 %0, %1 offset:1664\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 3456) + if(offset == 1728) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:3456 \n \ + ds_read_b128 %0, %1 offset:1728\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 3584) + if(offset == 1792) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:3584 \n \ + ds_read_b128 %0, %1 offset:1792\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 3712) + if(offset == 1856) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:3712 \n \ + ds_read_b128 %0, %1 offset:1856\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 3840) + if(offset == 1920) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:3840 \n \ + ds_read_b128 %0, %1 offset:1920\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 3968) + if(offset == 1984) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:3968 \n \ + ds_read_b128 %0, %1 offset:1984\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 4096) + if(offset == 2048) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:4096 \n \ + ds_read_b128 %0, %1 offset:2048\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else if(offset == 4352) + if(offset == 2112) { asm volatile("\n \ - ds_read_b128 %0, %1 offset:4352 \n \ + ds_read_b128 %0, %1 offset:2112\n \ " - : "=v"(r) - : "v"(__to_local(lds))); + : "=v"(r) + : "v"(__to_local(lds))); } - else + if(offset == 2176) { - assert(false); + asm volatile("\n \ + ds_read_b128 %0, %1 offset:2176\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 2240) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:2240\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 2304) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:2304\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 2368) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:2368\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 2432) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:2432\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 2496) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:2496\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 2560) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:2560\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 2624) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:2624\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 2688) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:2688\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 2752) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:2752\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 2816) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:2816\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 2880) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:2880\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 2944) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:2944\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3008) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3008\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3072) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3072\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3136) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3136\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3200) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3200\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3264) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3264\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3328) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3328\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3392) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3392\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3456) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3456\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3520) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3520\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3584) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3584\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3648) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3648\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3712) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3712\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3776) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3776\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3840) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3840\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3904) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3904\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 3968) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:3968\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 4032) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:4032\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); + } + if(offset == 4096) + { + asm volatile("\n \ + ds_read_b128 %0, %1 offset:4096\n \ + " + : "=v"(r) + : "v"(__to_local(lds))); } #endif }