diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc index 15a9df2c0c..c191fff7d0 100644 --- a/example/ck_tile/18_flatmm/run_flatmm_example.inc +++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc @@ -4,14 +4,22 @@ #include template -constexpr const char* DataTypeToString() { - if constexpr (std::is_same_v) { +constexpr const char* DataTypeToString() +{ + if constexpr(std::is_same_v) + { return "fp16"; - } else if constexpr (std::is_same_v) { + } + else if constexpr(std::is_same_v) + { return "fp8"; - } else if constexpr (std::is_same_v) { + } + else if constexpr(std::is_same_v) + { return "bf8"; - } else { + } + else + { return "unknown"; } } @@ -112,8 +120,9 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf, args.stride_B = stride_B; args.stride_C = stride_C; - float ave_time = flatmm_calc( - args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}); + float ave_time = + flatmm_calc( + args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_byte = @@ -121,18 +130,15 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf, float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_byte / 1.E6 / ave_time; - std::cout << "Run Flatmm kernel with DataType = " << DataTypeToString() << " M =" << M << " N =" << N << " K =" << K - << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C - << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " - << std::endl; + std::cout << "Run Flatmm kernel with DataType = " << DataTypeToString() + << " M =" << M << " N =" << N << " K =" << K << " StrideA =" << stride_A + << " StrideB =" << stride_B << " StrideC =" << stride_C << " : " << ave_time + << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl; return ave_time; } -template +template int run_flatmm_example_with_layouts(int argc, char* argv[], const ALayout a_layout = ALayout{}, @@ -147,7 +153,7 @@ int run_flatmm_example_with_layouts(int argc, using BDataType = typename GemmBasicTypeConfig::BDataType; using CDataType = typename GemmBasicTypeConfig::CDataType; using AccDataType = typename GemmBasicTypeConfig::AccDataType; - + ck_tile::index_t M = arg_parser.get_int("m"); ck_tile::index_t N = arg_parser.get_int("n"); ck_tile::index_t K = arg_parser.get_int("k"); @@ -182,7 +188,7 @@ int run_flatmm_example_with_layouts(int argc, c_rslt_host.SetZero(); // do pre-shuffle - std::string mfma = arg_parser.get_str("prec"); + std::string mfma = arg_parser.get_str("prec"); #if defined(USING_MFMA_16x16x32) && defined(ENABLE_FP8) ck_tile::index_t mfma_type = 1; #else @@ -193,18 +199,18 @@ int run_flatmm_example_with_layouts(int argc, b_shuffle_dev_buf.ToDevice(b_shuffle_host.data()); invoke_flatmm( - a_dev_buf, - b_shuffle_dev_buf, - c_dev_buf, - M, - N, - K, - stride_A, - stride_B, - stride_C, - kbatch, - n_warmup, - n_repeat); + a_dev_buf, + b_shuffle_dev_buf, + c_dev_buf, + M, + N, + K, + stride_A, + stride_B, + stride_C, + kbatch, + n_warmup, + n_repeat); c_dev_buf.FromDevice(c_rslt_host.data()); bool pass = true; @@ -219,8 +225,9 @@ int run_flatmm_example_with_layouts(int argc, a_host, b_origin_host, c_ref_host); const float max_accumulated_value = *std::max_element(c_ref_host.mData.begin(), c_ref_host.mData.end()); - const auto rtol_atol = calculate_rtol_atol(K, kbatch, max_accumulated_value); - pass = ck_tile::check_err(c_rslt_host, + const auto rtol_atol = calculate_rtol_atol( + K, kbatch, max_accumulated_value); + pass = ck_tile::check_err(c_rslt_host, c_ref_host, "Error: Incorrect results!", rtol_atol.at(ck_tile::number<0>{}), @@ -277,8 +284,9 @@ int run_flatmm_example_with_layouts(int argc, c_gpu_ref_dev_buf.FromDevice(c_gpu_ref_host.data()); const float max_accumulated_value = *std::max_element(c_gpu_ref_host.mData.begin(), c_gpu_ref_host.mData.end()); - const auto rtol_atol = calculate_rtol_atol(K, kbatch, max_accumulated_value); - pass = ck_tile::check_err(c_rslt_host, + const auto rtol_atol = calculate_rtol_atol( + K, kbatch, max_accumulated_value); + pass = ck_tile::check_err(c_rslt_host, c_gpu_ref_host, "Error: Incorrect results!", rtol_atol.at(ck_tile::number<0>{}), diff --git a/include/ck/utility/amd_buffer_addressing_builtins.hpp b/include/ck/utility/amd_buffer_addressing_builtins.hpp index 19869906dc..296c1d44d7 100644 --- a/include/ck/utility/amd_buffer_addressing_builtins.hpp +++ b/include/ck/utility/amd_buffer_addressing_builtins.hpp @@ -80,7 +80,7 @@ __device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2( int32x4_t rsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16"); // buffer atomic-add i32 __device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32( @@ -88,7 +88,7 @@ __device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32( int32x4_t rsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32"); // buffer atomic-add fp32 __device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32( @@ -96,15 +96,15 @@ __device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32( int32x4_t rsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32"); // buffer atomic-add fp32 -__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64( - double vdata, - int32x4_t rsrc, // dst_wave_buffer_resource - int voffset, // dst_thread_addr_offset - int soffset, // dst_wave_addr_offset - int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32"); +__device__ double +llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, + int32x4_t rsrc, // dst_wave_buffer_resource + int voffset, // dst_thread_addr_offset + int soffset, // dst_wave_addr_offset + int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64"); // memory coherency bit for buffer store/load instruction // check ISA manual for each GFX target @@ -827,7 +827,7 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, index_t voffset, index_t soffset, index_t offset, - index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds.v4i32"); + index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds"); #ifndef __HIPCC_RTC__ template diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp index 0b9956cd01..53a344c7b0 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp @@ -881,95 +881,95 @@ CK_TILE_DEVICE_EXTERN int8_t llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8"); CK_TILE_DEVICE_EXTERN int8x2_t llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8"); CK_TILE_DEVICE_EXTERN int8x4_t llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8"); // buffer load i16 CK_TILE_DEVICE_EXTERN int16_t llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16"); CK_TILE_DEVICE_EXTERN int16x2_t llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16"); CK_TILE_DEVICE_EXTERN int16x4_t llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16"); // buffer load i32 CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32"); CK_TILE_DEVICE_EXTERN int32x2_t llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32"); CK_TILE_DEVICE_EXTERN int32x4_t llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32"); // buffer load fp16 CK_TILE_DEVICE_EXTERN _Float16 llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16"); -CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_load_fp16x2( - int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16.v4i32"); +CK_TILE_DEVICE_EXTERN fp16x2_t +llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc, + index_t voffset, + index_t soffset, + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16"); -CK_TILE_DEVICE_EXTERN fp16x4_t llvm_amdgcn_raw_buffer_load_fp16x4( - int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16.v4i32"); +CK_TILE_DEVICE_EXTERN fp16x4_t +llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc, + index_t voffset, + index_t soffset, + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16"); // buffer load fp32 CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32"); -CK_TILE_DEVICE_EXTERN fp32x2_t llvm_amdgcn_raw_buffer_load_fp32x2( - int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32.v4i32"); +CK_TILE_DEVICE_EXTERN fp32x2_t +llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc, + index_t voffset, + index_t soffset, + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32"); -CK_TILE_DEVICE_EXTERN fp32x4_t llvm_amdgcn_raw_buffer_load_fp32x4( - int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32.v4i32"); +CK_TILE_DEVICE_EXTERN fp32x4_t +llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc, + index_t voffset, + index_t soffset, + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32"); // buffer store i8 CK_TILE_DEVICE_EXTERN void @@ -977,21 +977,21 @@ llvm_amdgcn_raw_buffer_store_i8(int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8"); CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8"); CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8"); // buffer store i16 CK_TILE_DEVICE_EXTERN void @@ -999,21 +999,21 @@ llvm_amdgcn_raw_buffer_store_i16(int16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16"); -CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x2( - int16x2_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16.v4i32"); +CK_TILE_DEVICE_EXTERN void +llvm_amdgcn_raw_buffer_store_i16x2(int16x2_t vdata, + int32x4_t rsrc, + index_t voffset, + index_t soffset, + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16"); -CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x4( - int16x4_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16.v4i32"); +CK_TILE_DEVICE_EXTERN void +llvm_amdgcn_raw_buffer_store_i16x4(int16x4_t vdata, + int32x4_t rsrc, + index_t voffset, + index_t soffset, + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16"); // buffer store i32 CK_TILE_DEVICE_EXTERN void @@ -1021,7 +1021,7 @@ llvm_amdgcn_raw_buffer_store_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32"); // buffer store ui16 CK_TILE_DEVICE_EXTERN void @@ -1029,35 +1029,35 @@ llvm_amdgcn_raw_buffer_store_ui16(uint16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16"); -CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x2( - uint16x2_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16.v4i32"); +CK_TILE_DEVICE_EXTERN void +llvm_amdgcn_raw_buffer_store_ui16x2(uint16x2_t vdata, + int32x4_t rsrc, + index_t voffset, + index_t soffset, + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16"); -CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x4( - uint16x4_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16.v4i32"); +CK_TILE_DEVICE_EXTERN void +llvm_amdgcn_raw_buffer_store_ui16x4(uint16x4_t vdata, + int32x4_t rsrc, + index_t voffset, + index_t soffset, + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16"); -CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x2( - int32x2_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32.v4i32"); +CK_TILE_DEVICE_EXTERN void +llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata, + int32x4_t rsrc, + index_t voffset, + index_t soffset, + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32"); -CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x4( - int32x4_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32.v4i32"); +CK_TILE_DEVICE_EXTERN void +llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata, + int32x4_t rsrc, + index_t voffset, + index_t soffset, + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32"); // buffer store fp16 CK_TILE_DEVICE_EXTERN void @@ -1065,21 +1065,21 @@ llvm_amdgcn_raw_buffer_store_fp16(_Float16 vdata, int32x4_t rsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16"); -CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x2( - fp16x2_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16.v4i32"); +CK_TILE_DEVICE_EXTERN void +llvm_amdgcn_raw_buffer_store_fp16x2(fp16x2_t vdata, + int32x4_t rsrc, + index_t voffset, + index_t soffset, + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16"); -CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x4( - fp16x4_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16.v4i32"); +CK_TILE_DEVICE_EXTERN void +llvm_amdgcn_raw_buffer_store_fp16x4(fp16x4_t vdata, + int32x4_t rsrc, + index_t voffset, + index_t soffset, + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16"); // buffer store fp32 CK_TILE_DEVICE_EXTERN void @@ -1087,21 +1087,21 @@ llvm_amdgcn_raw_buffer_store_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32"); -CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x2( - fp32x2_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32.v4i32"); +CK_TILE_DEVICE_EXTERN void +llvm_amdgcn_raw_buffer_store_fp32x2(fp32x2_t vdata, + int32x4_t rsrc, + index_t voffset, + index_t soffset, + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32"); -CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x4( - fp32x4_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32.v4i32"); +CK_TILE_DEVICE_EXTERN void +llvm_amdgcn_raw_buffer_store_fp32x4(fp32x4_t vdata, + int32x4_t rsrc, + index_t voffset, + index_t soffset, + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32"); // buffer atomic-add fp16 CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2( @@ -1109,7 +1109,7 @@ CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2( int32x4_t rsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16"); // buffer atomic-add i32 CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32( @@ -1117,7 +1117,7 @@ CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32( int32x4_t rsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32"); // buffer atomic-add fp32 CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32( @@ -1125,15 +1125,15 @@ CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32( int32x4_t rsrc, index_t voffset, index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32.v4i32"); + index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32"); // buffer atomic-max fp64 -CK_TILE_DEVICE_EXTERN double llvm_amdgcn_raw_buffer_atomic_max_fp64( - double vdata, - int32x4_t rsrc, // dst_wave_buffer_resource - int voffset, // dst_thread_addr_offset - int soffset, // dst_wave_addr_offset - int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32"); +CK_TILE_DEVICE_EXTERN double +llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, + int32x4_t rsrc, // dst_wave_buffer_resource + int voffset, // dst_thread_addr_offset + int soffset, // dst_wave_addr_offset + int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64"); // Direct loads from global to LDS. CK_TILE_DEVICE_EXTERN void @@ -1143,7 +1143,7 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, index_t voffset, index_t soffset, index_t offset, - index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds.v4i32"); + index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds"); template CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem,