mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
* wmma_op + unit test * add arch limitation to wmma test * change arch limitation * Refactor + Add all type unit test(int4 compile failed) * Add f32_16x16x16_bf16 unit test * tempsave * tempsave * tempsave * runtime bug, cannot find symbol * workaround for incorrect HIP warpSize return value * debugging * tempsave * Correctness OK, waiting for optimization * Tidy up + format * temp save * temp save, reproduce the v_bfi_b32 issue * add inline asm for wmmaop test * tidy up * clean some debug purpose code * discard some codes * clang format * clang format * compiler issue fixed + increase tile size * navi3x_multipleD+example * temp save * workable * batchedgemm[OK], groupconv[debug] * groupconv: Sanity check[OK], Performance[Bad] * navi3x_groupconv_need_optimization * create necessary files * save progress * Add Inter-Row thread transfer * save progress * save debugging progress * sanity check pass * fix a host tensor bug and clean up flash-attn code * format * cancel unnecessary change * cancel unnecessary change * cancel unnecessary change * temp save, add asm backend flag to amd_wmma * Mat-A LDS Bypass sanity pass * temp save * gemm sanity fix * Porting new blockwise gemm to flash attention * Example branch provide to compiler team * tempsave * Fix a bug * batched gemm ported * conv A-skip lds ported * Skip B-Lds real gemm * Skip B Lds Gemm + MulD * batched gemm, conv, skip b lds * format * Attn, skip b lds * Change GridwiseOp nam * fix a typo caused bug * Skip A_Lds sanity pass, Skip B_Lds scratch occured * Bug found, intra-row permute off caused * bug found * a fix * disable buffer load due to incorrect 3rd dword * update fmha config, no scratch generated * update 3rd dword * fmha config update * FMHA, add support to gfx1101/gfx1102 * Merge origin dev (#2) * [Navi3x] Fix Gridwise_multiple_d operation (#649) * Add CMake Option "USE_OPT_NAVI3X" * fix bug * standardize docs (#655) * Separate bibtex requirement from rocm-docs-core (#656) * separate bibtex requirement from rocm-docs-core * point requirements to source rocm-docs-core repo * Add CMake Option "USE_OPT_NAVI3X" (#647) * Add CMake Option "USE_OPT_NAVI3X" * remove navi3x opt compile option from cmake script * Conv + quantization + tanh (#645) * Rename file. Prepare to support another activation * Add comment for quantization * Extract out_elementop * Add tanh example * Add conv + bias + tanh quantization instance * Add missing parameter * Refine cmake * Add external api and client example * Extract variable in example * Fix the comment --------- Co-authored-by: zjing14 <zhangjing14@gmail.com> * Add a denorm test fix (#603) * Add type_convert implementations for bf16 * Add the fix for conv_fwd * Add the fix for conv_bwd_data * Add the fix for conv_bwd_weight * Format * Format * Another format * Add a macro to use workaround on MI200 only * Format --------- Co-authored-by: Rosty Geyyer <rosty.geyyer@amd.com> Co-authored-by: zjing14 <zhangjing14@gmail.com> * simplify karg in device/grid of split-k op (#644) * simplify karg in device/grid split-k op * fix mk_kn_mn instances * add more instances * use name from tensor layout * fix 3rd dword of buffer source descriptor (#659) * add fp64 instances (#658) Co-authored-by: root <root@ctr-ubbsmc15.amd.com> * Issue #666: Revert "simplify karg in device/grid of split-k op (#644)" (#665) This reverts commitbb5530af91. * Groupnorm + swish external api (#668) * Rename to proper naming * Add example of groupnorm + swish * Extract duplicate code in example * Add groupnorm + swish instances * Ractor instance generation, split into multiple cpp file * Add external api and client example * Refine profiler message * Use ck math version of exp * Refine problem size in example * Add host version of exp * add a marco to turn on/off denorm fix (off by default) (#673) * add a marco to turn off denorm fix by default * expose the marco --------- Co-authored-by: root <root@ctr-ubbsmc15.amd.com> * fixed quant example (#672) Co-authored-by: root <root@ctr-ubbsmc15.amd.com> * Add dependabot config and pin rocm-docs-core (#663) * [gtest] suppress unsafe buffer warn (#670) ref: https://github.com/ROCmSoftwarePlatform/MIOpen/pull/1912 * Add memory index guard in wmma device ops (#667) * Add more macros to turn on/off denorm fix (#678) Co-authored-by: Rosty Geyyer <rosty.geyyer@amd.com> * Fix a typo (#676) * Add (#677) * Allow using ROCm release candidate compilers. (#679) * enable use of rocm5.5 release candidate 4 * upgrade to ROCM5.5 RC5 * try fix the PUB_KEY error, remove the cmake-data package * upgrade to latest cmake version * use private dockerhub repo for rocm5.5 rc5 * add missing bracket * add vector load check * solve conflicts --------- Co-authored-by: Sam Wu <sjwu@ualberta.ca> Co-authored-by: Sam Wu <sam.wu2@amd.com> Co-authored-by: rocking5566 <ChunYu.Lai@amd.com> Co-authored-by: zjing14 <zhangjing14@gmail.com> Co-authored-by: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com> Co-authored-by: Rosty Geyyer <rosty.geyyer@amd.com> Co-authored-by: carlushuang <carlus.huang@amd.com> Co-authored-by: root <root@ctr-ubbsmc15.amd.com> Co-authored-by: Jun Liu <Liu.Jun@amd.com> Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> * Disable SkipLDS & Align AIT api (#3) * fix layernorm, reduction Ops (#4) * [Navi3x] Fix Gridwise_multiple_d operation (#649) * Add CMake Option "USE_OPT_NAVI3X" * fix bug * standardize docs (#655) * Separate bibtex requirement from rocm-docs-core (#656) * separate bibtex requirement from rocm-docs-core * point requirements to source rocm-docs-core repo * Add CMake Option "USE_OPT_NAVI3X" (#647) * Add CMake Option "USE_OPT_NAVI3X" * remove navi3x opt compile option from cmake script * Conv + quantization + tanh (#645) * Rename file. Prepare to support another activation * Add comment for quantization * Extract out_elementop * Add tanh example * Add conv + bias + tanh quantization instance * Add missing parameter * Refine cmake * Add external api and client example * Extract variable in example * Fix the comment --------- Co-authored-by: zjing14 <zhangjing14@gmail.com> * Add a denorm test fix (#603) * Add type_convert implementations for bf16 * Add the fix for conv_fwd * Add the fix for conv_bwd_data * Add the fix for conv_bwd_weight * Format * Format * Another format * Add a macro to use workaround on MI200 only * Format --------- Co-authored-by: Rosty Geyyer <rosty.geyyer@amd.com> Co-authored-by: zjing14 <zhangjing14@gmail.com> * simplify karg in device/grid of split-k op (#644) * simplify karg in device/grid split-k op * fix mk_kn_mn instances * add more instances * use name from tensor layout * fix 3rd dword of buffer source descriptor (#659) * add fp64 instances (#658) Co-authored-by: root <root@ctr-ubbsmc15.amd.com> * Issue #666: Revert "simplify karg in device/grid of split-k op (#644)" (#665) This reverts commitbb5530af91. * Groupnorm + swish external api (#668) * Rename to proper naming * Add example of groupnorm + swish * Extract duplicate code in example * Add groupnorm + swish instances * Ractor instance generation, split into multiple cpp file * Add external api and client example * Refine profiler message * Use ck math version of exp * Refine problem size in example * Add host version of exp * add a marco to turn on/off denorm fix (off by default) (#673) * add a marco to turn off denorm fix by default * expose the marco --------- Co-authored-by: root <root@ctr-ubbsmc15.amd.com> * fixed quant example (#672) Co-authored-by: root <root@ctr-ubbsmc15.amd.com> * Add dependabot config and pin rocm-docs-core (#663) * [gtest] suppress unsafe buffer warn (#670) ref: https://github.com/ROCmSoftwarePlatform/MIOpen/pull/1912 * Add memory index guard in wmma device ops (#667) * Add more macros to turn on/off denorm fix (#678) Co-authored-by: Rosty Geyyer <rosty.geyyer@amd.com> * Fix a typo (#676) * Add (#677) * Allow using ROCm release candidate compilers. (#679) * enable use of rocm5.5 release candidate 4 * upgrade to ROCM5.5 RC5 * try fix the PUB_KEY error, remove the cmake-data package * upgrade to latest cmake version * use private dockerhub repo for rocm5.5 rc5 * add missing bracket * Disable SkipLDS & Align AIT api * Update dependabot config (#682) Co-authored-by: samjwu <samjwu@users.noreply.github.com> * update attn api * solve type_convert bug + enable --------- Co-authored-by: Sam Wu <sjwu@ualberta.ca> Co-authored-by: Sam Wu <sam.wu2@amd.com> Co-authored-by: rocking5566 <ChunYu.Lai@amd.com> Co-authored-by: zjing14 <zhangjing14@gmail.com> Co-authored-by: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com> Co-authored-by: Rosty Geyyer <rosty.geyyer@amd.com> Co-authored-by: carlushuang <carlus.huang@amd.com> Co-authored-by: root <root@ctr-ubbsmc15.amd.com> Co-authored-by: Jun Liu <Liu.Jun@amd.com> Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> Co-authored-by: samjwu <samjwu@users.noreply.github.com> Co-authored-by: haocwang <Haocong.WANG@amd.com> * fix typo * Fix attention with causal mask * multiple fix, try ait compile * Add A/B not use LDS pipeline * Clang format, Add gfx1101, gfx1102 support of FMHA example * cancel change of format script * 1. Enable 2-stage global Prefetch ( May cause VGPR spilling) 2. Enable FP16 accumulator blockwise_gemm * clang-format * 1. change blockwise gemm loopover direction from kmn to mnk ( ~1% improvement) 2. change kernel timing mode to 50 warmup + 50 timed repeat * Update low level abstration of blockwise gemm wmma * (2/5) bilinear gemm pass, perf bug: skip a lds has lower performance than skip b lds * (3/5) batched gemm pass, perf bug: skip a lds has lower performance than skip b lds * (4/5) grouped conv pass * (5/5) attention pass, todo: debug lds perf bug * AIT Attention API refactor (#8) * sanity pass * sanity pass 2 * confirm significant performance regression. * turn on all instances * turn off instance format * Fix bug & tunning & format * DML meta, self_attn+cross_attn * sanity pass * remove useless flag * update tile and problem size used in AIT attention * bug fix in grouped conv supporting check * deprecate inline asm wmma * Bug fix: double lds skip * clang-format * Fix errors in 1. example, fmha 2. gridwise pipeline 3. deviceop, fmha, change some containers from vector to array * part2 of previous commit * clang format * API fix of gridwisegemmpipeline * separate array base and vector base attention tensor transformation * fix gemm * clang format * add gemm fp16 instances * Temp save * fpAintB kernel compile pass * Sanity pass. * Temp save * debug code enabled * Fp16AInt8B_GEMM sanity * MQA implementation * GQA-4 example * tempsave * Compile pass * New implementation of fp16Aint8B Gemm, Acheieve similar math throughput with native fp16 Gemm * format * Todo: fix gemm_bilinear_wmma instances compilation bug * Solve a bug when K1=16 * remove unnecessary changes * Remove tensor layout limitation to LDS usage in tesnor contraction * update self-attention and cross-attention * fix a typo of name * Add arch limiter for fp8 gemm * enable fp8 gemm_xdl for all gfx9 targets * temporarily disable gemm_xdl_fp16_fp8 on MI100/200 * fix the cmake logic for gemm_xdl_fp16_fp8 * re-enable the gemm_xdl_fp16_fp8 on MI100/200 --------- Co-authored-by: aska-0096 <haocwang@amd.com> Co-authored-by: Sam Wu <sjwu@ualberta.ca> Co-authored-by: Sam Wu <sam.wu2@amd.com> Co-authored-by: rocking5566 <ChunYu.Lai@amd.com> Co-authored-by: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com> Co-authored-by: Rosty Geyyer <rosty.geyyer@amd.com> Co-authored-by: carlushuang <carlus.huang@amd.com> Co-authored-by: root <root@ctr-ubbsmc15.amd.com> Co-authored-by: Jun Liu <Liu.Jun@amd.com> Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> Co-authored-by: samjwu <samjwu@users.noreply.github.com> Co-authored-by: haocwang <Haocong.WANG@amd.com> Co-authored-by: illsilin <Illia.Silin@amd.com>
996 lines
46 KiB
C++
996 lines
46 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
#include "data_type.hpp"
|
|
|
|
namespace ck {
|
|
|
|
template <typename T>
|
|
union BufferResource
|
|
{
|
|
__device__ constexpr BufferResource() : content{} {}
|
|
|
|
// 128 bit SGPRs to supply buffer resource in buffer instructions
|
|
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
|
|
int32x4_t content;
|
|
StaticallyIndexedArray<T*, 2> address;
|
|
StaticallyIndexedArray<int32_t, 4> range;
|
|
StaticallyIndexedArray<int32_t, 4> config;
|
|
};
|
|
|
|
template <typename T>
|
|
__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_size)
|
|
{
|
|
BufferResource<T> wave_buffer_resource;
|
|
|
|
// wavewise base address (64 bit)
|
|
wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
|
|
// wavewise range (32 bit)
|
|
wave_buffer_resource.range(Number<2>{}) = element_space_size * sizeof(T);
|
|
// wavewise setting (32 bit)
|
|
wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
|
|
|
|
return wave_buffer_resource.content;
|
|
}
|
|
|
|
template <typename T>
|
|
__device__ int32x4_t make_wave_buffer_resource_with_default_range(T* p_wave)
|
|
{
|
|
BufferResource<T> wave_buffer_resource;
|
|
|
|
// wavewise base address (64 bit)
|
|
wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
|
|
// wavewise range (32 bit)
|
|
wave_buffer_resource.range(Number<2>{}) = 0xffffffff; // max possible range
|
|
// wavewise setting (32 bit)
|
|
wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
|
|
|
|
return wave_buffer_resource.content;
|
|
}
|
|
|
|
// buffer load i8
|
|
__device__ int8_t
|
|
llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8");
|
|
|
|
__device__ int8x2_t
|
|
llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8");
|
|
|
|
__device__ int8x4_t
|
|
llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
|
|
|
|
// buffer load i16
|
|
__device__ bhalf_t
|
|
llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16");
|
|
|
|
__device__ bhalf2_t
|
|
llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16");
|
|
|
|
__device__ bhalf4_t
|
|
llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16");
|
|
|
|
// buffer load i32
|
|
__device__ int32_t
|
|
llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
|
|
|
|
__device__ int32x2_t
|
|
llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
|
|
|
|
__device__ int32x4_t
|
|
llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
|
|
|
|
// buffer load fp16
|
|
__device__ half_t
|
|
llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
|
|
|
|
__device__ half2_t
|
|
llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16");
|
|
|
|
__device__ half4_t
|
|
llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16");
|
|
|
|
// buffer load fp32
|
|
__device__ float
|
|
llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32");
|
|
|
|
__device__ float2_t
|
|
llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32");
|
|
|
|
__device__ float4_t
|
|
llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32");
|
|
|
|
// buffer store i8
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8");
|
|
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8");
|
|
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8");
|
|
|
|
// buffer store i16
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_store_i16(bhalf_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");
|
|
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_store_i16x2(bhalf2_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16");
|
|
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_store_i16x4(bhalf4_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16");
|
|
|
|
// buffer store i32
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_store_i32(int32_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32");
|
|
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32");
|
|
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
|
|
|
|
// buffer store fp16
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_store_fp16(half_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
|
|
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_store_fp16x2(half2_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16");
|
|
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
|
|
|
|
// buffer store fp32
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_store_fp32(float vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32");
|
|
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32");
|
|
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
|
|
|
|
// buffer atomic-add fp16
|
|
__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
|
|
half2_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
|
|
|
|
// buffer atomic-add i32
|
|
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
|
|
int32_t vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32");
|
|
|
|
// buffer atomic-add fp32
|
|
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(
|
|
float vdata,
|
|
int32x4_t rsrc,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32");
|
|
|
|
// buffer atomic-add fp32
|
|
__device__ double
|
|
llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
|
|
int32x4_t rsrc, // dst_wave_buffer_resource
|
|
int voffset, // dst_thread_addr_offset
|
|
int soffset, // dst_wave_addr_offset
|
|
int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
|
|
|
|
// memory coherency bit for buffer store/load instruction
|
|
// check ISA manual for each GFX target
|
|
// e.g. for
|
|
// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
|
|
// page 67~68
|
|
enum struct AmdBufferCoherenceEnum
|
|
{
|
|
DefaultCoherence = 0, // default value
|
|
GLC = 1,
|
|
SLC = 2,
|
|
GLC_SLC = 3,
|
|
};
|
|
|
|
template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
|
|
__device__ typename vector_type<int8_t, N>::type
|
|
amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource,
|
|
index_t src_thread_addr_offset,
|
|
index_t src_wave_addr_offset)
|
|
{
|
|
static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
|
|
"wrong! not implemented");
|
|
|
|
if constexpr(N == 1)
|
|
{
|
|
return llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource,
|
|
src_thread_addr_offset,
|
|
src_wave_addr_offset,
|
|
static_cast<index_t>(coherence));
|
|
}
|
|
else if constexpr(N == 2)
|
|
{
|
|
|
|
int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
|
|
src_thread_addr_offset,
|
|
src_wave_addr_offset,
|
|
static_cast<index_t>(coherence));
|
|
|
|
return bit_cast<int8x2_t>(tmp);
|
|
}
|
|
else if constexpr(N == 4)
|
|
{
|
|
int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource,
|
|
src_thread_addr_offset,
|
|
src_wave_addr_offset,
|
|
static_cast<index_t>(coherence));
|
|
|
|
return bit_cast<int8x4_t>(tmp);
|
|
}
|
|
else if constexpr(N == 8)
|
|
{
|
|
int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource,
|
|
src_thread_addr_offset,
|
|
src_wave_addr_offset,
|
|
static_cast<index_t>(coherence));
|
|
|
|
return bit_cast<int8x8_t>(tmp);
|
|
}
|
|
else if constexpr(N == 16)
|
|
{
|
|
int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
|
|
src_thread_addr_offset,
|
|
src_wave_addr_offset,
|
|
static_cast<index_t>(coherence));
|
|
return bit_cast<int8x16_t>(tmp);
|
|
}
|
|
else if constexpr(N == 32)
|
|
{
|
|
int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
|
|
src_thread_addr_offset,
|
|
src_wave_addr_offset,
|
|
static_cast<index_t>(coherence));
|
|
int32x4_t tmp1 =
|
|
llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
|
|
src_thread_addr_offset,
|
|
src_wave_addr_offset + 4 * sizeof(int32_t),
|
|
static_cast<index_t>(coherence));
|
|
vector_type<int32_t, 8> tmp;
|
|
|
|
tmp.AsType<int32x4_t>()(Number<0>{}) = tmp0;
|
|
tmp.AsType<int32x4_t>()(Number<1>{}) = tmp1;
|
|
|
|
return bit_cast<int8x32_t>(tmp);
|
|
}
|
|
else if constexpr(N == 64)
|
|
{
|
|
int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
|
|
src_thread_addr_offset,
|
|
src_wave_addr_offset,
|
|
static_cast<index_t>(coherence));
|
|
int32x4_t tmp1 =
|
|
llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
|
|
src_thread_addr_offset,
|
|
src_wave_addr_offset + 4 * sizeof(int32_t),
|
|
static_cast<index_t>(coherence));
|
|
int32x4_t tmp2 =
|
|
llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
|
|
src_thread_addr_offset,
|
|
src_wave_addr_offset + 8 * sizeof(int32_t),
|
|
static_cast<index_t>(coherence));
|
|
int32x4_t tmp3 =
|
|
llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
|
|
src_thread_addr_offset,
|
|
src_wave_addr_offset + 12 * sizeof(int32_t),
|
|
static_cast<index_t>(coherence));
|
|
|
|
vector_type<int32_t, 16> tmp;
|
|
|
|
tmp.AsType<int32x4_t>()(Number<0>{}) = tmp0;
|
|
tmp.AsType<int32x4_t>()(Number<1>{}) = tmp1;
|
|
tmp.AsType<int32x4_t>()(Number<2>{}) = tmp2;
|
|
tmp.AsType<int32x4_t>()(Number<3>{}) = tmp3;
|
|
|
|
return bit_cast<int8x64_t>(tmp);
|
|
}
|
|
}
|
|
|
|
template <typename T,
|
|
index_t N,
|
|
AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
|
|
__device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
|
|
index_t src_thread_addr_offset,
|
|
index_t src_wave_addr_offset)
|
|
{
|
|
static_assert(
|
|
(is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
|
|
(is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
|
(is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
|
(is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
|
(is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
|
(is_same<T, f8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
|
(is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
|
(is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
|
(is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
|
|
"wrong! not implemented");
|
|
|
|
using r_t = typename vector_type<T, N>::type;
|
|
auto raw_data = amd_buffer_load_impl_raw<sizeof(T) * N, coherence>(
|
|
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
|
|
return bit_cast<r_t>(raw_data);
|
|
}
|
|
|
|
template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
|
|
__device__ void
|
|
amd_buffer_store_impl_raw(const typename vector_type<int8_t, N>::type src_thread_data,
|
|
int32x4_t dst_wave_buffer_resource,
|
|
index_t dst_thread_addr_offset,
|
|
index_t dst_wave_addr_offset)
|
|
{
|
|
static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
|
|
"wrong! not implemented");
|
|
|
|
if constexpr(N == 1)
|
|
{
|
|
llvm_amdgcn_raw_buffer_store_i8(src_thread_data,
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
static_cast<index_t>(coherence));
|
|
}
|
|
else if constexpr(N == 2)
|
|
{
|
|
|
|
llvm_amdgcn_raw_buffer_store_i16(bit_cast<int16_t>(src_thread_data),
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
static_cast<index_t>(coherence));
|
|
}
|
|
else if constexpr(N == 4)
|
|
{
|
|
llvm_amdgcn_raw_buffer_store_i32(bit_cast<int32_t>(src_thread_data),
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
static_cast<index_t>(coherence));
|
|
}
|
|
else if constexpr(N == 8)
|
|
{
|
|
llvm_amdgcn_raw_buffer_store_i32x2(bit_cast<int32x2_t>(src_thread_data),
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
static_cast<index_t>(coherence));
|
|
}
|
|
else if constexpr(N == 16)
|
|
{
|
|
llvm_amdgcn_raw_buffer_store_i32x4(bit_cast<int32x4_t>(src_thread_data),
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
static_cast<index_t>(coherence));
|
|
}
|
|
else if constexpr(N == 32)
|
|
{
|
|
vector_type<int32_t, 8> tmp{bit_cast<int32x8_t>(src_thread_data)};
|
|
|
|
llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<0>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
static_cast<index_t>(coherence));
|
|
|
|
llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<1>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + sizeof(int32_t) * 4,
|
|
static_cast<index_t>(coherence));
|
|
}
|
|
else if constexpr(N == 64)
|
|
{
|
|
vector_type<int32_t, 16> tmp{bit_cast<int32x16_t>(src_thread_data)};
|
|
|
|
llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<0>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
static_cast<index_t>(coherence));
|
|
|
|
llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<1>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + sizeof(int32_t) * 4,
|
|
static_cast<index_t>(coherence));
|
|
|
|
llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<2>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + sizeof(int32_t) * 8,
|
|
static_cast<index_t>(coherence));
|
|
|
|
llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<3>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + sizeof(int32_t) * 12,
|
|
static_cast<index_t>(coherence));
|
|
}
|
|
}
|
|
|
|
template <typename T,
|
|
index_t N,
|
|
AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
|
|
__device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src_thread_data,
|
|
int32x4_t dst_wave_buffer_resource,
|
|
index_t dst_thread_addr_offset,
|
|
index_t dst_wave_addr_offset)
|
|
{
|
|
static_assert(
|
|
(is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
|
|
(is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
|
(is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
|
(is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
|
(is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
|
(is_same<T, f8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
|
(is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
|
|
(is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
|
|
"wrong! not implemented");
|
|
|
|
using r_t = typename vector_type<int8_t, sizeof(T) * N>::type;
|
|
|
|
amd_buffer_store_impl_raw<sizeof(T) * N, coherence>(bit_cast<r_t>(src_thread_data),
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset);
|
|
}
|
|
|
|
template <typename T, index_t N>
|
|
__device__ void amd_buffer_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
|
|
int32x4_t dst_wave_buffer_resource,
|
|
index_t dst_thread_addr_offset,
|
|
index_t dst_wave_addr_offset)
|
|
{
|
|
static_assert((is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
|
|
(is_same<T, half_t>::value && (N == 2 || N == 4 || N == 8)) ||
|
|
(is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
|
|
"wrong! not implemented");
|
|
|
|
if constexpr(is_same<T, float>::value)
|
|
{
|
|
if constexpr(N == 1)
|
|
{
|
|
llvm_amdgcn_raw_buffer_atomic_add_fp32(src_thread_data,
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
0);
|
|
}
|
|
else if constexpr(N == 2)
|
|
{
|
|
vector_type<float, 2> tmp{src_thread_data};
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
0);
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + sizeof(float),
|
|
0);
|
|
}
|
|
else if constexpr(N == 4)
|
|
{
|
|
vector_type<float, 4> tmp{src_thread_data};
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
0);
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + sizeof(float),
|
|
0);
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<2>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + 2 * sizeof(float),
|
|
0);
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<3>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + 3 * sizeof(float),
|
|
0);
|
|
}
|
|
}
|
|
else if constexpr(is_same<T, half_t>::value)
|
|
{
|
|
if constexpr(N == 2)
|
|
{
|
|
llvm_amdgcn_raw_buffer_atomic_add_fp16x2(src_thread_data,
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
0);
|
|
}
|
|
else if constexpr(N == 4)
|
|
{
|
|
vector_type<half_t, 4> tmp{src_thread_data};
|
|
|
|
static_for<0, 2, 1>{}([&](auto i) {
|
|
llvm_amdgcn_raw_buffer_atomic_add_fp16x2(tmp.AsType<half2_t>()[i],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + i * sizeof(half2_t),
|
|
0);
|
|
});
|
|
}
|
|
else if constexpr(N == 8)
|
|
{
|
|
vector_type<half_t, 8> tmp{src_thread_data};
|
|
|
|
static_for<0, 4, 1>{}([&](auto i) {
|
|
llvm_amdgcn_raw_buffer_atomic_add_fp16x2(tmp.AsType<half2_t>()[i],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + i * sizeof(half2_t),
|
|
0);
|
|
});
|
|
}
|
|
}
|
|
else if constexpr(is_same<T, int32_t>::value)
|
|
{
|
|
if constexpr(N == 1)
|
|
{
|
|
llvm_amdgcn_raw_buffer_atomic_add_i32(src_thread_data,
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
0);
|
|
}
|
|
else if constexpr(N == 2)
|
|
{
|
|
vector_type<int32_t, 2> tmp{src_thread_data};
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<0>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
0);
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<1>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + sizeof(int32_t),
|
|
0);
|
|
}
|
|
else if constexpr(N == 4)
|
|
{
|
|
vector_type<int32_t, 4> tmp{src_thread_data};
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<0>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
0);
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<1>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + sizeof(int32_t),
|
|
0);
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<2>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + 2 * sizeof(int32_t),
|
|
0);
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<3>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + 3 * sizeof(int32_t),
|
|
0);
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T, index_t N>
|
|
__device__ void amd_buffer_atomic_max_impl(const typename vector_type<T, N>::type src_thread_data,
|
|
int32x4_t dst_wave_buffer_resource,
|
|
index_t dst_thread_addr_offset,
|
|
index_t dst_wave_addr_offset)
|
|
{
|
|
static_assert((is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
|
|
"wrong! not implemented");
|
|
if constexpr(is_same<T, double>::value)
|
|
{
|
|
if constexpr(N == 1)
|
|
{
|
|
llvm_amdgcn_raw_buffer_atomic_max_fp64(src_thread_data,
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
0);
|
|
}
|
|
else if constexpr(N == 2)
|
|
{
|
|
vector_type<double, 2> tmp{src_thread_data};
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<0>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
0);
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<1>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + sizeof(double),
|
|
0);
|
|
}
|
|
else if constexpr(N == 4)
|
|
{
|
|
vector_type<double, 4> tmp{src_thread_data};
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<0>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset,
|
|
0);
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<1>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + sizeof(double),
|
|
0);
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<2>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + 2 * sizeof(double),
|
|
0);
|
|
|
|
llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<3>{}],
|
|
dst_wave_buffer_resource,
|
|
dst_thread_addr_offset,
|
|
dst_wave_addr_offset + 3 * sizeof(double),
|
|
0);
|
|
}
|
|
}
|
|
}
|
|
|
|
// buffer_load requires:
|
|
// 1) p_src_wave must point to global memory space
|
|
// 2) p_src_wave must be a wavewise pointer.
|
|
// It is user's responsibility to make sure that is true.
|
|
template <typename T,
|
|
index_t N,
|
|
AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
|
|
__device__ typename vector_type_maker<T, N>::type::type
|
|
amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
|
|
index_t src_thread_element_offset,
|
|
bool src_thread_element_valid,
|
|
index_t src_element_space_size)
|
|
{
|
|
const int32x4_t src_wave_buffer_resource =
|
|
make_wave_buffer_resource(p_src_wave, src_element_space_size);
|
|
|
|
index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
|
|
|
|
using vector_t = typename vector_type_maker<T, N>::type::type;
|
|
using scalar_t = typename scalar_type<vector_t>::type;
|
|
|
|
constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
|
|
|
|
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
|
|
uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
|
|
return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
|
|
src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
|
|
|
|
#else
|
|
|
|
vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
|
|
src_wave_buffer_resource, src_thread_addr_offset, 0);
|
|
return src_thread_element_valid ? tmp : vector_t(0);
|
|
#endif
|
|
}
|
|
|
|
// buffer_load requires:
|
|
// 1) p_src_wave must point to global memory space
|
|
// 2) p_src_wave must be a wavewise pointer.
|
|
// It is user's responsibility to make sure that is true.
|
|
template <typename T,
|
|
index_t N,
|
|
AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
|
|
__device__ typename vector_type_maker<T, N>::type::type
|
|
amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
|
|
index_t src_thread_element_offset,
|
|
bool src_thread_element_valid,
|
|
index_t src_element_space_size,
|
|
T customized_value)
|
|
{
|
|
const int32x4_t src_wave_buffer_resource =
|
|
make_wave_buffer_resource(p_src_wave, src_element_space_size);
|
|
|
|
index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
|
|
|
|
using vector_t = typename vector_type_maker<T, N>::type::type;
|
|
using scalar_t = typename scalar_type<vector_t>::type;
|
|
|
|
constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
|
|
|
|
vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
|
|
src_wave_buffer_resource, src_thread_addr_offset, 0);
|
|
|
|
return src_thread_element_valid ? tmp : vector_t(customized_value);
|
|
}
|
|
|
|
// buffer_store requires:
|
|
// 1) p_dst_wave must point to global memory
|
|
// 2) p_dst_wave must be a wavewise pointer.
|
|
// It is user's responsibility to make sure that is true.
|
|
template <typename T,
|
|
index_t N,
|
|
AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
|
|
__device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_data,
|
|
T* p_dst_wave,
|
|
const index_t dst_thread_element_offset,
|
|
const bool dst_thread_element_valid,
|
|
const index_t dst_element_space_size)
|
|
{
|
|
const int32x4_t dst_wave_buffer_resource =
|
|
make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
|
|
|
|
index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
|
|
|
|
using vector_t = typename vector_type_maker<T, N>::type::type;
|
|
using scalar_t = typename scalar_type<vector_t>::type;
|
|
constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
|
|
|
|
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
|
|
uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
|
|
amd_buffer_store_impl<scalar_t, vector_size, coherence>(
|
|
src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
|
|
#else
|
|
if(dst_thread_element_valid)
|
|
{
|
|
amd_buffer_store_impl<scalar_t, vector_size, coherence>(
|
|
src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// buffer_atomic_add requires:
|
|
// 1) p_dst_wave must point to global memory
|
|
// 2) p_dst_wave must be a wavewise pointer.
|
|
// It is user's responsibility to make sure that is true.
|
|
template <typename T, index_t N>
|
|
__device__ void
|
|
amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thread_data,
|
|
T* p_dst_wave,
|
|
const index_t dst_thread_element_offset,
|
|
const bool dst_thread_element_valid,
|
|
const index_t dst_element_space_size)
|
|
{
|
|
const int32x4_t dst_wave_buffer_resource =
|
|
make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
|
|
|
|
index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
|
|
|
|
using vector_t = typename vector_type_maker<T, N>::type::type;
|
|
using scalar_t = typename scalar_type<vector_t>::type;
|
|
constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
|
|
|
|
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
|
|
uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
|
|
|
|
amd_buffer_atomic_add_impl<scalar_t, vector_size>(
|
|
src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
|
|
#else
|
|
if(dst_thread_element_valid)
|
|
{
|
|
amd_buffer_atomic_add_impl<scalar_t, vector_size>(
|
|
src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// buffer_atomic_max requires:
|
|
// 1) p_dst_wave must point to global memory
|
|
// 2) p_dst_wave must be a wavewise pointer.
|
|
// It is user's responsibility to make sure that is true.
|
|
template <typename T, index_t N>
|
|
__device__ void
|
|
amd_buffer_atomic_max(const typename vector_type_maker<T, N>::type::type src_thread_data,
|
|
T* p_dst_wave,
|
|
const index_t dst_thread_element_offset,
|
|
const bool dst_thread_element_valid,
|
|
const index_t dst_element_space_size)
|
|
{
|
|
const int32x4_t dst_wave_buffer_resource =
|
|
make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
|
|
|
|
index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
|
|
|
|
using vector_t = typename vector_type_maker<T, N>::type::type;
|
|
using scalar_t = typename scalar_type<vector_t>::type;
|
|
constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
|
|
|
|
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
|
|
uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
|
|
|
|
amd_buffer_atomic_max_impl<scalar_t, vector_size>(
|
|
src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
|
|
#else
|
|
if(dst_thread_element_valid)
|
|
{
|
|
amd_buffer_atomic_max_impl<scalar_t, vector_size>(
|
|
src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// Direct loads from global to LDS.
|
|
__device__ void
|
|
llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
|
|
__attribute__((address_space(3))) uint32_t* lds_ptr,
|
|
index_t size,
|
|
index_t voffset,
|
|
index_t soffset,
|
|
index_t offset,
|
|
index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
|
|
|
|
template <typename T, index_t NumElemsPerThread>
|
|
__device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
|
|
const index_t global_offset,
|
|
T* lds_base_ptr,
|
|
const index_t lds_offset,
|
|
const bool is_valid,
|
|
const index_t src_element_space_size)
|
|
{
|
|
// Direct loads require that each thread reads and writes exactly a single DWORD.
|
|
constexpr auto dword_bytes = 4;
|
|
constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
|
|
static_assert(bytes_per_thread == dword_bytes);
|
|
|
|
const uint32_t* global_ptr =
|
|
reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
|
|
const int32x4_t src_resource = make_wave_buffer_resource(global_ptr, src_element_space_size);
|
|
const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
|
|
|
|
#if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
|
|
T* lds_ptr = lds_base_ptr + lds_offset;
|
|
auto const lds_ptr_sgpr =
|
|
__builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
|
|
asm volatile("s_mov_b32 m0, %0; \n\t"
|
|
"buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
|
|
"v"(global_offset_bytes),
|
|
"s"(src_resource));
|
|
#else
|
|
// LDS pointer must be attributed with the LDS address space.
|
|
__attribute__((address_space(3))) uint32_t* lds_ptr =
|
|
reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
|
|
reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
|
|
|
|
llvm_amdgcn_raw_buffer_load_lds(
|
|
src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
|
|
#endif
|
|
}
|
|
|
|
} // namespace ck
|