mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
clean up
This commit is contained in:
@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
constexpr index_t HPad = 0;
|
||||
constexpr index_t WPad = 0;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// 1x1 filter, 14x14 image, C = 512
|
||||
constexpr index_t N = 128;
|
||||
constexpr index_t C = 512;
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
typedef float Float4 __attribute__((ext_vector_type(4)));
|
||||
|
||||
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
|
||||
#include "common.hip.hpp"
|
||||
|
||||
#define NO_VM_WAIT 0
|
||||
#define NO_LGKM_WAIT 0
|
||||
@@ -10,7 +7,10 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
|
||||
#define NO_DS_WRITE 0
|
||||
#define NO_GLB_READ 0
|
||||
|
||||
inline __device__ void vmcnt(index_t cnt)
|
||||
// cast a pointer of LDS to its address
|
||||
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
|
||||
|
||||
__device__ void vmcnt(index_t cnt)
|
||||
{
|
||||
#if !NO_VM_WAIT
|
||||
if(cnt == 0)
|
||||
@@ -44,7 +44,7 @@ inline __device__ void vmcnt(index_t cnt)
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ void lgkmcnt(index_t cnt)
|
||||
__device__ void lgkmcnt(index_t cnt)
|
||||
{
|
||||
#if !NO_LGKM_WAIT
|
||||
if(cnt == 0)
|
||||
@@ -84,7 +84,7 @@ inline __device__ void lgkmcnt(index_t cnt)
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ void outerProduct1x4(const float* a, const float* b, float* c)
|
||||
__device__ void outerProduct1x4(const float* a, const float* b, float* c)
|
||||
{
|
||||
asm volatile("\n \
|
||||
v_mac_f32 %0, %4, %5 \n \
|
||||
@@ -104,7 +104,9 @@ inline __device__ void outerProduct1x4(const float* a, const float* b, float* c)
|
||||
"3"(c[3]));
|
||||
}
|
||||
|
||||
inline __device__ void outerProduct1x4(const float& a, const Float4& b, Float4& c)
|
||||
__device__ void outerProduct1x4(const float& a,
|
||||
const vector_type<float, 4>::MemoryType& b,
|
||||
vector_type<float, 4>::MemoryType& c)
|
||||
{
|
||||
#if 0
|
||||
asm volatile(
|
||||
@@ -123,8 +125,12 @@ inline __device__ void outerProduct1x4(const float& a, const Float4& b, Float4&
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ void
|
||||
outerProduct4x4(const Float4& a, const Float4& b, Float4& c0, Float4& c1, Float4& c2, Float4& c3)
|
||||
__device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,
|
||||
const vector_type<float, 4>::MemoryType& b,
|
||||
vector_type<float, 4>::MemoryType& c0,
|
||||
vector_type<float, 4>::MemoryType& c1,
|
||||
vector_type<float, 4>::MemoryType& c2,
|
||||
vector_type<float, 4>::MemoryType& c3)
|
||||
{
|
||||
#if 0
|
||||
asm volatile(
|
||||
@@ -179,7 +185,9 @@ outerProduct4x4(const Float4& a, const Float4& b, Float4& c0, Float4& c1, Float4
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4* c)
|
||||
__device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
|
||||
const vector_type<float, 4>::MemoryType* b,
|
||||
vector_type<float, 4>::MemoryType* c)
|
||||
{
|
||||
outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]);
|
||||
outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]);
|
||||
@@ -187,7 +195,7 @@ inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4*
|
||||
outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]);
|
||||
}
|
||||
|
||||
inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0)
|
||||
__device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
|
||||
{
|
||||
#if !NO_DS_READ
|
||||
if(offset == 0)
|
||||
@@ -413,7 +421,9 @@ inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0)
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset = 0)
|
||||
__device__ void global_load(vector_type<float, 4>::MemoryType& r,
|
||||
const vector_type<float, 4>::MemoryType* ptr,
|
||||
index_t offset = 0)
|
||||
{
|
||||
#if !NO_GLB_READ
|
||||
if(offset == 0)
|
||||
@@ -431,7 +441,8 @@ inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ void ds_write_b128(const Float4& r, void* lds, index_t offset = 0)
|
||||
__device__ void
|
||||
ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
|
||||
{
|
||||
#if !NO_DS_WRITE
|
||||
if(offset == 0)
|
||||
@@ -130,6 +130,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
const FloatB* __restrict__ p_b_block,
|
||||
FloatC* __restrict__ p_c_thread) const
|
||||
{
|
||||
static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value &&
|
||||
is_same<FloatC, float>::value,
|
||||
"Run_asm only deal with float\n");
|
||||
|
||||
constexpr auto True = integral_constant<bool, true>{};
|
||||
constexpr auto False = integral_constant<bool, false>{};
|
||||
|
||||
@@ -158,6 +162,12 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
|
||||
Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
|
||||
|
||||
static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
|
||||
MPerThread == 8 && NPerThread == 8,
|
||||
"Run_asm cannot deal with this GEMM shape yet\n");
|
||||
|
||||
using Float4 = vector_type<float, 4>::MemoryType;
|
||||
|
||||
float p_thread[a_thread_mtx.GetElementSpace() + b_thread_mtx.GetElementSpace()];
|
||||
|
||||
FloatA* p_a_thread = p_thread;
|
||||
|
||||
@@ -5,8 +5,8 @@
|
||||
#include "Array.hip.hpp"
|
||||
#include "functional.hip.hpp"
|
||||
|
||||
#if DEVICE_BACKEDN_HIP
|
||||
#include "inline_asm.hpp"
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#include "amd_inline_asm.hip.hpp"
|
||||
#endif
|
||||
|
||||
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
|
||||
@@ -67,8 +67,3 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
|
||||
return x < y ? x : y;
|
||||
}
|
||||
} // namespace mod_conv
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
// cast a pointer of LDS to its address
|
||||
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
|
||||
#endif
|
||||
|
||||
@@ -16,12 +16,14 @@ template <>
|
||||
struct vector_type<float, 2>
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
// For some reason, HIP compiler need this definition to generate optimal load and store instruction
|
||||
// For some reason, HIP compiler need this definition to generate optimal load and store
|
||||
// instruction
|
||||
typedef float MemoryType __attribute__((ext_vector_type(2)));
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
// For some reason, CUDA need this definition to, otherwise
|
||||
// compiler won't generate optimal load and store instruction, and
|
||||
// kernel would produce wrong result, indicating the compiler fail to generate correct instruction,
|
||||
// kernel would produce wrong result, indicating the compiler fail to generate correct
|
||||
// instruction,
|
||||
using MemoryType = float2;
|
||||
#endif
|
||||
|
||||
@@ -43,12 +45,14 @@ template <>
|
||||
struct vector_type<float, 4>
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
// For some reason, HIP compiler need this definition to generate optimal load and store instruction
|
||||
// For some reason, HIP compiler need this definition to generate optimal load and store
|
||||
// instruction
|
||||
typedef float MemoryType __attribute__((ext_vector_type(4)));
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
// For some reason, CUDA need this definition to, otherwise
|
||||
// compiler won't generate optimal load and store instruction, and
|
||||
// kernel would produce wrong result, indicating the compiler fail to generate correct instruction,
|
||||
// kernel would produce wrong result, indicating the compiler fail to generate correct
|
||||
// instruction,
|
||||
using MemoryType = float4;
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -239,9 +239,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
|
||||
#elif 0
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block + y * Wi + x,
|
||||
p_out_thread);
|
||||
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block + y * Wi + x,
|
||||
p_out_thread);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,11 +34,6 @@ template <index_t GridSize,
|
||||
index_t WeiBlockCopyDataPerRead>
|
||||
struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
{
|
||||
__host__
|
||||
__device__ constexpr GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer()
|
||||
{
|
||||
}
|
||||
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_out_global) const
|
||||
@@ -203,7 +198,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
|
||||
// preload data into LDS
|
||||
{
|
||||
#if 1
|
||||
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
|
||||
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
@@ -212,19 +206,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
|
||||
p_wei_register_clipboard);
|
||||
|
||||
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
|
||||
p_in_block_double);
|
||||
#if 0
|
||||
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
|
||||
p_wei_block_double);
|
||||
#elif 0
|
||||
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
|
||||
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
|
||||
p_in_register_clipboard);
|
||||
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
|
||||
p_wei_register_clipboard);
|
||||
|
||||
p_wei_block_double);
|
||||
#elif 1
|
||||
vmcnt(0);
|
||||
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
|
||||
p_in_block_double);
|
||||
@@ -278,28 +264,26 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
#if 1
|
||||
#if 0
|
||||
blockwise_gemm.Run
|
||||
#elif 0
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#elif 1
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block_now + y * Wi + x,
|
||||
p_out_thread);
|
||||
(p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block_now + y * Wi + x,
|
||||
p_out_thread);
|
||||
}
|
||||
}
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
|
||||
p_in_block_next);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
|
||||
p_wei_block_next);
|
||||
#elif 0
|
||||
// if work with RunLoadRegisterClipboard_asm, need to wait
|
||||
#elif 1
|
||||
vmcnt(0);
|
||||
|
||||
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
|
||||
p_in_block_next);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
|
||||
@@ -329,32 +313,29 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
#if 1
|
||||
#if 0
|
||||
blockwise_gemm.Run
|
||||
#elif 0
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#elif 1
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block_double + y * Wi + x,
|
||||
p_out_thread);
|
||||
(p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block_double + y * Wi + x,
|
||||
p_out_thread);
|
||||
}
|
||||
}
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
|
||||
p_in_block_double + in_block_space);
|
||||
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
|
||||
p_wei_block_double + wei_block_space);
|
||||
#else
|
||||
// if work with RunLoadRegisterClipboard_asm, need to wait
|
||||
vmcnt(0);
|
||||
|
||||
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
|
||||
p_in_block_double + in_block_space);
|
||||
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
|
||||
p_wei_block_double + wei_block_space);
|
||||
#endif
|
||||
@@ -366,17 +347,17 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
#if 1
|
||||
#if 0
|
||||
blockwise_gemm.Run
|
||||
#elif 0
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#elif 1
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block_double + wei_block_space +
|
||||
wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block_double + in_block_space + y * Wi + x,
|
||||
p_out_thread);
|
||||
(p_wei_block_double + wei_block_space +
|
||||
wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block_double + in_block_space + y * Wi + x,
|
||||
p_out_thread);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user