mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 19:09:59 +00:00
debugged: CUDA should use its own float4 definition
[ROCm/composable_kernel commit: 7a251a0922]
This commit is contained in:
@@ -15,9 +15,13 @@ struct vector_type<float, 1>
|
||||
template <>
|
||||
struct vector_type<float, 2>
|
||||
{
|
||||
#if 1
|
||||
#if DEVICE_BACKEND_HIP
|
||||
// For some reason, HIP compiler need this definition to generate optimal load and store instruction
|
||||
typedef float MemoryType __attribute__((ext_vector_type(2)));
|
||||
#else
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
// For some reason, CUDA need this definition to, otherwise
|
||||
// compiler won't generate optimal load and store instruction, and
|
||||
// kernel would produce wrong result, indicating the compiler fail to generate correct instruction,
|
||||
using MemoryType = float2;
|
||||
#endif
|
||||
|
||||
@@ -38,9 +42,13 @@ struct vector_type<float, 2>
|
||||
template <>
|
||||
struct vector_type<float, 4>
|
||||
{
|
||||
#if 1
|
||||
#if DEVICE_BACKEND_HIP
|
||||
// For some reason, HIP compiler need this definition to generate optimal load and store instruction
|
||||
typedef float MemoryType __attribute__((ext_vector_type(4)));
|
||||
#else
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
// For some reason, CUDA need this definition to, otherwise
|
||||
// compiler won't generate optimal load and store instruction, and
|
||||
// kernel would produce wrong result, indicating the compiler fail to generate correct instruction,
|
||||
using MemoryType = float4;
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -204,8 +204,18 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
// preload data into LDS
|
||||
{
|
||||
#if 1
|
||||
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_double);
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_double);
|
||||
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
|
||||
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
|
||||
p_in_register_clipboard);
|
||||
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
|
||||
p_wei_register_clipboard);
|
||||
|
||||
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
|
||||
p_in_block_double);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
|
||||
p_wei_block_double);
|
||||
#elif 0
|
||||
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
|
||||
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
@@ -363,9 +373,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
#elif 1
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block_double + in_block_space +
|
||||
(p_wei_block_double + wei_block_space +
|
||||
wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block_double + wei_block_space + y * Wi + x,
|
||||
p_in_block_double + in_block_space + y * Wi + x,
|
||||
p_out_thread);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user