mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
debugging
This commit is contained in:
@@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t WeiBlockCopyDataPerRead = 4;
|
||||
|
||||
constexpr index_t BlockSize = 256;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// 1x1, 14x14, Vega 20, disable lds_double_buffer, enable register double buffer
|
||||
constexpr index_t BPerBlock = 64;
|
||||
constexpr index_t KPerBlock = 128;
|
||||
@@ -266,7 +266,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
|
||||
for(index_t i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
constexpr auto gridwise_conv =
|
||||
#if 0
|
||||
#if 1
|
||||
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
|
||||
#else
|
||||
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "inline_asm.hpp"
|
||||
|
||||
template <index_t BlockSize, class Float, class DstDesc, class F>
|
||||
__device__ void
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "threadwise_gemm.hip.hpp"
|
||||
|
||||
// if following number are power of 2, index calculation shall be greatly reduced:
|
||||
|
||||
@@ -5,6 +5,10 @@
|
||||
#include "Array.hip.hpp"
|
||||
#include "functional.hip.hpp"
|
||||
|
||||
#if DEVICE_BACKEDN_HIP
|
||||
#include "inline_asm.hpp"
|
||||
#endif
|
||||
|
||||
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
|
||||
|
||||
__device__ index_t get_block_1d_id() { return blockIdx.x; }
|
||||
|
||||
@@ -15,7 +15,11 @@ struct vector_type<float, 1>
|
||||
template <>
|
||||
struct vector_type<float, 2>
|
||||
{
|
||||
#if 1
|
||||
typedef float MemoryType __attribute__((ext_vector_type(2)));
|
||||
#else
|
||||
using MemoryType = float2;
|
||||
#endif
|
||||
|
||||
__host__ __device__ static MemoryType Pack(float s0, float s1)
|
||||
{
|
||||
@@ -34,7 +38,11 @@ struct vector_type<float, 2>
|
||||
template <>
|
||||
struct vector_type<float, 4>
|
||||
{
|
||||
#if 1
|
||||
typedef float MemoryType __attribute__((ext_vector_type(4)));
|
||||
#else
|
||||
using MemoryType = float4;
|
||||
#endif
|
||||
};
|
||||
|
||||
#if 0
|
||||
|
||||
@@ -222,21 +222,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
|
||||
|
||||
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block);
|
||||
#elif 1
|
||||
Float4 tmp_in, tmp_wei;
|
||||
Float4* glb_in_p =
|
||||
(Float4*)(p_in_global_block_offset + blockwise_in_copy.mSrcMyThreadOffset);
|
||||
Float4* loc_in_p = (Float4*)(p_in_block + blockwise_in_copy.mDstMyThreadOffset);
|
||||
|
||||
Float4* glb_wei_p =
|
||||
(Float4*)(p_wei_global_block_offset + blockwise_wei_copy.mSrcMyThreadOffset);
|
||||
Float4* loc_wei_p = (Float4*)(p_wei_block + blockwise_wei_copy.mDstMyThreadOffset);
|
||||
|
||||
global_load(tmp_in, glb_in_p);
|
||||
global_load(tmp_wei, glb_wei_p);
|
||||
vmcnt(0);
|
||||
ds_write_b128(tmp_in, loc_in_p);
|
||||
ds_write_b128(tmp_wei, loc_wei_p);
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
@@ -247,11 +232,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
|
||||
{
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
#if 0
|
||||
#if 1
|
||||
blockwise_gemm.Run
|
||||
#elif 0
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#elif 1
|
||||
#elif 0
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
#include "inline_asm.hpp"
|
||||
|
||||
template <class Float, class SrcMatrix, class DstMatrix, index_t NRow, index_t NCol>
|
||||
__device__ void threadwise_matrix_copy(SrcMatrix,
|
||||
const Float* __restrict__ p_src,
|
||||
|
||||
Reference in New Issue
Block a user