mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
use ford/for instead of static_ford/static_for in threadwise copy, somehow register spill is greatly reduced on AMD
This commit is contained in:
@@ -155,7 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
|
||||
"GemmDataPerReadB alignment requirement is not satisfied");
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
// input blockwise copy
|
||||
// slice a merged tensor, reorder and copy to a normal tensor
|
||||
// this copy operator already has blockwise offset built-in
|
||||
@@ -178,7 +178,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
#else
|
||||
auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2<
|
||||
BlockSize,
|
||||
Float,
|
||||
decltype(in_e_n1_b_n2_global_merged_desc),
|
||||
decltype(in_e_n1_b_n2_block_desc),
|
||||
MergedTensorCoordinate<decltype(in_e_n1_b_n2_global_merged_desc)>,
|
||||
@@ -200,7 +199,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
Sequence<EPerBlock, KPerBlock>{},
|
||||
Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
// operator for blockwise copy of weight into LDS
|
||||
// slice a tensor, and copy it into another tensor
|
||||
// this copy operator already have blockwise offset built-in
|
||||
@@ -223,7 +222,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
#else
|
||||
auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v2<
|
||||
BlockSize,
|
||||
Float,
|
||||
decltype(wei_e_k_global_desc),
|
||||
decltype(wei_e_k_block_desc),
|
||||
NormalTensorCoordinate<decltype(wei_e_k_global_desc)>,
|
||||
@@ -326,7 +324,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
|
||||
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{},
|
||||
// True);
|
||||
@@ -358,7 +356,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
|
||||
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
|
||||
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
|
||||
@@ -439,17 +437,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
k_thread_data_on_global, 0, b_thread_data_on_global, 0);
|
||||
|
||||
#if 0
|
||||
threadwise_generic_tensor_slice_copy_v1(
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
|
||||
p_out_thread,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
|
||||
p_out_thread_on_global,
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
|
||||
arithmetic_sequence_gen<0, 8, 1>::type{},
|
||||
Number<1>{});
|
||||
#elif 0
|
||||
ThreadwiseGenericTensorSliceCopy_v1r1<
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
|
||||
@@ -461,7 +448,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
1,
|
||||
1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
|
||||
.Run(p_out_thread, p_out_thread_on_global);
|
||||
#elif 1
|
||||
#elif 0
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
|
||||
@@ -471,7 +458,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
1,
|
||||
1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
|
||||
.Run(p_out_thread, p_out_thread_on_global);
|
||||
#elif 0
|
||||
#elif 1
|
||||
ThreadwiseGenericTensorSliceCopy_v2<
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
|
||||
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
|
||||
|
||||
@@ -22,7 +22,6 @@ namespace ck {
|
||||
// repeat-length on the merged dimension need to be 1. These sanity checks are performed
|
||||
// in constructor of BlockwiseGenericTensorSliceCopy_v1
|
||||
template <index_t BlockSize,
|
||||
class Float,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class SliceLengths,
|
||||
@@ -202,8 +201,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
return GetRegisterBufferDescriptor().GetElementSpace();
|
||||
}
|
||||
|
||||
__device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src,
|
||||
Float* __restrict__ p_buffer) const
|
||||
template <class TData>
|
||||
__device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
|
||||
TData* __restrict__ p_buffer) const
|
||||
{
|
||||
constexpr auto thread_sub_tensor_lengths = SubLengths{};
|
||||
|
||||
@@ -255,7 +255,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
thread_sub_tensor_lengths,
|
||||
SrcDimAccessOrder{},
|
||||
Number<SrcDataPerAccess>{});
|
||||
#elif 0
|
||||
#elif 1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r1<
|
||||
SrcDesc,
|
||||
decltype(thread_buffer_desc),
|
||||
@@ -281,8 +281,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
});
|
||||
}
|
||||
|
||||
__device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_buffer,
|
||||
Float* __restrict__ p_dst) const
|
||||
template <class TData>
|
||||
__device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
|
||||
TData* __restrict__ p_dst) const
|
||||
{
|
||||
constexpr auto thread_sub_tensor_lengths = SubLengths{};
|
||||
|
||||
@@ -333,7 +334,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
thread_sub_tensor_lengths,
|
||||
DstDimAccessOrder{},
|
||||
Number<DstDataPerAccess>{});
|
||||
#elif 0
|
||||
#elif 1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r1<
|
||||
decltype(thread_buffer_desc),
|
||||
DstDesc,
|
||||
@@ -360,9 +361,10 @@ struct BlockwiseGenericTensorSliceCopy_v1
|
||||
});
|
||||
}
|
||||
|
||||
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
|
||||
template <class TData>
|
||||
__device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
|
||||
{
|
||||
Float p_buffer[GetRegisterBufferSize()];
|
||||
TData p_buffer[GetRegisterBufferSize()];
|
||||
|
||||
RunLoadRegisterBuffer(p_src, p_buffer);
|
||||
RunStoreRegisterBuffer(p_buffer, p_dst);
|
||||
|
||||
@@ -10,10 +10,18 @@
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
|
||||
#endif
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
|
||||
#endif
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
|
||||
#endif
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
|
||||
// user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1
|
||||
@@ -216,6 +224,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1
|
||||
src_vector_access_dim,
|
||||
SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
|
||||
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
|
||||
static_ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
|
||||
constexpr auto src_data_begin_id = src_access_id.Modify(
|
||||
src_vector_access_dim,
|
||||
@@ -239,6 +248,31 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1
|
||||
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
|
||||
});
|
||||
});
|
||||
#else
|
||||
ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
|
||||
auto src_data_begin_id = src_access_id;
|
||||
src_data_begin_id(src_vector_access_dim) =
|
||||
src_access_id[src_vector_access_dim] * src_data_per_access;
|
||||
|
||||
const index_t src_offset =
|
||||
SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
|
||||
|
||||
// load vector from src
|
||||
const vector_t vector_data = *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
|
||||
|
||||
// unpack vector into buffer
|
||||
for(index_t i = 0; i < SrcDataPerAccess; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(src_vector_access_dim) = i;
|
||||
|
||||
const index_t buffer_offset =
|
||||
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
|
||||
|
||||
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
|
||||
}
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
// copy data from buffer to dst
|
||||
@@ -252,6 +286,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1
|
||||
dst_vector_access_dim,
|
||||
SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
|
||||
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
|
||||
static_ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
|
||||
constexpr auto dst_data_begin_id = dst_access_id.Modify(
|
||||
dst_vector_access_dim,
|
||||
@@ -277,6 +312,33 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1
|
||||
// store vector into dst
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = vector_data;
|
||||
});
|
||||
#else
|
||||
ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
|
||||
auto dst_data_begin_id = dst_access_id;
|
||||
dst_data_begin_id(dst_vector_access_dim) =
|
||||
dst_access_id[dst_vector_access_dim] * dst_data_per_access;
|
||||
|
||||
vector_t vector_data;
|
||||
|
||||
// pack vector from buffer
|
||||
for(index_t i = 0; i < DstDataPerAccess; ++i)
|
||||
{
|
||||
auto scalar_id = make_zero_array<index_t, nDim>();
|
||||
scalar_id(dst_vector_access_dim) = i;
|
||||
|
||||
const index_t buffer_offset =
|
||||
buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id);
|
||||
|
||||
reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
|
||||
}
|
||||
|
||||
const index_t dst_offset =
|
||||
DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);
|
||||
|
||||
// store vector into dst
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = vector_data;
|
||||
});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -373,7 +435,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
|
||||
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
|
||||
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
|
||||
static_ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
|
||||
auto long_vector_access_id) {
|
||||
|
||||
@@ -524,6 +586,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2
|
||||
using SrcNormalDimSliceLengthsHack =
|
||||
decltype((SliceLengths{} + Number<1>{}) - SrcMergedDimSliceLengthsHack{});
|
||||
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
|
||||
static_ford<SrcMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id_) {
|
||||
constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){};
|
||||
|
||||
@@ -541,6 +604,21 @@ struct ThreadwiseGenericTensorSliceCopy_v2
|
||||
p_buffer[buffer_offset] = p_src_tmp[src_normal_offset];
|
||||
});
|
||||
});
|
||||
#else
|
||||
ford<SrcMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id) {
|
||||
const TData* p_src_tmp = p_src + (mSrcSliceOrigin + merged_dim_data_id).GetOffset();
|
||||
|
||||
ford<SrcNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id) {
|
||||
const index_t buffer_offset =
|
||||
buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id);
|
||||
|
||||
const index_t src_normal_offset =
|
||||
SrcDesc::GetOffsetFromMultiIndex(normal_dim_data_id);
|
||||
|
||||
p_buffer[buffer_offset] = p_src_tmp[src_normal_offset];
|
||||
});
|
||||
});
|
||||
#endif
|
||||
|
||||
// DstMergedDimSliceLengthsHack has entry same as SliceLengths on dst merged dimensions,
|
||||
// but 1 on normal dimensions;
|
||||
@@ -553,6 +631,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2
|
||||
using DstNormalDimSliceLengthsHack =
|
||||
decltype((SliceLengths{} + Number<1>{}) - DstMergedDimSliceLengthsHack{});
|
||||
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2
|
||||
static_ford<DstMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id_) {
|
||||
constexpr auto merged_dim_data_id = decltype(merged_dim_data_id_){};
|
||||
|
||||
@@ -570,6 +649,21 @@ struct ThreadwiseGenericTensorSliceCopy_v2
|
||||
p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset];
|
||||
});
|
||||
});
|
||||
#else
|
||||
ford<DstMergedDimSliceLengthsHack>{}([&](auto merged_dim_data_id) {
|
||||
TData* p_dst_tmp = p_dst + (mDstSliceOrigin + merged_dim_data_id).GetOffset();
|
||||
|
||||
ford<DstNormalDimSliceLengthsHack>{}([&](auto normal_dim_data_id) {
|
||||
const index_t buffer_offset =
|
||||
buffer_desc.GetOffsetFromMultiIndex(merged_dim_data_id + normal_dim_data_id);
|
||||
|
||||
const index_t dst_normal_offset =
|
||||
DstDesc::GetOffsetFromMultiIndex(normal_dim_data_id);
|
||||
|
||||
p_dst_tmp[dst_normal_offset] = p_buffer[buffer_offset];
|
||||
});
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
// T can be Sequence or Array
|
||||
|
||||
@@ -8,7 +8,9 @@
|
||||
#define CK_USE_AMD_INLINE_ASM 1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
|
||||
|
||||
namespace ck {
|
||||
|
||||
|
||||
@@ -10,7 +10,9 @@
|
||||
#define CK_USE_AMD_INLINE_ASM 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
|
||||
|
||||
namespace ck {
|
||||
|
||||
|
||||
@@ -112,14 +112,14 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 4, 1>;
|
||||
using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 4, 4>;
|
||||
using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 2, 2>;
|
||||
using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 8, 2>;
|
||||
using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
|
||||
using InBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
|
||||
using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
|
||||
|
||||
constexpr index_t InBlockCopySrcDataPerRead_B = 4;
|
||||
constexpr index_t InBlockCopyDstDataPerWrite_N2 = 1;
|
||||
constexpr index_t InBlockCopySrcDataPerRead_B = 2;
|
||||
constexpr index_t InBlockCopyDstDataPerWrite_N2 = 2;
|
||||
|
||||
using WeiBlockCopySubLengths_E_K = Sequence<2, 2>;
|
||||
using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>;
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
|
||||
|
||||
struct GeneratorTensor_1
|
||||
{
|
||||
@@ -379,7 +379,7 @@ int main(int argc, char* argv[])
|
||||
#elif 0
|
||||
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
|
||||
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
|
||||
#elif 1
|
||||
#elif 0
|
||||
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
|
||||
in_nchw,
|
||||
wei_kcyx_desc,
|
||||
|
||||
Reference in New Issue
Block a user