mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
behavior has changed (better and worse), figuring out why
This commit is contained in:
@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
|
||||
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
|
||||
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
// for 3x3, 34x34, v1r3, Pascal
|
||||
constexpr index_t BlockSize = 128;
|
||||
|
||||
@@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_W = 2;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
|
||||
@@ -286,10 +286,9 @@ struct ConstantTensorDescriptor
|
||||
"wrong! dimensions to be unfolded need to be packed");
|
||||
|
||||
// checkt ranks
|
||||
static_assert(GetMemoryRank(IDim_p1) = GetMemoryRank(IDim) + 1,
|
||||
"wrong! ranks of dimensions to be "
|
||||
"unfolded need to be in increasing "
|
||||
"and continuous ranks");
|
||||
static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1,
|
||||
"wrong! ranks of dimensions to be unfolded need to be in increasing and "
|
||||
"continuous ranks");
|
||||
});
|
||||
|
||||
// left and right
|
||||
|
||||
@@ -39,7 +39,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
constexpr auto thread_cluster_lengths =
|
||||
src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);
|
||||
|
||||
constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(thread_cluster_lengths);
|
||||
constexpr auto thread_cluster_desc =
|
||||
make_packed_ConstantTensorDescriptor(thread_cluster_lengths);
|
||||
|
||||
// sanity check: data type
|
||||
static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
|
||||
@@ -147,7 +148,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
|
||||
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
|
||||
|
||||
constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
|
||||
constexpr auto thread_tensor_desc =
|
||||
make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
|
||||
|
||||
return thread_tensor_desc.GetElementSpace();
|
||||
}
|
||||
@@ -167,7 +169,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
|
||||
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
|
||||
|
||||
constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
|
||||
constexpr auto thread_tensor_desc =
|
||||
make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
|
||||
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
|
||||
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
|
||||
@@ -204,7 +207,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
|
||||
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
|
||||
|
||||
constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
|
||||
constexpr auto thread_tensor_desc =
|
||||
make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
|
||||
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
|
||||
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
|
||||
|
||||
@@ -362,8 +362,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock;
|
||||
|
||||
static_if<GemmNPerThreadSubC <= NPerBlock>{}([&](auto fwd) { // fwd do nothing but
|
||||
// perfect forwarding.
|
||||
// Using this trick to
|
||||
// perfect forwarding.
|
||||
// Using this trick to
|
||||
// make this lambda a generic lambda, so it won't be compiled until
|
||||
// instantiated
|
||||
static_assert(
|
||||
|
||||
@@ -196,7 +196,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
|
||||
// choose GEMM implementation here
|
||||
const auto run_blockwise_batch_gemm = [&](auto... Xs) {
|
||||
#if 1
|
||||
#if 0
|
||||
return blockwise_batch_gemm.Run(Xs...);
|
||||
#elif 0
|
||||
return blockwise_batch_gemm.Run_asm(Xs...);
|
||||
|
||||
Reference in New Issue
Block a user