mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 06:49:15 +00:00
revert contraction deviceop changes
This commit is contained in:
@@ -656,26 +656,16 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
|
||||
{
|
||||
bool valid_as_access = true;
|
||||
static_for<0, NumATensor, 1>{}([&](auto i) {
|
||||
// vector memory access of A: could be on M or AK1 dimension
|
||||
if constexpr(ABlockTransferSrcVectorDim == 1)
|
||||
const bool valid_a_vector_size =
|
||||
arg.as_max_read_elems_[i] % ABlockTransferSrcScalarPerVector == 0;
|
||||
const bool valid_a_access_dim_m =
|
||||
ABlockTransferSrcVectorDim == 1 && arg.as_mz_consecutive_[i];
|
||||
const bool valid_a_access_dim_k =
|
||||
ABlockTransferSrcVectorDim == 2 && arg.as_kz_consecutive_[i];
|
||||
const bool valid_a_access_dim = valid_a_access_dim_m || valid_a_access_dim_k;
|
||||
if(!(valid_a_vector_size && valid_a_access_dim))
|
||||
{
|
||||
if(!(arg.a_mz_stride_[i] == 1 && arg.as_grid_desc_ak0_m_ak1_[i].GetLength(I1) %
|
||||
ABlockTransferSrcScalarPerVector ==
|
||||
0) &&
|
||||
ABlockTransferSrcScalarPerVector != 1)
|
||||
{
|
||||
all_valid = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(!(arg.a_kz_stride_[i] == 1 && arg.as_grid_desc_ak0_m_ak1_[i].GetLength(I2) %
|
||||
ABlockTransferSrcScalarPerVector ==
|
||||
0) &&
|
||||
ABlockTransferSrcScalarPerVector != 1)
|
||||
{
|
||||
all_valid = false;
|
||||
}
|
||||
valid_as_access = false;
|
||||
}
|
||||
});
|
||||
if(!valid_as_access)
|
||||
@@ -694,23 +684,7 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
|
||||
const bool valid_b_access_dim = valid_b_access_dim_n || valid_b_access_dim_k;
|
||||
if(!(valid_b_vector_size && valid_b_access_dim))
|
||||
{
|
||||
if(!(arg.b_nz_stride_[i] == 1 && arg.bs_grid_desc_bk0_n_bk1_[i].GetLength(I1) %
|
||||
BBlockTransferSrcScalarPerVector ==
|
||||
0) &&
|
||||
BBlockTransferSrcScalarPerVector != 1)
|
||||
{
|
||||
all_valid = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(!(arg.b_kz_stride_[i] == 1 && arg.bs_grid_desc_bk0_n_bk1_[i].GetLength(I2) %
|
||||
BBlockTransferSrcScalarPerVector ==
|
||||
0) &&
|
||||
BBlockTransferSrcScalarPerVector != 1)
|
||||
{
|
||||
all_valid = false;
|
||||
}
|
||||
valid_bs_access = false;
|
||||
}
|
||||
});
|
||||
if(!valid_bs_access)
|
||||
@@ -720,22 +694,16 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
|
||||
|
||||
bool valid_ds_access = true;
|
||||
static_for<0, NumDTensor, 1>{}([&](auto i) {
|
||||
if(!(arg.ds_nz_stride_[i] == 1 &&
|
||||
arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) %
|
||||
CDEBlockTransferScalarPerVector_NPerBlock ==
|
||||
0) &&
|
||||
CDEBlockTransferScalarPerVector_NPerBlock != 1)
|
||||
const bool valid_d_vector_size =
|
||||
arg.ds_max_read_elems_[i] % CDEBlockTransferScalarPerVector_NPerBlock == 0;
|
||||
// Vector read of Ds is always on N dimension.
|
||||
const bool valid_d_access_dim = arg.ds_nz_consecutive_[i];
|
||||
if(!(valid_d_vector_size && valid_d_access_dim))
|
||||
{
|
||||
valid_ds_access = false;
|
||||
}
|
||||
});
|
||||
|
||||
// vector memory access of E: always on NPerBlock dimension
|
||||
if(!(arg.e_nz_stride_ == 1 &&
|
||||
arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) %
|
||||
CDEBlockTransferScalarPerVector_NPerBlock ==
|
||||
0) &&
|
||||
CDEBlockTransferScalarPerVector_NPerBlock != 1)
|
||||
if(!valid_ds_access)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -169,78 +169,6 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
|
||||
static constexpr auto I2 = Number<2>{};
|
||||
static constexpr auto I3 = Number<3>{};
|
||||
|
||||
#if 0
|
||||
static constexpr auto matrix_padder =
|
||||
MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
|
||||
|
||||
static auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideAs)
|
||||
{
|
||||
const auto a_grid_desc_mraw_kraw = [&]() {
|
||||
if constexpr(is_same_v<tensor_layout::gemm::RowMajor, AsLayout>)
|
||||
{
|
||||
return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
|
||||
make_tuple(StrideAs, I1));
|
||||
}
|
||||
else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, AsLayout>)
|
||||
{
|
||||
return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
|
||||
make_tuple(I1, StrideAs));
|
||||
}
|
||||
}();
|
||||
|
||||
return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
|
||||
}
|
||||
|
||||
static auto MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideBs)
|
||||
{
|
||||
const auto b_grid_desc_nraw_kraw = [&]() {
|
||||
if constexpr(is_same<tensor_layout::gemm::RowMajor, BsLayout>::value)
|
||||
{
|
||||
return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
|
||||
make_tuple(I1, StrideBs));
|
||||
}
|
||||
else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BsLayout>::value)
|
||||
{
|
||||
return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
|
||||
make_tuple(StrideBs, I1));
|
||||
}
|
||||
}();
|
||||
|
||||
return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
|
||||
}
|
||||
|
||||
template <typename ELay>
|
||||
static auto MakeEGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideE)
|
||||
{
|
||||
const auto e_grid_desc_mraw_nraw = [&]() {
|
||||
if constexpr(is_same<tensor_layout::gemm::RowMajor, ELay>::value)
|
||||
{
|
||||
return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
|
||||
make_tuple(StrideE, I1));
|
||||
}
|
||||
else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELay>::value)
|
||||
{
|
||||
return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
|
||||
make_tuple(I1, StrideE));
|
||||
}
|
||||
}();
|
||||
|
||||
return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
|
||||
}
|
||||
|
||||
static auto MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
|
||||
const std::array<index_t, NumDTensor>& NRaws,
|
||||
const std::array<index_t, NumDTensor>& DsStride)
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
|
||||
|
||||
return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(MRaws[i], NRaws[i], DsStride[i]);
|
||||
},
|
||||
Number<NumDTensor>{});
|
||||
}
|
||||
#endif
|
||||
using ComputeDataType = EDataType;
|
||||
|
||||
// GridwiseGemm
|
||||
@@ -424,15 +352,6 @@ struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayou
|
||||
}
|
||||
}
|
||||
|
||||
void Print() const
|
||||
{
|
||||
// std::cout << "A[M, K]: " << as_grid_desc_m_k_ << std::endl;
|
||||
// std::cout << "B[N, K]: " << bs_grid_desc_n_k_ << std::endl;
|
||||
// static_for<0, NumDTensor, 1>{}(
|
||||
//[&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; });
|
||||
// std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
|
||||
}
|
||||
|
||||
// private:
|
||||
// pointers
|
||||
typename GridwiseGemm::AsGridPointer p_as_grid_;
|
||||
|
||||
@@ -10,38 +10,9 @@
|
||||
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_util.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
|
||||
// and sometimes useless instructions:
|
||||
// 1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
|
||||
// instead
|
||||
// 2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
|
||||
// tensor coordinate instead
|
||||
// 3. Don't use a pointer to VGPR buffer, use vector instead
|
||||
|
||||
namespace detail {
|
||||
// TODO: How to fix this? It uses an struct instead of lambda because lambda
|
||||
// doesn't have constructor
|
||||
template <index_t VectorDim, index_t ScalarPerVector>
|
||||
struct lambda_scalar_per_access
|
||||
{
|
||||
__host__ __device__ constexpr auto operator()(index_t i) const
|
||||
{
|
||||
return (i == VectorDim) ? ScalarPerVector : 1;
|
||||
}
|
||||
};
|
||||
|
||||
template <index_t VectorDim>
|
||||
struct lambda_scalar_step_in_vector
|
||||
{
|
||||
__host__ __device__ constexpr auto operator()(index_t i) const
|
||||
{
|
||||
return (i == VectorDim) ? 1 : 0;
|
||||
}
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
// Assume:
|
||||
// 1. src:
|
||||
// 1. SrcDesc is known at compile-time
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -7,44 +7,13 @@
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
|
||||
#include "ck/tensor/static_tensor.hpp"
|
||||
#include "ck/utility/is_detected.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_util.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
namespace detail {
|
||||
// TODO: How to fix this? It uses an struct instead of lambda because lambda
|
||||
// doesn't have constructor
|
||||
template <index_t SrcVectorDim,
|
||||
index_t SrcScalarPerVector,
|
||||
index_t DstVectorDim,
|
||||
index_t DstScalarPerVector>
|
||||
struct lambda_scalar_per_access_for_src_and_dst
|
||||
{
|
||||
__host__ __device__ constexpr auto operator()(index_t i) const
|
||||
{
|
||||
if(i == SrcVectorDim && i == DstVectorDim)
|
||||
{
|
||||
return math::lcm(SrcScalarPerVector, DstScalarPerVector);
|
||||
}
|
||||
else if(i == SrcVectorDim)
|
||||
{
|
||||
return SrcScalarPerVector;
|
||||
}
|
||||
else if(i == DstVectorDim)
|
||||
{
|
||||
return DstScalarPerVector;
|
||||
}
|
||||
else
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Assume:
|
||||
// 1. src_desc and dst_desc are not known at compile-time
|
||||
// 2. SrcBuffer and DstBuffer are DynamicBuffer
|
||||
|
||||
@@ -10,40 +10,9 @@
|
||||
#include "ck/utility/is_detected.hpp"
|
||||
#include "ck/tensor/static_tensor.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_util.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
namespace detail {
|
||||
// TODO: How to fix this? It uses an struct instead of lambda because lambda
|
||||
// doesn't have constructor
|
||||
template <index_t SrcVectorDim,
|
||||
index_t SrcScalarPerVector,
|
||||
index_t DstVectorDim,
|
||||
index_t DstScalarPerVector>
|
||||
struct lambda_scalar_per_access_for_src_and_dst
|
||||
{
|
||||
__host__ __device__ constexpr auto operator()(index_t i) const
|
||||
{
|
||||
if(i == SrcVectorDim && i == DstVectorDim)
|
||||
{
|
||||
return math::lcm(SrcScalarPerVector, DstScalarPerVector);
|
||||
}
|
||||
else if(i == SrcVectorDim)
|
||||
{
|
||||
return SrcScalarPerVector;
|
||||
}
|
||||
else if(i == DstVectorDim)
|
||||
{
|
||||
return DstScalarPerVector;
|
||||
}
|
||||
else
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Thread-level multi-source, multi-destination tensor slice data movement
|
||||
// Assume:
|
||||
// 1. All sources and destinations are DynamicBuffer
|
||||
|
||||
Reference in New Issue
Block a user