mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
adding implicit gemm v3
This commit is contained in:
@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
|
||||
|
||||
// reorder weight
|
||||
auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
|
||||
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
|
||||
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
|
||||
|
||||
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
|
||||
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
std::thread::hardware_concurrency());
|
||||
|
||||
// reorder input
|
||||
auto in_chwn_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
|
||||
auto in_chwn_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Hi, Wi, N>{});
|
||||
ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
|
||||
|
||||
Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
|
||||
@@ -64,7 +64,8 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
|
||||
std::thread::hardware_concurrency());
|
||||
|
||||
// output
|
||||
auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
|
||||
auto out_khwn_desc =
|
||||
make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, Ho, Wo, N>{});
|
||||
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
|
||||
|
||||
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
|
||||
|
||||
@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
|
||||
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
|
||||
|
||||
// reorder weight
|
||||
auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
|
||||
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
|
||||
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
|
||||
|
||||
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
|
||||
@@ -50,7 +50,8 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
|
||||
std::thread::hardware_concurrency());
|
||||
|
||||
// output
|
||||
auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
|
||||
auto out_khwn_desc =
|
||||
make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, Ho, Wo, N>{});
|
||||
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
|
||||
|
||||
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
|
||||
|
||||
@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
|
||||
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
|
||||
|
||||
// reorder weight
|
||||
auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
|
||||
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
|
||||
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
|
||||
|
||||
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
|
||||
@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
|
||||
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
|
||||
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
// for 3x3, 34x34, v1r3, Pascal
|
||||
constexpr index_t BlockSize = 128;
|
||||
|
||||
@@ -92,7 +92,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_W = 2;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 32
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
@@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
|
||||
constexpr index_t OutThreadCopyDataPerWrite_W = 2;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
|
||||
@@ -35,7 +35,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
|
||||
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
|
||||
|
||||
// reorder weight
|
||||
auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
|
||||
auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
|
||||
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
|
||||
|
||||
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
|
||||
@@ -56,37 +56,40 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
|
||||
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
|
||||
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
|
||||
|
||||
constexpr index_t N1 = 2;
|
||||
constexpr index_t N2 = 4;
|
||||
|
||||
constexpr index_t B = (N * Ho * Wo) / (N1 * N2);
|
||||
|
||||
#if 1
|
||||
// for 3x3, 28x28, v3, Pascal
|
||||
constexpr index_t BlockSize = 128;
|
||||
// for 3x3, 28x28, v3
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
constexpr index_t BPerBlock = 16;
|
||||
constexpr index_t KPerBlock = 128;
|
||||
constexpr index_t CPerBlock = 8;
|
||||
|
||||
constexpr index_t BPerThread = 1;
|
||||
constexpr index_t KPerThread = 8;
|
||||
|
||||
constexpr index_t GemmMPerThreadSubC = 4;
|
||||
constexpr index_t GemmNPerThreadSubC = 4;
|
||||
constexpr index_t GemmMLevel0Cluster = 4;
|
||||
constexpr index_t GemmNLevel0Cluster = 2;
|
||||
constexpr index_t GemmNLevel0Cluster = 4;
|
||||
constexpr index_t GemmMLevel1Cluster = 4;
|
||||
constexpr index_t GemmNLevel1Cluster = 2;
|
||||
constexpr index_t GemmNLevel1Cluster = 4;
|
||||
constexpr index_t GemmKPerThreadLoop = 1;
|
||||
constexpr index_t GemmDataPerReadA = 4;
|
||||
constexpr index_t GemmDataPerReadB = 4;
|
||||
|
||||
using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 1>;
|
||||
using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>;
|
||||
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
|
||||
using InBlockCopySubLengths_N1_N2_C_B = Sequence<1, 4, 1, 1>;
|
||||
using InBlockCopyClusterLengths_N1_N2_C_B = Sequence<2, 1, 8, 16>;
|
||||
|
||||
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
|
||||
constexpr index_t InBlockCopySrcDataPerRead_B = 1;
|
||||
constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
|
||||
|
||||
constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
|
||||
#endif
|
||||
|
||||
constexpr index_t GridSize =
|
||||
((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
|
||||
((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
|
||||
((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
|
||||
|
||||
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
|
||||
|
||||
@@ -102,15 +105,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
|
||||
decltype(in_nchw_desc),
|
||||
decltype(wei_cyxk_desc),
|
||||
decltype(out_nkhw_desc),
|
||||
NPerBlock,
|
||||
BPerBlock,
|
||||
KPerBlock,
|
||||
CPerBlock,
|
||||
HoPerBlock,
|
||||
WoPerBlock,
|
||||
NPerThread,
|
||||
KPerThread,
|
||||
HoPerThread,
|
||||
WoPerThread,
|
||||
N1,
|
||||
N2,
|
||||
GemmMPerThreadSubC,
|
||||
GemmNPerThreadSubC,
|
||||
GemmMLevel0Cluster,
|
||||
@@ -120,14 +119,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
|
||||
GemmKPerThreadLoop,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB,
|
||||
InBlockReorderSrcSubLengths_NCHW,
|
||||
InBlockReorderSrcClusterLengths_NCHW,
|
||||
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
|
||||
InBlockReorderDataPerRead_W,
|
||||
InBlockReorderDataPerWrite_N,
|
||||
WeiBlockCopyClusterLengths,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
OutThreadCopyDataPerWrite_W>{};
|
||||
InBlockCopySubLengths_N1_N2_C_B,
|
||||
InBlockCopyClusterLengths_N1_N2_C_B,
|
||||
InBlockCopySrcDataPerRead_B,
|
||||
InBlockCopyDstDataPerWrite_N2,
|
||||
WeiBlockCopyDataPerAccess_K>{};
|
||||
|
||||
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
|
||||
dim3(GridSize),
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
|
||||
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
|
||||
|
||||
struct GeneratorTensor_1
|
||||
{
|
||||
@@ -548,8 +548,8 @@ int main(int argc, char* argv[])
|
||||
auto lower_pads = Sequence<HPad, WPad>{};
|
||||
auto upper_pads = Sequence<HPad, WPad>{};
|
||||
|
||||
auto in_nchw_desc = make_packed_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{});
|
||||
auto wei_kcyx_desc = make_packed_ConstantTensorDescriptor(Sequence<K, C, Y, X>{});
|
||||
auto in_nchw_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, C, HI, WI>{});
|
||||
auto wei_kcyx_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, C, Y, X>{});
|
||||
auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor(
|
||||
in_nchw_desc, wei_kcyx_desc, lower_pads, upper_pads);
|
||||
|
||||
@@ -612,11 +612,11 @@ int main(int argc, char* argv[])
|
||||
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
|
||||
#elif 0
|
||||
device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
|
||||
#elif 1
|
||||
#elif 0
|
||||
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
|
||||
#elif 0
|
||||
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
|
||||
#elif 0
|
||||
#elif 1
|
||||
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
|
||||
#endif
|
||||
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
|
||||
|
||||
@@ -12,7 +12,7 @@ struct Array
|
||||
index_t mData[nSize];
|
||||
|
||||
template <class... Xs>
|
||||
__host__ __device__ Array(Xs... xs) : mData{static_cast<TData>(xs)...}
|
||||
__host__ __device__ constexpr Array(Xs... xs) : mData{static_cast<TData>(xs)...}
|
||||
{
|
||||
}
|
||||
|
||||
@@ -37,6 +37,25 @@ struct Array
|
||||
}
|
||||
};
|
||||
|
||||
template <index_t... Is>
|
||||
__host__ __device__ constexpr auto sequence2array(Sequence<Is...>)
|
||||
{
|
||||
return Array<index_t, sizeof...(Is)>{Is...};
|
||||
}
|
||||
|
||||
template <class TData, index_t NSize>
|
||||
__host__ __device__ constexpr auto make_zero_array()
|
||||
{
|
||||
Array<TData, NSize> a;
|
||||
|
||||
static_for<0, NSize, 1>{}([&](auto I) {
|
||||
constexpr index_t i = I.Get();
|
||||
a[i] = static_cast<TData>(0);
|
||||
});
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
template <class TData, index_t NSize, index_t... IRs>
|
||||
__host__ __device__ auto reorder_array_given_new2old(const Array<TData, NSize>& old_array,
|
||||
Sequence<IRs...> new2old)
|
||||
@@ -80,15 +99,14 @@ __host__ __device__ auto extract_array(const Array<TData, NSize>& old_array, Ext
|
||||
|
||||
static_for<0, new_size, 1>{}([&](auto I) {
|
||||
constexpr index_t i = I.Get();
|
||||
new_array[i] = old_array[ExtractSeq{}.Get(I)];
|
||||
new_array[i] = old_array[ExtractSeq::Get(I)];
|
||||
});
|
||||
|
||||
return new_array;
|
||||
}
|
||||
|
||||
template <class TData, index_t NSize>
|
||||
__host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a,
|
||||
const Array<TData, NSize>& b)
|
||||
__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
|
||||
{
|
||||
Array<TData, NSize> result;
|
||||
|
||||
@@ -99,3 +117,20 @@ __host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a,
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Array = Array * Sequence
|
||||
template <class TData, index_t NSize, index_t... Is>
|
||||
__host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b)
|
||||
{
|
||||
static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
|
||||
|
||||
Array<TData, NSize> result;
|
||||
|
||||
static_for<0, NSize, 1>{}([&](auto I) {
|
||||
constexpr index_t i = I.Get();
|
||||
|
||||
result[i] = a[i] + b.Get(I);
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -9,26 +9,26 @@ struct ConstantMatrixDescriptor
|
||||
static_assert(NCol_ <= RowStride_, "wrong! NCol > RowStride!");
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr index_t NRow() const { return NRow_; }
|
||||
__host__ __device__ static constexpr index_t NRow() { return NRow_; }
|
||||
|
||||
__host__ __device__ constexpr index_t NCol() const { return NCol_; }
|
||||
__host__ __device__ static constexpr index_t NCol() { return NCol_; }
|
||||
|
||||
__host__ __device__ constexpr index_t RowStride() const { return RowStride_; }
|
||||
__host__ __device__ static constexpr index_t RowStride() { return RowStride_; }
|
||||
|
||||
__host__ __device__ constexpr auto GetLengths() const { return Sequence<NRow_, NCol_>{}; }
|
||||
__host__ __device__ static constexpr auto GetLengths() { return Sequence<NRow_, NCol_>{}; }
|
||||
|
||||
__host__ __device__ constexpr index_t GetElementSize() const { return NRow_ * NCol_; }
|
||||
__host__ __device__ static constexpr index_t GetElementSize() { return NRow_ * NCol_; }
|
||||
|
||||
__host__ __device__ constexpr index_t GetElementSpace() const { return NRow_ * RowStride_; }
|
||||
__host__ __device__ static constexpr index_t GetElementSpace() { return NRow_ * RowStride_; }
|
||||
|
||||
__host__ __device__ index_t GetOffsetFromMultiIndex(index_t irow, index_t icol) const
|
||||
__host__ __device__ static index_t GetOffsetFromMultiIndex(index_t irow, index_t icol)
|
||||
{
|
||||
return irow * RowStride_ + icol;
|
||||
}
|
||||
|
||||
template <index_t SubNRow, index_t SubNCol>
|
||||
__host__ __device__ constexpr auto MakeSubMatrixDescriptor(Number<SubNRow>,
|
||||
Number<SubNCol>) const
|
||||
__host__ __device__ static constexpr auto MakeSubMatrixDescriptor(Number<SubNRow>,
|
||||
Number<SubNCol>)
|
||||
{
|
||||
return ConstantMatrixDescriptor<SubNRow, SubNCol, RowStride_>{};
|
||||
}
|
||||
|
||||
@@ -11,8 +11,8 @@ struct ConstantMergedTensorDescriptor
|
||||
{
|
||||
static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{};
|
||||
|
||||
static constexpr index_t nDim = std::tuple_size<mOriginalDimMergeSeqs>::value;
|
||||
static constexpr index_t nOriginalDim = OriginalDesc::GetNumOfDimension();
|
||||
static constexpr index_t nDim = sizeof...(OriginalDimMergeSeqs);
|
||||
static constexpr index_t nOriginalDim = OriginalTensorDesc::GetNumOfDimension();
|
||||
|
||||
__host__ __device__ constexpr ConstantMergedTensorDescriptor()
|
||||
{
|
||||
@@ -21,25 +21,28 @@ struct ConstantMergedTensorDescriptor
|
||||
// TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most
|
||||
// OriginalTensorDesc::nDim number of dimensions
|
||||
|
||||
// TODO: check there is no duplication in OriginalDimMergeSeqs
|
||||
|
||||
// TODO: check OriginalDimMergeSeqs contains all original dimensions
|
||||
|
||||
// TODO: check there is no duplication in OriginalDimMergeSeqs
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; }
|
||||
|
||||
__host__ __device__ static constexpr index_t GetNumOfOriginalDimension() { return nOriginalDim }
|
||||
__host__ __device__ static constexpr index_t GetNumOfOriginalDimension()
|
||||
{
|
||||
return nOriginalDim;
|
||||
}
|
||||
|
||||
template <index_t IDim>
|
||||
__host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(Number<IDim>)
|
||||
{
|
||||
return (std::Get<IDIM>(mOriginalDimMergeSeqs).GetSize() > 1);
|
||||
return (std::get<IDim>(mOriginalDimMergeSeqs).GetSize() > 1);
|
||||
}
|
||||
|
||||
template <index_t IDim>
|
||||
__host__ __device__ static constexpr index_t GetLength(Number<IDim>)
|
||||
{
|
||||
constexpr auto original_dims_partial = std::Get<IDim>(mOriginalDimMergeSeqs);
|
||||
constexpr auto original_dims_partial = std::get<IDim>(mOriginalDimMergeSeqs);
|
||||
|
||||
return OriginalTensorDesc::Extract(original_dims_partial).GetElementSize();
|
||||
}
|
||||
@@ -50,14 +53,14 @@ struct ConstantMergedTensorDescriptor
|
||||
static_assert(!ContainMultipleOriginalDimensions(Number<IDim>{}),
|
||||
"wrong! stride of a merged dimension is undefined");
|
||||
|
||||
constexpr auto idim_original = std::Get<IDim>(mOriginalDimMergeSeqs).Front();
|
||||
constexpr auto idim_original = std::get<IDim>(mOriginalDimMergeSeqs).Front();
|
||||
|
||||
return OriginalTensorDesc::GetStride(Number<idim_original>{});
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto GetLengths()
|
||||
{
|
||||
return Sequence<OriginalTensorDesc::Extract(OriginalDimMergeSeqs).GetElementSize()...>{};
|
||||
return Sequence<OriginalTensorDesc::Extract(OriginalDimMergeSeqs{}).GetElementSize()...>{};
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t GetElementSize()
|
||||
@@ -75,17 +78,16 @@ struct ConstantMergedTensorDescriptor
|
||||
constexpr auto original_dims_partial = std::get<idim>(mOriginalDimMergeSeqs);
|
||||
|
||||
// get partial original-multi-id corresponding to this merged dimension
|
||||
constexpr auto original_multi_id_partial =
|
||||
const auto original_multi_id_partial =
|
||||
OriginalTensorDesc::Extract(original_dims_partial)
|
||||
.GetMultiIndexFrom1dIndex(multi_id[idim]);
|
||||
|
||||
// make sure compiler unroll this loop and propagate all the constants
|
||||
for(index_t i = 0; i < original_dims_partial.GetSize(); ++i)
|
||||
{
|
||||
index_t idim_original = original_dims_partial[i];
|
||||
static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) {
|
||||
constexpr auto I = decltype(I_){};
|
||||
constexpr index_t idim_original = original_dims_partial.Get(I);
|
||||
|
||||
original_multi_id[idim_original] = original_multi_id_partial[i]
|
||||
}
|
||||
original_multi_id[idim_original] = original_multi_id_partial[I.Get()];
|
||||
});
|
||||
});
|
||||
|
||||
return original_multi_id;
|
||||
@@ -95,10 +97,10 @@ struct ConstantMergedTensorDescriptor
|
||||
{
|
||||
const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id);
|
||||
|
||||
return OriginalTensorDesc::GetOffsetFromMultiIndex(orginal_multi_id);
|
||||
return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id);
|
||||
}
|
||||
|
||||
template <index_t... Is>
|
||||
template <class... Is>
|
||||
__host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is)
|
||||
{
|
||||
return GetOffsetFromMultiIndex(Array<index_t, nDim>{is...});
|
||||
@@ -106,14 +108,15 @@ struct ConstantMergedTensorDescriptor
|
||||
|
||||
__host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
|
||||
{
|
||||
constexpr auto dummy_desc = make_packed_ConstantTensorDescriptor(GetLengths());
|
||||
constexpr auto dummy_desc = make_ConstantTensorDescriptor_default_rank_packed(GetLengths());
|
||||
|
||||
return dummy_desc.GetMultiIndexFrom1dIndex(id);
|
||||
}
|
||||
};
|
||||
|
||||
template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
|
||||
constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc, OriginalDimMergeSeqs...)
|
||||
__host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc,
|
||||
OriginalDimMergeSeqs...)
|
||||
{
|
||||
return ConstantMergedTensorDescriptor<OriginalTensorDesc, OriginalDimMergeSeqs...>{};
|
||||
}
|
||||
|
||||
@@ -2,20 +2,20 @@
|
||||
#include "common.hip.hpp"
|
||||
|
||||
template <class Lengths>
|
||||
__host__ __device__ constexpr auto calculate_packed_tensor_strides(Lengths)
|
||||
__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_packed(Lengths)
|
||||
{
|
||||
return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), std::multiplies<index_t>{})
|
||||
.PushBack(Number<1>{});
|
||||
}
|
||||
|
||||
template <class Lengths, index_t Align>
|
||||
__host__ __device__ constexpr auto
|
||||
calculate_rank_tensor_default_strides_with_alignment(Lengths, Number<Align>)
|
||||
__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_aligned(Lengths,
|
||||
Number<Align>)
|
||||
{
|
||||
constexpr index_t L_back_align =
|
||||
Align * mod_conv::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
|
||||
|
||||
return calculate_packed_tensor_strides(
|
||||
return calculate_tensor_strides_default_rank_packed(
|
||||
Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
|
||||
}
|
||||
|
||||
@@ -66,6 +66,12 @@ struct ConstantTensorDescriptor
|
||||
return MemoryRanks{}.Get(Number<I>{});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(T)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t GetElementSize()
|
||||
{
|
||||
return accumulate_on_sequence(Lengths{}, std::multiplies<index_t>{}, Number<1>{});
|
||||
@@ -146,7 +152,7 @@ struct ConstantTensorDescriptor
|
||||
{
|
||||
Array<index_t, nDim> multi_id;
|
||||
|
||||
constexpr auto dummy_strides = calculate_packed_tensor_strides(GetLengths());
|
||||
constexpr auto dummy_strides = calculate_tensor_strides_default_rank_packed(GetLengths());
|
||||
|
||||
// calculate index in each of the dimensions in the order of their dimension (not rank)
|
||||
static_for<0, nDim - 1, 1>{}([&](auto IDim) {
|
||||
@@ -181,6 +187,12 @@ struct ConstantTensorDescriptor
|
||||
return ConstantTensorDescriptor<extract_lengths, extract_strides, new_ranks>{};
|
||||
}
|
||||
|
||||
template <index_t... IDims>
|
||||
__host__ __device__ static constexpr auto Extract(Sequence<IDims...>)
|
||||
{
|
||||
return Extract(Number<IDims>{}...);
|
||||
}
|
||||
|
||||
template <index_t IDim, index_t SliceLen>
|
||||
__host__ __device__ static constexpr auto Slice(Number<IDim>, Number<SliceLen>)
|
||||
{
|
||||
@@ -271,9 +283,11 @@ struct ConstantTensorDescriptor
|
||||
FirstUnfoldDim <= LastUnfoldDim,
|
||||
"wrong! should have FirstUnfoldDim <= LastUnfoldDim!");
|
||||
|
||||
#if 0 // cannot compile: compiler complain about constexpr
|
||||
// dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be
|
||||
// packed in memory, otherwise, unfolding is invalid
|
||||
static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim) {
|
||||
static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim_) {
|
||||
constexpr auto IDim = decltype(IDim_){};
|
||||
constexpr auto IDim_p1 = IDim + Number<1>{};
|
||||
|
||||
// check stride
|
||||
@@ -285,11 +299,12 @@ struct ConstantTensorDescriptor
|
||||
static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim),
|
||||
"wrong! dimensions to be unfolded need to be packed");
|
||||
|
||||
// checkt ranks
|
||||
// check ranks
|
||||
static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1,
|
||||
"wrong! ranks of dimensions to be unfolded need to be in increasing and "
|
||||
"continuous ranks");
|
||||
});
|
||||
#endif
|
||||
|
||||
// left and right
|
||||
constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::SeqType{};
|
||||
@@ -308,9 +323,9 @@ struct ConstantTensorDescriptor
|
||||
|
||||
// decrease the ranks that are larger than the rank of LastUnfoldDim
|
||||
constexpr auto tmp_ranks =
|
||||
transform_sequences(GetMemoryRanks(),
|
||||
f_unfold_impl<GetMemoryRank(Number<LastUnfoldDim>{}),
|
||||
LastUnfoldDim - FirstUnfoldDim + 1>{});
|
||||
transform_sequences(f_unfold_impl<GetMemoryRank(Number<LastUnfoldDim>{}),
|
||||
LastUnfoldDim - FirstUnfoldDim + 1>{},
|
||||
GetMemoryRanks());
|
||||
|
||||
// new lengths, strides and ranks
|
||||
constexpr auto new_lengths = GetLengths()
|
||||
@@ -354,26 +369,26 @@ struct ConstantTensorDescriptor
|
||||
};
|
||||
|
||||
template <class Lengths>
|
||||
__host__ __device__ constexpr auto make_packed_ConstantTensorDescriptor(Lengths)
|
||||
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_packed(Lengths)
|
||||
{
|
||||
using Strides = decltype(calculate_packed_tensor_strides(Lengths{}));
|
||||
using Strides = decltype(calculate_tensor_strides_default_rank_packed(Lengths{}));
|
||||
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
|
||||
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
|
||||
}
|
||||
|
||||
template <class Lengths, class Strides>
|
||||
__host__ __device__ constexpr auto make_ranked_ConstantTensorDescriptor(Lengths, Strides)
|
||||
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank(Lengths, Strides)
|
||||
{
|
||||
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
|
||||
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
|
||||
}
|
||||
|
||||
template <class Lengths, index_t Align>
|
||||
__host__ __device__ constexpr auto
|
||||
make_ranked_ConstantTensorDescriptor_with_alignment(Lengths, Number<Align>)
|
||||
__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_aligned(Lengths,
|
||||
Number<Align>)
|
||||
{
|
||||
using Strides =
|
||||
decltype(calculate_rank_tensor_default_strides_with_alignment(Lengths{}, Number<Align>{}));
|
||||
decltype(calculate_tensor_strides_default_rank_aligned(Lengths{}, Number<Align>{}));
|
||||
using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType;
|
||||
return ConstantTensorDescriptor<Lengths, Strides, MemoryRanks>{};
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#pragma once
|
||||
#include "constant_integral.hip.hpp"
|
||||
#include "integral_constant.hip.hpp"
|
||||
#include "functional.hip.hpp"
|
||||
|
||||
template <index_t... Is>
|
||||
@@ -21,12 +21,6 @@ struct Sequence
|
||||
return mData[I];
|
||||
}
|
||||
|
||||
__host__ __device__ index_t operator[](index_t i) const
|
||||
{
|
||||
const index_t mData[mSize + 1] = {Is..., 0};
|
||||
return mData[i];
|
||||
}
|
||||
|
||||
template <index_t... IRs>
|
||||
__host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/)
|
||||
{
|
||||
@@ -164,6 +158,12 @@ struct sequence_reverse_inclusive_scan<Sequence<I>, Reduce>
|
||||
using SeqType = Sequence<I>;
|
||||
};
|
||||
|
||||
template <class Reduce>
|
||||
struct sequence_reverse_inclusive_scan<Sequence<>, Reduce>
|
||||
{
|
||||
using SeqType = Sequence<>;
|
||||
};
|
||||
|
||||
template <class, class>
|
||||
struct sequence_extract;
|
||||
|
||||
|
||||
@@ -457,7 +457,8 @@ struct Blockwise2dTensorCopy3
|
||||
index_t mSrcMyThreadOffset;
|
||||
index_t mDstMyThreadOffset;
|
||||
|
||||
__device__ Blockwise2dTensorCopy3()
|
||||
__device__ Blockwise2dTensorCopy3(Array<index_t, 2> src_block_data_multi_id_begin,
|
||||
Array<index_t, 2> dst_block_data_multi_id_begin)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -499,10 +500,13 @@ struct Blockwise2dTensorCopy3
|
||||
const index_t thread_id_d0 = get_thread_local_1d_id() / thread_per_d1;
|
||||
const index_t thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1;
|
||||
|
||||
mSrcMyThreadOffset =
|
||||
SrcDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead);
|
||||
mDstMyThreadOffset =
|
||||
DstDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead);
|
||||
mSrcMyThreadOffset = SrcDesc{}.GetOffsetFromMultiIndex(
|
||||
src_block_data_multi_id_begin +
|
||||
Array<index_t, 2>{thread_id_d0, thread_id_d1 * DataPerRead});
|
||||
|
||||
mDstMyThreadOffset = DstDesc{}.GetOffsetFromMultiIndex(
|
||||
dst_block_data_multi_id_begin +
|
||||
Array<index_t, 2>{thread_id_d0, thread_id_d1 * DataPerRead});
|
||||
}
|
||||
|
||||
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
|
||||
|
||||
@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
|
||||
|
||||
constexpr auto dst_desc = DstDesc{};
|
||||
|
||||
constexpr auto desc = make_packed_ConstantTensorDescriptor(dst_desc.GetLengths());
|
||||
constexpr auto desc = make_ConstantTensorDescriptor_default_rank_packed(dst_desc.GetLengths());
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0)
|
||||
@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
|
||||
|
||||
constexpr auto src_desc = SrcDesc{};
|
||||
constexpr auto dst_desc = DstDesc{};
|
||||
constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{});
|
||||
constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(SrcOpLengths{});
|
||||
|
||||
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
|
||||
|
||||
@@ -259,7 +259,7 @@ struct Blockwise4dTensorCopy1
|
||||
constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead);
|
||||
|
||||
constexpr auto ref_desc =
|
||||
make_packed_ConstantTensorDescriptor(Sequence<L0, L1, L2, read_per_d3>{});
|
||||
make_ConstantTensorDescriptor_default_rank_packed(Sequence<L0, L1, L2, read_per_d3>{});
|
||||
|
||||
constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;
|
||||
|
||||
@@ -336,7 +336,7 @@ struct BlockwiseChwnTensorCopyPadded
|
||||
|
||||
constexpr auto src_desc = SrcDesc{};
|
||||
constexpr auto dst_desc = DstDesc{};
|
||||
constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(DstOpLengths{});
|
||||
constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(DstOpLengths{});
|
||||
|
||||
constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0);
|
||||
constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1);
|
||||
@@ -510,7 +510,8 @@ struct Blockwise4dTensorCopy3
|
||||
}
|
||||
}
|
||||
|
||||
constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(ThreadPerDims{});
|
||||
constexpr auto thread_cluster_desc =
|
||||
make_ConstantTensorDescriptor_default_rank_packed(ThreadPerDims{});
|
||||
const auto thread_multi_id =
|
||||
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
|
||||
|
||||
@@ -652,7 +653,7 @@ struct Blockwise4dTensorCopy3
|
||||
constexpr index_t nloop_d2 = L2 / thread_per_d2;
|
||||
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
|
||||
|
||||
constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor(
|
||||
constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed(
|
||||
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
|
||||
|
||||
#pragma unroll
|
||||
@@ -719,7 +720,7 @@ struct Blockwise4dTensorCopy3
|
||||
constexpr index_t nloop_d2 = L2 / thread_per_d2;
|
||||
constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);
|
||||
|
||||
constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor(
|
||||
constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed(
|
||||
Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});
|
||||
|
||||
#pragma unroll
|
||||
|
||||
@@ -46,7 +46,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
|
||||
"wrong! Cannot evenly divide work among\n");
|
||||
|
||||
static_assert(ThreadMatrixC::GetLengths() == GetThreadMatrixCLengths(),
|
||||
static_assert(is_same_type(ThreadMatrixC::GetLengths(), GetThreadMatrixCLengths()),
|
||||
"wrong! ThreadMatrixC lengths is wrong");
|
||||
|
||||
auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
|
||||
@@ -55,7 +55,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
mMyThreadOffsetB = BlockMatrixB::GetOffsetFromMultiIndex(0, c_thread_mtx_index.col);
|
||||
}
|
||||
|
||||
__device__ static auto GetThreadMatrixCLengths()
|
||||
__device__ static constexpr auto GetThreadMatrixCLengths()
|
||||
{
|
||||
constexpr index_t M = BlockMatrixA::NCol(); // A is transposed
|
||||
constexpr index_t N = BlockMatrixB::NCol();
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
#pragma once
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
|
||||
// slice a merged tensor, reorder and copy it into a normal tensor
|
||||
// src: a merged tensor,
|
||||
// dst: a normal tensor
|
||||
// slice a (normal or merged) tensor, reorder and copy it into another (normal or merged) tensor
|
||||
template <index_t BlockSize,
|
||||
class Float,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class SliceLengths,
|
||||
class SubLengths,
|
||||
class ClusterLengths,
|
||||
class DataClusterLengths,
|
||||
class ThreadClusterArrangeOrder,
|
||||
class SrcAccessOrder,
|
||||
class DstAccessOrder>
|
||||
class DstAccessOrder,
|
||||
index_t SrcDataPerRead,
|
||||
index_t DstDataPerRead>
|
||||
struct BlockwiseTensorSliceCopy_generic_v1
|
||||
{
|
||||
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
|
||||
@@ -21,39 +21,44 @@ struct BlockwiseTensorSliceCopy_generic_v1
|
||||
index_t mSrcMyThreadOffset;
|
||||
index_t mDstMyThreadOffset;
|
||||
|
||||
__device__ BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_multi_offset,
|
||||
Array<index_t, nDim> dst_block_multi_offset)
|
||||
__device__
|
||||
BlockwiseTensorSliceCopy_generic_v1(Array<index_t, nDim> src_block_data_multi_id_begin,
|
||||
Array<index_t, nDim> dst_block_data_multi_id_begin)
|
||||
{
|
||||
// check NDim consistent
|
||||
static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(), "wrong");
|
||||
|
||||
constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(
|
||||
ClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
|
||||
// thread cluster
|
||||
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_default_rank_packed(
|
||||
DataClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
|
||||
|
||||
// BlockSize
|
||||
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
|
||||
|
||||
// divide work
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
static_assert(SliceLengths{}.Get(IDim) % SubLenghs{}.Get(IDim) == 0,
|
||||
constexpr auto data_per_cluster_per_dims = SubLengths{} * DataClusterLengths{};
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto IDim_) {
|
||||
constexpr auto IDim = decltype(IDim_){};
|
||||
|
||||
static_assert(SliceLengths::Get(IDim) % SubLengths::Get(IDim) == 0,
|
||||
"wrong! cannot evenly divide sliced tensor into sub-tensor");
|
||||
|
||||
static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
|
||||
"wrong! cannot evenly divide sliced tensor into cluster");
|
||||
});
|
||||
|
||||
constexpr auto thread_work_desc =
|
||||
make_packed_ConstantTensorDescriptor(SliceLengths{} / SliceSubLengths{});
|
||||
constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
static_assert(thread_work_desc.GetLength(IDim) % thread_cluster_desc.Get(IDim) == 0,
|
||||
"wrong! cannot evenly divide work to cluster");
|
||||
});
|
||||
// for now, only support SubLengths.Get() == 1 on a merged dimension that is merge from
|
||||
// multiple dimensions
|
||||
static_for<0, nDim, 1>{}([&](auto IDim_) {
|
||||
constexpr auto IDim = decltype(IDim_){};
|
||||
|
||||
// only support SubLengths.Get() == 1 on merged dimension, for now
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
static_if<(SrcDesc::ContainMultipleOriginalDimensions(IDim) ||
|
||||
DstDesc::ContainMultipleOriginalDimensions(IDim))>{}([&](auto fwd) {
|
||||
static_assert(fwd(SubLengths{}).Get(IDim) == 1,
|
||||
"wrong! Sub-Lengths on merged dimension should be 1");
|
||||
});
|
||||
static_assert(SubLengths::Get(IDim) == 1 ||
|
||||
(!SrcDesc::ContainMultipleOriginalDimensions(IDim) &&
|
||||
!DstDesc::ContainMultipleOriginalDimensions(IDim)),
|
||||
"wrong! only surpport Sub-Length == 1 on a merged dimension");
|
||||
});
|
||||
|
||||
// calculate mSrcMyThreadOffset, mDstMyThreadOffset
|
||||
@@ -63,22 +68,23 @@ struct BlockwiseTensorSliceCopy_generic_v1
|
||||
const auto data_cluster_multi_id =
|
||||
reorder_array_given_old2new(thread_cluster_multi_id, ThreadClusterArrangeOrder{});
|
||||
|
||||
const auto thread_data_multi_offset = data_cluster_multi_id * SubLengths{};
|
||||
const auto thread_data_multi_id_begin = data_cluster_multi_id * SubLengths{};
|
||||
|
||||
mSrcMythreadOffset =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_block_multi_offset + thread_data_multi_offset);
|
||||
mSrcMythreadOffset =
|
||||
DstDesc::GetOffsetFromMultiIndex(dst_block_multi_offset + thread_data_multi_offset);
|
||||
mSrcMyThreadOffset = SrcDesc::GetOffsetFromMultiIndex(src_block_data_multi_id_begin +
|
||||
thread_data_multi_id_begin);
|
||||
|
||||
mSrcMyThreadOffset = DstDesc::GetOffsetFromMultiIndex(dst_block_data_multi_id_begin +
|
||||
thread_data_multi_id_begin);
|
||||
}
|
||||
|
||||
__device__ static constexpr index_t GetRegisterClipboardSize()
|
||||
{
|
||||
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
|
||||
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
|
||||
|
||||
constexpr auto thread_tensor_desc =
|
||||
make_packed_ConstantTensorDescriptor(SubLengths{} * repeat_lengths);
|
||||
make_ConstantTensorDescriptor_default_rank_packed(SubLengths{} * repeat_lengths);
|
||||
|
||||
return thread_tensor_desc.GetElementSpaceSize();
|
||||
return thread_tensor_desc.GetElementSpace();
|
||||
}
|
||||
|
||||
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
|
||||
@@ -86,32 +92,34 @@ struct BlockwiseTensorSliceCopy_generic_v1
|
||||
{
|
||||
constexpr auto thread_sub_tensor_lengths = SubLengths{};
|
||||
|
||||
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{};
|
||||
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
|
||||
|
||||
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
|
||||
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
|
||||
|
||||
constexpr auto thread_tensor_desc =
|
||||
make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths);
|
||||
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed(
|
||||
thread_sub_tensor_lengths * repeat_lengths);
|
||||
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
|
||||
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
|
||||
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
|
||||
|
||||
constexpr auto src_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims;
|
||||
const auto src_thread_data_multi_id_begin =
|
||||
repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why?
|
||||
|
||||
constexpr auto clipboard_data_multi_offset =
|
||||
repeat_multi_id * thread_sub_tensor_lengths;
|
||||
const auto clipboard_data_multi_id_begin =
|
||||
repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why?
|
||||
|
||||
constexpr index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(src_data_multi_id);
|
||||
constexpr index_t clipboard_offset =
|
||||
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id);
|
||||
const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(
|
||||
src_thread_data_multi_id_begin); // cannot not constexpr, why?
|
||||
|
||||
const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(
|
||||
clipboard_data_multi_id_begin); // cannot not constexpr, why?
|
||||
|
||||
threadwise_tensor_slice_copy_generic(SrcDesc{},
|
||||
p_src + src_offset + mSrcMyThreadOffset,
|
||||
thread_tensor_desc,
|
||||
zero_array<index_t, nDim>{},
|
||||
make_zero_array<index_t, nDim>(),
|
||||
thread_tensor_desc,
|
||||
p_clipboard + clipboard_offset,
|
||||
zero_array<index_t, nDim>{},
|
||||
make_zero_array<index_t, nDim>(),
|
||||
thread_sub_tensor_lengths,
|
||||
SrcAccessOrder{});
|
||||
});
|
||||
@@ -122,41 +130,44 @@ struct BlockwiseTensorSliceCopy_generic_v1
|
||||
{
|
||||
constexpr auto thread_sub_tensor_lengths = SubLengths{};
|
||||
|
||||
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{};
|
||||
constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * DataClusterLengths{};
|
||||
|
||||
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{});
|
||||
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});
|
||||
|
||||
constexpr auto thread_tensor_desc =
|
||||
make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths);
|
||||
constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed(
|
||||
thread_sub_tensor_lengths * repeat_lengths);
|
||||
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
|
||||
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
|
||||
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
|
||||
|
||||
constexpr auto clipboard_data_multi_offset =
|
||||
repeat_multi_id * thread_sub_tensor_lengths;
|
||||
const auto clipboard_data_multi_id_begin =
|
||||
repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why?
|
||||
|
||||
constexpr auto dst_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims;
|
||||
const auto dst_data_multi_id_begin =
|
||||
repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why?
|
||||
|
||||
constexpr index_t clipboard_offset =
|
||||
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_offset);
|
||||
const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(
|
||||
clipboard_data_multi_id_begin); // cannot not constexpr, why?
|
||||
|
||||
constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_offset);
|
||||
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(
|
||||
dst_data_multi_id_begin); // cannot not constexpr, why?
|
||||
|
||||
threadwise_tensor_slice_copy_generic(thread_tensor_desc,
|
||||
p_clipboard + clipboard_offset,
|
||||
zero_array<index_t, nDim>{},
|
||||
make_zero_array<index_t, nDim>(),
|
||||
DstDesc{},
|
||||
p_dst + dst_offset + mDstMyThreadOffset,
|
||||
zero_array<index_t, nDim>{},
|
||||
make_zero_array<index_t, nDim>(),
|
||||
thread_sub_tensor_lengths,
|
||||
DstAccessOrder{});
|
||||
});
|
||||
}
|
||||
|
||||
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
|
||||
{
|
||||
Float p_clipboard[GetRegisterClipboardSize()];
|
||||
Float p_clipboard[GetRegisterClipboardSize()];
|
||||
|
||||
RunLoadRegisterClipboard(p_src, p_clipboard);
|
||||
RunStoreRegisterClipboard(p_clipboard, p_dst);
|
||||
RunLoadRegisterClipboard(p_src, p_clipboard);
|
||||
RunStoreRegisterClipboard(p_clipboard, p_dst);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
@@ -40,7 +40,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);
|
||||
|
||||
constexpr auto thread_cluster_desc =
|
||||
make_packed_ConstantTensorDescriptor(thread_cluster_lengths);
|
||||
make_ConstantTensorDescriptor_default_rank_packed(thread_cluster_lengths);
|
||||
|
||||
// sanity check: data type
|
||||
static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
|
||||
@@ -149,7 +149,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
|
||||
|
||||
constexpr auto thread_tensor_desc =
|
||||
make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
|
||||
make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
|
||||
|
||||
return thread_tensor_desc.GetElementSpace();
|
||||
}
|
||||
@@ -170,7 +170,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
|
||||
|
||||
constexpr auto thread_tensor_desc =
|
||||
make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
|
||||
make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
|
||||
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
|
||||
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
|
||||
@@ -208,7 +208,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
|
||||
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
|
||||
|
||||
constexpr auto thread_tensor_desc =
|
||||
make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
|
||||
make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
|
||||
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
|
||||
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "vector_type.hip.hpp"
|
||||
#include "constant_integral.hip.hpp"
|
||||
#include "integral_constant.hip.hpp"
|
||||
#include "Sequence.hip.hpp"
|
||||
#include "Array.hip.hpp"
|
||||
#include "functional.hip.hpp"
|
||||
@@ -17,15 +17,21 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; }
|
||||
template <class T1, class T2>
|
||||
struct is_same
|
||||
{
|
||||
static const bool value = false;
|
||||
static constexpr bool value = false;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct is_same<T, T>
|
||||
{
|
||||
static const bool value = true;
|
||||
static constexpr bool value = true;
|
||||
};
|
||||
|
||||
template <class X, class Y>
|
||||
__host__ __device__ constexpr bool is_same_type(X, Y)
|
||||
{
|
||||
return is_same<X, Y>::value;
|
||||
}
|
||||
|
||||
namespace mod_conv { // namespace mod_conv
|
||||
template <class T, T s>
|
||||
struct scales
|
||||
|
||||
@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
|
||||
constexpr auto HO = HI + 1 - Y;
|
||||
constexpr auto WO = WI + 1 - X;
|
||||
|
||||
return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
|
||||
return make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, K, HO, WO>{});
|
||||
}
|
||||
|
||||
template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
|
||||
@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
|
||||
constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y;
|
||||
constexpr auto WO = WI + WPadLow + WPadUp + 1 - X;
|
||||
|
||||
return make_packed_ConstantTensorDescriptor(Sequence<N, K, HO, WO>{});
|
||||
return make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, K, HO, WO>{});
|
||||
}
|
||||
|
||||
template <class InDesc, class WeiDesc, class OutDesc>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#pragma once
|
||||
#include "constant_integral.hip.hpp"
|
||||
#include "integral_constant.hip.hpp"
|
||||
|
||||
struct forwarder
|
||||
{
|
||||
|
||||
@@ -85,7 +85,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
|
||||
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
|
||||
constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);
|
||||
|
||||
constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
|
||||
constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
|
||||
Sequence<KBlockWork, HBlockWork, WBlockWork, NBlockWork>{});
|
||||
|
||||
const auto block_work_multi_id =
|
||||
@@ -109,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
|
||||
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
|
||||
Number<InBlockCopyDataPerRead_N>{});
|
||||
|
||||
@@ -118,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
|
||||
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
|
||||
"GemmDataPerReadB alignment requirement is not meet");
|
||||
|
||||
constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
|
||||
Sequence<CPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
|
||||
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
|
||||
|
||||
// blockwise copy
|
||||
|
||||
@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
|
||||
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
|
||||
|
||||
constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
|
||||
constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
|
||||
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
|
||||
|
||||
const auto block_work_multi_id =
|
||||
@@ -102,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
|
||||
// global tensor view
|
||||
constexpr auto wei_c_k_global_desc =
|
||||
make_ranked_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
|
||||
make_ConstantTensorDescriptor_default_rank(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
|
||||
|
||||
// LDS tensor view
|
||||
// be careful of alignment
|
||||
@@ -111,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
|
||||
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
|
||||
Number<InBlockReorderDataPerWrite_N>{});
|
||||
|
||||
@@ -120,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
|
||||
"GemmDataPerReadB alignment requirement is not meet");
|
||||
|
||||
constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
|
||||
Sequence<CPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
|
||||
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
|
||||
|
||||
// blockwise copy
|
||||
@@ -448,10 +448,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
|
||||
constexpr index_t K1 = KPerBlock / KPerThread;
|
||||
|
||||
#if 0
|
||||
constexpr auto out_10d_global_desc = make_packed_ConstantTensorDescriptor(
|
||||
constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_default_rank_packed(
|
||||
Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});
|
||||
|
||||
constexpr auto out_10d_thread_desc = make_packed_ConstantTensorDescriptor(
|
||||
constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
|
||||
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
|
||||
#else
|
||||
constexpr auto out_10d_global_desc =
|
||||
|
||||
@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
|
||||
constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
|
||||
|
||||
constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor(
|
||||
constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
|
||||
Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});
|
||||
|
||||
const auto block_work_multi_id =
|
||||
@@ -110,7 +110,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
|
||||
constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
|
||||
Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
|
||||
Number<InBlockReorderDataPerWrite_N>{});
|
||||
|
||||
@@ -119,12 +119,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
|
||||
"GemmDataPerReadB alignment requirement is not meet");
|
||||
|
||||
constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment(
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
|
||||
Sequence<CPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
|
||||
// tensor view of threadwise output in register
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor(
|
||||
constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
|
||||
Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
|
||||
|
||||
// blockwise copy
|
||||
@@ -152,7 +152,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
decltype(wei_c_k_global_desc),
|
||||
decltype(wei_c_k_block_desc),
|
||||
decltype(wei_c_k_block_desc.GetLengths()),
|
||||
WeiBlockCopyDataPerRead_K>{};
|
||||
WeiBlockCopyDataPerRead_K>({0, 0}, {0, 0});
|
||||
|
||||
// a series of blockwise batched GEMM
|
||||
// C_matrix += transpose(A_matrix) * B_matrix
|
||||
@@ -196,7 +196,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
|
||||
|
||||
// choose GEMM implementation here
|
||||
const auto run_blockwise_batch_gemm = [&](auto... Xs) {
|
||||
#if 0
|
||||
#if 1
|
||||
return blockwise_batch_gemm.Run(Xs...);
|
||||
#elif 0
|
||||
return blockwise_batch_gemm.Run_asm(Xs...);
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
#pragma once
|
||||
#include "common.hip.hpp"
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hip.hpp"
|
||||
#include "ConstantMatrixDescriptor.hip.hpp"
|
||||
#include "blockwise_merged_tensor_slice_op.hip.hpp"
|
||||
#include "blockwise_gemm.hip.hpp"
|
||||
#include "threadwise_tensor_slice_op.hip.hpp"
|
||||
|
||||
// define B = merge(N, Ho, Wo)
|
||||
template <index_t GridSize,
|
||||
@@ -24,7 +27,12 @@ template <index_t GridSize,
|
||||
index_t GemmNLevel1Cluster,
|
||||
index_t GemmKPerThreadLoop,
|
||||
index_t GemmDataPerReadA,
|
||||
index_t GemmDataPerReadB>
|
||||
index_t GemmDataPerReadB,
|
||||
class InBlockCopySubLengths_N1_N2_C_B,
|
||||
class InBlockCopyClusterLengths_N1_N2_C_B,
|
||||
index_t InBlockCopySrcDataPerRead_B,
|
||||
index_t InBlockCopyDstDataPerWrite_N2,
|
||||
index_t WeiBlockCopyDataPerAccess_K>
|
||||
struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
{
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
@@ -34,12 +42,10 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
// this is a mess
|
||||
// TODO: more elegent way of specifying (or calculating) performance variables
|
||||
static_assert(N2 == GemmNPerThreadSubC, "wrong!");
|
||||
static_assert(KPerBlock ==
|
||||
N1 * GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster,
|
||||
static_assert((N1 * N2 * BPerBlock) %
|
||||
(GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster) ==
|
||||
0,
|
||||
"wrong!");
|
||||
static_assert(
|
||||
KPerBlock % (N1 * GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster) == 0,
|
||||
"wrong!");
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -73,15 +79,14 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
constexpr index_t B = N0 * Ho * Wo;
|
||||
|
||||
// divide block work by [K, B]
|
||||
static_assert(K % KPerBlock == 0 && B % BPerBlock == 0,
|
||||
C % CPerBlock == 0,
|
||||
static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && C % CPerBlock == 0,
|
||||
"wrong! cannot divide work evenly among block");
|
||||
|
||||
constexpr index_t KBlockWork = K / KPerBlock;
|
||||
constexpr index_t BBlockWork = B / BPerBlock;
|
||||
|
||||
constexpr auto block_work_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KBlockWork, BBlockWork>{});
|
||||
make_ConstantTensorDescriptor_default_rank_packed(Sequence<KBlockWork, BBlockWork>{});
|
||||
|
||||
const auto block_work_multi_id =
|
||||
block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
|
||||
@@ -95,16 +100,20 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
in_n_c_h_w_global_desc.Fold(I0, Number<N1>{}, Number<N2>{});
|
||||
|
||||
// merged tensor descriptor in device memory [N1, N2, C, B], src of blockwise copy
|
||||
constexpr auto in_n1_n2_c_b_global_merged_desc =
|
||||
in_n0_n1_n2_c_h_w_global_desc.ReorderGivenNew2Old(Sequence<1, 2, 3, 0, 4, 5>{})
|
||||
constexpr auto in_n1_n2_c_b_global_merged_desc = make_ConstantMergedTensorDescriptor(
|
||||
in_n0_n1_n2_c_h_w_global_mem_desc.ReorderGivenNew2Old(Sequence<1, 2, 3, 0, 4, 5>{})
|
||||
.Slice(I4, Number<Ho>{})
|
||||
.Slice(I5, Number<Wo>{})
|
||||
.Merge(I3, I5);
|
||||
.Slice(I5, Number<Wo>{}),
|
||||
Sequence<0>{},
|
||||
Sequence<1>{},
|
||||
Sequence<2>{},
|
||||
Sequence<3, 4, 5>{});
|
||||
|
||||
// memory layout descriptor in LDS [C, N1, B, N2]
|
||||
// be careful of LDS alignment
|
||||
constexpr auto in_c_n1_b_n2_block_mem_desc = make_ConstantTensorDescriptor_aligned(
|
||||
Sequence<CPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
|
||||
constexpr auto in_c_n1_b_n2_block_mem_desc =
|
||||
make_ConstantTensorDescriptor_default_rank_aligned(
|
||||
Sequence<CPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
|
||||
|
||||
// tensor descriptor in LDS [N1, N2, C, B], dst of blockwise copy
|
||||
constexpr auto in_n1_n2_c_b_block_desc =
|
||||
@@ -112,7 +121,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
|
||||
// this check is ad-hoc
|
||||
// TODO: need to properly implement tensor descriptor with alignment
|
||||
static_assert(in_c_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
|
||||
static_assert(in_c_n1_b_n2_block_mem_desc.GetStride(I1) % GemmDataPerReadB == 0,
|
||||
"GemmDataPerReadB alignment requirement is not satisfied");
|
||||
|
||||
// input blockwise copy
|
||||
@@ -129,7 +138,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
Sequence<2, 0, 1, 3>, // thread_arrange_order [C, N1, N2, B]
|
||||
Sequence<0, 1, 2, 3>, // src_access_order [N1, N2, C, B]
|
||||
Sequence<2, 0, 3, 1>, // dst_access_order [C, N1, B, N2]
|
||||
>({0, 0, 0, b_block_data_on_global}, {0, 0, 0, 0});
|
||||
InBlockCopySrcDataPerRead_B,
|
||||
InBlockCopyDstDataPerWrite_N2>({0, 0, 0, b_block_data_on_global}, {0, 0, 0, 0});
|
||||
|
||||
// weight tensor
|
||||
// tensor descriptor in device memory, src of blockwise copy
|
||||
@@ -137,9 +147,9 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
|
||||
// tensor descriptor in LDS, dst of blockwise copy
|
||||
// be careful of LDS alignment
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
|
||||
constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
|
||||
Sequence<CPerBlock, KPerBlock>{},
|
||||
Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});
|
||||
Number<mod_conv::max(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{});
|
||||
|
||||
// operator for blockwise copy of weight into LDS
|
||||
// slicing a tensor
|
||||
@@ -150,7 +160,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
decltype(wei_c_k_global_desc),
|
||||
decltype(wei_c_k_block_desc),
|
||||
decltype(wei_c_k_block_desc.GetLengths()),
|
||||
WeiBlockCopyDataPerRead_K>({0, k_block_data_on_global}, {0, 0});
|
||||
WeiBlockCopyDataPerAccess_K>({0, k_block_data_on_global},
|
||||
{0, 0});
|
||||
|
||||
// GEMM definition
|
||||
// c_mtx += transpose(a_mtx) * b_mtx
|
||||
@@ -167,7 +178,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
Number<in_c_n1_b_n2_block_mem_desc.GetStride(I0)>{});
|
||||
|
||||
// sanity check
|
||||
static_assert(KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster),
|
||||
static_assert(KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster) ==
|
||||
0,
|
||||
"wrong!");
|
||||
|
||||
constexpr index_t GemmMRepeat =
|
||||
@@ -194,8 +206,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
GemmDataPerReadB>{};
|
||||
|
||||
// LDS allocation for input and weight: be careful of alignment
|
||||
constexpr index_t max_align = mod_conv::max(InBlockReorderDataPerWrite_N,
|
||||
WeiBlockCopyDataPerRead_K,
|
||||
constexpr index_t max_align = mod_conv::max(InBlockCopyDstDataPerWrite_N2,
|
||||
WeiBlockCopyDataPerAccess_K,
|
||||
GemmDataPerReadA,
|
||||
GemmDataPerReadB);
|
||||
|
||||
@@ -211,7 +223,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
Float p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()];
|
||||
|
||||
// zero out threadwise output
|
||||
threadwise_matrix_set_zero(out_k0_k1_k2_n1_n0_h_w_n2_thread_desc, p_out_thread);
|
||||
threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread);
|
||||
|
||||
// do work
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
@@ -229,15 +241,15 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
c_block_data_on_global = 0;
|
||||
c_block_data_on_global < C;
|
||||
c_block_data_on_global += CPerBlock,
|
||||
p_in_block_ont_global += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
|
||||
p_in_block_on_global += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
|
||||
p_wei_block_on_global += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0))
|
||||
{
|
||||
blockwise_in_copy.run(p_in_block_on_global, p_in_block);
|
||||
blockwise_wei_copy.run(p_wei_block_on_global, p_wei_block);
|
||||
blockwise_in_copy.Run(p_in_block_on_global, p_in_block);
|
||||
blockwise_wei_copy.Run(p_wei_block_on_global, p_wei_block);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
blockwise_gemm.run(p_wei_block, p_in_block, p_out_thread);
|
||||
blockwise_gemm.Run(p_wei_block, p_in_block, p_out_thread);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
@@ -253,19 +265,26 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
// define tensor descriptor for threadwise copy
|
||||
// output tensor (also, memory layout) descriptor in register, src of threadwise
|
||||
// copy
|
||||
constexpr auto out_k0_k1_k2_n1_b_n2_thread_mem_desc = make_ConstantTensorDescriptor(
|
||||
Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, 1, 1, N2>{});
|
||||
constexpr auto out_k0_k1_k2_n1_b_n2_thread_mem_desc =
|
||||
make_ConstantTensorDescriptor_default_rank_packed(
|
||||
Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, N2>{});
|
||||
|
||||
// output memory layout descriptor in device memory
|
||||
constexpr auto out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc =
|
||||
out_n_k_h_w_global.Fold(I1, Number<K1>{}, Number<K2>{})
|
||||
out_n_k_h_w_global_desc.Fold(I1, Number<K1>{}, Number<K2>{})
|
||||
.Fold(I0, Number<N1>{}, Number<N2>{});
|
||||
|
||||
// output merged tensor descriptor in device memory, dst of threadwise copy
|
||||
constexpr auto out_k0_k1_k2_n1_b_n2_global_merged_desc =
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
|
||||
.ReorderGivenNew2Old(Sequence<3, 4, 5, 1, 0, 6, 7, 2>{})
|
||||
.Merge(I4, I6);
|
||||
make_ConstantMergedTensorDescriptor(
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.ReorderGivenNew2Old(
|
||||
Sequence<3, 4, 5, 1, 0, 6, 7, 2>{}),
|
||||
Sequence<0>{},
|
||||
Sequence<1>{},
|
||||
Sequence<2>{},
|
||||
Sequence<3>{},
|
||||
Sequence<4, 5, 6>{},
|
||||
Sequence<7>{});
|
||||
|
||||
// calculate origin of thread output tensor on global memory
|
||||
// blockwise GEMM c matrix starting index
|
||||
@@ -273,18 +292,30 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
|
||||
|
||||
// origin of thread tensor on global
|
||||
const index_t k_thread_data_on_global k_block_data_on_global +
|
||||
c_thread_mtx_on_block.row;
|
||||
const index_t k_thread_data_on_global =
|
||||
k_block_data_on_global + c_thread_mtx_on_block.row;
|
||||
|
||||
const index_t b_thread_data_on_global =
|
||||
b_block_data_on_global + c_thread_mtx_on_block.col;
|
||||
|
||||
// output merged global tensor descriptor, for calculating origin of thread tensor
|
||||
// in global memory
|
||||
// output merged global tensor descriptor, for calculating origin of thread tensor
|
||||
// in global memory
|
||||
#if 0 // unfold a merged tensor is not implemented yet
|
||||
constexpr auto out_k_n1_b_n2_global_merged_desc =
|
||||
out_k0_k1_k2_n1_b_n2_global_merged_desc.Unfold(I1, I2);
|
||||
out_k0_k1_k2_n1_b_n2_global_merged_desc.Unfold(I0, I2);
|
||||
#else
|
||||
constexpr auto out_k_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor(
|
||||
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
|
||||
.ReorderGivenNew2Old(Sequence<3, 4, 5, 1, 0, 6, 7, 2>{})
|
||||
.Unfold(I0, I2),
|
||||
Sequence<0>{},
|
||||
Sequence<1>{},
|
||||
Sequence<2, 3, 4>{},
|
||||
Sequence<5>{});
|
||||
#endif
|
||||
|
||||
// origin of thread tensor in global memory
|
||||
const index_t p_out_thread_on_global =
|
||||
Float* p_out_thread_on_global =
|
||||
p_out_global +
|
||||
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
|
||||
k_thread_data_on_global, 0, 0, 0); // dst origin on merged global tensor
|
||||
@@ -303,8 +334,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
|
||||
0,
|
||||
b_thread_data_on_global,
|
||||
0}, // starting point of slice w.r.t. origin of dst
|
||||
out_k0_k1_k2_n1_b_n2_thread_desc.GetLengths(), // slice lengths
|
||||
Sequence<2, 3, 4, 0, 5, 1>{} // order of dimension access
|
||||
out_k0_k1_k2_n1_b_n2_thread_mem_desc.GetLengths(), // slice lengths
|
||||
Sequence<2, 3, 4, 0, 5, 1>{} // order of dimension access
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,5 +8,11 @@ struct integral_constant
|
||||
__host__ __device__ constexpr T Get() const { return value; }
|
||||
};
|
||||
|
||||
template <class T, index_t X, index_t Y>
|
||||
__host__ __device__ constexpr auto operator+(integral_constant<T, X>, integral_constant<T, Y>)
|
||||
{
|
||||
return integral_constant<T, X + Y>{};
|
||||
}
|
||||
|
||||
template <index_t N>
|
||||
using Number = integral_constant<index_t, N>;
|
||||
@@ -10,7 +10,7 @@ __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread)
|
||||
for(index_t j = 0; j < Matrix::NCol(); ++j)
|
||||
{
|
||||
const index_t id = Matrix::GetOffsetFromMultiIndex(i, j);
|
||||
p_thread[id] = 0;
|
||||
p_thread[id] = Float(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,7 +19,7 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc,
|
||||
|
||||
constexpr auto src_desc = SrcDesc{};
|
||||
constexpr auto dst_desc = DstDesc{};
|
||||
constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{});
|
||||
constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(SrcOpLengths{});
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
@@ -194,16 +194,19 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
|
||||
}
|
||||
|
||||
template <class Float, class SrcDesc, class DstDesc, class SliceLengths, class DimAccessOrder>
|
||||
__device__ void
|
||||
threadwise_tensor_slice_copy_generic(SrcDesc,
|
||||
const Float* __restrict__ p_src,
|
||||
Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_offset,
|
||||
DstDesc,
|
||||
Float* __restrict__ p_dst,
|
||||
Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_offset,
|
||||
SliceLengths,
|
||||
DimAccessOrder)
|
||||
__device__ void threadwise_tensor_slice_copy_generic(
|
||||
SrcDesc,
|
||||
const Float* __restrict__ p_src,
|
||||
Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_id_begin,
|
||||
DstDesc,
|
||||
Float* __restrict__ p_dst,
|
||||
Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_id_begin,
|
||||
SliceLengths,
|
||||
DimAccessOrder)
|
||||
{
|
||||
static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(),
|
||||
"wrong! # of dimensions not the same");
|
||||
|
||||
constexpr auto src_desc = SrcDesc{};
|
||||
constexpr auto dst_desc = DstDesc{};
|
||||
|
||||
@@ -215,9 +218,10 @@ threadwise_tensor_slice_copy_generic(SrcDesc,
|
||||
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
|
||||
|
||||
const index_t dst_index =
|
||||
dst_desc.GetOffsetFromMultiIndex(src_multi_offset + data_multi_id);
|
||||
dst_desc.GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
|
||||
|
||||
const index_t src_index =
|
||||
src_desc.GetOffsetFromMultiIndex(dst_multi_offset + data_multi_id);
|
||||
src_desc.GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
|
||||
|
||||
p_dst[dst_index] = p_src[src_index];
|
||||
});
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#include "config.h"
|
||||
#include "constant_integral.hip.hpp"
|
||||
#include "integral_constant.hip.hpp"
|
||||
|
||||
template <class T, index_t N>
|
||||
struct vector_type
|
||||
|
||||
Reference in New Issue
Block a user