mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-13 09:45:56 +00:00
clean up
This commit is contained in:
@@ -5,9 +5,9 @@
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_generic_tensor_slice_copy.hpp"
|
||||
#include "blockwise_generic_tensor_slice_copy_deprecated.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
#include "threadwise_generic_tensor_slice_copy.hpp"
|
||||
#include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -265,8 +265,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
|
||||
// LDS double buffer: preload data into LDS
|
||||
{
|
||||
blockwise_in_copy.Run(p_in_global, p_in_block_double);
|
||||
blockwise_wei_copy.Run(p_wei_global, p_wei_block_double);
|
||||
blockwise_in_copy.template Run<Float, address_space_t::global>(p_in_global,
|
||||
p_in_block_double);
|
||||
blockwise_wei_copy.template Run<Float, address_space_t::global>(p_wei_global,
|
||||
p_wei_block_double);
|
||||
}
|
||||
|
||||
// LDS double buffer: main body
|
||||
@@ -288,8 +290,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
Float* p_wei_block_next =
|
||||
even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double;
|
||||
|
||||
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
|
||||
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
|
||||
Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()];
|
||||
Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()];
|
||||
|
||||
blockwise_in_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0, 0, 0>{}, True);
|
||||
blockwise_wei_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
|
||||
@@ -297,23 +299,25 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
__syncthreads();
|
||||
|
||||
// LDS doubel buffer: load next data from device mem
|
||||
blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
|
||||
blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer);
|
||||
blockwise_in_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
|
||||
p_in_global, p_in_thread_buffer);
|
||||
blockwise_wei_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
|
||||
p_wei_global, p_wei_thread_buffer);
|
||||
|
||||
// LDS double buffer: GEMM on current data
|
||||
blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
|
||||
|
||||
// LDS double buffer: store next data to LDS
|
||||
blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_next);
|
||||
blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_next);
|
||||
blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer, p_in_block_next);
|
||||
blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, p_wei_block_next);
|
||||
}
|
||||
}
|
||||
|
||||
// LDS double buffer: tail
|
||||
{
|
||||
// even iteration
|
||||
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
|
||||
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
|
||||
Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()];
|
||||
Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()];
|
||||
|
||||
blockwise_in_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0, 0, 0>{}, True);
|
||||
blockwise_wei_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
|
||||
@@ -321,17 +325,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
__syncthreads();
|
||||
|
||||
// LDS doubel buffer: load next data from device mem
|
||||
blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
|
||||
blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer);
|
||||
blockwise_in_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
|
||||
p_in_global, p_in_thread_buffer);
|
||||
blockwise_wei_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
|
||||
p_wei_global, p_wei_thread_buffer);
|
||||
|
||||
// LDS double buffer: GEMM on current data
|
||||
blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
|
||||
|
||||
// LDS double buffer: store next data to LDS
|
||||
blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer,
|
||||
p_in_block_double + in_block_space);
|
||||
blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer,
|
||||
p_wei_block_double + wei_block_space);
|
||||
blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer,
|
||||
p_in_block_double + in_block_space);
|
||||
blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer,
|
||||
p_wei_block_double + wei_block_space);
|
||||
|
||||
// odd iteration
|
||||
__syncthreads();
|
||||
@@ -390,7 +396,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
|
||||
0,
|
||||
b_thread_data_on_global,
|
||||
0})
|
||||
.template Run_amd_experiment<Float, 0, 2>(p_out_thread, p_out_global);
|
||||
#if 0
|
||||
.Run
|
||||
#else // tweaking
|
||||
.template Run_optimized_address_calculation<Float,
|
||||
address_space_t::generic,
|
||||
address_space_t::global>
|
||||
#endif
|
||||
(p_out_thread, p_out_global);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -5,9 +5,9 @@
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "ConstantMatrixDescriptor.hpp"
|
||||
#include "blockwise_generic_tensor_slice_copy.hpp"
|
||||
#include "blockwise_generic_tensor_slice_copy_deprecated.hpp"
|
||||
#include "blockwise_gemm.hpp"
|
||||
#include "threadwise_generic_tensor_slice_copy.hpp"
|
||||
#include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -251,8 +251,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
|
||||
|
||||
// LDS double buffer: preload data into LDS
|
||||
{
|
||||
blockwise_in_copy.Run(p_in_global, p_in_block_double);
|
||||
blockwise_wei_copy.Run(p_wei_global, p_wei_block_double);
|
||||
blockwise_in_copy.template Run<Float, address_space_t::global>(p_in_global,
|
||||
p_in_block_double);
|
||||
blockwise_wei_copy.template Run<Float, address_space_t::global>(p_wei_global,
|
||||
p_wei_block_double);
|
||||
}
|
||||
|
||||
// LDS double buffer: main body
|
||||
@@ -274,51 +276,54 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
|
||||
Float* p_wei_block_next =
|
||||
even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double;
|
||||
|
||||
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
|
||||
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
|
||||
Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()];
|
||||
Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()];
|
||||
|
||||
blockwise_in_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
|
||||
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStrides()[0];
|
||||
blockwise_wei_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// LDS doubel buffer: load next data from device mem
|
||||
blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
|
||||
blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global,
|
||||
p_wei_register_buffer);
|
||||
blockwise_in_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
|
||||
p_in_global, p_in_thread_buffer);
|
||||
blockwise_wei_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
|
||||
p_wei_global, p_wei_thread_buffer);
|
||||
|
||||
// LDS double buffer: GEMM on current data
|
||||
blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
|
||||
|
||||
// LDS double buffer: store next data to LDS
|
||||
blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_next);
|
||||
blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_next);
|
||||
blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer, p_in_block_next);
|
||||
blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, p_wei_block_next);
|
||||
}
|
||||
}
|
||||
|
||||
// LDS double buffer: tail
|
||||
{
|
||||
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
|
||||
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
|
||||
Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()];
|
||||
Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()];
|
||||
|
||||
// even iteration
|
||||
blockwise_in_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
|
||||
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStrides()[0];
|
||||
blockwise_wei_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// LDS doubel buffer: load next data from device mem
|
||||
blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
|
||||
blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_block_on_global, p_wei_register_buffer);
|
||||
blockwise_in_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
|
||||
p_in_global, p_in_thread_buffer);
|
||||
blockwise_wei_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
|
||||
p_wei_global, p_wei_thread_buffer);
|
||||
|
||||
// LDS double buffer: GEMM on current data
|
||||
blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
|
||||
|
||||
// LDS double buffer: store next data to LDS
|
||||
blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer,
|
||||
p_in_block_double + in_block_space);
|
||||
blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer,
|
||||
p_wei_block_double + wei_block_space);
|
||||
blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer,
|
||||
p_in_block_double + in_block_space);
|
||||
blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer,
|
||||
p_wei_block_double + wei_block_space);
|
||||
|
||||
// odd iteration
|
||||
__syncthreads();
|
||||
@@ -385,7 +390,15 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
|
||||
|
||||
for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
|
||||
{
|
||||
threadwise_out_copy.Run(p_out_thread, p_out_global);
|
||||
threadwise_out_copy
|
||||
#if 1
|
||||
.Run
|
||||
#else // tweaking
|
||||
.template Run_optimized_address_calculation<Float,
|
||||
address_space_t::generic,
|
||||
address_space_t::global>
|
||||
#endif
|
||||
(p_out_thread, p_out_global);
|
||||
|
||||
threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True);
|
||||
threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True);
|
||||
|
||||
@@ -1,340 +1,210 @@
|
||||
#ifndef CK_TENSOR_COORDINATE_HPP
|
||||
#define CK_TENSOR_COORDINATE_HPP
|
||||
#ifndef CK_TENSOR_COORDINATE_V2_HPP
|
||||
#define CK_TENSOR_COORDINATE_V2_HPP
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "dimension.hpp"
|
||||
#include "multi_index_transform.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// TensorDesc is ConstantTensorDescriptor
|
||||
template <class TensorDesc>
|
||||
struct NormalTensorCoordinate
|
||||
template <typename TensorDesc>
|
||||
struct TensorCoordinate;
|
||||
|
||||
template <typename NativeTensorDesc>
|
||||
struct NativeTensorCoordinate
|
||||
{
|
||||
using type = NormalTensorCoordinate;
|
||||
using tensor_desc_type = TensorDesc;
|
||||
|
||||
using type = NativeTensorCoordinate;
|
||||
using tensor_desc_type = NativeTensorDesc;
|
||||
static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
|
||||
using Index = MultiIndex<nDim>;
|
||||
|
||||
__host__ __device__ constexpr NormalTensorCoordinate(Array<index_t, nDim> tensor_index)
|
||||
: mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)}
|
||||
__host__ __device__ constexpr NativeTensorCoordinate(Index idx)
|
||||
: mIndex(idx), mOffset(tensor_desc_type::CalculateOffset(idx))
|
||||
{
|
||||
}
|
||||
|
||||
template <class... Xs>
|
||||
__host__ __device__ constexpr NormalTensorCoordinate(Xs... xs)
|
||||
: NormalTensorCoordinate(Array<index_t, nDim>{xs...})
|
||||
template <typename... Xs>
|
||||
__host__ __device__ constexpr NativeTensorCoordinate(Xs... xs)
|
||||
: NativeTensorCoordinate(Index{xs...})
|
||||
{
|
||||
}
|
||||
|
||||
template <index_t... Xs>
|
||||
__host__ __device__ constexpr NormalTensorCoordinate(Sequence<Xs...>)
|
||||
: NormalTensorCoordinate(Array<index_t, nDim>{Xs...})
|
||||
__host__ __device__ constexpr NativeTensorCoordinate(Sequence<Xs...>)
|
||||
: NativeTensorCoordinate(Index{Xs...})
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
|
||||
__host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; }
|
||||
|
||||
// T is Array or Sequence
|
||||
template <class T>
|
||||
__host__ __device__ type operator+=(T step_sizes)
|
||||
__host__ __device__ constexpr const Index& GetIndex() const { return mIndex; }
|
||||
|
||||
__host__ __device__ constexpr const index_t& GetOffset() const { return mOffset; }
|
||||
|
||||
__host__ __device__ constexpr type operator+=(const Index& idx_diff)
|
||||
{
|
||||
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
|
||||
// mIndex is updated here, but some (or all) of its entries may never be used
|
||||
// compiler should remove those entries as dead code
|
||||
mIndex += idx_diff;
|
||||
|
||||
mOffset += tensor_desc_type::GetOffsetFromMultiIndex(step_sizes);
|
||||
mOffset += tensor_desc_type::CalculateOffsetDiff(idx_diff);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ type operator-=(T step_sizes)
|
||||
__host__ __device__ constexpr type operator-=(const Index& idx_diff)
|
||||
{
|
||||
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
|
||||
// mIndex is updated here, but some (or all) of its entries may never be used
|
||||
// compiler should remove those entries as dead code
|
||||
mIndex -= idx_diff;
|
||||
|
||||
mOffset -= tensor_desc_type::GetOffsetFromMultiIndex(step_sizes);
|
||||
mOffset -= tensor_desc_type::CalculateOffsetDiff(idx_diff);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr type operator+(T step_sizes) const
|
||||
__host__ __device__ constexpr type operator+(const Index& idx_diff) const
|
||||
{
|
||||
type coord = *this;
|
||||
coord += step_sizes;
|
||||
coord += idx_diff;
|
||||
return coord;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr type operator-(T step_sizes) const
|
||||
__host__ __device__ constexpr type operator-(const Index& idx_diff) const
|
||||
{
|
||||
type coord = *this;
|
||||
coord -= step_sizes;
|
||||
coord -= idx_diff;
|
||||
return coord;
|
||||
}
|
||||
|
||||
// reposition point of origin, and return compensated offset.
|
||||
// This is a hack to reduce index calculation during looping over
|
||||
// a tensor whose origin is this TensorCoordinate. It does so, by spitting
|
||||
// out the run-time offset to the pointer (to the tensor data) held by this
|
||||
// TensorCoordiante, so the caller can add the offset into the run-time pointer of
|
||||
// the data, so only 1 run-time variable (update pointer) is needed, instead
|
||||
// of 2 run-time variables (old pointer and this offset)
|
||||
// TODO: after introducing the concept of "run-time tensor view", which contains the
|
||||
// run-time pointer to the data, always keep track of the pointer, instead of both
|
||||
// offset and the pointer. This also bring additional benefit that we don't need to
|
||||
// worry the offset might underflow (because offset is unsigned integer) when updating it.
|
||||
__host__ __device__ constexpr index_t RepositionOrigin()
|
||||
{
|
||||
index_t offset_diff = mOffset;
|
||||
mOffset = 0;
|
||||
return offset_diff;
|
||||
}
|
||||
__host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; }
|
||||
|
||||
private:
|
||||
// mIndex may be saved and updated, however, the value of some (or all) of its entries may
|
||||
// never be used. Compiler should be able to remove these entries as well as its calculation
|
||||
// as dead code.
|
||||
// TODO: make sure compiler indeed remove these dead code
|
||||
Index mIndex;
|
||||
index_t mOffset;
|
||||
};
|
||||
|
||||
// TensorDesc is ConstantMergedTensorDescriptor
|
||||
template <class TensorDesc>
|
||||
struct MergedTensorCoordinate
|
||||
template <typename TransformedTensorDesc>
|
||||
struct TransformedTensorCoordinate
|
||||
{
|
||||
using type = MergedTensorCoordinate;
|
||||
using tensor_desc_type = TensorDesc;
|
||||
|
||||
using tensor_desc_type = TransformedTensorDesc;
|
||||
using LowerCoord =
|
||||
typename TensorCoordinate<decltype(tensor_desc_type::GetLowerTensorDescriptor())>::type;
|
||||
using UpperCoord = TransformedTensorCoordinate;
|
||||
static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
|
||||
static constexpr index_t nOriginalDim =
|
||||
tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension();
|
||||
using UpperIndex = MultiIndex<nDim>;
|
||||
|
||||
__host__ __device__ constexpr MergedTensorCoordinate(Array<index_t, nDim> tensor_index)
|
||||
: mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)}
|
||||
{
|
||||
// partial offset on each dimension
|
||||
static_for<0, nDim, 1>{}([&](auto idim) {
|
||||
constexpr auto partial_original_dims =
|
||||
tensor_desc_type::GetContainedOriginalDimensions(idim);
|
||||
|
||||
constexpr auto partial_original_desc =
|
||||
tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims);
|
||||
|
||||
mPartialOffsets(idim) = partial_original_desc.GetOffsetFromMultiIndex(
|
||||
extract_array(mOriginalIndex, partial_original_dims));
|
||||
});
|
||||
|
||||
// complete offset
|
||||
mOffset =
|
||||
accumulate_on_array(mPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
|
||||
}
|
||||
|
||||
template <class... Xs>
|
||||
__host__ __device__ constexpr MergedTensorCoordinate(Xs... xs)
|
||||
: MergedTensorCoordinate(Array<index_t, nDim>{xs...})
|
||||
__host__ __device__ constexpr TransformedTensorCoordinate(UpperIndex idx)
|
||||
: mIndexUp{idx}, mCoordLow{tensor_desc_type::CalculateLowerIndex(idx)}
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
|
||||
|
||||
template <class IDim, class T, bool PositiveDirection>
|
||||
__host__ __device__ void
|
||||
MoveOnDimension(IDim idim_, T step_size, integral_constant<bool, PositiveDirection>)
|
||||
template <typename... Xs>
|
||||
__host__ __device__ constexpr TransformedTensorCoordinate(Xs... xs)
|
||||
: TransformedTensorCoordinate(UpperIndex{xs...})
|
||||
{
|
||||
constexpr auto idim = idim_;
|
||||
|
||||
// if step_size is known at compile time
|
||||
static_if<is_static<T>::value>{}(
|
||||
[&](auto) { static_if<T{} == 0>{}([&](auto) { return; }); });
|
||||
|
||||
// update original index
|
||||
static_if<tensor_desc_type::ContainMultipleOriginalDimensions(idim)>{}([&](auto) {
|
||||
constexpr auto partial_original_dims =
|
||||
tensor_desc_type::GetContainedOriginalDimensions(idim);
|
||||
|
||||
constexpr index_t ndim_partial_original = partial_original_dims.GetSize();
|
||||
|
||||
constexpr auto partial_original_desc =
|
||||
tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims);
|
||||
|
||||
const auto partial_original_step_sizes =
|
||||
partial_original_desc.GetMultiIndexFrom1dIndex(step_size);
|
||||
|
||||
// update partial original multi-id
|
||||
auto partial_original_id = extract_array(mOriginalIndex, partial_original_dims);
|
||||
|
||||
static_if<PositiveDirection>{}([&](auto) {
|
||||
partial_original_id += partial_original_step_sizes;
|
||||
|
||||
bool carry = false;
|
||||
|
||||
// do carry check in reversed order, starting from lowest dimension
|
||||
// don't check the highest dimension
|
||||
static_for<0, ndim_partial_original - 1, 1>{}([&](auto IReverse) {
|
||||
constexpr index_t i = ndim_partial_original - 1 - IReverse;
|
||||
|
||||
if(carry)
|
||||
{
|
||||
++partial_original_id(i);
|
||||
}
|
||||
|
||||
carry = false;
|
||||
|
||||
if(partial_original_id[i] >= partial_original_desc.GetLength(i))
|
||||
{
|
||||
partial_original_id(i) -= partial_original_desc.GetLength(i);
|
||||
carry = true;
|
||||
}
|
||||
});
|
||||
|
||||
// highest dimension
|
||||
if(carry)
|
||||
{
|
||||
++partial_original_id(0);
|
||||
}
|
||||
}).Else([&](auto) {
|
||||
// shift up multi-id to avoid unsigned integer underflow during intermediate
|
||||
// calculations. After the shift, should have new_multi_id[...] >= 1
|
||||
partial_original_id +=
|
||||
partial_original_desc.GetLengths() - partial_original_step_sizes;
|
||||
|
||||
bool borrow = false;
|
||||
|
||||
// do borrow check in reversed order, starting from lowest dimension
|
||||
// don't check the highest dimension
|
||||
static_for<0, ndim_partial_original - 1, 1>{}([&](auto IReverse) {
|
||||
constexpr index_t i = ndim_partial_original - 1 - IReverse;
|
||||
|
||||
if(borrow)
|
||||
{
|
||||
--partial_original_id(i);
|
||||
}
|
||||
|
||||
borrow = false;
|
||||
|
||||
if(partial_original_id[i] < partial_original_desc.GetLength(i))
|
||||
{
|
||||
partial_original_id(i) += partial_original_desc.GetLength(i);
|
||||
borrow = true;
|
||||
}
|
||||
});
|
||||
|
||||
// highest dimension
|
||||
if(borrow)
|
||||
{
|
||||
--partial_original_id(0);
|
||||
}
|
||||
|
||||
// shift back down multi-id
|
||||
// here, should have new_multi_id[...] >= GetLengths()
|
||||
partial_original_id = partial_original_id - partial_original_desc.GetLengths();
|
||||
});
|
||||
|
||||
// update "mOriginalIndex"
|
||||
static_for<0, ndim_partial_original, 1>{}([&](auto I) {
|
||||
constexpr auto idim_original = partial_original_dims[I];
|
||||
|
||||
mOriginalIndex(idim_original) = partial_original_id[I];
|
||||
});
|
||||
|
||||
// calculate new partial offset on this merged dimension
|
||||
const index_t old_partial_offset = mPartialOffsets[idim];
|
||||
|
||||
mPartialOffsets(idim) =
|
||||
partial_original_desc.GetOffsetFromMultiIndex(partial_original_id);
|
||||
|
||||
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
|
||||
mOffset = (mOffset + mPartialOffsets[idim]) - old_partial_offset;
|
||||
}).Else([&](auto fwd) {
|
||||
static_if<PositiveDirection>{}([&](auto) {
|
||||
mOffset += step_size * fwd(tensor_desc_type{}).GetStride(idim);
|
||||
}).Else([&](auto) { mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim); });
|
||||
});
|
||||
}
|
||||
|
||||
// T is Array or Sequence
|
||||
template <class T>
|
||||
__host__ __device__ type operator+=(T step_sizes)
|
||||
template <index_t... Xs>
|
||||
__host__ __device__ constexpr TransformedTensorCoordinate(Sequence<Xs...>)
|
||||
: TransformedTensorCoordinate(UpperIndex{Xs...})
|
||||
{
|
||||
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
|
||||
}
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto idim) {
|
||||
// compiler should remove dead code path, because step_sizes is known at
|
||||
// compile time
|
||||
if(step_sizes[idim] != 0)
|
||||
{
|
||||
this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, true>{});
|
||||
}
|
||||
});
|
||||
__host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; }
|
||||
|
||||
__host__ __device__ constexpr const LowerCoord& GetLowerCoordinate() const { return mCoordLow; }
|
||||
|
||||
__host__ __device__ constexpr const UpperIndex& GetUpperIndex() const { return mIndexUp; }
|
||||
|
||||
__host__ __device__ constexpr const UpperIndex& GetIndex() const { return GetUpperIndex(); }
|
||||
|
||||
__host__ __device__ constexpr const index_t& GetOffset() const
|
||||
{
|
||||
return GetLowerCoordinate().GetOffset();
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr UpperCoord operator+=(const UpperIndex& idx_up_diff)
|
||||
{
|
||||
// For transformation of multi-index difference, not all transformation functions need to
|
||||
// know the old lower-index or the old upper-index. We pass both of them to the
|
||||
// transformation function. The transformation function itself decides to use them or not.
|
||||
mCoordLow += tensor_desc_type::CalculateLowerIndexDiff(
|
||||
idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
|
||||
|
||||
// mIndexUp is updated here, but some (or all) of its entries may never be used
|
||||
// compiler should remove those entries as dead code
|
||||
mIndexUp += idx_up_diff;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ type operator-=(T step_sizes)
|
||||
__host__ __device__ constexpr UpperCoord operator-=(const UpperIndex& idx_up_diff)
|
||||
{
|
||||
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
|
||||
mCoordLow -= tensor_desc_type::CalculateLowerIndexDiff(
|
||||
idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto idim) {
|
||||
// compiler should remove dead code path, because step_sizes is known at
|
||||
// compile time
|
||||
if(step_sizes[idim] != 0)
|
||||
{
|
||||
this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, false>{});
|
||||
}
|
||||
});
|
||||
// mIndex is updated here, but some (or all) of its entries may never be used
|
||||
// compiler should remove those entries as dead code
|
||||
mIndexUp -= idx_up_diff;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr type operator+(T step_sizes) const
|
||||
__host__ __device__ constexpr UpperCoord operator+(const UpperIndex& idx_up_diff) const
|
||||
{
|
||||
type coord = *this;
|
||||
coord += step_sizes;
|
||||
return coord;
|
||||
UpperCoord coord_up = *this;
|
||||
coord_up += idx_up_diff;
|
||||
return coord_up;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr type operator-(T step_sizes) const
|
||||
__host__ __device__ constexpr UpperCoord operator-(const UpperIndex& idx_up_diff) const
|
||||
{
|
||||
type coord = *this;
|
||||
coord -= step_sizes;
|
||||
return coord;
|
||||
UpperCoord coord_up = *this;
|
||||
coord_up -= idx_up_diff;
|
||||
return coord_up;
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t RepositionOrigin() { return 0; }
|
||||
// this function should be inexpensive, because there is no upper-to-lower index transformation
|
||||
__host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const
|
||||
{
|
||||
return tensor_desc_type::IsUpperIndexMappedToValidLowerIndex(GetIndex()) &&
|
||||
mCoordLow.IsUpperIndexMappedToValidOffset();
|
||||
}
|
||||
|
||||
private:
|
||||
// Allocate register memory for all merged dimensions and normal dimensions.
|
||||
// However, only those merged dimensions, whose index will be involved in arithmetic
|
||||
// after the construction of this TensorCoordinate (e.g. when user move a slicing
|
||||
// window on the merged dimension), will use these register memory.
|
||||
// Let's hope compiler will optimize away those register memory allocated for normal
|
||||
// dimensions, and those merged dimensions, that would never be involved in index
|
||||
// arithmetic after construction of TensorCoordinate.
|
||||
// TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
|
||||
// and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
|
||||
// count on compiler to optimize away those register memory for us
|
||||
Array<index_t, nOriginalDim> mOriginalIndex;
|
||||
Array<index_t, nDim> mPartialOffsets;
|
||||
|
||||
// complete offset
|
||||
index_t mOffset;
|
||||
// mIndexUp may be calculated and updated, however, the value of some (or all) of its entries
|
||||
// may
|
||||
// never be used. Compiler should be able to remove these entries as well as its calculation
|
||||
// as dead code.
|
||||
// TODO: make sure compiler indeed remove these dead code
|
||||
UpperIndex mIndexUp;
|
||||
LowerCoord mCoordLow;
|
||||
};
|
||||
|
||||
template <class TensorDesc>
|
||||
template <typename TensorDesc>
|
||||
struct TensorCoordinate
|
||||
{
|
||||
private:
|
||||
template <class... Ts>
|
||||
template <typename... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
|
||||
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
|
||||
{
|
||||
return NormalTensorCoordinate<ConstantTensorDescriptor<Ts...>>();
|
||||
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
|
||||
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
|
||||
}
|
||||
|
||||
template <class... Ts>
|
||||
template <typename... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
|
||||
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
|
||||
{
|
||||
return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
|
||||
return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
|
||||
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
@@ -0,0 +1,346 @@
|
||||
#ifndef CK_TENSOR_COORDINATE_DEPRECATED_HPP
|
||||
#define CK_TENSOR_COORDINATE_DEPRECATED_HPP
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// TensorDesc is ConstantTensorDescriptor
|
||||
template <class TensorDesc>
|
||||
struct NormalTensorCoordinate_deprecated
|
||||
{
|
||||
using type = NormalTensorCoordinate_deprecated;
|
||||
using tensor_desc_type = TensorDesc;
|
||||
|
||||
static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
|
||||
|
||||
__host__
|
||||
__device__ constexpr NormalTensorCoordinate_deprecated(Array<index_t, nDim> tensor_index)
|
||||
: mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)}
|
||||
{
|
||||
}
|
||||
|
||||
template <class... Xs>
|
||||
__host__ __device__ constexpr NormalTensorCoordinate_deprecated(Xs... xs)
|
||||
: NormalTensorCoordinate_deprecated(Array<index_t, nDim>{xs...})
|
||||
{
|
||||
}
|
||||
|
||||
template <index_t... Xs>
|
||||
__host__ __device__ constexpr NormalTensorCoordinate_deprecated(Sequence<Xs...>)
|
||||
: NormalTensorCoordinate_deprecated(Array<index_t, nDim>{Xs...})
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
|
||||
|
||||
// T is Array or Sequence
|
||||
template <class T>
|
||||
__host__ __device__ type operator+=(T step_sizes)
|
||||
{
|
||||
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
|
||||
|
||||
mOffset += tensor_desc_type::GetOffsetFromMultiIndex(step_sizes);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ type operator-=(T step_sizes)
|
||||
{
|
||||
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
|
||||
|
||||
mOffset -= tensor_desc_type::GetOffsetFromMultiIndex(step_sizes);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr type operator+(T step_sizes) const
|
||||
{
|
||||
type coord = *this;
|
||||
coord += step_sizes;
|
||||
return coord;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr type operator-(T step_sizes) const
|
||||
{
|
||||
type coord = *this;
|
||||
coord -= step_sizes;
|
||||
return coord;
|
||||
}
|
||||
|
||||
// reposition point of origin, and return compensated offset.
|
||||
// This is a hack to reduce index calculation during looping over
|
||||
// a tensor whose origin is this TensorCoordinate. It does so, by spitting
|
||||
// out the run-time offset to the pointer (to the tensor data) held by this
|
||||
// TensorCoordiante, so the caller can add the offset into the run-time pointer of
|
||||
// the data, so only 1 run-time variable (update pointer) is needed, instead
|
||||
// of 2 run-time variables (old pointer and this offset)
|
||||
// TODO: after introducing the concept of "run-time tensor view", which contains the
|
||||
// run-time pointer to the data, always keep track of the pointer, instead of both
|
||||
// offset and the pointer. This also bring additional benefit that we don't need to
|
||||
// worry the offset might underflow (because offset is unsigned integer) when updating it.
|
||||
__host__ __device__ constexpr index_t RepositionOrigin()
|
||||
{
|
||||
index_t offset_diff = mOffset;
|
||||
mOffset = 0;
|
||||
return offset_diff;
|
||||
}
|
||||
|
||||
private:
|
||||
index_t mOffset;
|
||||
};
|
||||
|
||||
// TensorDesc is ConstantMergedTensorDescriptor
|
||||
template <class TensorDesc>
|
||||
struct MergedTensorCoordinate
|
||||
{
|
||||
using type = MergedTensorCoordinate;
|
||||
using tensor_desc_type = TensorDesc;
|
||||
|
||||
static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
|
||||
static constexpr index_t nOriginalDim =
|
||||
tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension();
|
||||
|
||||
__host__ __device__ constexpr MergedTensorCoordinate(Array<index_t, nDim> tensor_index)
|
||||
: mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)}
|
||||
{
|
||||
// partial offset on each dimension
|
||||
static_for<0, nDim, 1>{}([&](auto idim) {
|
||||
constexpr auto partial_original_dims =
|
||||
tensor_desc_type::GetContainedOriginalDimensions(idim);
|
||||
|
||||
constexpr auto partial_original_desc =
|
||||
tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims);
|
||||
|
||||
mPartialOffsets(idim) = partial_original_desc.GetOffsetFromMultiIndex(
|
||||
extract_array(mOriginalIndex, partial_original_dims));
|
||||
});
|
||||
|
||||
// complete offset
|
||||
mOffset =
|
||||
accumulate_on_array(mPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
|
||||
}
|
||||
|
||||
template <class... Xs>
|
||||
__host__ __device__ constexpr MergedTensorCoordinate(Xs... xs)
|
||||
: MergedTensorCoordinate(Array<index_t, nDim>{xs...})
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
|
||||
|
||||
template <class IDim, class T, bool PositiveDirection>
|
||||
__host__ __device__ void
|
||||
MoveOnDimension(IDim idim_, T step_size, integral_constant<bool, PositiveDirection>)
|
||||
{
|
||||
constexpr auto idim = idim_;
|
||||
|
||||
// if step_size is known at compile time
|
||||
static_if<is_static<T>::value>{}(
|
||||
[&](auto) { static_if<T{} == 0>{}([&](auto) { return; }); });
|
||||
|
||||
// update original index
|
||||
static_if<tensor_desc_type::ContainMultipleOriginalDimensions(idim)>{}([&](auto) {
|
||||
constexpr auto partial_original_dims =
|
||||
tensor_desc_type::GetContainedOriginalDimensions(idim);
|
||||
|
||||
constexpr index_t ndim_partial_original = partial_original_dims.GetSize();
|
||||
|
||||
constexpr auto partial_original_desc =
|
||||
tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims);
|
||||
|
||||
const auto partial_original_step_sizes =
|
||||
partial_original_desc.GetMultiIndexFrom1dIndex(step_size);
|
||||
|
||||
// update partial original multi-id
|
||||
auto partial_original_id = extract_array(mOriginalIndex, partial_original_dims);
|
||||
|
||||
static_if<PositiveDirection>{}([&](auto) {
|
||||
partial_original_id += partial_original_step_sizes;
|
||||
|
||||
bool carry = false;
|
||||
|
||||
// do carry check in reversed order, starting from lowest dimension
|
||||
// don't check the highest dimension
|
||||
static_for<0, ndim_partial_original - 1, 1>{}([&](auto IReverse) {
|
||||
constexpr index_t i = ndim_partial_original - 1 - IReverse;
|
||||
|
||||
if(carry)
|
||||
{
|
||||
++partial_original_id(i);
|
||||
}
|
||||
|
||||
carry = false;
|
||||
|
||||
if(partial_original_id[i] >= partial_original_desc.GetLength(i))
|
||||
{
|
||||
partial_original_id(i) -= partial_original_desc.GetLength(i);
|
||||
carry = true;
|
||||
}
|
||||
});
|
||||
|
||||
// highest dimension
|
||||
if(carry)
|
||||
{
|
||||
++partial_original_id(0);
|
||||
}
|
||||
}).Else([&](auto) {
|
||||
// shift up multi-id to avoid unsigned integer underflow during intermediate
|
||||
// calculations. After the shift, should have new_multi_id[...] >= 1
|
||||
partial_original_id +=
|
||||
partial_original_desc.GetLengths() - partial_original_step_sizes;
|
||||
|
||||
bool borrow = false;
|
||||
|
||||
// do borrow check in reversed order, starting from lowest dimension
|
||||
// don't check the highest dimension
|
||||
static_for<0, ndim_partial_original - 1, 1>{}([&](auto IReverse) {
|
||||
constexpr index_t i = ndim_partial_original - 1 - IReverse;
|
||||
|
||||
if(borrow)
|
||||
{
|
||||
--partial_original_id(i);
|
||||
}
|
||||
|
||||
borrow = false;
|
||||
|
||||
if(partial_original_id[i] < partial_original_desc.GetLength(i))
|
||||
{
|
||||
partial_original_id(i) += partial_original_desc.GetLength(i);
|
||||
borrow = true;
|
||||
}
|
||||
});
|
||||
|
||||
// highest dimension
|
||||
if(borrow)
|
||||
{
|
||||
--partial_original_id(0);
|
||||
}
|
||||
|
||||
// shift back down multi-id
|
||||
// here, should have new_multi_id[...] >= GetLengths()
|
||||
partial_original_id = partial_original_id - partial_original_desc.GetLengths();
|
||||
});
|
||||
|
||||
// update "mOriginalIndex"
|
||||
static_for<0, ndim_partial_original, 1>{}([&](auto I) {
|
||||
constexpr auto idim_original = partial_original_dims[I];
|
||||
|
||||
mOriginalIndex(idim_original) = partial_original_id[I];
|
||||
});
|
||||
|
||||
// calculate new partial offset on this merged dimension
|
||||
const index_t old_partial_offset = mPartialOffsets[idim];
|
||||
|
||||
mPartialOffsets(idim) =
|
||||
partial_original_desc.GetOffsetFromMultiIndex(partial_original_id);
|
||||
|
||||
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
|
||||
mOffset = (mOffset + mPartialOffsets[idim]) - old_partial_offset;
|
||||
}).Else([&](auto fwd) {
|
||||
static_if<PositiveDirection>{}([&](auto) {
|
||||
mOffset += step_size * fwd(tensor_desc_type{}).GetStride(idim);
|
||||
}).Else([&](auto) { mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim); });
|
||||
});
|
||||
}
|
||||
|
||||
// T is Array or Sequence
|
||||
template <class T>
|
||||
__host__ __device__ type operator+=(T step_sizes)
|
||||
{
|
||||
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto idim) {
|
||||
// compiler should remove dead code path, because step_sizes is known at
|
||||
// compile time
|
||||
if(step_sizes[idim] != 0)
|
||||
{
|
||||
this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, true>{});
|
||||
}
|
||||
});
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ type operator-=(T step_sizes)
|
||||
{
|
||||
static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto idim) {
|
||||
// compiler should remove dead code path, because step_sizes is known at
|
||||
// compile time
|
||||
if(step_sizes[idim] != 0)
|
||||
{
|
||||
this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, false>{});
|
||||
}
|
||||
});
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr type operator+(T step_sizes) const
|
||||
{
|
||||
type coord = *this;
|
||||
coord += step_sizes;
|
||||
return coord;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr type operator-(T step_sizes) const
|
||||
{
|
||||
type coord = *this;
|
||||
coord -= step_sizes;
|
||||
return coord;
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr index_t RepositionOrigin() { return 0; }
|
||||
|
||||
private:
|
||||
// Allocate register memory for all merged dimensions and normal dimensions.
|
||||
// However, only those merged dimensions, whose index will be involved in arithmetic
|
||||
// after the construction of this TensorCoordinate (e.g. when user move a slicing
|
||||
// window on the merged dimension), will use these register memory.
|
||||
// Let's hope compiler will optimize away those register memory allocated for normal
|
||||
// dimensions, and those merged dimensions, that would never be involved in index
|
||||
// arithmetic after construction of TensorCoordinate.
|
||||
// TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
|
||||
// and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
|
||||
// count on compiler to optimize away those register memory for us
|
||||
Array<index_t, nOriginalDim> mOriginalIndex;
|
||||
Array<index_t, nDim> mPartialOffsets;
|
||||
|
||||
// complete offset
|
||||
index_t mOffset;
|
||||
};
|
||||
|
||||
template <class TensorDesc>
|
||||
struct TensorCoordinate_deprecated
|
||||
{
|
||||
private:
|
||||
template <class... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
|
||||
{
|
||||
return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor<Ts...>>();
|
||||
}
|
||||
|
||||
template <class... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
|
||||
{
|
||||
return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
|
||||
}
|
||||
|
||||
public:
|
||||
using type = decltype(MakeDummyTensorCoordinate(TensorDesc{}));
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -9,7 +9,7 @@ template <typename TensorDesc>
|
||||
__host__ __device__ constexpr auto
|
||||
make_tensor_coordinate_v2(TensorDesc, MultiIndex<TensorDesc::GetNumOfDimension()> idx)
|
||||
{
|
||||
return typename TensorCoordinate_v2<TensorDesc>::type(idx);
|
||||
return typename TensorCoordinate<TensorDesc>::type(idx);
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
@@ -1,215 +0,0 @@
|
||||
#ifndef CK_TENSOR_COORDINATE_V2_HPP
|
||||
#define CK_TENSOR_COORDINATE_V2_HPP
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "dimension.hpp"
|
||||
#include "multi_index_transform.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename TensorDesc>
|
||||
struct TensorCoordinate_v2;
|
||||
|
||||
template <typename NativeTensorDesc>
|
||||
struct NativeTensorCoordinate
|
||||
{
|
||||
using type = NativeTensorCoordinate;
|
||||
using tensor_desc_type = NativeTensorDesc;
|
||||
static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
|
||||
using Index = MultiIndex<nDim>;
|
||||
|
||||
__host__ __device__ constexpr NativeTensorCoordinate(Index idx)
|
||||
: mIndex(idx), mOffset(tensor_desc_type::CalculateOffset(idx))
|
||||
{
|
||||
}
|
||||
|
||||
template <typename... Xs>
|
||||
__host__ __device__ constexpr NativeTensorCoordinate(Xs... xs)
|
||||
: NativeTensorCoordinate(Index{xs...})
|
||||
{
|
||||
}
|
||||
|
||||
template <index_t... Xs>
|
||||
__host__ __device__ constexpr NativeTensorCoordinate(Sequence<Xs...>)
|
||||
: NativeTensorCoordinate(Index{Xs...})
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; }
|
||||
|
||||
__host__ __device__ constexpr const Index& GetIndex() const { return mIndex; }
|
||||
|
||||
__host__ __device__ constexpr const index_t& GetOffset() const { return mOffset; }
|
||||
|
||||
__host__ __device__ constexpr type operator+=(const Index& idx_diff)
|
||||
{
|
||||
// mIndex is updated here, but some (or all) of its entries may never be used
|
||||
// compiler should remove those entries as dead code
|
||||
mIndex += idx_diff;
|
||||
|
||||
mOffset += tensor_desc_type::CalculateOffsetDiff(idx_diff);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr type operator-=(const Index& idx_diff)
|
||||
{
|
||||
// mIndex is updated here, but some (or all) of its entries may never be used
|
||||
// compiler should remove those entries as dead code
|
||||
mIndex -= idx_diff;
|
||||
|
||||
mOffset -= tensor_desc_type::CalculateOffsetDiff(idx_diff);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr type operator+(const Index& idx_diff) const
|
||||
{
|
||||
type coord = *this;
|
||||
coord += idx_diff;
|
||||
return coord;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr type operator-(const Index& idx_diff) const
|
||||
{
|
||||
type coord = *this;
|
||||
coord -= idx_diff;
|
||||
return coord;
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; }
|
||||
|
||||
private:
|
||||
// mIndex may be saved and updated, however, the value of some (or all) of its entries may
|
||||
// never be used. Compiler should be able to remove these entries as well as its calculation
|
||||
// as dead code.
|
||||
// TODO: make sure compiler indeed remove these dead code
|
||||
Index mIndex;
|
||||
index_t mOffset;
|
||||
};
|
||||
|
||||
template <typename TransformedTensorDesc>
|
||||
struct TransformedTensorCoordinate
|
||||
{
|
||||
using tensor_desc_type = TransformedTensorDesc;
|
||||
using LowerCoord =
|
||||
typename TensorCoordinate_v2<decltype(tensor_desc_type::GetLowerTensorDescriptor())>::type;
|
||||
using UpperCoord = TransformedTensorCoordinate;
|
||||
static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
|
||||
using UpperIndex = MultiIndex<nDim>;
|
||||
|
||||
__host__ __device__ constexpr TransformedTensorCoordinate(UpperIndex idx)
|
||||
: mIndexUp{idx}, mCoordLow{tensor_desc_type::CalculateLowerIndex(idx)}
|
||||
{
|
||||
}
|
||||
|
||||
template <typename... Xs>
|
||||
__host__ __device__ constexpr TransformedTensorCoordinate(Xs... xs)
|
||||
: TransformedTensorCoordinate(UpperIndex{xs...})
|
||||
{
|
||||
}
|
||||
|
||||
template <index_t... Xs>
|
||||
__host__ __device__ constexpr TransformedTensorCoordinate(Sequence<Xs...>)
|
||||
: TransformedTensorCoordinate(UpperIndex{Xs...})
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; }
|
||||
|
||||
__host__ __device__ constexpr const LowerCoord& GetLowerCoordinate() const { return mCoordLow; }
|
||||
|
||||
__host__ __device__ constexpr const UpperIndex& GetUpperIndex() const { return mIndexUp; }
|
||||
|
||||
__host__ __device__ constexpr const UpperIndex& GetIndex() const { return GetUpperIndex(); }
|
||||
|
||||
__host__ __device__ constexpr const index_t& GetOffset() const
|
||||
{
|
||||
return GetLowerCoordinate().GetOffset();
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr UpperCoord operator+=(const UpperIndex& idx_up_diff)
|
||||
{
|
||||
// For transformation of multi-index difference, not all transformation functions need to
|
||||
// know the old lower-index or the old upper-index. We pass both of them to the
|
||||
// transformation function. The transformation function itself decides to use them or not.
|
||||
mCoordLow += tensor_desc_type::CalculateLowerIndexDiff(
|
||||
idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
|
||||
|
||||
// mIndexUp is updated here, but some (or all) of its entries may never be used
|
||||
// compiler should remove those entries as dead code
|
||||
mIndexUp += idx_up_diff;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr UpperCoord operator-=(const UpperIndex& idx_up_diff)
|
||||
{
|
||||
mCoordLow -= tensor_desc_type::CalculateLowerIndexDiff(
|
||||
idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
|
||||
|
||||
// mIndex is updated here, but some (or all) of its entries may never be used
|
||||
// compiler should remove those entries as dead code
|
||||
mIndexUp -= idx_up_diff;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr UpperCoord operator+(const UpperIndex& idx_up_diff) const
|
||||
{
|
||||
UpperCoord coord_up = *this;
|
||||
coord_up += idx_up_diff;
|
||||
return coord_up;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr UpperCoord operator-(const UpperIndex& idx_up_diff) const
|
||||
{
|
||||
UpperCoord coord_up = *this;
|
||||
coord_up -= idx_up_diff;
|
||||
return coord_up;
|
||||
}
|
||||
|
||||
// this function should be inexpensive, because there is no upper-to-lower index transformation
|
||||
__host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const
|
||||
{
|
||||
return tensor_desc_type::IsUpperIndexMappedToValidLowerIndex(GetIndex()) &&
|
||||
mCoordLow.IsUpperIndexMappedToValidOffset();
|
||||
}
|
||||
|
||||
private:
|
||||
// mIndexUp may be calculated and updated, however, the value of some (or all) of its entries
|
||||
// may
|
||||
// never be used. Compiler should be able to remove these entries as well as its calculation
|
||||
// as dead code.
|
||||
// TODO: make sure compiler indeed remove these dead code
|
||||
UpperIndex mIndexUp;
|
||||
LowerCoord mCoordLow;
|
||||
};
|
||||
|
||||
template <typename TensorDesc>
|
||||
struct TensorCoordinate_v2
|
||||
{
|
||||
private:
|
||||
template <typename... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
|
||||
{
|
||||
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
|
||||
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
|
||||
}
|
||||
|
||||
template <typename... Ts>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
|
||||
{
|
||||
return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
|
||||
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
|
||||
}
|
||||
|
||||
public:
|
||||
using type = decltype(MakeDummyTensorCoordinate(TensorDesc{}));
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -4,7 +4,7 @@
|
||||
#include "common_header.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "tensor_coordinate.hpp"
|
||||
#include "tensor_coordinate_deprecated.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -14,7 +14,7 @@ struct NormalTensorView
|
||||
{
|
||||
using type = NormalTensorView;
|
||||
using tensor_desc_type = TensorDesc;
|
||||
using coordinate_type = typename NormalTensorCoordinate<TensorDesc>::type;
|
||||
using coordinate_type = typename NormalTensorCoordinate_deprecated<TensorDesc>::type;
|
||||
using data_type = TData;
|
||||
|
||||
static constexpr auto nDim = TensorDesc::GetNumOfDimension();
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include "dimension.hpp"
|
||||
#include "dimension_transform.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
#include "tensor_coordinate_v2.hpp"
|
||||
#include "tensor_coordinate.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
@@ -13,7 +13,7 @@ template <class TensorDescriptor>
|
||||
struct TensorVisit
|
||||
{
|
||||
using Index = typename TensorDescriptor::Index;
|
||||
using Coordinate = typename TensorCoordinate_v2<TensorDescriptor>::type;
|
||||
using Coordinate = typename TensorCoordinate<TensorDescriptor>::type;
|
||||
|
||||
__host__ __device__ static void Run_v1(Index idx_begin)
|
||||
{
|
||||
|
||||
@@ -4,680 +4,11 @@
|
||||
#include "common_header.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "tensor_coordinate_v2.hpp"
|
||||
#include "tensor_coordinate.hpp"
|
||||
#include "threadwise_generic_tensor_slice_copy.hpp"
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
|
||||
#if 0
|
||||
|
||||
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
|
||||
// memory layout (ordering of dimensions) can be different between src and dst.
|
||||
// This functions assume each thread is reading and writing a normal (not merged) tensor,
|
||||
// to simplify index calculations. To satisfy this assumption, the user need to make sure
|
||||
// that, on a merged dimension that constains multiple original dimensions, the length of
|
||||
// the last original dimension need to be evenly dividable by its sub-lengths. Also, the
|
||||
// repeat-length on the merged dimension need to be 1. These sanity checks are performed
|
||||
// in constructor of BlockwiseGenericTensorSliceCopy_v1
|
||||
template <index_t BlockSize,
|
||||
typename SrcDesc,
|
||||
typename DstDesc,
|
||||
typename SliceLengths,
|
||||
typename SubLengths,
|
||||
typename ThreadClusterLengths,
|
||||
typename ThreadClusterArrangeOrder,
|
||||
typename SrcDimAccessOrder,
|
||||
typename DstDimAccessOrder,
|
||||
index_t SrcVectorAccessDim,
|
||||
index_t DstVectorAccessDim,
|
||||
index_t SrcDataPerAccess,
|
||||
index_t DstDataPerAccess>
|
||||
struct BlockwiseGenericTensorSliceCopy_v1
|
||||
{
|
||||
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
|
||||
|
||||
static constexpr index_t nOriginalDimSrc =
|
||||
SrcDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
|
||||
static constexpr index_t nOriginalDimDst =
|
||||
DstDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
|
||||
|
||||
// per-thread offset
|
||||
index_t mThreadSrcOffset;
|
||||
index_t mThreadDstOffset;
|
||||
|
||||
// "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
|
||||
// "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
|
||||
// updated if slicing-window is moved. However, they will not be used if you always move
|
||||
// the slicing-window along a non-merged dimension. In that case, compiler should be
|
||||
// able to remove these calculation.
|
||||
// TODO: make sure compiler would actually remove them in that case
|
||||
|
||||
// partial offset in each (merged) dimension
|
||||
Array<index_t, nDim> mThreadSrcPartialOffsets;
|
||||
Array<index_t, nDim> mThreadDstPartialOffsets;
|
||||
|
||||
// multi-id of original tensor
|
||||
Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId;
|
||||
Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId;
|
||||
|
||||
__device__ BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_id_begin,
|
||||
Array<index_t, nDim> dst_block_data_id_begin)
|
||||
{
|
||||
// check NDim consistency
|
||||
static_assert(
|
||||
nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
|
||||
nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
|
||||
nDim == ThreadClusterLengths::GetSize() &&
|
||||
nDim == ThreadClusterArrangeOrder::GetSize() &&
|
||||
nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
|
||||
"wrong");
|
||||
|
||||
// check thread arrange order and read/write access order are valid
|
||||
static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
|
||||
is_valid_sequence_map<SrcDimAccessOrder>::value &&
|
||||
is_valid_sequence_map<DstDimAccessOrder>::value,
|
||||
"wrong!");
|
||||
|
||||
// thread cluster
|
||||
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
|
||||
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
|
||||
|
||||
// BlockSize
|
||||
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
|
||||
|
||||
// divide work
|
||||
constexpr auto data_per_cluster_per_dims = SubLengths{} * ThreadClusterLengths{};
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
|
||||
"wrong! cannot evenly divide sliced tensor into cluster");
|
||||
});
|
||||
|
||||
constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
|
||||
|
||||
// additional check for merged dimension
|
||||
static_for<0, nDim, 1>{}([&](auto IDim_) {
|
||||
// src
|
||||
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
|
||||
constexpr auto IDim = decltype(IDim_){};
|
||||
|
||||
// on a merged dimension that constains multiple original dimensions,
|
||||
// the length of the last original dimension need to evenly dividable by its
|
||||
// sub-length,
|
||||
// so each thread is effectively reading a normal (not merged) tensor
|
||||
constexpr auto idim_last_original_src =
|
||||
SrcDesc::GetContainedOriginalDimensions(IDim).Back();
|
||||
static_assert(
|
||||
SrcDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_src) %
|
||||
SubLengths::Get(IDim) ==
|
||||
0,
|
||||
"wrong!");
|
||||
|
||||
// merged dimension should have repeat_lengths = 1
|
||||
static_assert(repeat_lengths[IDim] == 1,
|
||||
"wrong! repeat_lengths shoud be 1 on merged dimension");
|
||||
});
|
||||
|
||||
// dst
|
||||
static_if<DstDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
|
||||
constexpr auto IDim = decltype(IDim_){};
|
||||
|
||||
// on a merged dimension that constains multiple original dimensions,
|
||||
// the length of the last original dimension need to evenly dividable by its
|
||||
// sub-length,
|
||||
// so each thread is effectively reading a normal (not merged) tensor
|
||||
constexpr auto idim_last_original_dst =
|
||||
DstDesc::GetContainedOriginalDimensions(IDim).Back();
|
||||
static_assert(
|
||||
DstDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_dst) %
|
||||
SubLengths::Get(IDim) ==
|
||||
0,
|
||||
"wrong!");
|
||||
|
||||
// merged dimension should have repeat_lengths = 1
|
||||
static_assert(repeat_lengths[IDim] == 1,
|
||||
"wrong! repeat_lengths shoud be 1 on merged dimension");
|
||||
});
|
||||
});
|
||||
|
||||
// calculate mThreadSrcOffset, mThreadDstOffset
|
||||
const auto thread_cluster_id =
|
||||
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
|
||||
|
||||
const auto data_cluster_id =
|
||||
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
|
||||
|
||||
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
|
||||
|
||||
// original multi-id
|
||||
mThreadSrcOriginalMultiId = SrcDesc::GetOriginalMultiIndexFromMultiIndex(
|
||||
src_block_data_id_begin + thread_data_id_begin);
|
||||
|
||||
mThreadDstOriginalMultiId = DstDesc::GetOriginalMultiIndexFromMultiIndex(
|
||||
dst_block_data_id_begin + thread_data_id_begin);
|
||||
|
||||
// partial offset on each dimension
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
constexpr auto src_partial_original_dims =
|
||||
SrcDesc::GetContainedOriginalDimensions(IDim);
|
||||
|
||||
constexpr auto src_partial_original_desc =
|
||||
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
|
||||
|
||||
mThreadSrcPartialOffsets(IDim) = src_partial_original_desc.GetOffsetFromMultiIndex(
|
||||
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims));
|
||||
});
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
constexpr auto dst_partial_original_dims =
|
||||
DstDesc::GetContainedOriginalDimensions(IDim);
|
||||
|
||||
constexpr auto dst_partial_original_desc =
|
||||
DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims);
|
||||
|
||||
mThreadDstPartialOffsets(IDim) = dst_partial_original_desc.GetOffsetFromMultiIndex(
|
||||
extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims));
|
||||
});
|
||||
|
||||
// complete offset
|
||||
mThreadSrcOffset = accumulate_on_array(
|
||||
mThreadSrcPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
|
||||
|
||||
mThreadDstOffset = accumulate_on_array(
|
||||
mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
|
||||
}
|
||||
|
||||
__device__ static constexpr auto GetRegisterBufferDescriptor()
|
||||
{
|
||||
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
|
||||
|
||||
return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
|
||||
}
|
||||
|
||||
__device__ static constexpr index_t GetRegisterBufferSize()
|
||||
{
|
||||
return GetRegisterBufferDescriptor().GetElementSpace();
|
||||
}
|
||||
|
||||
template <typename TData>
|
||||
__device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
|
||||
TData* __restrict__ p_buffer) const
|
||||
{
|
||||
constexpr auto thread_sub_tensor_lengths = SubLengths{};
|
||||
|
||||
constexpr auto data_per_cluster_per_dims =
|
||||
thread_sub_tensor_lengths * ThreadClusterLengths{};
|
||||
|
||||
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
|
||||
|
||||
constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
|
||||
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
|
||||
constexpr auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
|
||||
|
||||
constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
|
||||
|
||||
constexpr index_t src_offset =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
|
||||
|
||||
constexpr index_t buffer_offset =
|
||||
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
|
||||
#else
|
||||
ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
|
||||
const auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
|
||||
|
||||
const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
|
||||
|
||||
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
|
||||
|
||||
const index_t buffer_offset =
|
||||
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
|
||||
#endif
|
||||
|
||||
// By position the origin of the per-thread window at the point, where multi-index
|
||||
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
|
||||
// is assuming each thread is copy a noraml (not merged) tensor.
|
||||
// To satisfy this assumption, the user need to make sure that, on a merged dimension
|
||||
// that constains multiple original dimensions, the length of the last original
|
||||
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
|
||||
// the merged dimension need to be 1. These sanity checks are performed in constructor
|
||||
// of BlockwiseGenericTensorSliceCopy_v1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
|
||||
decltype(thread_buffer_desc),
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
SrcVectorAccessDim,
|
||||
SrcDataPerAccess,
|
||||
1>(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
.Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
|
||||
});
|
||||
}
|
||||
|
||||
template <typename TData>
|
||||
__device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
|
||||
TData* __restrict__ p_dst) const
|
||||
{
|
||||
constexpr auto thread_sub_tensor_lengths = SubLengths{};
|
||||
|
||||
constexpr auto data_per_cluster_per_dims =
|
||||
thread_sub_tensor_lengths * ThreadClusterLengths{};
|
||||
|
||||
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
|
||||
|
||||
constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
|
||||
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
|
||||
constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
|
||||
|
||||
constexpr auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
|
||||
|
||||
constexpr index_t buffer_offset =
|
||||
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
|
||||
|
||||
constexpr index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
|
||||
#else
|
||||
ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
|
||||
const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
|
||||
|
||||
const auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
|
||||
|
||||
const index_t buffer_offset =
|
||||
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
|
||||
|
||||
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
|
||||
#endif
|
||||
|
||||
// By position the origin of the per-thread window at the point, where multi-index
|
||||
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
|
||||
// is assuming each thread is copy a noraml (not merged) tensor.
|
||||
// To satisfy this assumption, the user need to make sure that, on a merged dimension
|
||||
// that constains multiple original dimensions, the length of the last original
|
||||
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
|
||||
// the merged dimension need to be 1. These sanity checks are performed in constructor
|
||||
// of BlockwiseGenericTensorSliceCopy_v1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
|
||||
DstDesc,
|
||||
SubLengths,
|
||||
DstDimAccessOrder,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>(
|
||||
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
|
||||
.Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
|
||||
});
|
||||
}
|
||||
|
||||
template <typename TData>
|
||||
__device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
|
||||
{
|
||||
TData p_buffer[GetRegisterBufferSize()];
|
||||
|
||||
RunLoadRegisterBuffer(p_src, p_buffer);
|
||||
RunStoreRegisterBuffer(p_buffer, p_dst);
|
||||
}
|
||||
|
||||
// When moving the slicing windows along a merged dimension, if the strides of the
|
||||
// contained (by the merged dimension) original dimensions are not in descending order,
|
||||
// then there is no guarantee that the new offset will be larger than the old offset
|
||||
// for movement in positive direction (vice versue for movement in negative direction).
|
||||
// As a result, there is the possiblity that the offset calculation may result in
|
||||
// unsigned integer underflow (due to "-" operation). However, this hazard should not
|
||||
// happen, as long as the users make sure the slicing window would not be moved out of
|
||||
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
|
||||
// check on out-of-bound slicing window, for performance reason
|
||||
template <index_t IDim_, index_t StepSize, bool PositiveDirection>
|
||||
__device__ void MoveSlicingWindowOnSourceTensor(
|
||||
Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
|
||||
{
|
||||
constexpr auto IDim = Number<IDim_>{};
|
||||
|
||||
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto) {
|
||||
// logic for a merged dimension, also works for non-merged dimension, but its logic may
|
||||
// be unncessarily complicated for compiler to remove calculations that are useless for
|
||||
// a non-merged dimension
|
||||
|
||||
// extract partial original dimensions
|
||||
constexpr auto src_partial_original_dims =
|
||||
SrcDesc::GetContainedOriginalDimensions(IDim);
|
||||
|
||||
constexpr auto src_partial_original_desc =
|
||||
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
|
||||
|
||||
// calculate new partial original multi-id
|
||||
auto old_src_partial_original_id =
|
||||
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims);
|
||||
|
||||
auto new_src_partial_original_id =
|
||||
src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
|
||||
old_src_partial_original_id, StepSize, direction);
|
||||
|
||||
// update "mThreadSrcOriginalMultiId"
|
||||
static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I) {
|
||||
constexpr auto IDimOriginal = src_partial_original_dims[I];
|
||||
|
||||
mThreadSrcOriginalMultiId(IDimOriginal) = new_src_partial_original_id[I];
|
||||
});
|
||||
|
||||
// calculate new partial offset on this merged dimension
|
||||
const index_t old_src_partial_offset = mThreadSrcPartialOffsets[IDim];
|
||||
|
||||
const index_t new_src_partial_offset =
|
||||
src_partial_original_desc.GetOffsetFromMultiIndex(new_src_partial_original_id);
|
||||
|
||||
// update "mThreadSrcPartialOffsets"
|
||||
mThreadSrcPartialOffsets(IDim) = new_src_partial_offset;
|
||||
|
||||
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
|
||||
mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
|
||||
}).Else([&](auto) {
|
||||
// Logic for non-merged dimension. If you are never going to move the slicing window on
|
||||
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
|
||||
// which are being calculated here, will never be used later. In this case, compiler
|
||||
// should be able to remove these calculations.
|
||||
// TODO: make sure compiler would actually remove them in this case.
|
||||
|
||||
// It is the user's responsiblity to make sure the slicing window will not be moved out
|
||||
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
|
||||
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
|
||||
|
||||
constexpr auto IDimOriginal = SrcDesc::GetContainedOriginalDimensions(IDim).Front();
|
||||
|
||||
static_if<PositiveDirection>{}([&](auto fwd) {
|
||||
mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim);
|
||||
|
||||
mThreadSrcOriginalMultiId(IDimOriginal) += StepSize;
|
||||
|
||||
mThreadSrcPartialOffsets(IDim) += StepSize * fwd(SrcDesc{}).GetStride(IDim);
|
||||
}).Else([&](auto fwd) {
|
||||
mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
|
||||
|
||||
mThreadSrcOriginalMultiId(IDimOriginal) -= StepSize;
|
||||
|
||||
mThreadSrcPartialOffsets(IDim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <typename T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
static_for<0, nDim, 1>{}([&](auto idim) {
|
||||
if(step_sizes[idim] != 0)
|
||||
{
|
||||
MoveSlicingWindowOnSourceTensor(idim, step_sizes[idim], positive_direction);
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// This version use TensorCoordiante
|
||||
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
|
||||
// memory layout (ordering of dimensions) can be different between src and dst.
|
||||
template <index_t BlockSize,
|
||||
typename SrcDesc,
|
||||
typename DstDesc,
|
||||
typename SliceLengths,
|
||||
typename SubLengths,
|
||||
typename ThreadClusterLengths,
|
||||
typename ThreadClusterArrangeOrder,
|
||||
typename SrcDimAccessOrder,
|
||||
typename DstDimAccessOrder,
|
||||
index_t SrcVectorAccessDim,
|
||||
index_t DstVectorAccessDim,
|
||||
index_t SrcDataPerAccess,
|
||||
index_t DstDataPerAccess>
|
||||
struct BlockwiseGenericTensorSliceCopy_v2
|
||||
{
|
||||
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
|
||||
|
||||
using Index = MultiIndex<nDim>;
|
||||
|
||||
__device__ constexpr BlockwiseGenericTensorSliceCopy_v2(const Index& src_block_slice_origin,
|
||||
const Index& dst_block_slice_origin)
|
||||
{
|
||||
static_assert(
|
||||
nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
|
||||
nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
|
||||
nDim == ThreadClusterLengths::GetSize() &&
|
||||
nDim == ThreadClusterArrangeOrder::GetSize() &&
|
||||
nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
|
||||
"wrong! nDim not consistent");
|
||||
|
||||
static_assert(is_same<SliceLengths, decltype(SubLengths{} * ThreadClusterLengths{})>{},
|
||||
"wrong! threads should be mapped to cover entire slicing window");
|
||||
|
||||
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
|
||||
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
|
||||
|
||||
static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
|
||||
"wrong! BlockSize not consistent with ThreadClusterLengths");
|
||||
|
||||
const auto thread_cluster_id =
|
||||
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
|
||||
|
||||
const auto data_cluster_id =
|
||||
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
|
||||
|
||||
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
|
||||
|
||||
mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
|
||||
mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>());
|
||||
|
||||
mThreadwiseStore.SetSrcSliceOrigin(make_zero_array<index_t, nDim>());
|
||||
mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
|
||||
}
|
||||
|
||||
__device__ static constexpr index_t GetRegisterBufferSize()
|
||||
{
|
||||
return RegisterBufferDesc::GetElementSpace();
|
||||
}
|
||||
|
||||
template <typename TData>
|
||||
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
|
||||
{
|
||||
#if 0
|
||||
mThreadwiseLoad.Run(p_src, p_buffer);
|
||||
#else
|
||||
// hardcoded: global to register
|
||||
mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename TData>
|
||||
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
|
||||
{
|
||||
#if 0
|
||||
mThreadwiseStore.Run(p_buffer, p_dst);
|
||||
#else
|
||||
// hardcoded: register to LDS
|
||||
mThreadwiseStore.template Run_amd_experiment<TData, 0, 1>(p_buffer, p_dst);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename TData>
|
||||
__device__ void Run(const TData* p_src, TData* p_dst) const
|
||||
{
|
||||
TData p_buffer[GetRegisterBufferSize()];
|
||||
|
||||
RunLoadRegisterBuffer(p_src, p_buffer);
|
||||
RunStoreRegisterBuffer(p_buffer, p_dst);
|
||||
}
|
||||
|
||||
template <typename T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
|
||||
}
|
||||
|
||||
template <typename T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
|
||||
}
|
||||
|
||||
private:
|
||||
using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
|
||||
|
||||
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
|
||||
RegisterBufferDesc,
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
SrcDimAccessOrder,
|
||||
SrcVectorAccessDim,
|
||||
SrcVectorAccessDim,
|
||||
SrcDataPerAccess,
|
||||
1>;
|
||||
|
||||
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1<RegisterBufferDesc,
|
||||
DstDesc,
|
||||
SubLengths,
|
||||
DstDimAccessOrder,
|
||||
DstDimAccessOrder,
|
||||
DstVectorAccessDim,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>;
|
||||
|
||||
ThreadwiseLoad mThreadwiseLoad;
|
||||
ThreadwiseStore mThreadwiseStore;
|
||||
};
|
||||
|
||||
// this version use TensorView and TensorCoordinate
|
||||
template <index_t BlockSize,
|
||||
typename SrcTensor,
|
||||
typename DstTensor,
|
||||
typename SliceLengths,
|
||||
typename SubLengths,
|
||||
typename ThreadClusterLengths,
|
||||
typename ThreadClusterArrangeOrder,
|
||||
typename SrcDimAccessOrder,
|
||||
typename DstDimAccessOrder,
|
||||
index_t SrcVectorAccessDim,
|
||||
index_t DstVectorAccessDim,
|
||||
index_t SrcDataPerAccess,
|
||||
index_t DstDataPerAccess>
|
||||
struct BlockwiseGenericTensorSliceCopy_v3
|
||||
{
|
||||
static constexpr index_t nDim = SrcTensor::GetNumOfDimension();
|
||||
using data_type = remove_cv_t<typename SrcTensor::data_type>;
|
||||
|
||||
using SrcCoordinate = typename SrcTensor::coordinate_type;
|
||||
using DstCoordinate = typename DstTensor::coordinate_type;
|
||||
|
||||
__device__ constexpr BlockwiseGenericTensorSliceCopy_v3(SrcTensor src_block,
|
||||
SrcCoordinate src_block_slice_origin,
|
||||
DstTensor dst_block,
|
||||
DstCoordinate dst_block_slice_origin)
|
||||
: mThreadBuffer{make_TensorView(ThreadBufferDesc{}, mpBuffer)}
|
||||
{
|
||||
static_assert(
|
||||
nDim == SrcTensor::GetNumOfDimension() && nDim == DstTensor::GetNumOfDimension() &&
|
||||
nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
|
||||
nDim == ThreadClusterLengths::GetSize() &&
|
||||
nDim == ThreadClusterArrangeOrder::GetSize() &&
|
||||
nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
|
||||
"wrong! nDim not consistent");
|
||||
|
||||
static_assert(is_same<SliceLengths, decltype(SubLengths{} * ThreadClusterLengths{})>{},
|
||||
"wrong! threads should be mapped to cover entire slicing window");
|
||||
|
||||
static_assert(is_same<remove_cv_t<typename SrcTensor::data_type>,
|
||||
remove_cv_t<typename DstTensor::data_type>>{},
|
||||
"wrong! type conversion not supported yet");
|
||||
|
||||
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
|
||||
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
|
||||
|
||||
static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
|
||||
"wrong! BlockSize not consistent with ThreadClusterLengths");
|
||||
|
||||
const auto thread_cluster_id =
|
||||
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
|
||||
|
||||
const auto data_cluster_id =
|
||||
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
|
||||
|
||||
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
|
||||
|
||||
mThreadwiseLoad = ThreadwiseLoad(src_block,
|
||||
src_block_slice_origin + thread_data_id_begin,
|
||||
mThreadBuffer,
|
||||
make_zero_array<index_t, nDim>());
|
||||
|
||||
mThreadwiseStore = ThreadwiseStore(mThreadBuffer,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
dst_block,
|
||||
dst_block_slice_origin + thread_data_id_begin);
|
||||
}
|
||||
|
||||
__device__ void RunLoadRegisterBuffer() { mThreadwiseLoad.Run(); }
|
||||
|
||||
__device__ void RunStoreRegisterBuffer() const { mThreadwiseStore.Run(); }
|
||||
|
||||
__device__ void Run()
|
||||
{
|
||||
mThreadwiseLoad.Run();
|
||||
mThreadwiseStore.Run();
|
||||
}
|
||||
|
||||
template <typename T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
|
||||
}
|
||||
|
||||
template <typename T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
|
||||
}
|
||||
|
||||
private:
|
||||
using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
|
||||
using ThreadBufferTensor = NormalTensorView<ThreadBufferDesc, data_type>;
|
||||
|
||||
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v3r1<SrcTensor,
|
||||
ThreadBufferTensor,
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
SrcDimAccessOrder,
|
||||
SrcVectorAccessDim,
|
||||
SrcVectorAccessDim,
|
||||
SrcDataPerAccess,
|
||||
1>;
|
||||
|
||||
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v3r1<ThreadBufferTensor,
|
||||
DstTensor,
|
||||
SubLengths,
|
||||
DstDimAccessOrder,
|
||||
DstDimAccessOrder,
|
||||
DstVectorAccessDim,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>;
|
||||
|
||||
data_type mpBuffer[ThreadBufferDesc::GetElementSpace()];
|
||||
|
||||
ThreadBufferTensor mThreadBuffer;
|
||||
|
||||
ThreadwiseLoad mThreadwiseLoad;
|
||||
ThreadwiseStore mThreadwiseStore;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
template <index_t BlockSize,
|
||||
typename BlockSrcDesc,
|
||||
typename BlockDstDesc,
|
||||
|
||||
@@ -0,0 +1,692 @@
|
||||
#ifndef CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
|
||||
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "ConstantTensorDescriptor.hpp"
|
||||
#include "ConstantMergedTensorDescriptor.hpp"
|
||||
#include "tensor_view.hpp"
|
||||
#include "tensor_coordinate_deprecated.hpp"
|
||||
#include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
|
||||
#endif
|
||||
|
||||
namespace ck {
|
||||
|
||||
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
|
||||
// memory layout (ordering of dimensions) can be different between src and dst.
|
||||
// This functions assume each thread is reading and writing a normal (not merged) tensor,
|
||||
// to simplify index calculations. To satisfy this assumption, the user need to make sure
|
||||
// that, on a merged dimension that constains multiple original dimensions, the length of
|
||||
// the last original dimension need to be evenly dividable by its sub-lengths. Also, the
|
||||
// repeat-length on the merged dimension need to be 1. These sanity checks are performed
|
||||
// in constructor of BlockwiseGenericTensorSliceCopy_v1
|
||||
template <index_t BlockSize,
|
||||
typename SrcDesc,
|
||||
typename DstDesc,
|
||||
typename SliceLengths,
|
||||
typename SubLengths,
|
||||
typename ThreadClusterLengths,
|
||||
typename ThreadClusterArrangeOrder,
|
||||
typename SrcDimAccessOrder,
|
||||
typename DstDimAccessOrder,
|
||||
index_t SrcVectorAccessDim,
|
||||
index_t DstVectorAccessDim,
|
||||
index_t SrcDataPerAccess,
|
||||
index_t DstDataPerAccess>
|
||||
struct BlockwiseGenericTensorSliceCopy_v1
|
||||
{
|
||||
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
|
||||
|
||||
static constexpr index_t nOriginalDimSrc =
|
||||
SrcDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
|
||||
static constexpr index_t nOriginalDimDst =
|
||||
DstDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
|
||||
|
||||
// per-thread offset
|
||||
index_t mThreadSrcOffset;
|
||||
index_t mThreadDstOffset;
|
||||
|
||||
// "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
|
||||
// "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
|
||||
// updated if slicing-window is moved. However, they will not be used if you always move
|
||||
// the slicing-window along a non-merged dimension. In that case, compiler should be
|
||||
// able to remove these calculation.
|
||||
// TODO: make sure compiler would actually remove them in that case
|
||||
|
||||
// partial offset in each (merged) dimension
|
||||
Array<index_t, nDim> mThreadSrcPartialOffsets;
|
||||
Array<index_t, nDim> mThreadDstPartialOffsets;
|
||||
|
||||
// multi-id of original tensor
|
||||
Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId;
|
||||
Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId;
|
||||
|
||||
__device__ BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_id_begin,
|
||||
Array<index_t, nDim> dst_block_data_id_begin)
|
||||
{
|
||||
// check NDim consistency
|
||||
static_assert(
|
||||
nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
|
||||
nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
|
||||
nDim == ThreadClusterLengths::GetSize() &&
|
||||
nDim == ThreadClusterArrangeOrder::GetSize() &&
|
||||
nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
|
||||
"wrong");
|
||||
|
||||
// check thread arrange order and read/write access order are valid
|
||||
static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
|
||||
is_valid_sequence_map<SrcDimAccessOrder>::value &&
|
||||
is_valid_sequence_map<DstDimAccessOrder>::value,
|
||||
"wrong!");
|
||||
|
||||
// thread cluster
|
||||
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
|
||||
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
|
||||
|
||||
// BlockSize
|
||||
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
|
||||
|
||||
// divide work
|
||||
constexpr auto data_per_cluster_per_dims = SubLengths{} * ThreadClusterLengths{};
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
|
||||
"wrong! cannot evenly divide sliced tensor into cluster");
|
||||
});
|
||||
|
||||
constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
|
||||
|
||||
// additional check for merged dimension
|
||||
static_for<0, nDim, 1>{}([&](auto IDim_) {
|
||||
// src
|
||||
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
|
||||
constexpr auto IDim = decltype(IDim_){};
|
||||
|
||||
// on a merged dimension that constains multiple original dimensions,
|
||||
// the length of the last original dimension need to evenly dividable by its
|
||||
// sub-length,
|
||||
// so each thread is effectively reading a normal (not merged) tensor
|
||||
constexpr auto idim_last_original_src =
|
||||
SrcDesc::GetContainedOriginalDimensions(IDim).Back();
|
||||
static_assert(
|
||||
SrcDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_src) %
|
||||
SubLengths::Get(IDim) ==
|
||||
0,
|
||||
"wrong!");
|
||||
|
||||
// merged dimension should have repeat_lengths = 1
|
||||
static_assert(repeat_lengths[IDim] == 1,
|
||||
"wrong! repeat_lengths shoud be 1 on merged dimension");
|
||||
});
|
||||
|
||||
// dst
|
||||
static_if<DstDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
|
||||
constexpr auto IDim = decltype(IDim_){};
|
||||
|
||||
// on a merged dimension that constains multiple original dimensions,
|
||||
// the length of the last original dimension need to evenly dividable by its
|
||||
// sub-length,
|
||||
// so each thread is effectively reading a normal (not merged) tensor
|
||||
constexpr auto idim_last_original_dst =
|
||||
DstDesc::GetContainedOriginalDimensions(IDim).Back();
|
||||
static_assert(
|
||||
DstDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_dst) %
|
||||
SubLengths::Get(IDim) ==
|
||||
0,
|
||||
"wrong!");
|
||||
|
||||
// merged dimension should have repeat_lengths = 1
|
||||
static_assert(repeat_lengths[IDim] == 1,
|
||||
"wrong! repeat_lengths shoud be 1 on merged dimension");
|
||||
});
|
||||
});
|
||||
|
||||
// calculate mThreadSrcOffset, mThreadDstOffset
|
||||
const auto thread_cluster_id =
|
||||
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
|
||||
|
||||
const auto data_cluster_id =
|
||||
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
|
||||
|
||||
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
|
||||
|
||||
// original multi-id
|
||||
mThreadSrcOriginalMultiId = SrcDesc::GetOriginalMultiIndexFromMultiIndex(
|
||||
src_block_data_id_begin + thread_data_id_begin);
|
||||
|
||||
mThreadDstOriginalMultiId = DstDesc::GetOriginalMultiIndexFromMultiIndex(
|
||||
dst_block_data_id_begin + thread_data_id_begin);
|
||||
|
||||
// partial offset on each dimension
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
constexpr auto src_partial_original_dims =
|
||||
SrcDesc::GetContainedOriginalDimensions(IDim);
|
||||
|
||||
constexpr auto src_partial_original_desc =
|
||||
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
|
||||
|
||||
mThreadSrcPartialOffsets(IDim) = src_partial_original_desc.GetOffsetFromMultiIndex(
|
||||
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims));
|
||||
});
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto IDim) {
|
||||
constexpr auto dst_partial_original_dims =
|
||||
DstDesc::GetContainedOriginalDimensions(IDim);
|
||||
|
||||
constexpr auto dst_partial_original_desc =
|
||||
DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims);
|
||||
|
||||
mThreadDstPartialOffsets(IDim) = dst_partial_original_desc.GetOffsetFromMultiIndex(
|
||||
extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims));
|
||||
});
|
||||
|
||||
// complete offset
|
||||
mThreadSrcOffset = accumulate_on_array(
|
||||
mThreadSrcPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
|
||||
|
||||
mThreadDstOffset = accumulate_on_array(
|
||||
mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
|
||||
}
|
||||
|
||||
__device__ static constexpr auto GetRegisterBufferDescriptor()
|
||||
{
|
||||
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
|
||||
|
||||
return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
|
||||
}
|
||||
|
||||
__device__ static constexpr index_t GetRegisterBufferSize()
|
||||
{
|
||||
return GetRegisterBufferDescriptor().GetElementSpace();
|
||||
}
|
||||
|
||||
template <typename TData>
|
||||
__device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
|
||||
TData* __restrict__ p_buffer) const
|
||||
{
|
||||
constexpr auto thread_sub_tensor_lengths = SubLengths{};
|
||||
|
||||
constexpr auto data_per_cluster_per_dims =
|
||||
thread_sub_tensor_lengths * ThreadClusterLengths{};
|
||||
|
||||
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
|
||||
|
||||
constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
|
||||
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
|
||||
constexpr auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
|
||||
|
||||
constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
|
||||
|
||||
constexpr index_t src_offset =
|
||||
SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
|
||||
|
||||
constexpr index_t buffer_offset =
|
||||
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
|
||||
#else
|
||||
ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
|
||||
const auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
|
||||
|
||||
const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
|
||||
|
||||
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
|
||||
|
||||
const index_t buffer_offset =
|
||||
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
|
||||
#endif
|
||||
|
||||
// By position the origin of the per-thread window at the point, where multi-index
|
||||
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
|
||||
// is assuming each thread is copy a noraml (not merged) tensor.
|
||||
// To satisfy this assumption, the user need to make sure that, on a merged dimension
|
||||
// that constains multiple original dimensions, the length of the last original
|
||||
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
|
||||
// the merged dimension need to be 1. These sanity checks are performed in constructor
|
||||
// of BlockwiseGenericTensorSliceCopy_v1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
|
||||
decltype(thread_buffer_desc),
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
SrcVectorAccessDim,
|
||||
SrcDataPerAccess,
|
||||
1>(make_zero_array<index_t, nDim>(),
|
||||
make_zero_array<index_t, nDim>())
|
||||
.Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
|
||||
});
|
||||
}
|
||||
|
||||
template <typename TData>
|
||||
__device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
|
||||
TData* __restrict__ p_dst) const
|
||||
{
|
||||
constexpr auto thread_sub_tensor_lengths = SubLengths{};
|
||||
|
||||
constexpr auto data_per_cluster_per_dims =
|
||||
thread_sub_tensor_lengths * ThreadClusterLengths{};
|
||||
|
||||
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
|
||||
|
||||
constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
|
||||
|
||||
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
|
||||
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
|
||||
constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
|
||||
|
||||
constexpr auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
|
||||
|
||||
constexpr index_t buffer_offset =
|
||||
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
|
||||
|
||||
constexpr index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
|
||||
#else
|
||||
ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
|
||||
const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
|
||||
|
||||
const auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
|
||||
|
||||
const index_t buffer_offset =
|
||||
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
|
||||
|
||||
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
|
||||
#endif
|
||||
|
||||
// By position the origin of the per-thread window at the point, where multi-index
|
||||
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
|
||||
// is assuming each thread is copy a noraml (not merged) tensor.
|
||||
// To satisfy this assumption, the user need to make sure that, on a merged dimension
|
||||
// that constains multiple original dimensions, the length of the last original
|
||||
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
|
||||
// the merged dimension need to be 1. These sanity checks are performed in constructor
|
||||
// of BlockwiseGenericTensorSliceCopy_v1
|
||||
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
|
||||
DstDesc,
|
||||
SubLengths,
|
||||
DstDimAccessOrder,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>(
|
||||
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
|
||||
.Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
|
||||
});
|
||||
}
|
||||
|
||||
template <typename TData>
|
||||
__device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
|
||||
{
|
||||
TData p_buffer[GetRegisterBufferSize()];
|
||||
|
||||
RunLoadRegisterBuffer(p_src, p_buffer);
|
||||
RunStoreRegisterBuffer(p_buffer, p_dst);
|
||||
}
|
||||
|
||||
// When moving the slicing windows along a merged dimension, if the strides of the
|
||||
// contained (by the merged dimension) original dimensions are not in descending order,
|
||||
// then there is no guarantee that the new offset will be larger than the old offset
|
||||
// for movement in positive direction (vice versue for movement in negative direction).
|
||||
// As a result, there is the possiblity that the offset calculation may result in
|
||||
// unsigned integer underflow (due to "-" operation). However, this hazard should not
|
||||
// happen, as long as the users make sure the slicing window would not be moved out of
|
||||
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
|
||||
// check on out-of-bound slicing window, for performance reason
|
||||
template <index_t IDim_, index_t StepSize, bool PositiveDirection>
|
||||
__device__ void MoveSlicingWindowOnSourceTensor(
|
||||
Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
|
||||
{
|
||||
constexpr auto IDim = Number<IDim_>{};
|
||||
|
||||
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto) {
|
||||
// logic for a merged dimension, also works for non-merged dimension, but its logic may
|
||||
// be unncessarily complicated for compiler to remove calculations that are useless for
|
||||
// a non-merged dimension
|
||||
|
||||
// extract partial original dimensions
|
||||
constexpr auto src_partial_original_dims =
|
||||
SrcDesc::GetContainedOriginalDimensions(IDim);
|
||||
|
||||
constexpr auto src_partial_original_desc =
|
||||
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
|
||||
|
||||
// calculate new partial original multi-id
|
||||
auto old_src_partial_original_id =
|
||||
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims);
|
||||
|
||||
auto new_src_partial_original_id =
|
||||
src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
|
||||
old_src_partial_original_id, StepSize, direction);
|
||||
|
||||
// update "mThreadSrcOriginalMultiId"
|
||||
static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I) {
|
||||
constexpr auto IDimOriginal = src_partial_original_dims[I];
|
||||
|
||||
mThreadSrcOriginalMultiId(IDimOriginal) = new_src_partial_original_id[I];
|
||||
});
|
||||
|
||||
// calculate new partial offset on this merged dimension
|
||||
const index_t old_src_partial_offset = mThreadSrcPartialOffsets[IDim];
|
||||
|
||||
const index_t new_src_partial_offset =
|
||||
src_partial_original_desc.GetOffsetFromMultiIndex(new_src_partial_original_id);
|
||||
|
||||
// update "mThreadSrcPartialOffsets"
|
||||
mThreadSrcPartialOffsets(IDim) = new_src_partial_offset;
|
||||
|
||||
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
|
||||
mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
|
||||
}).Else([&](auto) {
|
||||
// Logic for non-merged dimension. If you are never going to move the slicing window on
|
||||
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
|
||||
// which are being calculated here, will never be used later. In this case, compiler
|
||||
// should be able to remove these calculations.
|
||||
// TODO: make sure compiler would actually remove them in this case.
|
||||
|
||||
// It is the user's responsiblity to make sure the slicing window will not be moved out
|
||||
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
|
||||
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
|
||||
|
||||
constexpr auto IDimOriginal = SrcDesc::GetContainedOriginalDimensions(IDim).Front();
|
||||
|
||||
static_if<PositiveDirection>{}([&](auto fwd) {
|
||||
mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim);
|
||||
|
||||
mThreadSrcOriginalMultiId(IDimOriginal) += StepSize;
|
||||
|
||||
mThreadSrcPartialOffsets(IDim) += StepSize * fwd(SrcDesc{}).GetStride(IDim);
|
||||
}).Else([&](auto fwd) {
|
||||
mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
|
||||
|
||||
mThreadSrcOriginalMultiId(IDimOriginal) -= StepSize;
|
||||
|
||||
mThreadSrcPartialOffsets(IDim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <typename T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
static_for<0, nDim, 1>{}([&](auto idim) {
|
||||
if(step_sizes[idim] != 0)
|
||||
{
|
||||
MoveSlicingWindowOnSourceTensor(idim, step_sizes[idim], positive_direction);
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// This version use TensorCoordiante
|
||||
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
|
||||
// memory layout (ordering of dimensions) can be different between src and dst.
|
||||
template <index_t BlockSize,
|
||||
typename SrcDesc,
|
||||
typename DstDesc,
|
||||
typename SliceLengths,
|
||||
typename SubLengths,
|
||||
typename ThreadClusterLengths,
|
||||
typename ThreadClusterArrangeOrder,
|
||||
typename SrcDimAccessOrder,
|
||||
typename DstDimAccessOrder,
|
||||
index_t SrcVectorAccessDim,
|
||||
index_t DstVectorAccessDim,
|
||||
index_t SrcDataPerAccess,
|
||||
index_t DstDataPerAccess>
|
||||
struct BlockwiseGenericTensorSliceCopy_v2
|
||||
{
|
||||
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
|
||||
|
||||
using Index = MultiIndex<nDim>;
|
||||
|
||||
__device__ constexpr BlockwiseGenericTensorSliceCopy_v2(const Index& src_block_slice_origin,
|
||||
const Index& dst_block_slice_origin)
|
||||
{
|
||||
static_assert(
|
||||
nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
|
||||
nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
|
||||
nDim == ThreadClusterLengths::GetSize() &&
|
||||
nDim == ThreadClusterArrangeOrder::GetSize() &&
|
||||
nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
|
||||
"wrong! nDim not consistent");
|
||||
|
||||
static_assert(is_same<SliceLengths, decltype(SubLengths{} * ThreadClusterLengths{})>{},
|
||||
"wrong! threads should be mapped to cover entire slicing window");
|
||||
|
||||
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
|
||||
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
|
||||
|
||||
static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
|
||||
"wrong! BlockSize not consistent with ThreadClusterLengths");
|
||||
|
||||
const auto thread_cluster_id =
|
||||
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
|
||||
|
||||
const auto data_cluster_id =
|
||||
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
|
||||
|
||||
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
|
||||
|
||||
mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
|
||||
mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>());
|
||||
|
||||
mThreadwiseStore.SetSrcSliceOrigin(make_zero_array<index_t, nDim>());
|
||||
mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
|
||||
}
|
||||
|
||||
__device__ static constexpr index_t GetThreadBufferSize()
|
||||
{
|
||||
return ThreadBufferDesc::GetElementSpace();
|
||||
}
|
||||
|
||||
template <typename TData,
|
||||
address_space_t BlockSrcAddressSpace = address_space_t::generic,
|
||||
address_space_t ThreadBufferAddressSpace = address_space_t::generic>
|
||||
__device__ void RunLoadThreadBuffer(const TData* p_block_src, TData* p_thread_buffer) const
|
||||
{
|
||||
#if 0
|
||||
mThreadwiseLoad.Run(p_block_src, p_thread_buffer);
|
||||
#else // tweaking
|
||||
mThreadwiseLoad.template Run_optimized_address_calculation<TData,
|
||||
BlockSrcAddressSpace,
|
||||
ThreadBufferAddressSpace>(
|
||||
p_block_src, p_thread_buffer);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename TData,
|
||||
address_space_t ThreadBufferAddressSpace = address_space_t::generic,
|
||||
address_space_t BlockDstAddressSpace = address_space_t::generic>
|
||||
__device__ void RunStoreThreadBuffer(const TData* p_thread_buffer, TData* p_block_dst) const
|
||||
{
|
||||
#if 0
|
||||
mThreadwiseStore.Run(p_thread_buffer, p_block_dst);
|
||||
#else // tweaking
|
||||
mThreadwiseStore.template Run_optimized_address_calculation<TData,
|
||||
ThreadBufferAddressSpace,
|
||||
BlockDstAddressSpace>(
|
||||
p_thread_buffer, p_block_dst);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename TData,
|
||||
address_space_t BlockSrcAddressSpace = address_space_t::generic,
|
||||
address_space_t BlockDstAddressSpace = address_space_t::generic>
|
||||
__device__ void Run(const TData* p_block_src, TData* p_block_dst) const
|
||||
{
|
||||
TData p_thread_buffer[GetThreadBufferSize()];
|
||||
|
||||
RunLoadThreadBuffer<TData, BlockSrcAddressSpace, address_space_t::generic>(p_block_src,
|
||||
p_thread_buffer);
|
||||
RunStoreThreadBuffer<TData, address_space_t::generic, BlockDstAddressSpace>(p_thread_buffer,
|
||||
p_block_dst);
|
||||
}
|
||||
|
||||
template <typename T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
|
||||
}
|
||||
|
||||
template <typename T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
|
||||
}
|
||||
|
||||
private:
|
||||
using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
|
||||
|
||||
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
|
||||
ThreadBufferDesc,
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
SrcDimAccessOrder,
|
||||
SrcVectorAccessDim,
|
||||
SrcVectorAccessDim,
|
||||
SrcDataPerAccess,
|
||||
1>;
|
||||
|
||||
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1<ThreadBufferDesc,
|
||||
DstDesc,
|
||||
SubLengths,
|
||||
DstDimAccessOrder,
|
||||
DstDimAccessOrder,
|
||||
DstVectorAccessDim,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>;
|
||||
|
||||
ThreadwiseLoad mThreadwiseLoad;
|
||||
ThreadwiseStore mThreadwiseStore;
|
||||
};
|
||||
|
||||
// this version use TensorView and TensorCoordinate_deprecated
|
||||
template <index_t BlockSize,
|
||||
typename SrcTensor,
|
||||
typename DstTensor,
|
||||
typename SliceLengths,
|
||||
typename SubLengths,
|
||||
typename ThreadClusterLengths,
|
||||
typename ThreadClusterArrangeOrder,
|
||||
typename SrcDimAccessOrder,
|
||||
typename DstDimAccessOrder,
|
||||
index_t SrcVectorAccessDim,
|
||||
index_t DstVectorAccessDim,
|
||||
index_t SrcDataPerAccess,
|
||||
index_t DstDataPerAccess>
|
||||
struct BlockwiseGenericTensorSliceCopy_v3
|
||||
{
|
||||
static constexpr index_t nDim = SrcTensor::GetNumOfDimension();
|
||||
using data_type = remove_cv_t<typename SrcTensor::data_type>;
|
||||
|
||||
using SrcCoordinate = typename SrcTensor::coordinate_type;
|
||||
using DstCoordinate = typename DstTensor::coordinate_type;
|
||||
|
||||
__device__ constexpr BlockwiseGenericTensorSliceCopy_v3(SrcTensor src_block,
|
||||
SrcCoordinate src_block_slice_origin,
|
||||
DstTensor dst_block,
|
||||
DstCoordinate dst_block_slice_origin)
|
||||
: mThreadBuffer{make_TensorView(ThreadBufferDesc{}, mpBuffer)}
|
||||
{
|
||||
static_assert(
|
||||
nDim == SrcTensor::GetNumOfDimension() && nDim == DstTensor::GetNumOfDimension() &&
|
||||
nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
|
||||
nDim == ThreadClusterLengths::GetSize() &&
|
||||
nDim == ThreadClusterArrangeOrder::GetSize() &&
|
||||
nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
|
||||
"wrong! nDim not consistent");
|
||||
|
||||
static_assert(is_same<SliceLengths, decltype(SubLengths{} * ThreadClusterLengths{})>{},
|
||||
"wrong! threads should be mapped to cover entire slicing window");
|
||||
|
||||
static_assert(is_same<remove_cv_t<typename SrcTensor::data_type>,
|
||||
remove_cv_t<typename DstTensor::data_type>>{},
|
||||
"wrong! type conversion not supported yet");
|
||||
|
||||
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
|
||||
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
|
||||
|
||||
static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
|
||||
"wrong! BlockSize not consistent with ThreadClusterLengths");
|
||||
|
||||
const auto thread_cluster_id =
|
||||
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
|
||||
|
||||
const auto data_cluster_id =
|
||||
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
|
||||
|
||||
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
|
||||
|
||||
mThreadwiseLoad = ThreadwiseLoad(src_block,
|
||||
src_block_slice_origin + thread_data_id_begin,
|
||||
mThreadBuffer,
|
||||
make_zero_array<index_t, nDim>());
|
||||
|
||||
mThreadwiseStore = ThreadwiseStore(mThreadBuffer,
|
||||
make_zero_array<index_t, nDim>(),
|
||||
dst_block,
|
||||
dst_block_slice_origin + thread_data_id_begin);
|
||||
}
|
||||
|
||||
__device__ void RunLoadRegisterBuffer() { mThreadwiseLoad.Run(); }
|
||||
|
||||
__device__ void RunStoreRegisterBuffer() const { mThreadwiseStore.Run(); }
|
||||
|
||||
__device__ void Run()
|
||||
{
|
||||
mThreadwiseLoad.Run();
|
||||
mThreadwiseStore.Run();
|
||||
}
|
||||
|
||||
template <typename T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
|
||||
}
|
||||
|
||||
template <typename T, bool PositiveDirection>
|
||||
__device__ void
|
||||
MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
|
||||
{
|
||||
mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
|
||||
}
|
||||
|
||||
private:
|
||||
using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
|
||||
using ThreadBufferTensor = NormalTensorView<ThreadBufferDesc, data_type>;
|
||||
|
||||
using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v3r1<SrcTensor,
|
||||
ThreadBufferTensor,
|
||||
SubLengths,
|
||||
SrcDimAccessOrder,
|
||||
SrcDimAccessOrder,
|
||||
SrcVectorAccessDim,
|
||||
SrcVectorAccessDim,
|
||||
SrcDataPerAccess,
|
||||
1>;
|
||||
|
||||
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v3r1<ThreadBufferTensor,
|
||||
DstTensor,
|
||||
SubLengths,
|
||||
DstDimAccessOrder,
|
||||
DstDimAccessOrder,
|
||||
DstVectorAccessDim,
|
||||
DstVectorAccessDim,
|
||||
1,
|
||||
DstDataPerAccess>;
|
||||
|
||||
data_type mpBuffer[ThreadBufferDesc::GetElementSpace()];
|
||||
|
||||
ThreadBufferTensor mThreadBuffer;
|
||||
|
||||
ThreadwiseLoad mThreadwiseLoad;
|
||||
ThreadwiseStore mThreadwiseStore;
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -3,7 +3,7 @@
|
||||
#include "device.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
|
||||
//#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
|
||||
|
||||
template <class T,
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#include "device.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "gridwise_convolution_kernel_wrapper.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
|
||||
//#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
|
||||
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
|
||||
|
||||
using namespace ck;
|
||||
@@ -164,7 +164,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
|
||||
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
|
||||
|
||||
constexpr auto gridwise_conv =
|
||||
#if 1
|
||||
#if 0
|
||||
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
|
||||
#else
|
||||
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
|
||||
|
||||
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
|
||||
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
|
||||
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
constexpr index_t BPerBlock = 128;
|
||||
|
||||
@@ -14,11 +14,11 @@
|
||||
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
|
||||
//#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
|
||||
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
|
||||
|
||||
struct GeneratorTensor_1
|
||||
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
using LeftPads = Sequence<0, 0>;
|
||||
using RightPads = Sequence<0, 0>;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// 1x1 filter, 8x8 image
|
||||
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
|
||||
constexpr index_t N = 64;
|
||||
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
using LeftPads = Sequence<0, 0>;
|
||||
using RightPads = Sequence<0, 0>;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
|
||||
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
|
||||
constexpr index_t N = 128;
|
||||
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
using LeftPads = Sequence<3, 0>;
|
||||
using RightPads = Sequence<3, 0>;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// 1x7 filter, 0x3 pad, 17x17 input
|
||||
constexpr index_t N = 128;
|
||||
constexpr index_t C = 128;
|
||||
|
||||
Reference in New Issue
Block a user