mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-02 04:31:25 +00:00
Compile for gfx908 and gfx90a (#130)
* adding compilation for multiple targets * fix build * clean * update Jekinsfile * update readme * update Jenkins * use ck::half_t instead of ushort for bf16 * rename enum classes * clean * rename * clean
This commit is contained in:
@@ -56,7 +56,7 @@ template <typename SrcData,
|
||||
typename DimAccessOrder,
|
||||
index_t DstVectorDim,
|
||||
index_t DstScalarPerVector,
|
||||
InMemoryDataOperationEnum_t DstInMemOp,
|
||||
InMemoryDataOperationEnum DstInMemOp,
|
||||
index_t DstScalarStrideInVector,
|
||||
bool DstResetCoordinateAfterRun,
|
||||
typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
|
||||
@@ -407,7 +407,7 @@ struct ThreadwiseTensorSliceTransfer_v2
|
||||
// 3. src_slice_origin and dst_slice_origin are not known at compile-time,
|
||||
// 4. Use thread buffer
|
||||
template <typename SliceLengths,
|
||||
InMemoryDataOperationEnum_t DstInMemOp,
|
||||
InMemoryDataOperationEnum DstInMemOp,
|
||||
typename SrcData,
|
||||
typename DstData,
|
||||
typename SrcDesc,
|
||||
@@ -464,8 +464,8 @@ struct ThreadwiseTensorSliceTransfer_v3
|
||||
__device__ void
|
||||
RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
|
||||
{
|
||||
static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
|
||||
SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
|
||||
static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
|
||||
SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
|
||||
"wrong!");
|
||||
|
||||
static_assert(
|
||||
@@ -621,8 +621,8 @@ struct ThreadwiseTensorSliceTransfer_v3
|
||||
__device__ void
|
||||
RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
|
||||
{
|
||||
static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
|
||||
DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
|
||||
static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
|
||||
DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
|
||||
"wrong!");
|
||||
|
||||
static_assert(
|
||||
@@ -979,7 +979,7 @@ struct ThreadwiseTensorSliceTransfer_v3
|
||||
|
||||
static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_, true> buffer_;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, SrcData, buffer_size_, true> buffer_;
|
||||
|
||||
SrcCoord src_coord_;
|
||||
DstCoord dst_coord_;
|
||||
|
||||
@@ -1,523 +0,0 @@
|
||||
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R4_HPP
|
||||
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R4_HPP
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
|
||||
// and sometimes useless instructions:
|
||||
// 1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
|
||||
// instead
|
||||
// 2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
|
||||
// tensor coordinate instead
|
||||
// 3. Don't use a pointer to VGPR buffer, use vector instead
|
||||
|
||||
// WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
|
||||
// TODO: fix this
|
||||
// Assume:
|
||||
// 1. src:
|
||||
// 1. SrcDesc is known at compile-time
|
||||
// 2. SrcBuffer is StaticBuffer
|
||||
// 3. SrcSliceOrginIdx is known at compile-time
|
||||
// 2. dst:
|
||||
// 1. DstDesc is not known at compile-time
|
||||
// 2. DstBuffer is DynamicBuffer
|
||||
// 3. DstSliceOrginIdx is not known at compile time
|
||||
template <typename SrcData,
|
||||
typename DstData,
|
||||
typename SrcDesc,
|
||||
typename DstDesc,
|
||||
typename Dst0Desc, // this is really one of sources, but it has same shape as DstDesc
|
||||
typename Dst1Desc, // this is really one of sources, but it has same shape as DstDesc
|
||||
typename DstElementwiseOperation,
|
||||
typename SliceLengths,
|
||||
typename DimAccessOrder,
|
||||
index_t DstVectorDim,
|
||||
index_t DstScalarPerVector,
|
||||
InMemoryDataOperationEnum_t DstInMemOp,
|
||||
index_t DstScalarStrideInVector,
|
||||
bool DstResetCoordinateAfterRun,
|
||||
typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
|
||||
struct ThreadwiseTensorSliceTransfer_v1r4
|
||||
{
|
||||
static constexpr index_t nDim = SliceLengths::Size();
|
||||
|
||||
using Index = MultiIndex<nDim>;
|
||||
|
||||
using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
|
||||
using Dst0Coord = decltype(make_tensor_coordinate(Dst0Desc{}, Index{}));
|
||||
using Dst1Coord = decltype(make_tensor_coordinate(Dst1Desc{}, Index{}));
|
||||
|
||||
using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
|
||||
using Dst0CoordStep = decltype(make_tensor_coordinate_step(Dst0Desc{}, Index{}));
|
||||
using Dst1CoordStep = decltype(make_tensor_coordinate_step(Dst1Desc{}, Index{}));
|
||||
|
||||
__device__ constexpr ThreadwiseTensorSliceTransfer_v1r4(
|
||||
const DstDesc& dst_desc,
|
||||
const Dst0Desc& dst0_desc,
|
||||
const Dst1Desc& dst1_desc,
|
||||
const Index& dst_slice_origin_idx,
|
||||
const DstElementwiseOperation& dst_element_op)
|
||||
: dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx)),
|
||||
dst0_coord_(make_tensor_coordinate(dst0_desc, dst_slice_origin_idx)),
|
||||
dst1_coord_(make_tensor_coordinate(dst1_desc, dst_slice_origin_idx)),
|
||||
dst_element_op_{dst_element_op}
|
||||
{
|
||||
static_assert(SrcDesc::IsKnownAtCompileTime(),
|
||||
"wrong! SrcDesc need to known at compile-time");
|
||||
}
|
||||
|
||||
__device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
|
||||
{
|
||||
dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
|
||||
}
|
||||
|
||||
template <typename SrcSliceOriginIdx,
|
||||
typename SrcBuffer,
|
||||
typename DstBuffer,
|
||||
typename Dst0Buffer,
|
||||
typename Dst1Buffer,
|
||||
typename DstStepHacks,
|
||||
typename Dst0StepHacks,
|
||||
typename Dst1StepHacks>
|
||||
__device__ void Run(const SrcDesc&,
|
||||
const SrcSliceOriginIdx&,
|
||||
const SrcBuffer& src_buf,
|
||||
const DstDesc& dst_desc,
|
||||
DstBuffer& dst_buf,
|
||||
const DstStepHacks& dst_step_hacks,
|
||||
const Dst0Desc& dst0_desc,
|
||||
const Dst0Buffer& dst0_buf,
|
||||
const Dst0StepHacks& dst0_step_hacks,
|
||||
const Dst1Desc& dst1_desc,
|
||||
const Dst1Buffer& dst1_buf,
|
||||
const Dst1StepHacks& dst1_step_hacks)
|
||||
{
|
||||
static_assert(SrcDesc::IsKnownAtCompileTime(),
|
||||
"wrong! SrcDesc need to known at compile-time");
|
||||
|
||||
static_assert(is_known_at_compile_time<remove_cvref_t<SrcSliceOriginIdx>>::value,
|
||||
"wrong! SrcSliceOrigin need to known at compile-time");
|
||||
|
||||
static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
|
||||
|
||||
// SrcDesc and src_slice_origin_idx are known at compile-time
|
||||
constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
|
||||
constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
// scalar per access on each dim
|
||||
// TODO: don't use lambda_scalar_per_access
|
||||
constexpr auto dst_scalar_per_access = generate_sequence(
|
||||
detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
|
||||
|
||||
constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
|
||||
|
||||
constexpr auto dim_access_order = DimAccessOrder{};
|
||||
|
||||
constexpr auto ordered_access_lengths =
|
||||
container_reorder_given_new2old(access_lengths, dim_access_order);
|
||||
|
||||
// make forward steps: dst
|
||||
const auto dst_forward_steps = generate_tuple(
|
||||
[&](auto i) {
|
||||
Index forward_step_idx;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto j) {
|
||||
forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
|
||||
});
|
||||
|
||||
return make_tensor_coordinate_step(
|
||||
dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
|
||||
},
|
||||
Number<nDim>{});
|
||||
|
||||
// make forward steps: dst0
|
||||
// WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
|
||||
// DstScalarPerVector
|
||||
// TODO: fix this
|
||||
const auto dst0_forward_steps = generate_tuple(
|
||||
[&](auto i) {
|
||||
Index forward_step_idx;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto j) {
|
||||
forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
|
||||
});
|
||||
|
||||
return make_tensor_coordinate_step(
|
||||
dst0_desc, forward_step_idx, dst0_step_hacks[I0][i]);
|
||||
},
|
||||
Number<nDim>{});
|
||||
|
||||
// make forward steps: dst1
|
||||
// WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
|
||||
// DstScalarPerVector
|
||||
// TODO: fix this
|
||||
const auto dst1_forward_steps = generate_tuple(
|
||||
[&](auto i) {
|
||||
Index forward_step_idx;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto j) {
|
||||
forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
|
||||
});
|
||||
|
||||
return make_tensor_coordinate_step(
|
||||
dst1_desc, forward_step_idx, dst1_step_hacks[I0][i]);
|
||||
},
|
||||
Number<nDim>{});
|
||||
|
||||
// make backward steps: dst
|
||||
const auto dst_backward_steps = generate_tuple(
|
||||
[&](auto i) {
|
||||
Index backward_step_idx;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto j) {
|
||||
backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
|
||||
});
|
||||
|
||||
return make_tensor_coordinate_step(
|
||||
dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
|
||||
},
|
||||
Number<nDim>{});
|
||||
|
||||
// make backward steps: dst0
|
||||
// WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
|
||||
// DstScalarPerVector
|
||||
// TODO: fix this
|
||||
const auto dst0_backward_steps = generate_tuple(
|
||||
[&](auto i) {
|
||||
Index backward_step_idx;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto j) {
|
||||
backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
|
||||
});
|
||||
|
||||
return make_tensor_coordinate_step(
|
||||
dst0_desc, backward_step_idx, dst0_step_hacks[I1][i]);
|
||||
},
|
||||
Number<nDim>{});
|
||||
|
||||
// make backward steps: dst1
|
||||
// WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
|
||||
// DstScalarPerVector
|
||||
// TODO: fix this
|
||||
const auto dst1_backward_steps = generate_tuple(
|
||||
[&](auto i) {
|
||||
Index backward_step_idx;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto j) {
|
||||
backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
|
||||
});
|
||||
|
||||
return make_tensor_coordinate_step(
|
||||
dst1_desc, backward_step_idx, dst1_step_hacks[I1][i]);
|
||||
},
|
||||
Number<nDim>{});
|
||||
|
||||
// loop over tensor and copy
|
||||
static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
|
||||
// judge move forward or move backward
|
||||
constexpr auto forward_sweep = [&]() {
|
||||
StaticallyIndexedArray<bool, nDim> forward_sweep_;
|
||||
|
||||
forward_sweep_(I0) = true;
|
||||
|
||||
static_for<1, nDim, 1>{}([&](auto i) {
|
||||
index_t tmp = ordered_access_idx[I0];
|
||||
|
||||
static_for<1, i, 1>{}([&](auto j) {
|
||||
tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
|
||||
});
|
||||
|
||||
forward_sweep_(i) = tmp % 2 == 0;
|
||||
});
|
||||
|
||||
return forward_sweep_;
|
||||
}();
|
||||
|
||||
// calculate dst data index
|
||||
constexpr auto dst_data_idx = [&]() {
|
||||
Index ordered_idx;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto i) {
|
||||
ordered_idx(i) = forward_sweep[i]
|
||||
? ordered_access_idx[i]
|
||||
: ordered_access_lengths[i] - 1 - ordered_access_idx[i];
|
||||
});
|
||||
|
||||
return container_reorder_given_old2new(ordered_idx, dim_access_order) *
|
||||
dst_scalar_per_access;
|
||||
}();
|
||||
|
||||
typename vector_type_maker<DstData, DstScalarPerVector>::type dst_vector;
|
||||
|
||||
using dst_vector_t =
|
||||
typename vector_type_maker<DstData, DstScalarPerVector>::type::type;
|
||||
|
||||
// load dst0 and dst1 and apply elementwise operation
|
||||
{
|
||||
// WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
|
||||
// TODO: fix this
|
||||
static_assert(DstScalarPerVector == 1, "wrong!");
|
||||
|
||||
// copy data from src_buf into dst_vector_src_data
|
||||
constexpr index_t src_offset =
|
||||
src_desc.CalculateOffset(src_slice_origin_idx + dst_data_idx);
|
||||
|
||||
const SrcData src_v = src_buf[Number<src_offset>{}];
|
||||
|
||||
// load dst0 and dst1
|
||||
const bool is_dst0_valid =
|
||||
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst0_desc,
|
||||
dst0_coord_);
|
||||
const bool is_dst1_valid =
|
||||
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst1_desc,
|
||||
dst1_coord_);
|
||||
|
||||
const DstData dst0_v =
|
||||
dst0_buf.template Get<DstData>(dst0_coord_.GetOffset(), is_dst0_valid);
|
||||
const DstData dst1_v =
|
||||
dst1_buf.template Get<DstData>(dst1_coord_.GetOffset(), is_dst1_valid);
|
||||
|
||||
#if !CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R4_TYPE_CONVERT_ISSUE
|
||||
// apply element-wise operation in SrcData type
|
||||
const SrcData dst_v = dst_element_op_(
|
||||
src_v, type_convert<SrcData>(dst0_v), type_convert<SrcData>(dst1_v));
|
||||
|
||||
// apply type convert
|
||||
dst_vector.template AsType<DstData>()(Number<0>{}) = type_convert<DstData>(dst_v);
|
||||
#else
|
||||
// apply element-wise operation in DstData type
|
||||
DstData dst_v;
|
||||
|
||||
dst_element_op_(dst_v, src_v, dst0_v, dst1_v);
|
||||
|
||||
dst_vector.template AsType<DstData>()(Number<0>{}) = dst_v;
|
||||
#endif
|
||||
}
|
||||
|
||||
const bool is_dst_valid =
|
||||
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
|
||||
|
||||
// copy data from dst_vector into dst_buf
|
||||
if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set)
|
||||
{
|
||||
dst_buf.template Set<dst_vector_t>(
|
||||
dst_coord_.GetOffset(),
|
||||
is_dst_valid,
|
||||
dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
|
||||
}
|
||||
else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd)
|
||||
{
|
||||
dst_buf.template AtomicAdd<dst_vector_t>(
|
||||
dst_coord_.GetOffset(),
|
||||
is_dst_valid,
|
||||
dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
|
||||
}
|
||||
else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Add)
|
||||
{
|
||||
|
||||
typename vector_type_maker<DstData, DstScalarPerVector>::type tmp;
|
||||
tmp.template AsType<dst_vector_t>()(Number<0>{}) =
|
||||
dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid);
|
||||
|
||||
static_for<0, DstScalarPerVector, 1>{}([&](auto t) {
|
||||
dst_vector.template AsType<DstData>()(t) += tmp.template AsType<DstData>()[t];
|
||||
});
|
||||
|
||||
dst_buf.template Set<dst_vector_t>(
|
||||
dst_coord_.GetOffset(),
|
||||
is_dst_valid,
|
||||
dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
|
||||
}
|
||||
|
||||
constexpr auto move_on_dim = [&]() constexpr
|
||||
{
|
||||
StaticallyIndexedArray<bool, nDim> move_on_dim_;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto i) {
|
||||
move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
|
||||
|
||||
static_for<i + 1, nDim, 1>{}([&](auto j) {
|
||||
move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
|
||||
});
|
||||
});
|
||||
|
||||
return move_on_dim_;
|
||||
}
|
||||
();
|
||||
|
||||
// move
|
||||
static_for<0, nDim, 1>{}([&](auto i) {
|
||||
if constexpr(move_on_dim[i])
|
||||
{
|
||||
if constexpr(forward_sweep[i])
|
||||
{
|
||||
move_tensor_coordinate(
|
||||
dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
|
||||
|
||||
// dst0
|
||||
move_tensor_coordinate(
|
||||
dst0_desc, dst0_coord_, dst0_forward_steps[dim_access_order[i]]);
|
||||
|
||||
// dst1
|
||||
move_tensor_coordinate(
|
||||
dst1_desc, dst1_coord_, dst1_forward_steps[dim_access_order[i]]);
|
||||
}
|
||||
else
|
||||
{
|
||||
move_tensor_coordinate(
|
||||
dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
|
||||
|
||||
// dst0
|
||||
move_tensor_coordinate(
|
||||
dst0_desc, dst0_coord_, dst0_backward_steps[dim_access_order[i]]);
|
||||
|
||||
// dst1
|
||||
move_tensor_coordinate(
|
||||
dst1_desc, dst1_coord_, dst1_backward_steps[dim_access_order[i]]);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// move dst coordinate back to slice origin (or not)
|
||||
if constexpr(DstResetCoordinateAfterRun)
|
||||
{
|
||||
const auto dst_reset_step =
|
||||
make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
|
||||
|
||||
move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename SrcSliceOriginIdx,
|
||||
typename SrcBuffer,
|
||||
typename DstBuffer,
|
||||
typename Dst0Buffer,
|
||||
typename Dst1Buffer>
|
||||
__device__ void Run(const SrcDesc&,
|
||||
const SrcSliceOriginIdx&,
|
||||
const SrcBuffer& src_buf,
|
||||
const DstDesc& dst_desc,
|
||||
DstBuffer& dst_buf,
|
||||
const Dst0Desc& dst0_desc,
|
||||
const Dst0Buffer& dst0_buf,
|
||||
const Dst1Desc& dst1_desc,
|
||||
const Dst1Buffer& dst1_buf)
|
||||
{
|
||||
auto f_step_hacks = [&](auto desc) {
|
||||
constexpr index_t ntransform = decltype(desc)::GetNumOfTransform();
|
||||
|
||||
constexpr auto zeros = typename uniform_sequence_gen<ntransform, 0>::type{};
|
||||
|
||||
constexpr auto step_hacks =
|
||||
make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
|
||||
generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
|
||||
|
||||
return step_hacks;
|
||||
};
|
||||
|
||||
Run(SrcDesc{},
|
||||
SrcSliceOriginIdx{},
|
||||
src_buf,
|
||||
dst_desc,
|
||||
dst_buf,
|
||||
f_step_hacks(dst_desc),
|
||||
dst0_desc,
|
||||
dst0_buf,
|
||||
f_step_hacks(dst0_desc),
|
||||
dst1_desc,
|
||||
dst1_buf,
|
||||
f_step_hacks(dst1_desc));
|
||||
}
|
||||
|
||||
__device__ static constexpr auto GetDstCoordinateResetStep()
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
|
||||
// scalar per access on each dim
|
||||
// TODO: don't use lambda_scalar_per_access
|
||||
constexpr auto dst_scalar_per_access = generate_sequence(
|
||||
detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
|
||||
|
||||
constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
|
||||
|
||||
constexpr auto dim_access_order = DimAccessOrder{};
|
||||
|
||||
constexpr auto ordered_access_lengths =
|
||||
container_reorder_given_new2old(access_lengths, dim_access_order);
|
||||
|
||||
// judge move forward or move backward during the last iteration
|
||||
constexpr auto forward_sweep = [&]() {
|
||||
StaticallyIndexedArray<bool, nDim> forward_sweep_;
|
||||
|
||||
forward_sweep_(I0) = true;
|
||||
|
||||
static_for<1, nDim, 1>{}([&](auto i) {
|
||||
index_t tmp = ordered_access_lengths[I0] - 1;
|
||||
|
||||
static_for<1, i, 1>{}([&](auto j) {
|
||||
tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
|
||||
});
|
||||
|
||||
forward_sweep_(i) = tmp % 2 == 0;
|
||||
});
|
||||
|
||||
return forward_sweep_;
|
||||
}();
|
||||
|
||||
// calculate dst data index after last iteration in Run(), if it has not being reset by
|
||||
// RunWrite()
|
||||
constexpr auto dst_data_idx = [&]() {
|
||||
Index ordered_idx;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto i) {
|
||||
ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
|
||||
});
|
||||
|
||||
return container_reorder_given_old2new(ordered_idx, dim_access_order) *
|
||||
dst_scalar_per_access;
|
||||
}();
|
||||
|
||||
//
|
||||
constexpr auto reset_dst_data_step = [&]() {
|
||||
Index reset_dst_data_step_;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
|
||||
|
||||
return reset_dst_data_step_;
|
||||
}();
|
||||
|
||||
return reset_dst_data_step;
|
||||
}
|
||||
|
||||
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
|
||||
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
|
||||
const Index& dst_slice_origin_step_idx)
|
||||
{
|
||||
// if dst coord was not reset by Run(), then need to adjust the step here
|
||||
const auto adjusted_step_idx =
|
||||
DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
|
||||
: dst_slice_origin_step_idx + GetDstCoordinateResetStep();
|
||||
|
||||
// is it OK to construct a new step every time?
|
||||
const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
|
||||
|
||||
move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
|
||||
}
|
||||
|
||||
private:
|
||||
DstCoord dst_coord_;
|
||||
Dst0Coord dst0_coord_;
|
||||
Dst1Coord dst1_coord_;
|
||||
const DstElementwiseOperation dst_element_op_;
|
||||
}; // namespace ck
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -1,453 +0,0 @@
|
||||
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R5_HPP
|
||||
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V1R5_HPP
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
|
||||
// and sometimes useless instructions:
|
||||
// 1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
|
||||
// instead
|
||||
// 2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
|
||||
// tensor coordinate instead
|
||||
// 3. Don't use a pointer to VGPR buffer, use vector instead
|
||||
|
||||
// WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
|
||||
// TODO: fix this
|
||||
// Assume:
|
||||
// 1. src:
|
||||
// 1. SrcDesc is known at compile-time
|
||||
// 2. SrcBuffer is StaticBuffer
|
||||
// 3. SrcSliceOrginIdx is known at compile-time
|
||||
// 2. dst:
|
||||
// 1. DstDesc is not known at compile-time
|
||||
// 2. DstBuffer is DynamicBuffer
|
||||
// 3. DstSliceOrginIdx is not known at compile time
|
||||
template <typename SrcData,
|
||||
typename DstData,
|
||||
typename SrcDesc,
|
||||
typename DstDesc,
|
||||
typename Dst0Desc, // this is really one of sources, but it has same shape as DstDesc
|
||||
typename DstElementwiseOperation,
|
||||
typename SliceLengths,
|
||||
typename DimAccessOrder,
|
||||
index_t DstVectorDim,
|
||||
index_t DstScalarPerVector,
|
||||
InMemoryDataOperationEnum_t DstInMemOp,
|
||||
index_t DstScalarStrideInVector,
|
||||
bool DstResetCoordinateAfterRun,
|
||||
typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
|
||||
struct ThreadwiseTensorSliceTransfer_v1r5
|
||||
{
|
||||
static constexpr index_t nDim = SliceLengths::Size();
|
||||
|
||||
using Index = MultiIndex<nDim>;
|
||||
|
||||
using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
|
||||
using Dst0Coord = decltype(make_tensor_coordinate(Dst0Desc{}, Index{}));
|
||||
|
||||
using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
|
||||
using Dst0CoordStep = decltype(make_tensor_coordinate_step(Dst0Desc{}, Index{}));
|
||||
|
||||
__device__ constexpr ThreadwiseTensorSliceTransfer_v1r5(
|
||||
const DstDesc& dst_desc,
|
||||
const Dst0Desc& dst0_desc,
|
||||
const Index& dst_slice_origin_idx,
|
||||
const DstElementwiseOperation& dst_element_op)
|
||||
: dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx)),
|
||||
dst0_coord_(make_tensor_coordinate(dst0_desc, dst_slice_origin_idx)),
|
||||
dst_element_op_{dst_element_op}
|
||||
{
|
||||
static_assert(SrcDesc::IsKnownAtCompileTime(),
|
||||
"wrong! SrcDesc need to known at compile-time");
|
||||
}
|
||||
|
||||
__device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
|
||||
{
|
||||
dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
|
||||
}
|
||||
|
||||
template <typename SrcSliceOriginIdx,
|
||||
typename SrcBuffer,
|
||||
typename DstBuffer,
|
||||
typename Dst0Buffer,
|
||||
typename DstStepHacks,
|
||||
typename Dst0StepHacks>
|
||||
__device__ void Run(const SrcDesc&,
|
||||
const SrcSliceOriginIdx&,
|
||||
const SrcBuffer& src_buf,
|
||||
const DstDesc& dst_desc,
|
||||
DstBuffer& dst_buf,
|
||||
const DstStepHacks& dst_step_hacks,
|
||||
const Dst0Desc& dst0_desc,
|
||||
const Dst0Buffer& dst0_buf,
|
||||
const Dst0StepHacks& dst0_step_hacks)
|
||||
{
|
||||
static_assert(SrcDesc::IsKnownAtCompileTime(),
|
||||
"wrong! SrcDesc need to known at compile-time");
|
||||
|
||||
static_assert(is_known_at_compile_time<remove_cvref_t<SrcSliceOriginIdx>>::value,
|
||||
"wrong! SrcSliceOrigin need to known at compile-time");
|
||||
|
||||
static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
|
||||
|
||||
// SrcDesc and src_slice_origin_idx are known at compile-time
|
||||
constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
|
||||
constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
// scalar per access on each dim
|
||||
// TODO: don't use lambda_scalar_per_access
|
||||
constexpr auto dst_scalar_per_access = generate_sequence(
|
||||
detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
|
||||
|
||||
constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
|
||||
|
||||
constexpr auto dim_access_order = DimAccessOrder{};
|
||||
|
||||
constexpr auto ordered_access_lengths =
|
||||
container_reorder_given_new2old(access_lengths, dim_access_order);
|
||||
|
||||
// make forward steps: dst
|
||||
const auto dst_forward_steps = generate_tuple(
|
||||
[&](auto i) {
|
||||
Index forward_step_idx;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto j) {
|
||||
forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
|
||||
});
|
||||
|
||||
return make_tensor_coordinate_step(
|
||||
dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
|
||||
},
|
||||
Number<nDim>{});
|
||||
|
||||
// make forward steps: dst0
|
||||
// WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
|
||||
// TODO: fix this
|
||||
const auto dst0_forward_steps = generate_tuple(
|
||||
[&](auto i) {
|
||||
Index forward_step_idx;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto j) {
|
||||
forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
|
||||
});
|
||||
|
||||
return make_tensor_coordinate_step(
|
||||
dst0_desc, forward_step_idx, dst0_step_hacks[I0][i]);
|
||||
},
|
||||
Number<nDim>{});
|
||||
|
||||
// make backward steps: dst
|
||||
const auto dst_backward_steps = generate_tuple(
|
||||
[&](auto i) {
|
||||
Index backward_step_idx;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto j) {
|
||||
backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
|
||||
});
|
||||
|
||||
return make_tensor_coordinate_step(
|
||||
dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
|
||||
},
|
||||
Number<nDim>{});
|
||||
|
||||
// make backward steps: dst0
|
||||
// WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
|
||||
// TODO: fix this
|
||||
const auto dst0_backward_steps = generate_tuple(
|
||||
[&](auto i) {
|
||||
Index backward_step_idx;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto j) {
|
||||
backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
|
||||
});
|
||||
|
||||
return make_tensor_coordinate_step(
|
||||
dst0_desc, backward_step_idx, dst0_step_hacks[I1][i]);
|
||||
},
|
||||
Number<nDim>{});
|
||||
|
||||
// loop over tensor and copy
|
||||
static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
|
||||
// judge move forward or move backward
|
||||
constexpr auto forward_sweep = [&]() {
|
||||
StaticallyIndexedArray<bool, nDim> forward_sweep_;
|
||||
|
||||
forward_sweep_(I0) = true;
|
||||
|
||||
static_for<1, nDim, 1>{}([&](auto i) {
|
||||
index_t tmp = ordered_access_idx[I0];
|
||||
|
||||
static_for<1, i, 1>{}([&](auto j) {
|
||||
tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
|
||||
});
|
||||
|
||||
forward_sweep_(i) = tmp % 2 == 0;
|
||||
});
|
||||
|
||||
return forward_sweep_;
|
||||
}();
|
||||
|
||||
// calculate dst data index
|
||||
constexpr auto dst_data_idx = [&]() {
|
||||
Index ordered_idx;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto i) {
|
||||
ordered_idx(i) = forward_sweep[i]
|
||||
? ordered_access_idx[i]
|
||||
: ordered_access_lengths[i] - 1 - ordered_access_idx[i];
|
||||
});
|
||||
|
||||
return container_reorder_given_old2new(ordered_idx, dim_access_order) *
|
||||
dst_scalar_per_access;
|
||||
}();
|
||||
|
||||
typename vector_type_maker<DstData, DstScalarPerVector>::type dst_vector;
|
||||
|
||||
using dst_vector_t =
|
||||
typename vector_type_maker<DstData, DstScalarPerVector>::type::type;
|
||||
|
||||
// load dst0 and apply elementwise operation
|
||||
{
|
||||
// WARNING!!!!!!: this logic is only correct if DstScalarPerVector=1
|
||||
// TODO: fix this
|
||||
static_assert(DstScalarPerVector == 1, "wrong!");
|
||||
|
||||
// copy data from src_buf into dst_vector_src_data
|
||||
constexpr index_t src_offset =
|
||||
src_desc.CalculateOffset(src_slice_origin_idx + dst_data_idx);
|
||||
|
||||
const SrcData src_v = src_buf[Number<src_offset>{}];
|
||||
|
||||
// load dst0
|
||||
const bool is_dst0_valid =
|
||||
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst0_desc,
|
||||
dst0_coord_);
|
||||
const DstData dst0_v =
|
||||
dst0_buf.template Get<DstData>(dst0_coord_.GetOffset(), is_dst0_valid);
|
||||
|
||||
#if !CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE
|
||||
// apply element-wise operation in SrcData type
|
||||
const SrcData dst_v = dst_element_op_(src_v, type_convert<SrcData>(dst0_v));
|
||||
|
||||
// apply type convert
|
||||
dst_vector.template AsType<DstData>()(Number<0>{}) = type_convert<DstData>(dst_v);
|
||||
#else
|
||||
// apply element-wise operation in DstData type
|
||||
const DstData dst_v = dst_element_op_(src_v, dst0_v);
|
||||
|
||||
dst_vector.template AsType<DstData>()(Number<0>{}) = dst_v;
|
||||
#endif
|
||||
}
|
||||
|
||||
const bool is_dst_valid =
|
||||
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
|
||||
|
||||
// copy data from dst_vector into dst_buf
|
||||
if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set)
|
||||
{
|
||||
dst_buf.template Set<dst_vector_t>(
|
||||
dst_coord_.GetOffset(),
|
||||
is_dst_valid,
|
||||
dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
|
||||
}
|
||||
else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd)
|
||||
{
|
||||
dst_buf.template AtomicAdd<dst_vector_t>(
|
||||
dst_coord_.GetOffset(),
|
||||
is_dst_valid,
|
||||
dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
|
||||
}
|
||||
else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Add)
|
||||
{
|
||||
|
||||
typename vector_type_maker<DstData, DstScalarPerVector>::type tmp;
|
||||
tmp.template AsType<dst_vector_t>()(Number<0>{}) =
|
||||
dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid);
|
||||
|
||||
static_for<0, DstScalarPerVector, 1>{}([&](auto t) {
|
||||
dst_vector.template AsType<DstData>()(t) += tmp.template AsType<DstData>()[t];
|
||||
});
|
||||
|
||||
dst_buf.template Set<dst_vector_t>(
|
||||
dst_coord_.GetOffset(),
|
||||
is_dst_valid,
|
||||
dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
|
||||
}
|
||||
|
||||
constexpr auto move_on_dim = [&]() constexpr
|
||||
{
|
||||
StaticallyIndexedArray<bool, nDim> move_on_dim_;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto i) {
|
||||
move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
|
||||
|
||||
static_for<i + 1, nDim, 1>{}([&](auto j) {
|
||||
move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
|
||||
});
|
||||
});
|
||||
|
||||
return move_on_dim_;
|
||||
}
|
||||
();
|
||||
|
||||
// move
|
||||
static_for<0, nDim, 1>{}([&](auto i) {
|
||||
if constexpr(move_on_dim[i])
|
||||
{
|
||||
if constexpr(forward_sweep[i])
|
||||
{
|
||||
move_tensor_coordinate(
|
||||
dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
|
||||
|
||||
// dst0
|
||||
move_tensor_coordinate(
|
||||
dst0_desc, dst0_coord_, dst0_forward_steps[dim_access_order[i]]);
|
||||
}
|
||||
else
|
||||
{
|
||||
move_tensor_coordinate(
|
||||
dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
|
||||
|
||||
// dst0
|
||||
move_tensor_coordinate(
|
||||
dst0_desc, dst0_coord_, dst0_backward_steps[dim_access_order[i]]);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// move dst coordinate back to slice origin (or not)
|
||||
if constexpr(DstResetCoordinateAfterRun)
|
||||
{
|
||||
const auto dst_reset_step =
|
||||
make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
|
||||
|
||||
move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename SrcSliceOriginIdx,
|
||||
typename SrcBuffer,
|
||||
typename DstBuffer,
|
||||
typename Dst0Buffer>
|
||||
__device__ void Run(const SrcDesc&,
|
||||
const SrcSliceOriginIdx&,
|
||||
const SrcBuffer& src_buf,
|
||||
const DstDesc& dst_desc,
|
||||
DstBuffer& dst_buf,
|
||||
const Dst0Desc& dst0_desc,
|
||||
const Dst0Buffer& dst0_buf)
|
||||
{
|
||||
auto f_step_hacks = [&](auto desc) {
|
||||
constexpr index_t ntransform = decltype(desc)::GetNumOfTransform();
|
||||
|
||||
constexpr auto zeros = typename uniform_sequence_gen<ntransform, 0>::type{};
|
||||
|
||||
constexpr auto step_hacks =
|
||||
make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
|
||||
generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
|
||||
|
||||
return step_hacks;
|
||||
};
|
||||
|
||||
Run(SrcDesc{},
|
||||
SrcSliceOriginIdx{},
|
||||
src_buf,
|
||||
dst_desc,
|
||||
dst_buf,
|
||||
f_step_hacks(dst_desc),
|
||||
dst0_desc,
|
||||
dst0_buf,
|
||||
f_step_hacks(dst0_desc));
|
||||
}
|
||||
|
||||
__device__ static constexpr auto GetDstCoordinateResetStep()
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
|
||||
// scalar per access on each dim
|
||||
// TODO: don't use lambda_scalar_per_access
|
||||
constexpr auto dst_scalar_per_access = generate_sequence(
|
||||
detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
|
||||
|
||||
constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access;
|
||||
|
||||
constexpr auto dim_access_order = DimAccessOrder{};
|
||||
|
||||
constexpr auto ordered_access_lengths =
|
||||
container_reorder_given_new2old(access_lengths, dim_access_order);
|
||||
|
||||
// judge move forward or move backward during the last iteration
|
||||
constexpr auto forward_sweep = [&]() {
|
||||
StaticallyIndexedArray<bool, nDim> forward_sweep_;
|
||||
|
||||
forward_sweep_(I0) = true;
|
||||
|
||||
static_for<1, nDim, 1>{}([&](auto i) {
|
||||
index_t tmp = ordered_access_lengths[I0] - 1;
|
||||
|
||||
static_for<1, i, 1>{}([&](auto j) {
|
||||
tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
|
||||
});
|
||||
|
||||
forward_sweep_(i) = tmp % 2 == 0;
|
||||
});
|
||||
|
||||
return forward_sweep_;
|
||||
}();
|
||||
|
||||
// calculate dst data index after last iteration in Run(), if it has not being reset by
|
||||
// RunWrite()
|
||||
constexpr auto dst_data_idx = [&]() {
|
||||
Index ordered_idx;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto i) {
|
||||
ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
|
||||
});
|
||||
|
||||
return container_reorder_given_old2new(ordered_idx, dim_access_order) *
|
||||
dst_scalar_per_access;
|
||||
}();
|
||||
|
||||
//
|
||||
constexpr auto reset_dst_data_step = [&]() {
|
||||
Index reset_dst_data_step_;
|
||||
|
||||
static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
|
||||
|
||||
return reset_dst_data_step_;
|
||||
}();
|
||||
|
||||
return reset_dst_data_step;
|
||||
}
|
||||
|
||||
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
|
||||
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
|
||||
const Index& dst_slice_origin_step_idx)
|
||||
{
|
||||
// if dst coord was not reset by Run(), then need to adjust the step here
|
||||
const auto adjusted_step_idx =
|
||||
DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
|
||||
: dst_slice_origin_step_idx + GetDstCoordinateResetStep();
|
||||
|
||||
// is it OK to construct a new step every time?
|
||||
const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
|
||||
|
||||
move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
|
||||
}
|
||||
|
||||
private:
|
||||
DstCoord dst_coord_;
|
||||
Dst0Coord dst0_coord_;
|
||||
const DstElementwiseOperation dst_element_op_;
|
||||
}; // namespace ck
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -48,7 +48,7 @@ struct lambda_scalar_per_access_for_src_and_dst
|
||||
template <typename SliceLengths,
|
||||
typename SrcElementwiseOperation,
|
||||
typename DstElementwiseOperation,
|
||||
InMemoryDataOperationEnum_t DstInMemOp,
|
||||
InMemoryDataOperationEnum DstInMemOp,
|
||||
typename SrcData,
|
||||
typename DstData,
|
||||
typename SrcDesc,
|
||||
@@ -110,8 +110,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
|
||||
const SrcBuffer& src_buf,
|
||||
Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
|
||||
{
|
||||
static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
|
||||
SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
|
||||
static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
|
||||
SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
|
||||
"wrong!");
|
||||
|
||||
static_assert(
|
||||
@@ -271,7 +271,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1
|
||||
static_ford<SliceLengths>{}([&](auto idx) {
|
||||
// convert from SrcData to DstData here
|
||||
dst_thread_scratch_(idx) =
|
||||
type_convert<DstData>(src_thread_scratch_tuple[thread_scratch_id][idx]);
|
||||
type_convert<DstData>(src_thread_scratch_tuple_[thread_scratch_id][idx]);
|
||||
});
|
||||
#else
|
||||
// sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
|
||||
@@ -361,8 +361,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1
|
||||
// TODO move this elsewhere
|
||||
TransferDataFromSrcThreadScratchToDstThreadScratch(thread_scratch_id);
|
||||
|
||||
static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
|
||||
DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
|
||||
static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
|
||||
DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
|
||||
"wrong!");
|
||||
|
||||
static_assert(
|
||||
@@ -763,13 +763,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
|
||||
static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){};
|
||||
static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){};
|
||||
|
||||
using SrcThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
|
||||
using SrcThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
|
||||
SrcData,
|
||||
SrcScalarPerVector,
|
||||
decltype(src_thread_scratch_desc_),
|
||||
true>;
|
||||
|
||||
using DstThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
|
||||
using DstThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
|
||||
DstData,
|
||||
DstScalarPerVector,
|
||||
decltype(dst_thread_scratch_desc_),
|
||||
|
||||
@@ -48,7 +48,7 @@ struct lambda_scalar_per_access_for_src_and_dst
|
||||
template <typename SliceLengths,
|
||||
typename SrcElementwiseOperation,
|
||||
typename DstElementwiseOperation,
|
||||
InMemoryDataOperationEnum_t DstInMemOp,
|
||||
InMemoryDataOperationEnum DstInMemOp,
|
||||
typename SrcData,
|
||||
typename DstData,
|
||||
typename SrcDesc,
|
||||
@@ -120,8 +120,8 @@ struct ThreadwiseTensorSliceTransfer_v3r3
|
||||
template <typename SrcBuffer>
|
||||
__device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
|
||||
{
|
||||
static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
|
||||
SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
|
||||
static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
|
||||
SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
|
||||
"wrong!");
|
||||
|
||||
static_assert(
|
||||
@@ -369,8 +369,8 @@ struct ThreadwiseTensorSliceTransfer_v3r3
|
||||
// TODO move this elsewhere
|
||||
TransferDataFromSrcThreadScratchToDstThreadScratch();
|
||||
|
||||
static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
|
||||
DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
|
||||
static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
|
||||
DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
|
||||
"wrong!");
|
||||
|
||||
static_assert(
|
||||
@@ -859,14 +859,14 @@ struct ThreadwiseTensorSliceTransfer_v3r3
|
||||
static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){};
|
||||
static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){};
|
||||
|
||||
StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
|
||||
StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
|
||||
SrcData,
|
||||
SrcScalarPerVector,
|
||||
decltype(src_thread_scratch_desc_),
|
||||
true>
|
||||
src_thread_scratch_;
|
||||
|
||||
StaticTensorTupleOfVectorBuffer<AddressSpaceEnum_t::Vgpr,
|
||||
StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
|
||||
DstData,
|
||||
DstScalarPerVector,
|
||||
decltype(dst_thread_scratch_desc_),
|
||||
|
||||
@@ -13,7 +13,7 @@ namespace ck {
|
||||
// 3. src_slice_origin and dst_slice_origin are not known at compile-time,
|
||||
// 4. Use thread buffer
|
||||
template <typename SliceLengths,
|
||||
InMemoryDataOperationEnum_t DstInMemOp,
|
||||
InMemoryDataOperationEnum DstInMemOp,
|
||||
typename SrcData,
|
||||
typename DstData,
|
||||
typename SrcDesc,
|
||||
@@ -76,8 +76,8 @@ struct ThreadwiseTensorSliceTransfer_v5r1
|
||||
__device__ void
|
||||
RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
|
||||
{
|
||||
static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
|
||||
SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
|
||||
static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
|
||||
SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
|
||||
"wrong!");
|
||||
|
||||
static_assert(
|
||||
@@ -244,8 +244,8 @@ struct ThreadwiseTensorSliceTransfer_v5r1
|
||||
__device__ void
|
||||
RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
|
||||
{
|
||||
static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or
|
||||
DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds,
|
||||
static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
|
||||
DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
|
||||
"wrong!");
|
||||
|
||||
static_assert(
|
||||
@@ -602,7 +602,7 @@ struct ThreadwiseTensorSliceTransfer_v5r1
|
||||
|
||||
static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
|
||||
|
||||
StaticBuffer<AddressSpaceEnum_t::Vgpr, SrcData, buffer_size_, true> buffer_;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, SrcData, buffer_size_, true> buffer_;
|
||||
|
||||
SrcCoord src_coord_;
|
||||
DstCoord dst_coord_;
|
||||
|
||||
@@ -29,7 +29,7 @@ template <typename SrcData,
|
||||
typename DimAccessOrder,
|
||||
index_t VectorDim,
|
||||
index_t ScalarPerVector,
|
||||
InMemoryDataOperationEnum_t DstInMemOp,
|
||||
InMemoryDataOperationEnum DstInMemOp,
|
||||
bool SrcResetCoordinateAfterRun,
|
||||
bool DstResetCoordinateAfterRun>
|
||||
struct ThreadwiseTensorSliceTransfer_v6r1
|
||||
|
||||
@@ -31,7 +31,7 @@ template <typename Src0Data,
|
||||
typename DimAccessOrder,
|
||||
index_t VectorDim,
|
||||
index_t ScalarPerVector,
|
||||
InMemoryDataOperationEnum_t DstInMemOp,
|
||||
InMemoryDataOperationEnum DstInMemOp,
|
||||
bool Src0ResetCoordinateAfterRun,
|
||||
bool Src1ResetCoordinateAfterRun,
|
||||
bool DstResetCoordinateAfterRun>
|
||||
|
||||
@@ -33,7 +33,7 @@ template <typename Src0Data,
|
||||
typename DimAccessOrder,
|
||||
index_t VectorDim,
|
||||
index_t ScalarPerVector,
|
||||
InMemoryDataOperationEnum_t DstInMemOp,
|
||||
InMemoryDataOperationEnum DstInMemOp,
|
||||
bool Src0ResetCoordinateAfterRun,
|
||||
bool Src1ResetCoordinateAfterRun,
|
||||
bool Src2ResetCoordinateAfterRun,
|
||||
|
||||
Reference in New Issue
Block a user