Files
composable_kernel/include/ck/utility/static_buffer.hpp
Jianfeng Yan 40b59a63cc Navi21 gemm (#197)
* start adding navi21 GEMM

* navi_gemm_km_kn_mn_fp32 compiles and passes one test.

* rename variables and functions in gridwise_gemm_dlops_v1r3

* add other 3 layouts; format instance

* adding more tuning parameters

add tuning parameters for other 3 layouts

* add gemm_dlops_f16

* tmp

* add dependence of DeviceGemm::IsSupportedArg() on arch

* minor changes

* minor changes

* minor changes

* minor changes

* minor changes

* minor changes

* minor changes

* push gemm_dlops into profiler

* minor changes

* if using xdl or dlops is moved into profiler_gemm_impl

* minor changes

* minor changes

* remove is_xdl from profile_gemm_impl

* make IsSupportedArg dependent on arch for other device_gemm

* minor changes

* minor changes

* fix a bug in f_generate_tensor_value

* add 64x64x64 for gemm_dlops_int8

* add 64x64x64 for gemm_dlops_int8

* comment out 3 layouts in gemm_dlops_int8; add 32x32x32 for gemm_dlops_int8; init A values to 1

* fix

* start fixing tuning parameters

* monir

* minor changes

* minor changes

* minor changes

* fixing

* adding example

* adding example

* adding example

* add gemm fp32 example

* clean up

* use 128x128x16 as MNK tile in navi21 gemm example

* bug fix

* fix test

* use new block c tile

* clean

* fix build

Co-authored-by: Chao Liu <chao.liu2@amd.com>
Co-authored-by: shaojiewang <wsjmessi@163.com>
2022-05-24 12:19:27 -05:00

174 lines
5.4 KiB
C++

#ifndef CK_STATIC_BUFFER_HPP
#define CK_STATIC_BUFFER_HPP
#include "statically_indexed_array.hpp"
namespace ck {
// static buffer for scalar
template <AddressSpaceEnum AddressSpace,
typename T,
index_t N,
bool InvalidElementUseNumericalZeroValue> // TODO remove this bool, no longer needed
struct StaticBuffer : public StaticallyIndexedArray<T, N>
{
using type = T;
using base = StaticallyIndexedArray<T, N>;
__host__ __device__ constexpr StaticBuffer() : base{} {}
__host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return AddressSpace; }
__host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
__host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
// read access
template <index_t I>
__host__ __device__ constexpr const T& operator[](Number<I> i) const
{
return base::operator[](i);
}
// write access
template <index_t I>
__host__ __device__ constexpr T& operator()(Number<I> i)
{
return base::operator()(i);
}
__host__ __device__ void Clear()
{
static_for<0, N, 1>{}([&](auto i) { operator()(i) = T{0}; });
}
};
// static buffer for vector
template <AddressSpaceEnum AddressSpace,
typename S,
index_t NumOfVector,
index_t ScalarPerVector,
bool InvalidElementUseNumericalZeroValue, // TODO remove this bool, no longer needed,
typename enable_if<is_scalar_type<S>::value, bool>::type = false>
struct StaticBufferTupleOfVector
: public StaticallyIndexedArray<vector_type<S, ScalarPerVector>, NumOfVector>
{
using V = typename vector_type<S, ScalarPerVector>::type;
using base = StaticallyIndexedArray<vector_type<S, ScalarPerVector>, NumOfVector>;
static constexpr auto s_per_v = Number<ScalarPerVector>{};
static constexpr auto num_of_v_ = Number<NumOfVector>{};
__host__ __device__ constexpr StaticBufferTupleOfVector() : base{} {}
__host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return AddressSpace; }
__host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
__host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
// Get S
// i is offset of S
template <index_t I>
__host__ __device__ constexpr const S& operator[](Number<I> i) const
{
constexpr auto i_v = i / s_per_v;
constexpr auto i_s = i % s_per_v;
return base::operator[](i_v).template AsType<S>()[i_s];
}
// Set S
// i is offset of S
template <index_t I>
__host__ __device__ constexpr S& operator()(Number<I> i)
{
constexpr auto i_v = i / s_per_v;
constexpr auto i_s = i % s_per_v;
return base::operator()(i_v).template AsType<S>()(i_s);
}
// Get X
// i is offset of S, not X. i should be aligned to X
template <typename X,
index_t I,
typename enable_if<has_same_scalar_type<S, X>::value, bool>::type = false>
__host__ __device__ constexpr auto GetAsType(Number<I> i) const
{
constexpr auto s_per_x = Number<scalar_type<remove_cvref_t<X>>::vector_size>{};
static_assert(s_per_v % s_per_x == 0, "wrong! V must one or multiple X");
static_assert(i % s_per_x == 0, "wrong!");
constexpr auto i_v = i / s_per_v;
constexpr auto i_x = (i % s_per_v) / s_per_x;
return base::operator[](i_v).template AsType<X>()[i_x];
}
// Set X
// i is offset of S, not X. i should be aligned to X
template <typename X,
index_t I,
typename enable_if<has_same_scalar_type<S, X>::value, bool>::type = false>
__host__ __device__ constexpr void SetAsType(Number<I> i, X x)
{
constexpr auto s_per_x = Number<scalar_type<remove_cvref_t<X>>::vector_size>{};
static_assert(s_per_v % s_per_x == 0, "wrong! V must contain one or multiple X");
static_assert(i % s_per_x == 0, "wrong!");
constexpr auto i_v = i / s_per_v;
constexpr auto i_x = (i % s_per_v) / s_per_x;
base::operator()(i_v).template AsType<X>()(i_x) = x;
}
// Get read access to vector_type V
// i is offset of S, not V. i should be aligned to V
template <index_t I>
__host__ __device__ constexpr const auto& GetVectorTypeReference(Number<I> i) const
{
static_assert(i % s_per_v == 0, "wrong!");
constexpr auto i_v = i / s_per_v;
return base::operator[](i_v);
}
// Get write access to vector_type V
// i is offset of S, not V. i should be aligned to V
template <index_t I>
__host__ __device__ constexpr auto& GetVectorTypeReference(Number<I> i)
{
static_assert(i % s_per_v == 0, "wrong!");
constexpr auto i_v = i / s_per_v;
return base::operator()(i_v);
}
__host__ __device__ void Clear()
{
constexpr index_t NumScalars = NumOfVector * ScalarPerVector;
static_for<0, NumScalars, 1>{}([&](auto i) { SetAsType(i, S{0}); });
}
};
template <AddressSpaceEnum AddressSpace, typename T, index_t N>
__host__ __device__ constexpr auto make_static_buffer(Number<N>)
{
return StaticBuffer<AddressSpace, T, N, true>{};
}
template <AddressSpaceEnum AddressSpace, typename T, long_index_t N>
__host__ __device__ constexpr auto make_static_buffer(LongNumber<N>)
{
return StaticBuffer<AddressSpace, T, N, true>{};
}
} // namespace ck
#endif