Update to gemm_reduce and batched_gemm_reduce (#213)

* [Experimental] Change to gemm+reduce and batched-gemm+reduce

* Use threadwise-reduce function to improve the gridwise_gemm_reduce_xdl_cshuffle kernel

* Tiny fix in device_batched_gemm_xdl.hpp

* clang-format library/src/utility/conv_fwd_util.cpp
This commit is contained in:
Qianfeng
2022-04-30 00:35:25 +08:00
committed by GitHub
parent 97d8c5045e
commit c77ae65d40
18 changed files with 350 additions and 375 deletions

View File

@@ -5,20 +5,6 @@ namespace ck {
namespace tensor_operation {
namespace element_wise {
struct ReduceSum
{
__host__ __device__ static constexpr float GetReduceZeroValue() { return float(0); }
__host__ __device__ void Reduce(float& acc, float v) const { acc += v; }
};
struct ReduceSquareSum
{
__host__ __device__ static constexpr float GetReduceZeroValue() { return float(0); }
__host__ __device__ void Reduce(float& acc, float v) const { acc += v * v; }
};
} // namespace element_wise
} // namespace tensor_operation
} // namespace ck