Multi-kernel CGEMM (#230)

* Reference CGEMM + test stub

* Format.

* Incomplete simple implementation

* Library instances

* Sketch of tests

* Test fixes.

* Example added

* Cosmetics

* Add elementwise operation kernel and example

* Add comment

* Add template argument of dim . Prepare to support multiple dimension

* Rename example

* Support 1 dimension

* Add static assert

* Add comment

* Second auxiliary buffer added

* Extract pad

* Remove redundant argument

* Support any dimension for elementwise operation

* Remove line

* Let it be the multiple number of CU

* Move thread per block to the parameter of constructor

* Consuming binary ops to do A+B / A-B

* Fix + cosmetics + bf16 test commented out temporarily

* Format

* Enabling bf16 test

* Revert "Enabling bf16 test"

This reverts commit f497e2ba44.

* Fix + test reenabled

* fix build

* Revert "fix build"

This reverts commit d73102384b.

* post PR #235 merge fix

* amend

* Single workspace for cgemm + helper

* Perf calc fix

* Review remarks: static_cast

* Review remarks: binary ops templated

* Cleaning

* Removal of instances and their tests

* Review remarks from aosew addressed

* Review remark: unnecessary attribute

* Post-merge fixes

* Restrict 4gemm to PassThrough + bug fix

* Review remarks

* update licence

* change cgemm example to fp16

Co-authored-by: rocking <chunylai@amd.com>
Co-authored-by: Chao Liu <chao.liu2@amd.com>
Co-authored-by: Anthony Chang <ac.chang@outlook.com>
This commit is contained in:
myamlak
2022-05-31 17:20:55 +02:00
committed by GitHub
parent 85fc91c321
commit 7b1e2c379e
12 changed files with 1756 additions and 21 deletions

View File

@@ -1,3 +1,28 @@
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#pragma once
#include "data_type.hpp"
@@ -5,14 +30,22 @@ namespace ck {
namespace tensor_operation {
namespace binary_element_wise {
struct Add
template <typename Y, typename X1, typename X2>
struct Add;
template <>
struct Add<double, double, double>
{
__host__ __device__ constexpr void
operator()(double& dst, const double& src1, const double& src2) const
{
dst = src1 + src2;
}
};
template <>
struct Add<float, float, float>
{
__host__ __device__ constexpr void
operator()(float& dst, const float& src1, const float& src2) const
{
@@ -20,6 +53,75 @@ struct Add
}
};
template <>
struct Add<half_t, half_t, half_t>
{
__host__ __device__ constexpr void
operator()(half_t& dst, const half_t& src1, const half_t& src2) const
{
dst = src1 + src2;
}
};
template <>
struct Add<bhalf_t, bhalf_t, bhalf_t>
{
__host__ __device__ constexpr void
operator()(bhalf_t& dst, const bhalf_t& src1, const bhalf_t& src2) const
{
const float x1 = ck::type_convert<float>(src1);
const float x2 = ck::type_convert<float>(src2);
const float y = x1 + x2;
dst = ck::type_convert<bhalf_t>(y);
}
};
template <typename Y, typename X1, typename X2>
struct Substract;
template <>
struct Substract<double, double, double>
{
__host__ __device__ constexpr void
operator()(double& dst, const double& src1, const double& src2) const
{
dst = src1 - src2;
}
};
template <>
struct Substract<float, float, float>
{
__host__ __device__ constexpr void
operator()(float& dst, const float& src1, const float& src2) const
{
dst = src1 - src2;
}
};
template <>
struct Substract<half_t, half_t, half_t>
{
__host__ __device__ constexpr void
operator()(half_t& dst, const half_t& src1, const half_t& src2) const
{
dst = src1 - src2;
}
};
template <>
struct Substract<bhalf_t, bhalf_t, bhalf_t>
{
__host__ __device__ constexpr void
operator()(bhalf_t& dst, const bhalf_t& src1, const bhalf_t& src2) const
{
const float x1 = ck::type_convert<float>(src1);
const float x2 = ck::type_convert<float>(src2);
const float y = x1 - x2;
dst = ck::type_convert<bhalf_t>(y);
}
};
} // namespace binary_element_wise
} // namespace tensor_operation
} // namespace ck