Standalone sweep once softmax kernel w/ ckProfiler (#295)

* use 'sweep once' softmax kernel where applicable

* threadwise copy's dst buffer can specify invalid element value

* add int8 in/out float compute softmax support

give a bit of leeway for int absolute tolerance as there's a single data point of all test cases showing off-by-1 error

* format

* softmax inherits DeviceNormalization

* softmax profiler stub

* tighten up reference softmax interface

* example prints tensor dimension

* add fp32 to softmax profiler

* rename header

* hook with ckProfiler

* format

* resolve merge conflict

* resolve merge conflicts

* update normalization profiler help string

* resolve conflict

* typo

* remove residual

* softmax profiler: address feedback

* test for mixed precision input/output

* fully qualify ck::math::isnan

* add comment for device normalization interface

* revise wording

* constness for alpha/beta scaler pointer

[ROCm/composable_kernel commit: 93c99f3d87]
This commit is contained in:
Anthony Chang
2022-07-01 01:08:50 +08:00
committed by GitHub
parent ca34ce4450
commit d41b1a7c2c
24 changed files with 809 additions and 106 deletions

View File

@@ -222,6 +222,12 @@ struct Tensor
Tensor(const Tensor& other) : mDesc(other.mDesc), mData(other.mData) {}
Tensor& operator=(const Tensor& other)
{
mDesc = other.mDesc;
mData = other.mData;
}
template <typename F>
void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
{

View File

@@ -26,12 +26,11 @@ struct ReferenceSoftmax : public device::BaseOperator
Tensor<OutDataType>& out,
AccDataType alpha,
AccDataType beta,
const index_t rank,
const std::vector<index_t> sm_reduce_dims)
: in_(in), out_(out), alpha_(alpha), beta_(beta), sm_reduce_dims_(sm_reduce_dims)
{
// std::cout << "debug: scalar dims: ";
for(int i = 0; i < rank; i++)
for(size_t i = 0; i < in.mDesc.GetNumOfDimension(); i++)
{
if(std::find(sm_reduce_dims.begin(), sm_reduce_dims.end(), i) ==
sm_reduce_dims.end())
@@ -47,7 +46,6 @@ struct ReferenceSoftmax : public device::BaseOperator
Tensor<OutDataType>& out_;
AccDataType alpha_;
AccDataType beta_;
index_t rank_;
std::vector<index_t> sm_reduce_dims_;
std::vector<index_t> sm_scalar_dims_; // dim after internal max/sum reduction
};
@@ -136,10 +134,9 @@ struct ReferenceSoftmax : public device::BaseOperator
Tensor<OutDataType>& out,
AccDataType alpha,
AccDataType beta,
const index_t rank,
const std::vector<index_t> sm_reduce_dims)
{
return Argument{in, out, alpha, beta, rank, sm_reduce_dims};
return Argument{in, out, alpha, beta, sm_reduce_dims};
}
static auto MakeInvoker() { return Invoker{}; }

View File

@@ -4,6 +4,7 @@
#pragma once
#include <vector>
#include "ck/utility/functional2.hpp"
namespace ck {
namespace tensor_operation {

View File

@@ -159,7 +159,7 @@ check_err(const std::vector<T>& out,
const std::vector<T>& ref,
const std::string& msg = "Error: Incorrect results!",
double = 0,
double = 0)
double atol = 0)
{
if(out.size() != ref.size())
{
@@ -179,7 +179,7 @@ check_err(const std::vector<T>& out,
int64_t r = ref[i];
err = std::abs(o - r);
if(err > 0)
if(err > atol)
{
max_err = err > max_err ? err : max_err;
err_count++;