mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
* Implement multiple-reduction in one kernel (kernels, device ops, examples) * Add generic elementwise kernel and device interface * Add generator for normal-distributed data initialization * Add host refer implementation of batchnorm-forward and batchnorm-infer * Add examples for implementing batchnorm-forward and batchnorm-infer using generic kernels * Remove un-needed including in batchnorm example * Renaming generic_elementwise to elementiwise in kernel and device classes/functions * Change in gemm_layernorm examples to use DeviceElementwise instead of Device5AryElementwise * Change in exampe 19_binary_elementwise to use DeviceElementwise instead of DeviceBinaryElementwise * Change in device_cgemm_4gemm_xdl_cshuffle.hpp to use kernel_elementwise instead of kernel_binary_elementwise * Add DeviceElementwiseBase and use it in device_normalize_instance.cpp * Removing and renaming files * Update to synchronize gemm_layernorm client example to the generic element-wise device op API * Update to synchronize with the latest headers directory and HostTensorDescriptor interface renaming * Merge two static member functions in device_elementwise.hpp * Remove unary_elementwise_1d kernel and device
120 lines
4.0 KiB
C++
120 lines
4.0 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include <cassert>
|
|
#include <vector>
|
|
|
|
#include "ck/ck.hpp"
|
|
#include "ck/utility/sequence.hpp"
|
|
#include "ck/utility/tuple.hpp"
|
|
#include "ck/utility/reduction_operator.hpp"
|
|
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
|
|
|
#include "batchnorm_common.hpp"
|
|
|
|
template <typename InOutDataType,
|
|
typename AccDataType,
|
|
ck::index_t Rank,
|
|
ck::index_t NumBatchNormReduceDim,
|
|
bool fastest_dim_is_reduced = false>
|
|
int bnorm_infer(
|
|
bool time_kernel,
|
|
const std::array<int, NumBatchNormReduceDim> reduceDims,
|
|
const std::array<ck::index_t, Rank> xyLengths,
|
|
const std::array<ck::index_t, Rank> xStrides,
|
|
const std::array<ck::index_t, Rank> yStrides,
|
|
const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
|
|
const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarStrides,
|
|
const void* p_x,
|
|
const void* p_scale,
|
|
const void* p_bias,
|
|
double epsilon,
|
|
const void* p_estimatedMean,
|
|
const void* p_estimatedVariance,
|
|
void* p_y)
|
|
{
|
|
(void)bnScaleBiasMeanVarLengths;
|
|
|
|
static_assert(NumBatchNormReduceDim < Rank,
|
|
"Invalid number of reduced dimensions for batchnorm!");
|
|
|
|
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
|
|
ck::Tuple<InOutDataType, AccDataType, AccDataType, AccDataType, AccDataType>, // x, mean,
|
|
// variance,
|
|
// scale,
|
|
// bias,
|
|
ck::Tuple<InOutDataType>, // y
|
|
NormalizeInInfer,
|
|
Rank,
|
|
2, // MPerthread
|
|
ck::Sequence<1, 1, 1, 1, 1>, // x, mean, variance, scale, bias
|
|
ck::Sequence<1>>; // scalarPerVector: y
|
|
|
|
auto invariantDims = get_invariant_dims<Rank, NumBatchNormReduceDim>(reduceDims);
|
|
std::array<ck::index_t, Rank> aligned_scaleBiasMeanVarStrides{0};
|
|
|
|
int i = 0;
|
|
for(auto dim : invariantDims)
|
|
{
|
|
assert(xyLengths[dim] == bnScaleBiasMeanVarLengths[i]);
|
|
|
|
aligned_scaleBiasMeanVarStrides[dim] = bnScaleBiasMeanVarStrides[i];
|
|
i++;
|
|
};
|
|
|
|
int32_t reduceLength = 1;
|
|
|
|
for(auto dim : reduceDims)
|
|
reduceLength *= xyLengths[dim];
|
|
|
|
int32_t invariantLength = 1;
|
|
|
|
for(auto dim : invariantDims)
|
|
invariantLength *= xyLengths[dim];
|
|
|
|
size_t total_length = static_cast<size_t>(invariantLength) * reduceLength;
|
|
|
|
float avg_time = 0.0f;
|
|
std::size_t num_bytes = 0;
|
|
|
|
auto dev_normalize = DeviceNormalizeInstance{};
|
|
|
|
auto argument_ptr1 = dev_normalize.MakeArgumentPointer(
|
|
xyLengths,
|
|
{xStrides,
|
|
aligned_scaleBiasMeanVarStrides,
|
|
aligned_scaleBiasMeanVarStrides,
|
|
aligned_scaleBiasMeanVarStrides,
|
|
aligned_scaleBiasMeanVarStrides},
|
|
{yStrides},
|
|
{p_x, p_estimatedMean, p_estimatedVariance, p_scale, p_bias},
|
|
{p_y},
|
|
NormalizeInInfer{epsilon});
|
|
|
|
if(!dev_normalize.IsSupportedArgument(argument_ptr1.get()))
|
|
{
|
|
std::cout << "The runtime parameters seems not supported by the Devic, exiting!"
|
|
<< std::endl;
|
|
|
|
return (-1);
|
|
};
|
|
|
|
auto invoker_ptr1 = dev_normalize.MakeInvokerPointer();
|
|
|
|
avg_time += invoker_ptr1->Run(argument_ptr1.get(), StreamConfig{nullptr, time_kernel});
|
|
|
|
num_bytes += (total_length * (1 * sizeof(InOutDataType) + 4 * sizeof(AccDataType)) +
|
|
total_length * sizeof(InOutDataType));
|
|
|
|
if(time_kernel)
|
|
{
|
|
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
|
|
|
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
|
|
};
|
|
|
|
return (0);
|
|
};
|