diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp index 153cd92ff0..9339501206 100644 --- a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp +++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp @@ -860,5 +860,116 @@ inline void naive_conv_fwd(const TIn* p_in, stream); } +// Batch normalization + clamp kernel (to be run after convolution for bias_bnorm tests) +template +__global__ void naive_batchnorm_clamp_infer_kernel(DataType* __restrict__ p_out, + const DataType* __restrict__ p_in, + const DataType* __restrict__ p_mean, + const DataType* __restrict__ p_variance, + const DataType* __restrict__ p_scale, + const DataType* __restrict__ p_shift, + const index_t* __restrict__ param_strides, + const index_t* __restrict__ tensor_strides, + const index_t* __restrict__ lengths, + int num_dims, + long_index_t total_elements, + float epsilon, + float floor, + float ceil) +{ + const long_index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const long_index_t num_threads = blockDim.x * gridDim.x; + + for(long_index_t idx = tid; idx < total_elements; idx += num_threads) + { + // Extract dimensions from linear index + long_index_t remaining = idx; + long_index_t param_idx = 0; + long_index_t tensor_idx = 0; + + // Extract coordinates and compute indices + for(int dim = num_dims - 1; dim >= 0; --dim) + { + index_t coord = remaining % lengths[dim]; + remaining /= lengths[dim]; + param_idx += coord * param_strides[dim]; + tensor_idx += coord * tensor_strides[dim]; + } + + // Batch normalization + clamp + const float x = type_convert(p_in[tensor_idx]); + const float inv_variance = + 1.0f / std::sqrt(epsilon + type_convert(p_variance[param_idx])); + const float norm_x = (x - type_convert(p_mean[param_idx])) * inv_variance; + float y = type_convert(p_scale[param_idx]) * norm_x + + type_convert(p_shift[param_idx]); + + // Clamp + y = y > floor ? (y < ceil ? y : ceil) : floor; + + p_out[tensor_idx] = type_convert(y); + } +} + +// Wrapper for batch normalization + clamp on GPU +template +void naive_batchnorm_clamp_infer_gpu(DataType* p_out, + const DataType* p_in, + const DataType* p_mean, + const DataType* p_variance, + const DataType* p_scale, + const DataType* p_shift, + const std::vector& tensor_lengths, + const std::vector& param_strides, + const std::vector& tensor_strides, + long_index_t total_elements, + float epsilon, + float floor, + float ceil, + hipStream_t stream = nullptr) +{ + // Copy strides and lengths to device + SimpleDeviceMem param_strides_buf(param_strides.size() * sizeof(index_t)); + SimpleDeviceMem tensor_strides_buf(tensor_strides.size() * sizeof(index_t)); + SimpleDeviceMem lengths_buf(tensor_lengths.size() * sizeof(index_t)); + + index_t* d_param_strides = static_cast(param_strides_buf.GetDeviceBuffer()); + index_t* d_tensor_strides = static_cast(tensor_strides_buf.GetDeviceBuffer()); + index_t* d_lengths = static_cast(lengths_buf.GetDeviceBuffer()); + + HIP_CHECK_ERROR(hipMemcpy(d_param_strides, + param_strides.data(), + param_strides.size() * sizeof(index_t), + hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy(d_tensor_strides, + tensor_strides.data(), + tensor_strides.size() * sizeof(index_t), + hipMemcpyHostToDevice)); + HIP_CHECK_ERROR(hipMemcpy(d_lengths, + tensor_lengths.data(), + tensor_lengths.size() * sizeof(index_t), + hipMemcpyHostToDevice)); + + constexpr int block_size = 256; + const int grid_size = (total_elements + block_size - 1) / block_size; + + naive_batchnorm_clamp_infer_kernel<<>>(p_out, + p_in, + p_mean, + p_variance, + p_scale, + p_shift, + d_param_strides, + d_tensor_strides, + d_lengths, + tensor_lengths.size(), + total_elements, + epsilon, + floor, + ceil); + + HIP_CHECK_ERROR(hipGetLastError()); +} + } // namespace ref } // namespace ck diff --git a/test/gpu_reference/CMakeLists.txt b/test/gpu_reference/CMakeLists.txt index d1c3908849..cd702316f3 100644 --- a/test/gpu_reference/CMakeLists.txt +++ b/test/gpu_reference/CMakeLists.txt @@ -12,3 +12,6 @@ target_link_libraries(test_gpu_reference_conv_bwd_data PRIVATE utility) add_gtest_executable(test_gpu_reference_conv_bwd_weight test_gpu_reference_conv_bwd_weight.cpp) target_link_libraries(test_gpu_reference_conv_bwd_weight PRIVATE utility) + +add_gtest_executable(test_gpu_batchnorm_clamp test_gpu_batchnorm_clamp.cpp) +target_link_libraries(test_gpu_batchnorm_clamp PRIVATE utility) diff --git a/test/gpu_reference/test_gpu_batchnorm_clamp.cpp b/test/gpu_reference/test_gpu_batchnorm_clamp.cpp new file mode 100644 index 0000000000..d7951cad75 --- /dev/null +++ b/test/gpu_reference/test_gpu_batchnorm_clamp.cpp @@ -0,0 +1,210 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/type_convert.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +using namespace ck; +using BaseConv = ck::tensor_layout::convolution::BaseConvolutionLayout; + +// CPU reference implementation matching profiler's ref_bnorm_clamp_infer +template +void cpu_batchnorm_clamp_ref(Tensor& out, + const Tensor& in, + const Tensor& mean, + const Tensor& variance, + const Tensor& scale, + const Tensor& shift, + float floor, + float ceil, + float epsilon) +{ + using Clamp = tensor_operation::element_wise::Clamp; + + auto func = [&](auto g, auto n, auto k, auto h, auto w) { + const float x = type_convert(in(g, n, k, h, w)); + const float invVariance = + 1.0f / std::sqrt(epsilon + type_convert(variance(g, n, k, h, w))); + const float norm_x = (x - type_convert(mean(g, n, k, h, w))) * invVariance; + float y = type_convert(scale(g, n, k, h, w)) * norm_x + + type_convert(shift(g, n, k, h, w)); + Clamp{floor, ceil}(y, y); + out(g, n, k, h, w) = type_convert(y); + }; + + make_ParallelTensorFunctor(func, + out.GetLengths()[0], + out.GetLengths()[1], + out.GetLengths()[2], + out.GetLengths()[3], + out.GetLengths()[4])(std::thread::hardware_concurrency()); +} + +void run_batchnorm_test(index_t G, + index_t N, + index_t K, + index_t H, + index_t W, + bool use_gk_broadcast, + float epsilon, + float floor, + float ceil_val, + float tolerance = 1e-3f) +{ + const long_index_t total_elements = G * N * K * H * W; + const long_index_t param_size = use_gk_broadcast ? G * K : total_elements; + + // Create tensor descriptors + auto out_desc = + HostTensorDescriptor({G, N, K, H, W}, {N * K * H * W, K * H * W, H * W, W, 1}, BaseConv{}); + + // Parameter descriptor: GK broadcast or full size + auto param_desc = use_gk_broadcast + ? HostTensorDescriptor({G, 1, K, 1, 1}, {K, 0, 1, 0, 0}, BaseConv{}) + : out_desc; + + Tensor in_tensor(out_desc); + Tensor out_cpu_tensor(out_desc); + Tensor out_gpu_tensor(out_desc); + Tensor mean_tensor(param_desc); + Tensor variance_tensor(param_desc); + Tensor scale_tensor(param_desc); + Tensor shift_tensor(param_desc); + + // Initialize tensors + in_tensor.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + mean_tensor.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + variance_tensor.GenerateTensorValue(GeneratorTensor_2{0, 5}); + scale_tensor.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + shift_tensor.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + + // CPU reference + cpu_batchnorm_clamp_ref(out_cpu_tensor, + in_tensor, + mean_tensor, + variance_tensor, + scale_tensor, + shift_tensor, + floor, + ceil_val, + epsilon); + + // GPU version + DeviceMem d_in(total_elements * sizeof(float)); + DeviceMem d_mean(param_size * sizeof(float)); + DeviceMem d_variance(param_size * sizeof(float)); + DeviceMem d_scale(param_size * sizeof(float)); + DeviceMem d_shift(param_size * sizeof(float)); + DeviceMem d_out(total_elements * sizeof(float)); + + d_in.ToDevice(in_tensor.mData.data()); + d_mean.ToDevice(mean_tensor.mData.data()); + d_variance.ToDevice(variance_tensor.mData.data()); + d_scale.ToDevice(scale_tensor.mData.data()); + d_shift.ToDevice(shift_tensor.mData.data()); + + // Setup strides + std::vector tensor_lengths = {G, N, K, H, W}; + std::vector tensor_strides = {N * K * H * W, K * H * W, H * W, W, 1}; + + // Parameter strides: GK broadcast uses zero strides for N, H, W + std::vector param_strides = + use_gk_broadcast ? std::vector{K, 0, 1, 0, 0} : tensor_strides; + + ref::naive_batchnorm_clamp_infer_gpu( + reinterpret_cast(d_out.GetDeviceBuffer()), + reinterpret_cast(d_in.GetDeviceBuffer()), + reinterpret_cast(d_mean.GetDeviceBuffer()), + reinterpret_cast(d_variance.GetDeviceBuffer()), + reinterpret_cast(d_scale.GetDeviceBuffer()), + reinterpret_cast(d_shift.GetDeviceBuffer()), + tensor_lengths, + param_strides, + tensor_strides, + total_elements, + epsilon, + floor, + ceil_val); + + d_out.FromDevice(out_gpu_tensor.mData.data()); + + // Verify results + bool pass = true; + float max_diff = 0.0f; + int error_count = 0; + + for(long_index_t i = 0; i < total_elements; ++i) + { + float diff = std::abs(out_gpu_tensor.mData[i] - out_cpu_tensor.mData[i]); + max_diff = std::max(max_diff, diff); + + if(diff > tolerance) + { + error_count++; + pass = false; + } + } + + EXPECT_TRUE(pass) << "GPU batchnorm produced " << error_count << " errors out of " + << total_elements << " elements (max diff: " << max_diff << ")"; +} + +TEST(GpuBatchnormClamp, SmallDimensions) +{ + run_batchnorm_test( + /* G */ 2, + /* N */ 3, + /* K */ 4, + /* H */ 2, + /* W */ 2, + /* use_gk_broadcast */ true, + /* epsilon */ 1e-4f, + /* floor */ 0.0f, + /* ceil */ 100.0f); +} + +TEST(GpuBatchnormClamp, RealDimensions_GKBroadcast) +{ + // Dimensions from test_grouped_convnd_fwd_bias_bnorm_clamp + run_batchnorm_test( + /* G */ 2, + /* N */ 32, + /* K */ 256, + /* H */ 4, + /* W */ 4, + /* use_gk_broadcast */ true, + /* epsilon */ 1e-4f, + /* floor */ 0.0f, + /* ceil */ 2048.0f); +} + +TEST(GpuBatchnormClamp, FullStrides_NoBroadcast) +{ + run_batchnorm_test( + /* G */ 2, + /* N */ 32, + /* K */ 256, + /* H */ 4, + /* W */ 4, + /* use_gk_broadcast */ false, + /* epsilon */ 1e-4f, + /* floor */ 0.0f, + /* ceil */ 2048.0f); +} + +int main(int argc, char** argv) +{ + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}