diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp index 9a06988ff8..8e40d60c17 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp @@ -31,10 +31,6 @@ void add_device_batchnorm_forward_rank_4_3_bf16_instances( std::vector< std::unique_ptr>>&); -// Int8 -void add_device_batchnorm_forward_rank_4_3_i8_instances( - std::vector>>&); - // FP64 void add_device_batchnorm_forward_rank_4_3_f64_instances( std::vector< @@ -101,15 +97,6 @@ struct DeviceOperationInstanceFactory< add_device_batchnorm_forward_rank_4_3_bf16_instances(op_ptrs); } } - else if constexpr(is_same_v && is_same_v && - is_same_v && is_same_v && - is_same_v && is_same_v) - { - if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v) - { - add_device_batchnorm_forward_rank_4_3_i8_instances(op_ptrs); - } - } else if constexpr(is_same_v && is_same_v && is_same_v && is_same_v && is_same_v && is_same_v) diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt index c637693f10..8947ecb9c1 100644 --- a/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt @@ -2,6 +2,5 @@ add_instance_library(device_batchnorm_instance device_batchnorm_forward_f16_instance.cpp device_batchnorm_forward_f32_instance.cpp device_batchnorm_forward_bf16_instance.cpp - device_batchnorm_forward_i8_instance.cpp device_batchnorm_forward_f64_instance.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_i8_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_i8_instance.cpp deleted file mode 100644 index 88ce369e15..0000000000 --- a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_i8_instance.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp" -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using I8 = int8_t; -using F32 = float; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -// clang-format off -template -using device_batchnorm_forward_i8_blockwise_instances = std::tuple< - // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl - >; -// clang-format on - -// clang-format off -template -using device_batchnorm_forward_i8_multiblock_instances = - std::tuple < - // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl, - DeviceBatchNormFwdImpl - >; -// clang-format on - -void add_device_batchnorm_forward_rank_4_3_i8_instances( - std::vector>>& - instances) -{ - add_device_operation_instances( - instances, device_batchnorm_forward_i8_blockwise_instances<4, 3, PassThrough>{}); - add_device_operation_instances( - instances, device_batchnorm_forward_i8_multiblock_instances<4, 3, PassThrough>{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/profiler/include/profile_batchnorm_forward_impl.hpp b/profiler/include/profile_batchnorm_forward_impl.hpp index b7fc435f07..82fe75bf01 100644 --- a/profiler/include/profile_batchnorm_forward_impl.hpp +++ b/profiler/include/profile_batchnorm_forward_impl.hpp @@ -85,39 +85,22 @@ bool profile_batchnorm_forward_impl(int do_verification, if(updateMovingAverage) { - if constexpr(ck::is_same_v) - { - x.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + const float x_mean = 0.0f; + const float x_stddev = 1.0f; + const float noise_stddev = 0.04f; - const float x_mean = 0.0f; - const float x_stddev = 2.5f; - const float noise_stddev = 0.04f; + // input data in normal distribution + x.GenerateTensorValue(GeneratorTensor_4{x_mean, x_stddev}, num_thread); - resultRunningMean_ref.GenerateTensorValue( - GeneratorTensor_4{x_mean, noise_stddev}, num_thread); + // initialize the runningMean to be values with tiny variation to the mean of the x + // values + resultRunningMean_ref.GenerateTensorValue( + GeneratorTensor_4{x_mean, noise_stddev}, num_thread); - resultRunningVariance_ref.GenerateTensorValue( - GeneratorTensor_4{x_stddev * x_stddev, noise_stddev}, num_thread); - } - else - { - const float x_mean = 0.0f; - const float x_stddev = 1.0f; - const float noise_stddev = 0.04f; - - // input data in normal distribution - x.GenerateTensorValue(GeneratorTensor_4{x_mean, x_stddev}, num_thread); - - // initialize the runningMean to be values with tiny variation to the mean of the x - // values - resultRunningMean_ref.GenerateTensorValue( - GeneratorTensor_4{x_mean, noise_stddev}, num_thread); - - // initialize the runningVariance to be values with tiny variation to the variance of - // the x values - resultRunningVariance_ref.GenerateTensorValue( - GeneratorTensor_4{x_stddev * x_stddev, noise_stddev}, num_thread); - }; + // initialize the runningVariance to be values with tiny variation to the variance of + // the x values + resultRunningVariance_ref.GenerateTensorValue( + GeneratorTensor_4{x_stddev * x_stddev, noise_stddev}, num_thread); } else { @@ -129,35 +112,24 @@ bool profile_batchnorm_forward_impl(int do_verification, if(do_verification) { - if constexpr(ck::is_same_v && ck::is_same_v) + switch(init_method) { + case 0: + bnScale.GenerateTensorValue(GeneratorTensor_0{}, num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_0{}, num_thread); + break; + case 1: + bnScale.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_1{0}, num_thread); + break; + case 2: bnScale.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); bnBias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + default: + bnScale.GenerateTensorValue(GeneratorTensor_3{-1.0f, 1.0f}, num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_3{-1.0f, 1.0f}, num_thread); } - else - { - - switch(init_method) - { - case 0: - bnScale.GenerateTensorValue(GeneratorTensor_0{}, num_thread); - bnBias.GenerateTensorValue(GeneratorTensor_0{}, num_thread); - break; - case 1: - bnScale.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); - bnBias.GenerateTensorValue(GeneratorTensor_1{0}, num_thread); - break; - case 2: - bnScale.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - bnBias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - break; - default: - bnScale.GenerateTensorValue(GeneratorTensor_3{-1.0f, 1.0f}, - num_thread); - bnBias.GenerateTensorValue(GeneratorTensor_3{-1.0f, 1.0f}, - num_thread); - } - }; }; // these buffers are usually provided by the user application diff --git a/profiler/src/profile_batchnorm_fwd.cpp b/profiler/src/profile_batchnorm_fwd.cpp index 077963f828..db443e5d7b 100644 --- a/profiler/src/profile_batchnorm_fwd.cpp +++ b/profiler/src/profile_batchnorm_fwd.cpp @@ -48,7 +48,7 @@ class BatchnormFwdArgParser std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl; std::cout << "--reduceDims or -R, comma separated list of dimensions to reduce on" << std::endl; std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl; - std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl; + std::cout << "Arg1: data type (0: fp16, 1: fp32, 5: bp16, 6: fp64)" << std::endl; std::cout << "Arg2: 1/0 to indicate whether to update the moving average and variance (0=no, 1=yes)" << std::endl; std::cout << "Arg3: 1/0 to indicate whether to save the calculated mean and invVariance (0=no, 1=yes)" << std::endl; std::cout << "Arg4: init method used for bnScale and bnBias (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl; @@ -141,7 +141,6 @@ int profile_batchnorm_forward(int argc, char* argv[]) using F16 = ck::half_t; using F32 = float; using BF16 = ck::bhalf_t; - using I8 = int8_t; using F64 = double; if(arg_parser.data_type == 0) @@ -178,23 +177,6 @@ int profile_batchnorm_forward(int argc, char* argv[]) averageFactor); }; } - else if(arg_parser.data_type == 3) - { - if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3) - { - profile_batchnorm_forward_impl( - arg_parser.do_verification, - arg_parser.init_method, - arg_parser.do_dumpout, - arg_parser.time_kernel, - arg_parser.inLengths, - arg_parser.reduceDims, - arg_parser.updateMovingAverage, - arg_parser.saveMeanAndInvVariance, - epsilon, - averageFactor); - }; - } else if(arg_parser.data_type == 5) { if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3) diff --git a/test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp b/test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp index a19664a87c..bc820be462 100644 --- a/test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp +++ b/test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp @@ -90,7 +90,6 @@ class TestBatchNormFwdRank4 : public ::testing::Test using KernelTypes = ::testing::Types, std::tuple, std::tuple, - std::tuple, std::tuple>; TYPED_TEST_SUITE(TestBatchNormFwdRank4, KernelTypes);