Batchnorm inference instances, external API, client examples and gtests (#531)

* File renaming and class renaming for device element-wise operation * Add batchnorm-infer instances, external API and client example * Add batchnorm-infer profiler module and gtests * Remove file device_elementwise_extension.hpp and move NormalizeInInfer operation to element_wise_operation.hpp * Remove the using of class aliasing for DeviceElementwiseForBatchNormInfer * Rename class and file due to conflict from device_elementwise_2d.hpp * Fix namespace in batcnnorm_infer_nhwc client example
2026-05-05 22:22:27 +00:00 · 2023-01-26 07:09:04 +08:00
parent 52abc2f371
commit a1b2441f8d
29 changed files with 1260 additions and 80 deletions
--- a/include/ck/tensor_operation/gpu/device/device_elementwise_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_base.hpp
@@ -17,7 +17,7 @@ template <typename InDataTypeTuple,
          typename OutDataTypeTuple,
          typename ElementwiseOperation,
          index_t NumDim>
-struct DeviceElementwiseBase : public BaseOperator
+struct DeviceElementwise : public BaseOperator
 {
    static constexpr int NumInput  = InDataTypeTuple::Size();
    static constexpr int NumOutput = OutDataTypeTuple::Size();
@@ -37,8 +37,8 @@ template <typename InDataTypeTuple,
          typename OutDataTypeTuple,
          typename ElementwiseOperation,
          index_t NumDim>
-using DeviceElementwiseBasePtr = std::unique_ptr<
-    DeviceElementwiseBase<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>>;
+using DeviceElementwisePtr = std::unique_ptr<
+    DeviceElementwise<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>>;

 } // namespace device
 } // namespace tensor_operation
--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp
@@ -8,7 +8,7 @@

 #include "ck/utility/math.hpp"
 #include "ck/utility/sequence.hpp"
-#include "ck/tensor_operation/gpu/device/device_elementwise_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"

@@ -26,10 +26,10 @@ template <typename InDataTypeTuple,
          index_t NPerThread,
          typename InScalarPerVectorSeq,
          typename OutScalarPerVectorSeq>
-struct DeviceElementwise : public DeviceElementwiseBase<InDataTypeTuple,
-                                                        OutDataTypeTuple,
-                                                        ElementwiseOperation,
-                                                        NumDim_m + NumDim_n>
+struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple,
+                                                          OutDataTypeTuple,
+                                                          ElementwiseOperation,
+                                                          NumDim_m + NumDim_n>
 {
    static constexpr index_t NumDim = NumDim_m + NumDim_n;

--- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp
@@ -8,7 +8,7 @@

 #include "ck/utility/math.hpp"
 #include "ck/utility/sequence.hpp"
-#include "ck/tensor_operation/gpu/device/device_elementwise_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"

@@ -25,8 +25,8 @@ template <typename InDataTypeTuple,
          index_t MPerThread,
          typename InScalarPerVectorSeq,
          typename OutScalarPerVectorSeq>
-struct DeviceElementwise
-    : public DeviceElementwiseBase<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>
+struct DeviceElementwiseImpl
+    : public DeviceElementwise<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>
 {
    static constexpr int NumInput  = InDataTypeTuple::Size();
    static constexpr int NumOutput = OutDataTypeTuple::Size();
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -314,6 +314,40 @@ struct Normalize
    double epsilon_;
 };

+// used by BatchNorm inference
+// y = gamma * (x-mean) / sqrt(epsilon+variance) + beta
+// The data type of mean and variance is used as AccDataType
+struct NormalizeInInfer
+{
+    NormalizeInInfer(double epsilon = 1e-4) : epsilon_(epsilon) {}
+
+    template <typename T1, typename T2, typename T3, typename T4>
+    __host__ __device__ constexpr void operator()(T1& y,
+                                                  const T1& x,
+                                                  const T2& mean,
+                                                  const T2& variance,
+                                                  const T3& gamma,
+                                                  const T4& beta) const
+    {
+        static_assert(std::is_same<T2, float>::value || std::is_same<T2, double>::value,
+                      "Data type is not supported by this operation!");
+
+        using ck::type_convert;
+        using ck::math::sqrt;
+
+        T2 tmp_x, tmp_y;
+
+        tmp_x = type_convert<T2>(x);
+
+        tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert<T2>(epsilon_))) *
+                    type_convert<T2>(gamma) +
+                type_convert<T2>(beta);
+        y = type_convert<T1>(tmp_y);
+    };
+
+    double epsilon_;
+};
+
 template <typename Y, typename X>
 struct UnaryTypeConvert;