diff --git a/client_example/06_softmax/CMakeLists.txt b/client_example/06_softmax/CMakeLists.txt new file mode 100644 index 0000000000..b38a0fd9e2 --- /dev/null +++ b/client_example/06_softmax/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_softmax4d softmax4d.cpp) +target_link_libraries(client_softmax4d PRIVATE composable_kernel::device_operations) diff --git a/client_example/06_softmax/softmax4d.cpp b/client_example/06_softmax/softmax4d.cpp new file mode 100644 index 0000000000..7745ddf34c --- /dev/null +++ b/client_example/06_softmax/softmax4d.cpp @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/softmax.hpp" + +using InDataType = ck::half_t; +using OutDataType = ck::half_t; +using AccDataType = float; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +constexpr int Rank = 4; +constexpr int NumReduceDim = 2; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + std::vector in_lengths{2, 8, 128, 1024}; + std::vector in_strides{8 * 128 * 1024, 128 * 1024, 1024, 1}; + std::vector reduce_dims{2, 3}; + + ck::index_t num_elements = + std::accumulate(in_lengths.begin(), in_lengths.end(), 1, std::multiplies()); + + AccDataType alpha{2.0f}; + AccDataType beta{2.0f}; + + SimpleDeviceMem in(sizeof(InDataType) * num_elements); + SimpleDeviceMem out(sizeof(OutDataType) * num_elements); + + using DeviceOp = ck::tensor_operation::device:: + DeviceSoftmax; + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + if(op_ptr->GetRank() != Rank || op_ptr->GetNumReduceDim() != NumReduceDim) + { + continue; + } + + auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths, + in_strides, + reduce_dims, + &alpha, + &beta, + in.GetDeviceBuffer(), + out.GetDeviceBuffer(), + PassThrough{}, + PassThrough{}); + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_bytes = num_elements * sizeof(InDataType) + + (beta == 0.0f ? 1 : 2) * num_elements * sizeof(OutDataType); + + float gb_per_sec = num_bytes / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths, + in_strides, + reduce_dims, + &alpha, + &beta, + in.GetDeviceBuffer(), + out.GetDeviceBuffer(), + PassThrough{}, + PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt index 9a0e243570..8e7aa76f87 100644 --- a/client_example/CMakeLists.txt +++ b/client_example/CMakeLists.txt @@ -11,3 +11,4 @@ add_subdirectory(02_gemm_add_add_fastgelu) add_subdirectory(03_gemm_layernorm) add_subdirectory(04_contraction) add_subdirectory(05_layernorm) +add_subdirectory(06_softmax) diff --git a/example/23_softmax/softmax_blockwise.cpp b/example/23_softmax/softmax_blockwise.cpp index fa2e4cbf49..7ab9221fff 100644 --- a/example/23_softmax/softmax_blockwise.cpp +++ b/example/23_softmax/softmax_blockwise.cpp @@ -9,37 +9,41 @@ #include "ck/ck.hpp" #include "ck/utility/reduction_enums.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_common_util.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" -using namespace ck; using namespace ck::tensor_operation::device; using InDataType = ck::half_t; using OutDataType = ck::half_t; using AccDataType = float; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + constexpr int Rank = 3; constexpr int NumReduceDim = 1; -using DeviceInstance = DeviceSoftmax; // OutScalarPerVector +using DeviceInstance = DeviceSoftmaxImpl; // OutScalarPerVector static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, {"verify", required_argument, nullptr, 'v'}, @@ -196,7 +200,7 @@ int main(int argc, char* argv[]) if(args.do_verification) { using ReferenceInstance = - tensor_operation::host::ReferenceSoftmax; + ck::tensor_operation::host::ReferenceSoftmax; ReferenceInstance ref; auto ref_arg = ref.MakeArgument(in, out_ref, alpha, beta, reduceDims); auto invoker = ref.MakeInvoker(); @@ -220,7 +224,9 @@ int main(int argc, char* argv[]) &alpha, &beta, in_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer()); + out_dev.GetDeviceBuffer(), + PassThrough{}, + PassThrough{}); if(!device_instance.IsSupportedArgument(argument_ptr.get())) { diff --git a/include/ck/tensor_operation/gpu/device/device_softmax.hpp b/include/ck/tensor_operation/gpu/device/device_softmax.hpp index 7fd4c4d1b3..dc40f7c789 100644 --- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp +++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp @@ -3,19 +3,10 @@ #pragma once -#include -#include +#include +#include -#include "ck/utility/reduction_operator.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp" -#include "ck/tensor_operation/gpu/device/device_reduce.hpp" -#include "ck/tensor_operation/gpu/device/device_normalization.hpp" -#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp" -#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp" -#include "ck/host_utility/device_prop.hpp" -#include "ck/host_utility/kernel_launch.hpp" namespace ck { namespace tensor_operation { @@ -24,227 +15,54 @@ namespace device { template -struct DeviceSoftmax : public DeviceNormalization + typename InElementwiseOp, + typename AccElementwiseOp, + index_t Rank> +struct DeviceSoftmax : public BaseOperator { - static constexpr index_t kRank = Rank; - static constexpr index_t kNumReduceDim = NumReduceDim; + // + // @brief Makes a pointer to Argument class. + // + // @param[in] inLengths Input tensor extent(s) from high to low dimension + // @param[in] inStrides Input tensor stride(s) from high to low dimension + // @param[in] reduceDims The dimension(s) the normalization operation is applied + // @param[in] alpha Typeless pointer in host memory storing the alpha scaling + // value as type AccDataType + // @param[in] beta Typeless pointer in host memory storing the beta scaling + // value as type AccDataType + // @param[in] in_dev Typeless const pointer in device memory storing the input + // tensor + // @param out_dev Typeless pointer in device memory storing the output tensor + // @param[in] in_elementwise_op The input elementwise operation. + // @param[in] acc_elementwise_op The accumulation elementwise operation. + // + // @return Unique pointer to the Argument class. + // + virtual std::unique_ptr + MakeArgumentPointer(const std::vector inLengths, + const std::vector inStrides, + const std::vector reduceDims, + const void* alpha, + const void* beta, + const void* in_dev, + void* out_dev, + InElementwiseOp in_elementwise_op, + AccElementwiseOp acc_elementwise_op) = 0; - virtual index_t GetRank() const override { return kRank; } - - virtual index_t GetNumReduceDim() const override { return kNumReduceDim; } - - using PassThrough = tensor_operation::element_wise::PassThrough; - - // Used for freeloading of some handy functions from DeviceReduceMultiBlock - using Reduction = DeviceReduceMultiBlock; // OutDstVectorSize - - using GridDesc_M_K = decltype(Reduction::MakeSrc2dDescriptor({1}, {1}, 1, 1)); - - using GridwiseSoftmaxGeneric = GridwiseSoftmax_mk_to_mk; - - using GridwiseSoftmaxSweepOnce = GridwiseSoftmax_mk_to_mk; - - struct Argument : public Reduction::Argument - { - Argument(const std::vector inLengths, - const std::vector inStrides, - const std::vector reduceDims, - AccDataType alpha, - AccDataType beta, - const InDataType* in_dev, - OutDataType* out_dev) - : Reduction::Argument(inLengths, - inStrides, - {}, - {}, - reduceDims, - 0.0f, // alpha - 0.0f, // beta - in_dev, - nullptr, - out_dev, - nullptr, - PassThrough{}, - PassThrough{}), - // FIXME: The base class DeviceReduceMultiBlock::Argument only supports alpha/beta of - // float32 precision. Make it support any data type so the fields can be removed. - alpha_(alpha), - beta_(beta) - { - // std::cout << "blkGroupSize= " << this->blkGroupSize - // << ", numBlockTileIteration= " << this->numBlockTileIteration - // << ", gridSize=" << this->gridSize - // << ", invariant_total_length=" << this->invariant_total_length << - // std::endl; - } - - AccDataType alpha_; - AccDataType beta_; - }; - - struct Invoker : public BaseInvoker - { - float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) - { - const auto in_grid_desc_m_k = Reduction::MakeSrc2dDescriptor( - arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration); - const auto out_grid_desc_m_k = Reduction::MakeSrc2dDescriptor( - arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration); - - bool sweep_once = - in_grid_desc_m_k.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize; - - const auto kernel_main = sweep_once ? kernel_softmax - : kernel_softmax; - - float avg_time = 0; - - avg_time += launch_and_time_kernel(stream_config, - kernel_main, - dim3(arg.gridSize), - dim3(BlockSize), - 0, - in_grid_desc_m_k, - out_grid_desc_m_k, - arg.blkGroupSize, - arg.numBlockTileIteration, - arg.alpha_, - arg.in_dev_, - arg.beta_, - arg.out_dev_); - - return (avg_time); - }; - - float Run(const BaseArgument* p_arg, - const StreamConfig& stream_config = StreamConfig{}) override - { - return Run(*dynamic_cast(p_arg), stream_config); - }; - }; - - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - const Argument* p_arg_ = dynamic_cast(p_arg); - - if(!Reduction::IsSupportedArgument(p_arg_)) - { - return false; - } - - if(p_arg_->inLengths_[Rank - 1] % OutDstVectorSize != 0) - { - return false; - } - - return true; - }; - - // inLengths: input tensor extent(s) from high to low dimension - // inStrides: input tensor stride(s) from high to low dimension - // reduceDims: the dimension(s) the softmax normalization operate on - // alpha: typeless pointer in host memory storing the alpha scaling value as type AccDataType - // beta: typeless pointer in host memory storing the beta scaling value as type AccDataType - // in_dev: typeless const pointer in device memory storing the input tensor - // out_dev: typeless pointer in device memory storing the output tensor - std::unique_ptr MakeArgumentPointer(const std::vector inLengths, - const std::vector inStrides, - const std::vector reduceDims, - const void* alpha, - const void* beta, - const void* in_dev, - void* out_dev) override - { - return std::make_unique(inLengths, - inStrides, - reduceDims, - *static_cast(alpha), - *static_cast(beta), - static_cast(in_dev), - static_cast(out_dev)); - }; - - std::unique_ptr MakeInvokerPointer() override - { - return std::make_unique(); - }; - - std::string GetTypeString() const override - { - auto str = std::stringstream(); - - // clang-format off - str << "DeviceReduceSoftmax<" << BlockSize << ","; - str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; - str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; - str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">"; - // clang-format on - - return str.str(); - } + virtual std::unique_ptr MakeInvokerPointer() = 0; + virtual index_t GetRank() const = 0; + virtual index_t GetNumReduceDim() const = 0; }; +template +using DeviceSoftmaxPtr = std::unique_ptr< + DeviceSoftmax>; + } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp new file mode 100644 index 0000000000..ce58d1f49b --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/utility/reduction_operator.hpp" +#include "ck/tensor_operation/gpu/device/device_base.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceSoftmaxImpl : public DeviceSoftmax +{ + static constexpr index_t kRank = Rank; + static constexpr index_t kNumReduceDim = NumReduceDim; + + virtual index_t GetRank() const override { return kRank; } + + virtual index_t GetNumReduceDim() const override { return kNumReduceDim; } + + // Used for freeloading of some handy functions from DeviceReduceMultiBlock + using Reduction = DeviceReduceMultiBlock; // OutDstVectorSize + + using GridDesc_M_K = decltype(Reduction::MakeSrc2dDescriptor({1}, {1}, 1, 1)); + + using GridwiseSoftmaxGeneric = GridwiseSoftmax_mk_to_mk; + + using GridwiseSoftmaxSweepOnce = GridwiseSoftmax_mk_to_mk; + + struct Argument : public Reduction::Argument + { + Argument(const std::vector inLengths, + const std::vector inStrides, + const std::vector reduceDims, + AccDataType alpha, + AccDataType beta, + const InDataType* in_dev, + OutDataType* out_dev, + InElementwiseOp in_elementwise_op, + AccElementwiseOp acc_elementwise_op) + : Reduction::Argument(inLengths, + inStrides, + {}, + {}, + reduceDims, + 0.0f, // alpha + 0.0f, // beta + in_dev, + nullptr, + out_dev, + nullptr, + in_elementwise_op, + acc_elementwise_op), + // FIXME: The base class DeviceReduceMultiBlock::Argument only supports alpha/beta of + // float32 precision. Make it support any data type so the fields can be removed. + alpha_(alpha), + beta_(beta) + { + // std::cout << "blkGroupSize= " << this->blkGroupSize + // << ", numBlockTileIteration= " << this->numBlockTileIteration + // << ", gridSize=" << this->gridSize + // << ", invariant_total_length=" << this->invariant_total_length << + // std::endl; + } + + AccDataType alpha_; + AccDataType beta_; + }; + + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + const auto in_grid_desc_m_k = Reduction::MakeSrc2dDescriptor( + arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration); + const auto out_grid_desc_m_k = Reduction::MakeSrc2dDescriptor( + arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration); + + bool sweep_once = + in_grid_desc_m_k.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize; + + const auto kernel_main = sweep_once ? kernel_softmax + : kernel_softmax; + + float avg_time = 0; + + avg_time += launch_and_time_kernel(stream_config, + kernel_main, + dim3(arg.gridSize), + dim3(BlockSize), + 0, + in_grid_desc_m_k, + out_grid_desc_m_k, + arg.blkGroupSize, + arg.numBlockTileIteration, + arg.alpha_, + arg.in_dev_, + arg.beta_, + arg.out_dev_); + + return (avg_time); + }; + + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + }; + }; + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + const Argument* p_arg_ = dynamic_cast(p_arg); + + if(!Reduction::IsSupportedArgument(p_arg_)) + { + return false; + } + + if(p_arg_->inLengths_[Rank - 1] % OutDstVectorSize != 0) + { + return false; + } + + return true; + }; + + // + // @brief Makes a pointer to Argument class. + // + // @param[in] inLengths Input tensor extent(s) from high to low dimension + // @param[in] inStrides Input tensor stride(s) from high to low dimension + // @param[in] reduceDims The dimension(s) the normalization operation is applied + // @param[in] alpha Typeless pointer in host memory storing the alpha scaling + // value as type AccDataType + // @param[in] beta Typeless pointer in host memory storing the beta scaling + // value as type AccDataType + // @param[in] in_dev Typeless const pointer in device memory storing the input + // tensor + // @param out_dev Typeless pointer in device memory storing the output tensor + // @param[in] in_elementwise_op The input elementwise operation. + // @param[in] acc_elementwise_op The accumulation elementwise operation. + // + // @return Unique pointer to the Argument class. + // + std::unique_ptr MakeArgumentPointer(const std::vector inLengths, + const std::vector inStrides, + const std::vector reduceDims, + const void* alpha, + const void* beta, + const void* in_dev, + void* out_dev, + InElementwiseOp in_elementwise_op, + AccElementwiseOp acc_elementwise_op) override + { + return std::make_unique(inLengths, + inStrides, + reduceDims, + *static_cast(alpha), + *static_cast(beta), + static_cast(in_dev), + static_cast(out_dev), + in_elementwise_op, + acc_elementwise_op); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceReduceSoftmax<" << BlockSize << ","; + str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; + str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; + str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">"; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp new file mode 100644 index 0000000000..0ef87252e6 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/utility/data_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using F32 = float; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +void add_device_softmax_f16_f16_rank3_instances( + std::vector>&); +void add_device_softmax_f16_f16_rank4_instances( + std::vector>&); + +void add_device_softmax_f32_f32_rank3_instances( + std::vector>&); +void add_device_softmax_f32_f32_rank4_instances( + std::vector>&); + +template +struct DeviceOperationInstanceFactory< + ck::tensor_operation::device:: + DeviceSoftmax> +{ + using DeviceOp = + DeviceSoftmax; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(std::is_same_v && std::is_same_v && + std::is_same_v) + { + if constexpr(Rank == 3) + add_device_softmax_f16_f16_rank3_instances(op_ptrs); + else if constexpr(Rank == 4) + add_device_softmax_f16_f16_rank4_instances(op_ptrs); + } + else if constexpr(std::is_same_v && std::is_same_v && + std::is_same_v) + { + if constexpr(Rank == 3) + add_device_softmax_f32_f32_rank3_instances(op_ptrs); + else if constexpr(Rank == 4) + add_device_softmax_f32_f32_rank4_instances(op_ptrs); + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 6f3f900b8a..0c5afce6a6 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -78,6 +78,7 @@ target_include_directories(device_operations PUBLIC $ $ $ + $ $ $ $ diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp index 8465baa17c..819532e883 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f16_f16_instance.cpp @@ -1,43 +1,51 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" -#include "ck/utility/data_type.hpp" +#include +#include +#include "ck/ck.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp" +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" +#include "ck/utility/data_type.hpp" namespace ck { namespace tensor_operation { namespace device { namespace instance { -using F16 = ck::half_t; -using F32 = float; +namespace { +using F16 = ck::half_t; +using F32 = float; +using Pass = ck::tensor_operation::element_wise::PassThrough; +} // namespace template using device_softmax_f16_f16_instances = std::tuple< // clang-format off - // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> - DeviceSoftmax, // fallback kernel - DeviceSoftmax, - DeviceSoftmax, - DeviceSoftmax, - DeviceSoftmax, - DeviceSoftmax, - DeviceSoftmax, - DeviceSoftmax, - DeviceSoftmax + // InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> + DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel + DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8>, + DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8>, + DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 8>, + DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 8>, + DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 8>, + DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 8>, + DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 8>, + DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 8> // clang-format on >; -void add_device_softmax_f16_f16_rank3_instances(std::vector& instances) +void add_device_softmax_f16_f16_rank3_instances( + std::vector>& instances) { add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{}); add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{}); } -void add_device_softmax_f16_f16_rank4_instances(std::vector& instances) +void add_device_softmax_f16_f16_rank4_instances( + std::vector>& instances) { add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{}); add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{}); diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp index 73ecf747b2..cfc85986c4 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_softmax_f32_f32_instance.cpp @@ -1,41 +1,49 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +#include +#include + #include "ck/ck.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" -#include "ck/utility/data_type.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp" +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" namespace ck { namespace tensor_operation { namespace device { namespace instance { -using F32 = float; +namespace { +using F32 = float; +using Pass = ck::tensor_operation::element_wise::PassThrough; +} // namespace template using device_softmax_f32_f32_instances = std::tuple< // clang-format off - // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> - DeviceSoftmax, // fallback kernel - DeviceSoftmax, - DeviceSoftmax, - DeviceSoftmax, - DeviceSoftmax, - DeviceSoftmax, - DeviceSoftmax, - DeviceSoftmax, - DeviceSoftmax + // InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> + DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel + DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4>, + DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4>, + DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4>, + DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 4>, + DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 4>, + DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 4>, + DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 4>, + DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 4> // clang-format on >; -void add_device_softmax_f32_f32_rank3_instances(std::vector& instances) +void add_device_softmax_f32_f32_rank3_instances( + std::vector>& instances) { add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{}); add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{}); } -void add_device_softmax_f32_f32_rank4_instances(std::vector& instances) +void add_device_softmax_f32_f32_rank4_instances( + std::vector>& instances) { add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{}); add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{}); diff --git a/profiler/include/profile_normalization_impl.hpp b/profiler/include/profile_normalization_impl.hpp index 394d679ce2..9f6d7e3d88 100644 --- a/profiler/include/profile_normalization_impl.hpp +++ b/profiler/include/profile_normalization_impl.hpp @@ -6,25 +6,36 @@ #include #include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" - #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/utility/data_type.hpp" namespace ck { namespace tensor_operation { namespace device { namespace instance { -void add_device_softmax_f16_f16_rank3_instances(std::vector&); -void add_device_softmax_f16_f16_rank4_instances(std::vector&); +namespace { +using F16 = ck::half_t; +using F32 = float; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +} // namespace -void add_device_softmax_f32_f32_rank3_instances(std::vector&); -void add_device_softmax_f32_f32_rank4_instances(std::vector&); +void add_device_softmax_f16_f16_rank3_instances( + std::vector>&); +void add_device_softmax_f16_f16_rank4_instances( + std::vector>&); + +void add_device_softmax_f32_f32_rank3_instances( + std::vector>&); +void add_device_softmax_f32_f32_rank4_instances( + std::vector>&); } // namespace instance } // namespace device @@ -57,7 +68,7 @@ template <> std::string type_to_string() { return "int8"; } template <> std::string type_to_string() { return "int32"; } // clang-format on -template +template void profile_normalization_impl(int do_verification, int init_method, bool do_log, @@ -69,6 +80,11 @@ void profile_normalization_impl(int do_verification, AccDataType beta, NormType norm_type) { + if(Rank != in_length.size()) + { + throw std::runtime_error("Input tensor rank is different from template argument Rank!"); + } + Tensor in = in_strides.empty() ? Tensor(in_length) : Tensor(in_length, in_strides); Tensor out(in.mDesc); @@ -99,30 +115,31 @@ void profile_normalization_impl(int do_verification, std::vector i_in_lengths(in.mDesc.GetLengths().begin(), in.mDesc.GetLengths().end()); std::vector i_in_strides(in.mDesc.GetStrides().begin(), in.mDesc.GetStrides().end()); - // add device normalization instances - std::vector instances; + // add device softmax instances + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + using DeviceOpPtr = tensor_operation::device:: + DeviceSoftmaxPtr; + std::vector instances; if(norm_type == NormType::SOFTMAX) { if constexpr(is_same::value && is_same::value && is_same::value) { - if(in_length.size() == 3) + if constexpr(Rank == 3) tensor_operation::device::instance::add_device_softmax_f16_f16_rank3_instances( instances); - - if(in_length.size() == 4) + else if constexpr(Rank == 4) tensor_operation::device::instance::add_device_softmax_f16_f16_rank4_instances( instances); } else if constexpr(is_same::value && is_same::value && is_same::value) { - if(in_length.size() == 3) + if constexpr(Rank == 3) tensor_operation::device::instance::add_device_softmax_f32_f32_rank3_instances( instances); - - if(in_length.size() == 4) + else if constexpr(Rank == 4) tensor_operation::device::instance::add_device_softmax_f32_f32_rank4_instances( instances); } @@ -137,6 +154,8 @@ void profile_normalization_impl(int do_verification, float best_avg_time = std::numeric_limits::max(); float best_gb_per_sec = 0; + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + for(auto& inst_ptr : instances) { // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3 @@ -153,7 +172,9 @@ void profile_normalization_impl(int do_verification, &alpha, &beta, in_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer()); + out_dev.GetDeviceBuffer(), + PassThrough{}, + PassThrough{}); if(!inst_ptr->IsSupportedArgument(argument_ptr.get())) { diff --git a/profiler/src/profile_normalization.cpp b/profiler/src/profile_normalization.cpp index 5f2913464b..0e95a989a7 100644 --- a/profiler/src/profile_normalization.cpp +++ b/profiler/src/profile_normalization.cpp @@ -50,7 +50,7 @@ struct ArgParser void print_help() { - std::cout << "arg1: tensor operation (layernorm/batchnorm/softmax)\n" + std::cout << "arg1: tensor operation (batchnorm/softmax)\n" << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n" << "arg3: verification (0: no; 1: yes)\n" << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n" @@ -91,31 +91,73 @@ int profile_normalization(int argc, char* argv[]) arg_parser.long_opts["alpha"].empty() ? 1 : arg_parser.long_opts["alpha"][0]; const index_t beta = arg_parser.long_opts["beta"].empty() ? 0 : arg_parser.long_opts["beta"][0]; - if(data_type == NormDataType::F16_F16) + if(length.size() == 3) { - ck::profiler::profile_normalization_impl(do_verification, - init_method, - do_log, - time_kernel, - length, - stride, - reduce, - float(alpha), - float(beta), - norm_type); + if(data_type == NormDataType::F16_F16) + { + ck::profiler::profile_normalization_impl( + do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + float(alpha), + float(beta), + norm_type); + } + else if(data_type == NormDataType::F32_F32) + { + ck::profiler::profile_normalization_impl(do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + float(alpha), + float(beta), + norm_type); + } + else + { + throw std::runtime_error("not implemented yet"); + } } - else if(data_type == NormDataType::F32_F32) + else if(length.size() == 4) { - ck::profiler::profile_normalization_impl(do_verification, - init_method, - do_log, - time_kernel, - length, - stride, - reduce, - float(alpha), - float(beta), - norm_type); + if(data_type == NormDataType::F16_F16) + { + ck::profiler::profile_normalization_impl( + do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + float(alpha), + float(beta), + norm_type); + } + else if(data_type == NormDataType::F32_F32) + { + ck::profiler::profile_normalization_impl(do_verification, + init_method, + do_log, + time_kernel, + length, + stride, + reduce, + float(alpha), + float(beta), + norm_type); + } + else + { + throw std::runtime_error("not implemented yet"); + } } else { diff --git a/test/softmax/test_softmax_util.hpp b/test/softmax/test_softmax_util.hpp index 97a641e8e9..c41d326222 100644 --- a/test/softmax/test_softmax_util.hpp +++ b/test/softmax/test_softmax_util.hpp @@ -9,7 +9,8 @@ #include "ck/ck.hpp" #include "ck/utility/number.hpp" -#include "ck/tensor_operation/gpu/device/device_softmax.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/host_tensor.hpp" @@ -51,19 +52,23 @@ class TestSoftmax : public ::testing::Test using ReferenceInstance = tensor_operation::host::ReferenceSoftmax; - using DeviceInstance = tensor_operation::device::DeviceSoftmax; + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + + using DeviceInstance = tensor_operation::device::DeviceSoftmaxImpl; TestSoftmax() : ref_instance_invoker_(ReferenceInstance{}.MakeInvoker()) {} @@ -97,7 +102,9 @@ class TestSoftmax : public ::testing::Test &alpha, &beta, in_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer()); + out_dev.GetDeviceBuffer(), + PassThrough{}, + PassThrough{}); if(!device_instance.IsSupportedArgument(argument_ptr.get())) {