mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-21 05:19:20 +00:00
Reduction in Composable Kernel (#82)
* Initial adding of generic reduction
* Initial adding of generic reduction ...
* Updates to make compiling done
* clang-format all files
* clang-format some files again
* Renaming in profiler/include/profile_reduce.hpp
* Updates and make BlockWise cases passed
* Updates and make ThreadWise and MultiBlockTwoCall cases passed
* Remove the support for MUL and NORM1 reduceOp from the profiler and the device instances
* Change to replace the dim0_max_vector_size/dim1_max_vector_size template argument in the device reduce classes
* format
* adding pooling
* added max and average pooling
* comment out cout and kernel timing
* Tiny simplification in profiler/reduce_profiler.cpp
* Add example for reduce_blockwise
* Tiny updates
* Change to pass the ElementWiseOp from device layer to kernel
* Fix the vectorDim and vectorSize in Device layer
* Enable vector load on both dim0 and dim1 for Threadwise method
* Tiny updates
* Change to let the user to pass the preUnaryOp and posUnaryOp
* Make pooling example work
* split device_reduce_instance into two libraries
* Tiny update
* Replace nanPropaOpt enum by boolean propagate_nan
* Simplification in DeviceReduce layer codes
* update build
* Change to clarify the difference between ck::half_t and half_float::half
* Renaming in all the reduction codes
* Add VectorSize as template parameter for device layer
* Add BetaIsZero as kernel template and as AccDataType for alpha
* print
* Small updates for pooling
* Updates for host_generic_reduction for reference
* Update to make AVG pooling pass
* Update to make MAX pooling with indices output pass
* fix
* add OutDst vector store to threadwise reduction and pooling
* tweak
* turn off check_indices that caused build issue
* refactor pooling
* clean up
* turn off check_indices for building issue for php-compiler
* add more tile size for odd C
* tweak conv for odd C
* update script
* clean up elementwise op
* add hack in reduction_operator.hpp to avoid compile error. To fix it, need to use element_wise_op in reduction op
* Add OutVectorSize as device and kernel tunable, also update to Elementwise Operations
* Move reduce operator mapping to host layer file reduction_operator_mapping.hpp from reduction_operator.hpp
* Change to the unary operators
* Move the definitions of unary operations to element_wise_operation.hpp
* re-org files
* Refine in device interfaces and multiblock kernels
* Split the reduction configurations into instances for specific methods
* Update in getTypeString() of device pool2d
* Renaming in host and kernel
* Tiny update in profiler/src/profiler.cpp
* Uncomment in device_operation/CMakeLists.txt to enable the building of all operations
* Make check_indices a templated function to remove some linking issue
* Renaming in the profiler reduce module
* Add support for double Reduction (but disable MultiblockAtomicAdd for double)
* Tiny correction of literal string
* Rename DevicePoolFwd to DevicePool2dFwd
* Split device_reduce_instance_xxx.cpp files according to the data types to speed up compiling
* Add comments for lists of configurations, lists of instances and references of add_reduce_instances_xxx
* Remove un-used header file gridwise_generic_reduction_wrapper_common.hpp
* Renaming and refining in the Reduction codes
* Tiny change in the unary operators
* Renaming symbols and files
* Renaming symbols in the kernels
* Move kernel kernel_set_buffer_value to separate file
* Add IndexDataType template parameter for kernels and use int32_t as index data type in device layer
* Tiny update in the kernels
* Remove definition of sqrtf()/isnan()/abs() for half_t due to some ADL issue
* Simplify a helper function in device layer
* Tiny adjustment in testing data initialization
* Renaming in kernel/device/host
* Add two testing scripts for reduction
* Refine the Unary operators in element_wise_operation.hpp
* Update in the reduce profiler module
* Update to the reduction testing scripts
* reduce compile parallelism
* change CI docker to rocm5.0
* remove unused variables
* fix build
Co-authored-by: Chao Liu <chao.liu2@amd.com>
[ROCm/composable_kernel commit: e17c0d8008]
This commit is contained in:
@@ -111,7 +111,35 @@ set(DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp;
|
||||
)
|
||||
|
||||
# device_reduce_instance
|
||||
set(DEVICE_REDUCE_INSTANCE_SOURCE
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_f16_f16_f16.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_f16_f32_f16.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_f32_f32_f32.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_f32_f64_f32.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_f64_f64_f64.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_threadwise_f16_f16_f16.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_threadwise_f16_f32_f16.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_threadwise_f32_f32_f32.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_threadwise_f32_f64_f32.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_threadwise_f64_f64_f64.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_second_call_f16_f16_f16.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_second_call_f32_f32_f16.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_second_call_f32_f32_f32.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_second_call_f64_f64_f32.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp;
|
||||
${PROJECT_SOURCE_DIR}/device_operation/src/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp;
|
||||
)
|
||||
|
||||
add_library(device_gemm_instance SHARED ${DEVICE_GEMM_INSTANCE_SOURCE})
|
||||
add_library(device_gemm_bias_2d_instance SHARED ${DEVICE_GEMM_BIAS_2D_INSTANCE_SOURCE})
|
||||
add_library(device_gemm_bias_relu_instance SHARED ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE})
|
||||
add_library(device_gemm_bias_relu_add_instance SHARED ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE})
|
||||
add_library(device_batched_gemm_instance SHARED ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE})
|
||||
@@ -120,8 +148,8 @@ add_library(device_conv2d_fwd_instance SHARED ${DEVICE_CONV2D_FWD_INSTANCE_SOURC
|
||||
add_library(device_conv2d_fwd_bias_relu_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE})
|
||||
add_library(device_conv2d_fwd_bias_relu_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE})
|
||||
add_library(device_conv2d_fwd_bias_relu_atomic_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE})
|
||||
add_library(device_gemm_bias_2d_instance SHARED ${DEVICE_GEMM_BIAS_2D_INSTANCE_SOURCE})
|
||||
add_library(device_conv2d_bwd_data_instance SHARED ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE})
|
||||
add_library(device_reduce_instance SHARED ${DEVICE_REDUCE_INSTANCE_SOURCE})
|
||||
|
||||
target_include_directories(device_gemm_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
|
||||
target_include_directories(device_gemm_bias_2d_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
|
||||
@@ -134,6 +162,7 @@ target_include_directories(device_conv2d_fwd_bias_relu_instance SYSTEM PUBLIC $<
|
||||
target_include_directories(device_conv2d_fwd_bias_relu_add_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
|
||||
target_include_directories(device_conv2d_fwd_bias_relu_atomic_add_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
|
||||
target_include_directories(device_conv2d_bwd_data_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
|
||||
target_include_directories(device_reduce_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
|
||||
|
||||
target_compile_features(device_gemm_instance PUBLIC)
|
||||
target_compile_features(device_gemm_bias_2d_instance PUBLIC)
|
||||
@@ -146,6 +175,7 @@ target_compile_features(device_conv2d_fwd_bias_relu_instance PUBLIC)
|
||||
target_compile_features(device_conv2d_fwd_bias_relu_add_instance PUBLIC)
|
||||
target_compile_features(device_conv2d_fwd_bias_relu_atomic_add_instance PUBLIC)
|
||||
target_compile_features(device_conv2d_bwd_data_instance PUBLIC)
|
||||
target_compile_features(device_reduce_instance PUBLIC)
|
||||
|
||||
set_target_properties(device_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(device_gemm_bias_2d_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
@@ -158,6 +188,7 @@ set_target_properties(device_conv2d_fwd_bias_relu_instance PROPERTIES POSITION_I
|
||||
set_target_properties(device_conv2d_fwd_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(device_conv2d_fwd_bias_relu_atomic_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(device_conv2d_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(device_reduce_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
install(TARGETS device_gemm_instance LIBRARY DESTINATION lib)
|
||||
install(TARGETS device_gemm_bias_2d_instance LIBRARY DESTINATION lib)
|
||||
@@ -170,3 +201,4 @@ install(TARGETS device_conv2d_fwd_bias_relu_instance LIBRARY DESTINATION lib)
|
||||
install(TARGETS device_conv2d_fwd_bias_relu_add_instance LIBRARY DESTINATION lib)
|
||||
install(TARGETS device_conv2d_fwd_bias_relu_atomic_add_instance LIBRARY DESTINATION lib)
|
||||
install(TARGETS device_conv2d_bwd_data_instance LIBRARY DESTINATION lib)
|
||||
install(TARGETS device_reduce_instance LIBRARY DESTINATION lib)
|
||||
|
||||
@@ -549,8 +549,11 @@ struct
|
||||
Conv_N_{N},
|
||||
Conv_K_{K},
|
||||
Conv_C_{C},
|
||||
input_spatial_lengths_{input_spatial_lengths},
|
||||
filter_spatial_lengths_{filter_spatial_lengths},
|
||||
output_spatial_lengths_{output_spatial_lengths},
|
||||
conv_filter_strides_{conv_filter_strides},
|
||||
conv_filter_dilations_{conv_filter_dilations},
|
||||
input_left_pads_{input_left_pads},
|
||||
input_right_pads_{input_right_pads}
|
||||
{
|
||||
@@ -625,8 +628,11 @@ struct
|
||||
index_t Conv_N_;
|
||||
index_t Conv_K_;
|
||||
index_t Conv_C_;
|
||||
std::vector<index_t> input_spatial_lengths_;
|
||||
std::vector<index_t> filter_spatial_lengths_;
|
||||
std::vector<index_t> output_spatial_lengths_;
|
||||
std::vector<index_t> conv_filter_strides_;
|
||||
std::vector<index_t> conv_filter_dilations_;
|
||||
std::vector<index_t> input_left_pads_;
|
||||
std::vector<index_t> input_right_pads_;
|
||||
};
|
||||
@@ -638,6 +644,28 @@ struct
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
{
|
||||
#if 0
|
||||
{
|
||||
std::cout << DeviceOp{}.GetTypeString() << std::endl;
|
||||
std::cout << "N " << arg.Conv_N_ << ", "
|
||||
<< "K " << arg.Conv_K_ << ", "
|
||||
<< "C " << arg.Conv_C_ << ", " << std::endl;
|
||||
std::cout << "Y X " << arg.filter_spatial_lengths_[0] << ", "
|
||||
<< arg.filter_spatial_lengths_[1] << ", " << std::endl;
|
||||
std::cout << "Hi Wi " << arg.input_spatial_lengths_[0] << ", "
|
||||
<< arg.input_spatial_lengths_[1] << ", " << std::endl;
|
||||
std::cout << "Ho Wo " << arg.output_spatial_lengths_[0] << ", "
|
||||
<< arg.output_spatial_lengths_[1] << ", " << std::endl;
|
||||
std::cout << "Strides " << arg.conv_filter_strides_[0] << ", "
|
||||
<< arg.conv_filter_strides_[1] << ", " << std::endl;
|
||||
std::cout << "Dilations " << arg.conv_filter_dilations_[0] << ", "
|
||||
<< arg.conv_filter_dilations_[1] << ", " << std::endl;
|
||||
std::cout << "InLeftPads " << arg.input_left_pads_[0] << ", "
|
||||
<< arg.input_left_pads_[1] << ", " << std::endl;
|
||||
std::cout << "InLeftPads " << arg.input_right_pads_[0] << ", "
|
||||
<< arg.input_right_pads_[1] << ", " << std::endl;
|
||||
}
|
||||
|
||||
{
|
||||
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
|
||||
<< ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
|
||||
@@ -656,6 +684,7 @@ struct
|
||||
std::cout << "arg.c1_grid_desc_m_n_{ " << arg.c1_grid_desc_m_n_.GetLength(I0)
|
||||
<< ", " << arg.c1_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
|
||||
arg.b_grid_desc_k0_n_k1_,
|
||||
|
||||
@@ -526,8 +526,11 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
|
||||
Conv_N_{N},
|
||||
Conv_K_{K},
|
||||
Conv_C_{C},
|
||||
input_spatial_lengths_{input_spatial_lengths},
|
||||
filter_spatial_lengths_{filter_spatial_lengths},
|
||||
output_spatial_lengths_{output_spatial_lengths},
|
||||
conv_filter_strides_{conv_filter_strides},
|
||||
conv_filter_dilations_{conv_filter_dilations},
|
||||
input_left_pads_{input_left_pads},
|
||||
input_right_pads_{input_right_pads}
|
||||
{
|
||||
@@ -590,8 +593,11 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
|
||||
index_t Conv_N_;
|
||||
index_t Conv_K_;
|
||||
index_t Conv_C_;
|
||||
std::vector<index_t> input_spatial_lengths_;
|
||||
std::vector<index_t> filter_spatial_lengths_;
|
||||
std::vector<index_t> output_spatial_lengths_;
|
||||
std::vector<index_t> conv_filter_strides_;
|
||||
std::vector<index_t> conv_filter_dilations_;
|
||||
std::vector<index_t> input_left_pads_;
|
||||
std::vector<index_t> input_right_pads_;
|
||||
};
|
||||
@@ -603,6 +609,28 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
{
|
||||
#if 0
|
||||
{
|
||||
std::cout << DeviceOp{}.GetTypeString() << std::endl;
|
||||
std::cout << "N " << arg.Conv_N_ << ", "
|
||||
<< "K " << arg.Conv_K_ << ", "
|
||||
<< "C " << arg.Conv_C_ << ", " << std::endl;
|
||||
std::cout << "Y X " << arg.filter_spatial_lengths_[0] << ", "
|
||||
<< arg.filter_spatial_lengths_[1] << ", " << std::endl;
|
||||
std::cout << "Hi Wi " << arg.input_spatial_lengths_[0] << ", "
|
||||
<< arg.input_spatial_lengths_[1] << ", " << std::endl;
|
||||
std::cout << "Ho Wo " << arg.output_spatial_lengths_[0] << ", "
|
||||
<< arg.output_spatial_lengths_[1] << ", " << std::endl;
|
||||
std::cout << "Strides " << arg.conv_filter_strides_[0] << ", "
|
||||
<< arg.conv_filter_strides_[1] << ", " << std::endl;
|
||||
std::cout << "Dilations " << arg.conv_filter_dilations_[0] << ", "
|
||||
<< arg.conv_filter_dilations_[1] << ", " << std::endl;
|
||||
std::cout << "InLeftPads " << arg.input_left_pads_[0] << ", "
|
||||
<< arg.input_left_pads_[1] << ", " << std::endl;
|
||||
std::cout << "InLeftPads " << arg.input_right_pads_[0] << ", "
|
||||
<< arg.input_right_pads_[1] << ", " << std::endl;
|
||||
}
|
||||
|
||||
{
|
||||
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
|
||||
<< ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
|
||||
@@ -618,6 +646,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
|
||||
std::cout << "arg.c0_grid_desc_m_n_{ " << arg.c0_grid_desc_m_n_.GetLength(I0)
|
||||
<< ", " << arg.c0_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
|
||||
arg.b_grid_desc_k0_n_k1_,
|
||||
|
||||
@@ -498,8 +498,11 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
|
||||
Conv_N_{N},
|
||||
Conv_K_{K},
|
||||
Conv_C_{C},
|
||||
input_spatial_lengths_{input_spatial_lengths},
|
||||
filter_spatial_lengths_{filter_spatial_lengths},
|
||||
output_spatial_lengths_{output_spatial_lengths},
|
||||
conv_filter_strides_{conv_filter_strides},
|
||||
conv_filter_dilations_{conv_filter_dilations},
|
||||
input_left_pads_{input_left_pads},
|
||||
input_right_pads_{input_right_pads}
|
||||
{
|
||||
@@ -551,8 +554,11 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
|
||||
index_t Conv_N_;
|
||||
index_t Conv_K_;
|
||||
index_t Conv_C_;
|
||||
std::vector<index_t> input_spatial_lengths_;
|
||||
std::vector<index_t> filter_spatial_lengths_;
|
||||
std::vector<index_t> output_spatial_lengths_;
|
||||
std::vector<index_t> conv_filter_strides_;
|
||||
std::vector<index_t> conv_filter_dilations_;
|
||||
std::vector<index_t> input_left_pads_;
|
||||
std::vector<index_t> input_right_pads_;
|
||||
};
|
||||
@@ -564,6 +570,28 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
{
|
||||
#if 0
|
||||
{
|
||||
std::cout << DeviceOp{}.GetTypeString() << std::endl;
|
||||
std::cout << "N " << arg.Conv_N_ << ", "
|
||||
<< "K " << arg.Conv_K_ << ", "
|
||||
<< "C " << arg.Conv_C_ << ", " << std::endl;
|
||||
std::cout << "Y X " << arg.filter_spatial_lengths_[0] << ", "
|
||||
<< arg.filter_spatial_lengths_[1] << ", " << std::endl;
|
||||
std::cout << "Hi Wi " << arg.input_spatial_lengths_[0] << ", "
|
||||
<< arg.input_spatial_lengths_[1] << ", " << std::endl;
|
||||
std::cout << "Ho Wo " << arg.output_spatial_lengths_[0] << ", "
|
||||
<< arg.output_spatial_lengths_[1] << ", " << std::endl;
|
||||
std::cout << "Strides " << arg.conv_filter_strides_[0] << ", "
|
||||
<< arg.conv_filter_strides_[1] << ", " << std::endl;
|
||||
std::cout << "Dilations " << arg.conv_filter_dilations_[0] << ", "
|
||||
<< arg.conv_filter_dilations_[1] << ", " << std::endl;
|
||||
std::cout << "InLeftPads " << arg.input_left_pads_[0] << ", "
|
||||
<< arg.input_left_pads_[1] << ", " << std::endl;
|
||||
std::cout << "InLeftPads " << arg.input_right_pads_[0] << ", "
|
||||
<< arg.input_right_pads_[1] << ", " << std::endl;
|
||||
}
|
||||
|
||||
{
|
||||
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
|
||||
<< ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
|
||||
@@ -598,6 +626,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
|
||||
.GetLength(I5)
|
||||
<< "}" << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
|
||||
arg.b_grid_desc_k0_n_k1_,
|
||||
|
||||
@@ -452,6 +452,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
{
|
||||
#if 0
|
||||
{
|
||||
std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
|
||||
<< ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
|
||||
@@ -464,6 +465,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
|
||||
std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
|
||||
<< arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
|
||||
arg.b_grid_desc_k0_n_k1_,
|
||||
|
||||
38
device_operation/include/device_pool2d_fwd.hpp
Normal file
38
device_operation/include/device_pool2d_fwd.hpp
Normal file
@@ -0,0 +1,38 @@
|
||||
#ifndef DEVICE_POOL2D_FWD_HPP
|
||||
#define DEVICE_POOL2D_FWD_HPP
|
||||
|
||||
#include <iostream>
|
||||
#include <array>
|
||||
#include "device_base.hpp"
|
||||
#include "reduction_enums.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <ck::ReduceTensorOp_t ReduceOpId>
|
||||
struct DevicePool2dFwd : public BaseOperator
|
||||
{
|
||||
virtual std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const void* in_dev,
|
||||
void* out_dev,
|
||||
void* out_indices_dev,
|
||||
ck::index_t N,
|
||||
ck::index_t C,
|
||||
std::array<ck::index_t, 2> input_spatial_lengths,
|
||||
std::array<ck::index_t, 2> window_spatial_lengths,
|
||||
std::array<ck::index_t, 2> output_spatial_lengths,
|
||||
std::array<ck::index_t, 2> window_strides,
|
||||
std::array<ck::index_t, 2> input_left_pads,
|
||||
std::array<ck::index_t, 2> input_right_pads) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
};
|
||||
|
||||
template <ck::ReduceTensorOp_t ReduceOpId>
|
||||
using DevicePool2dFwdPtr = std::unique_ptr<DevicePool2dFwd<ReduceOpId>>;
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
#endif
|
||||
327
device_operation/include/device_pool2d_fwd_nhwc_nhwc.hpp
Normal file
327
device_operation/include/device_pool2d_fwd_nhwc_nhwc.hpp
Normal file
@@ -0,0 +1,327 @@
|
||||
#ifndef DEVICE_POOL2D_FWD_NHWC_NHWC_HPP
|
||||
#define DEVICE_POOL2D_FWD_NHWC_NHWC_HPP
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include "device_pool2d_fwd.hpp"
|
||||
#include "tensor_descriptor.hpp"
|
||||
#include "tensor_descriptor_helper.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "gridwise_2d_reduction_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <typename InDataType,
|
||||
typename OutDataType,
|
||||
typename AccDataType,
|
||||
ck::ReduceTensorOp_t ReduceOpId,
|
||||
bool NeedIndices,
|
||||
ck::index_t BlockSize,
|
||||
ck::index_t ReduceMThreadClusterSize,
|
||||
ck::index_t ReduceKThreadClusterSize,
|
||||
ck::index_t ReduceMThreadSliceSize,
|
||||
ck::index_t ReduceKThreadSliceSize,
|
||||
ck::index_t InSrcOutDstVectorSize>
|
||||
struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd<ReduceOpId>
|
||||
{
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
static constexpr auto I2 = Number<2>{};
|
||||
static constexpr auto I3 = Number<3>{};
|
||||
static constexpr auto I4 = Number<4>{};
|
||||
static constexpr auto I5 = Number<5>{};
|
||||
|
||||
using IndexDataType = int32_t;
|
||||
|
||||
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
|
||||
|
||||
using InElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
|
||||
|
||||
using AccElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
|
||||
AccElementwiseOperation;
|
||||
|
||||
static constexpr bool BetaIsZero = true;
|
||||
|
||||
static constexpr index_t InSrcOutDstVectorDim =
|
||||
0; // for NHWC, the dim C is the vector Dim for both input and output in memory, which is
|
||||
// not reduced.
|
||||
|
||||
static constexpr ck::index_t ReduceM_BlockTileSize =
|
||||
ReduceMThreadClusterSize * ReduceMThreadSliceSize;
|
||||
static constexpr ck::index_t ReduceK_BlockTileSize =
|
||||
ReduceKThreadClusterSize * ReduceKThreadSliceSize;
|
||||
|
||||
static auto MakeABGridDescriptor_A_M_K_B_M(ck::index_t N,
|
||||
ck::index_t C,
|
||||
std::array<ck::index_t, 2> input_spatial_lengths,
|
||||
std::array<ck::index_t, 2> window_spatial_lengths,
|
||||
std::array<ck::index_t, 2> output_spatial_lengths,
|
||||
std::array<ck::index_t, 2> window_strides,
|
||||
std::array<ck::index_t, 2> input_left_pads,
|
||||
std::array<ck::index_t, 2> input_right_pads)
|
||||
{
|
||||
const index_t Hi = input_spatial_lengths[0];
|
||||
const index_t Wi = input_spatial_lengths[1];
|
||||
|
||||
const index_t Ho = output_spatial_lengths[0];
|
||||
const index_t Wo = output_spatial_lengths[1];
|
||||
|
||||
const index_t Y = window_spatial_lengths[0];
|
||||
const index_t X = window_spatial_lengths[1];
|
||||
|
||||
const index_t ConvStrideH = window_strides[0];
|
||||
const index_t ConvStrideW = window_strides[1];
|
||||
|
||||
const index_t InLeftPadH = input_left_pads[0];
|
||||
const index_t InLeftPadW = input_left_pads[1];
|
||||
|
||||
const index_t InRightPadH = input_right_pads[0];
|
||||
const index_t InRightPadW = input_right_pads[1];
|
||||
|
||||
const index_t ReduceMRaw = N * Ho * Wo * C;
|
||||
const index_t ReduceMPad =
|
||||
math::integer_least_multiple(ReduceMRaw, ReduceM_BlockTileSize) - ReduceMRaw;
|
||||
|
||||
const index_t ReduceKRaw = Y * X;
|
||||
const index_t ReduceKPad =
|
||||
math::integer_least_multiple(ReduceKRaw, ReduceK_BlockTileSize) - ReduceKRaw;
|
||||
|
||||
// A[ReduceM, ReduceK]
|
||||
const auto in_grid_desc_n_hi_wi_c =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
|
||||
|
||||
const auto in_grid_desc_n_hip_wip_c = transform_tensor_descriptor(
|
||||
in_grid_desc_n_hi_wi_c,
|
||||
make_tuple(make_pass_through_transform(N),
|
||||
make_pad_transform(Hi, InLeftPadH, InRightPadH),
|
||||
make_pad_transform(Wi, InLeftPadW, InRightPadW),
|
||||
make_pass_through_transform(C)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
|
||||
|
||||
const auto in_grid_desc_n_y_ho_x_wo_c = transform_tensor_descriptor(
|
||||
in_grid_desc_n_hip_wip_c,
|
||||
make_tuple(make_pass_through_transform(N),
|
||||
make_embed_transform(make_tuple(Y, Ho), make_tuple(I1, ConvStrideH)),
|
||||
make_embed_transform(make_tuple(X, Wo), make_tuple(I1, ConvStrideW)),
|
||||
make_pass_through_transform(C)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
|
||||
|
||||
const auto in_grid_desc_reducemraw_reducekraw =
|
||||
transform_tensor_descriptor(in_grid_desc_n_y_ho_x_wo_c,
|
||||
make_tuple(make_merge_transform(make_tuple(N, Ho, Wo, C)),
|
||||
make_merge_transform(make_tuple(Y, X))),
|
||||
make_tuple(Sequence<0, 2, 4, 5>{}, Sequence<1, 3>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
const auto in_grid_desc_reducem_reducek = transform_tensor_descriptor(
|
||||
in_grid_desc_reducemraw_reducekraw,
|
||||
make_tuple(make_right_pad_transform(ReduceMRaw, ReduceMPad),
|
||||
make_right_pad_transform(ReduceKRaw, ReduceKPad)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
// B[ReduceM]
|
||||
const auto out_grid_desc_reducemraw =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo * C));
|
||||
|
||||
const auto out_grid_desc_reducem = transform_tensor_descriptor(
|
||||
out_grid_desc_reducemraw,
|
||||
make_tuple(make_right_pad_transform(ReduceMRaw, ReduceMPad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
return make_tuple(in_grid_desc_reducem_reducek, out_grid_desc_reducem);
|
||||
}
|
||||
|
||||
using ABGridDescs = decltype(
|
||||
MakeABGridDescriptor_A_M_K_B_M(1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}));
|
||||
|
||||
using AGridDesc_M_K = remove_cvref_t<decltype(ABGridDescs{}[I0])>;
|
||||
using BGridDesc_M = remove_cvref_t<decltype(ABGridDescs{}[I1])>;
|
||||
|
||||
// TODO
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const InDataType* p_in_dev,
|
||||
OutDataType* p_out_dev,
|
||||
int* p_out_indices_dev,
|
||||
ck::index_t N,
|
||||
ck::index_t C,
|
||||
std::array<ck::index_t, 2>& input_spatial_lengths,
|
||||
std::array<ck::index_t, 2>& window_spatial_lengths,
|
||||
std::array<ck::index_t, 2>& output_spatial_lengths,
|
||||
std::array<ck::index_t, 2>& window_strides,
|
||||
std::array<ck::index_t, 2>& input_left_pads,
|
||||
std::array<ck::index_t, 2>& input_right_pads)
|
||||
: p_in_dev_{p_in_dev},
|
||||
p_out_dev_{p_out_dev},
|
||||
p_out_indices_dev_{p_out_indices_dev},
|
||||
a_grid_desc_m_k_{},
|
||||
b_grid_desc_m_{}
|
||||
{
|
||||
const auto descs = MakeABGridDescriptor_A_M_K_B_M(N,
|
||||
C,
|
||||
input_spatial_lengths,
|
||||
window_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
window_strides,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
|
||||
a_grid_desc_m_k_ = descs[I0];
|
||||
b_grid_desc_m_ = descs[I1];
|
||||
|
||||
invariant_lowest_length_ = C;
|
||||
reduce_lowest_length_ = window_spatial_lengths[1];
|
||||
|
||||
// TODO: is this correct?
|
||||
if constexpr(ReduceOpId == ck::ReduceTensorOp_t::AVG)
|
||||
{
|
||||
ck::index_t divider = window_spatial_lengths[0] * window_spatial_lengths[1];
|
||||
in_element_op_ = InElementwiseOperation{divider};
|
||||
acc_element_op_ = AccElementwiseOperation{divider};
|
||||
}
|
||||
}
|
||||
|
||||
const InDataType* p_in_dev_;
|
||||
OutDataType* p_out_dev_;
|
||||
int* p_out_indices_dev_;
|
||||
AGridDesc_M_K a_grid_desc_m_k_;
|
||||
BGridDesc_M b_grid_desc_m_;
|
||||
InElementwiseOperation in_element_op_;
|
||||
AccElementwiseOperation acc_element_op_;
|
||||
|
||||
// for checking vector load/store
|
||||
ck::index_t invariant_lowest_length_;
|
||||
ck::index_t reduce_lowest_length_;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
{
|
||||
using gridwise_reduce = GridwiseReduction_mk_to_m_threadwise<InDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
IndexDataType,
|
||||
AGridDesc_M_K,
|
||||
BGridDesc_M,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
false, // propagate_nan
|
||||
BetaIsZero,
|
||||
BlockSize,
|
||||
ReduceMThreadClusterSize,
|
||||
ReduceKThreadClusterSize,
|
||||
ReduceMThreadSliceSize,
|
||||
ReduceKThreadSliceSize,
|
||||
InSrcOutDstVectorDim,
|
||||
InSrcOutDstVectorSize,
|
||||
InSrcOutDstVectorSize>;
|
||||
|
||||
const auto kernel = kernel_reduce_threadwise<gridwise_reduce,
|
||||
NeedIndices,
|
||||
InDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
IndexDataType,
|
||||
AGridDesc_M_K,
|
||||
BGridDesc_M,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation>;
|
||||
|
||||
ck::index_t ReduceM = arg.a_grid_desc_m_k_.GetLength(I0);
|
||||
|
||||
const index_t grid_size = (ReduceM / ReduceM_BlockTileSize);
|
||||
|
||||
return launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(grid_size),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.a_grid_desc_m_k_,
|
||||
arg.b_grid_desc_m_,
|
||||
arg.in_element_op_,
|
||||
arg.acc_element_op_,
|
||||
float(1),
|
||||
arg.p_in_dev_,
|
||||
float(0),
|
||||
arg.p_out_dev_,
|
||||
arg.p_out_indices_dev_);
|
||||
}
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
}
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
{
|
||||
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
||||
|
||||
if(pArg->invariant_lowest_length_ % InSrcOutDstVectorSize != 0)
|
||||
{
|
||||
return (false);
|
||||
}
|
||||
|
||||
return (true);
|
||||
}
|
||||
|
||||
std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const void* p_in_dev,
|
||||
void* p_out_dev,
|
||||
void* p_out_indices_dev,
|
||||
ck::index_t N,
|
||||
ck::index_t C,
|
||||
std::array<ck::index_t, 2> input_spatial_lengths,
|
||||
std::array<ck::index_t, 2> window_spatial_lengths,
|
||||
std::array<ck::index_t, 2> output_spatial_lengths,
|
||||
std::array<ck::index_t, 2> window_strides,
|
||||
std::array<ck::index_t, 2> input_left_pads,
|
||||
std::array<ck::index_t, 2> input_right_pads) override
|
||||
{
|
||||
return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_dev),
|
||||
static_cast<OutDataType*>(p_out_dev),
|
||||
static_cast<int*>(p_out_indices_dev),
|
||||
N,
|
||||
C,
|
||||
input_spatial_lengths,
|
||||
window_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
window_strides,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
}
|
||||
|
||||
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>(Invoker{});
|
||||
}
|
||||
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << "DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<" << BlockSize << ",";
|
||||
str << "M_C" << ReduceMThreadClusterSize << "_S" << ReduceMThreadSliceSize << ",";
|
||||
str << "K_C" << ReduceKThreadClusterSize << "_S" << ReduceKThreadSliceSize << ",";
|
||||
str <<"InSrcOutDstVectorSize_" << InSrcOutDstVectorSize << ">";
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
}; // namespace device
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
#endif
|
||||
58
device_operation/include/device_reduce.hpp
Normal file
58
device_operation/include/device_reduce.hpp
Normal file
@@ -0,0 +1,58 @@
|
||||
#ifndef DEVICE_REDUCE_HPP
|
||||
#define DEVICE_REDUCE_HPP
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <iostream>
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "device_base.hpp"
|
||||
#include "reduction_enums.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <typename InElementwiseOperation, typename AccElementwiseOperation>
|
||||
struct DeviceReduce : public BaseOperator
|
||||
{
|
||||
virtual size_t GetWorkspaceSizeInBytes(const std::vector<int>& inLengths)
|
||||
{
|
||||
(void)inLengths;
|
||||
|
||||
return (0);
|
||||
};
|
||||
|
||||
virtual bool HasFurtherCall() { return (false); };
|
||||
|
||||
virtual std::vector<int> GetWorkspace2dLengths(const BaseArgument* argPtr)
|
||||
{
|
||||
(void)argPtr;
|
||||
return (std::vector<int>{0, 0});
|
||||
};
|
||||
|
||||
virtual std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
float alpha,
|
||||
float beta,
|
||||
const void* in_dev,
|
||||
void* out_dev,
|
||||
void* out_indices_dev,
|
||||
void* workspace_dev,
|
||||
const InElementwiseOperation& inElementwiseOp,
|
||||
const AccElementwiseOperation& accElementwiseOp) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
};
|
||||
|
||||
template <typename InElementwiseOperation, typename AccElementwiseOperation>
|
||||
using DeviceReducePtr =
|
||||
std::unique_ptr<DeviceReduce<InElementwiseOperation, AccElementwiseOperation>>;
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
#endif
|
||||
354
device_operation/include/device_reduce_blockwise.hpp
Normal file
354
device_operation/include/device_reduce_blockwise.hpp
Normal file
@@ -0,0 +1,354 @@
|
||||
#ifndef DEVICE_REDUCE_BLOCKWISE_HPP
|
||||
#define DEVICE_REDUCE_BLOCKWISE_HPP
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include "device.hpp"
|
||||
#include "device_reduce.hpp"
|
||||
#include "device_reduce_common.hpp"
|
||||
#include "gridwise_2d_reduction_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
typename ReduceDims,
|
||||
typename ReduceOperation,
|
||||
typename InElementwiseOperation,
|
||||
typename AccElementwiseOperation,
|
||||
bool PropagateNan,
|
||||
bool NeedIndices,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
index_t MThreadSliceSize,
|
||||
index_t KThreadSliceSize,
|
||||
index_t InSrcVectorDim,
|
||||
index_t InSrcVectorSize,
|
||||
index_t OutDstVectorSize>
|
||||
struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
|
||||
{
|
||||
static_assert(Rank <= 6, "Bigger Rank size is not supported!");
|
||||
static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
|
||||
"Invalid thread cluster size assignments!");
|
||||
|
||||
using IndexDataType = int32_t;
|
||||
|
||||
static constexpr bool BetaIsZero = NeedIndices;
|
||||
|
||||
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>());
|
||||
|
||||
static constexpr index_t srcDims = Rank;
|
||||
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
|
||||
static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
|
||||
|
||||
static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
|
||||
static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
|
||||
|
||||
static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides)
|
||||
{
|
||||
const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
|
||||
const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
|
||||
|
||||
const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
||||
|
||||
const auto in_grid_desc_m_k = [&]() {
|
||||
if constexpr(reduceAllDims)
|
||||
{
|
||||
const auto one_dim_inDesc = transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(tupleSrcLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
return transform_tensor_descriptor(one_dim_inDesc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(
|
||||
1, one_dim_inDesc.GetLength(Number<0>{})))),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 1>{}));
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto toReduceDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
|
||||
const auto invariantDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
|
||||
|
||||
return transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(invariantDimLengths),
|
||||
make_merge_transform(toReduceDimLengths)),
|
||||
make_tuple(InvariantDims{}, ReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
}
|
||||
}();
|
||||
|
||||
const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
|
||||
const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
|
||||
|
||||
const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
|
||||
const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen;
|
||||
|
||||
auto in_grid_desc_m_k_padded =
|
||||
transform_tensor_descriptor(in_grid_desc_m_k,
|
||||
make_tuple(make_right_pad_transform(outerLen, inPad_M),
|
||||
make_right_pad_transform(innerLen, inPad_K)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
return (in_grid_desc_m_k_padded);
|
||||
};
|
||||
|
||||
static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides)
|
||||
{
|
||||
const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<dstDims>{});
|
||||
const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<dstDims>{});
|
||||
|
||||
auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
auto out_grid_desc_m = transform_tensor_descriptor(
|
||||
outDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto outerLen = out_grid_desc_m.GetLength(Number<0>{});
|
||||
|
||||
const auto inPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
|
||||
|
||||
auto out_grid_desc_m_padded =
|
||||
transform_tensor_descriptor(out_grid_desc_m,
|
||||
make_tuple(make_right_pad_transform(outerLen, inPad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
return (out_grid_desc_m_padded);
|
||||
};
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
float alpha,
|
||||
float beta,
|
||||
const InDataType* in_dev,
|
||||
OutDataType* out_dev,
|
||||
IndexDataType* out_indices_dev,
|
||||
AccDataType* workspace_dev,
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const AccElementwiseOperation& acc_elementwise_op)
|
||||
: in_dev_{in_dev}, out_dev_{out_dev}, out_indices_dev_{out_indices_dev}
|
||||
{
|
||||
(void)workspace_dev;
|
||||
|
||||
inLengths_ = inLengths;
|
||||
inStrides_ = inStrides;
|
||||
outLengths_ = outLengths;
|
||||
outStrides_ = outStrides;
|
||||
|
||||
in_elementwise_op_ = in_elementwise_op;
|
||||
acc_elementwise_op_ = acc_elementwise_op;
|
||||
|
||||
alpha_ = static_cast<AccDataType>(alpha);
|
||||
beta_ = static_cast<OutDataType>(beta);
|
||||
|
||||
std::tie(invariant_total_length, reduce_total_length) =
|
||||
get_2d_lengths<Rank, ReduceDims>(inLengths);
|
||||
|
||||
if constexpr(InvariantDims::Size() == 0)
|
||||
invariant_lowest_length = 1;
|
||||
else
|
||||
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)];
|
||||
|
||||
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)];
|
||||
|
||||
gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
|
||||
M_BlockTileSize;
|
||||
}
|
||||
|
||||
std::vector<int> inLengths_;
|
||||
std::vector<int> inStrides_;
|
||||
std::vector<int> outLengths_;
|
||||
std::vector<int> outStrides_;
|
||||
|
||||
AccDataType alpha_;
|
||||
OutDataType beta_;
|
||||
|
||||
const InDataType* in_dev_;
|
||||
OutDataType* out_dev_;
|
||||
IndexDataType* out_indices_dev_;
|
||||
|
||||
InElementwiseOperation in_elementwise_op_;
|
||||
AccElementwiseOperation acc_elementwise_op_;
|
||||
|
||||
int invariant_lowest_length;
|
||||
int reduce_lowest_length;
|
||||
size_t invariant_total_length;
|
||||
size_t reduce_total_length;
|
||||
|
||||
size_t gridSize;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
{
|
||||
const auto in_grid_desc_m_k =
|
||||
DeviceReduceBlockWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_);
|
||||
const auto out_grid_desc_m =
|
||||
DeviceReduceBlockWise::MakeDst1dDescriptor(arg.outLengths_, arg.outStrides_);
|
||||
using InGridDesc_M_K = decltype(in_grid_desc_m_k);
|
||||
using OutGridDesc_M = decltype(out_grid_desc_m);
|
||||
|
||||
using GridwiseReduce = GridwiseReduction_mk_to_m_blockwise<InDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
IndexDataType,
|
||||
InGridDesc_M_K,
|
||||
OutGridDesc_M,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
PropagateNan,
|
||||
BetaIsZero,
|
||||
BlockSize,
|
||||
MThreadClusterSize,
|
||||
KThreadClusterSize,
|
||||
MThreadSliceSize,
|
||||
KThreadSliceSize,
|
||||
InSrcVectorDim,
|
||||
InSrcVectorSize,
|
||||
OutDstVectorSize>;
|
||||
|
||||
float avg_time = 0;
|
||||
|
||||
const auto kernel = kernel_reduce_blockwise<GridwiseReduce,
|
||||
NeedIndices,
|
||||
InDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
IndexDataType,
|
||||
InGridDesc_M_K,
|
||||
OutGridDesc_M,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation>;
|
||||
|
||||
avg_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
in_grid_desc_m_k,
|
||||
out_grid_desc_m,
|
||||
arg.in_elementwise_op_,
|
||||
arg.acc_elementwise_op_,
|
||||
arg.alpha_,
|
||||
arg.in_dev_,
|
||||
arg.beta_,
|
||||
arg.out_dev_,
|
||||
nullptr,
|
||||
arg.out_indices_dev_);
|
||||
|
||||
return (avg_time);
|
||||
};
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
};
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
{
|
||||
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
||||
|
||||
if constexpr(InSrcVectorDim == 0)
|
||||
{
|
||||
if constexpr(InvariantDims::Size() == 0)
|
||||
return (false);
|
||||
|
||||
if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
|
||||
return (false);
|
||||
|
||||
if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
|
||||
return (false);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
|
||||
return (false);
|
||||
|
||||
if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
|
||||
return (false);
|
||||
};
|
||||
|
||||
// To improve
|
||||
if(pArg->invariant_lowest_length % OutDstVectorSize != 0)
|
||||
return (false);
|
||||
|
||||
// cases with very small reduce_total_length should be handled by the ThreadWise method
|
||||
if(pArg->reduce_total_length / KThreadSliceSize < 2)
|
||||
return (false);
|
||||
|
||||
return (true);
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
float alpha,
|
||||
float beta,
|
||||
const void* in_dev,
|
||||
void* out_dev,
|
||||
void* out_indices_dev,
|
||||
void* workspace_dev,
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const AccElementwiseOperation& acc_elementwise_op) override
|
||||
{
|
||||
return std::make_unique<Argument>(inLengths,
|
||||
inStrides,
|
||||
outLengths,
|
||||
outStrides,
|
||||
alpha,
|
||||
beta,
|
||||
static_cast<const InDataType*>(in_dev),
|
||||
static_cast<OutDataType*>(out_dev),
|
||||
static_cast<IndexDataType*>(out_indices_dev),
|
||||
static_cast<AccDataType*>(workspace_dev),
|
||||
in_elementwise_op,
|
||||
acc_elementwise_op);
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
};
|
||||
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << "DeviceReduceBlockWise<" << BlockSize << ",";
|
||||
str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
|
||||
str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
|
||||
str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
#endif
|
||||
317
device_operation/include/device_reduce_blockwise_second_call.hpp
Normal file
317
device_operation/include/device_reduce_blockwise_second_call.hpp
Normal file
@@ -0,0 +1,317 @@
|
||||
#ifndef DEVICE_REDUCE_BLOCKWISE_SECOND_CALL_HPP
|
||||
#define DEVICE_REDUCE_BLOCKWISE_SECOND_CALL_HPP
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include "device.hpp"
|
||||
#include "device_reduce.hpp"
|
||||
#include "device_reduce_common.hpp"
|
||||
#include "gridwise_2d_reduction_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
typename ReduceDims,
|
||||
typename ReduceOperation,
|
||||
typename InElementwiseOperation,
|
||||
typename AccElementwiseOperation,
|
||||
bool PropagateNan,
|
||||
bool NeedIndices,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
index_t MThreadSliceSize,
|
||||
index_t KThreadSliceSize,
|
||||
index_t InSrcVectorDim,
|
||||
index_t InSrcVectorSize,
|
||||
index_t OutDstVectorSize>
|
||||
struct DeviceReduceBlockWiseSecondCall
|
||||
: public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
|
||||
{
|
||||
static_assert(Rank <= 6, "Bigger Rank size is not supported!");
|
||||
static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
|
||||
"Invalid thread cluster size assignments!");
|
||||
|
||||
using IndexDataType = int32_t;
|
||||
|
||||
static constexpr bool BetaIsZero = NeedIndices;
|
||||
|
||||
static_assert(
|
||||
std::is_same<InDataType, AccDataType>::value,
|
||||
"InDataType and AccDataType should be the same to use DEviceReduceBlockWiseSecondCall!");
|
||||
|
||||
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>());
|
||||
|
||||
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
|
||||
|
||||
static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
|
||||
static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
|
||||
|
||||
static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides)
|
||||
{
|
||||
const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<2>{});
|
||||
const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<2>{});
|
||||
|
||||
const auto in_grid_desc_m_k =
|
||||
make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
||||
|
||||
const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
|
||||
const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
|
||||
|
||||
const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
|
||||
const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen;
|
||||
|
||||
auto in_grid_desc_m_k_padded =
|
||||
transform_tensor_descriptor(in_grid_desc_m_k,
|
||||
make_tuple(make_right_pad_transform(outerLen, inPad_M),
|
||||
make_right_pad_transform(innerLen, inPad_K)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
return (in_grid_desc_m_k_padded);
|
||||
};
|
||||
|
||||
static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides)
|
||||
{
|
||||
const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<dstDims>{});
|
||||
const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<dstDims>{});
|
||||
|
||||
auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
auto out_grid_desc_m = transform_tensor_descriptor(
|
||||
outDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto outerLen = out_grid_desc_m.GetLength(Number<0>{});
|
||||
|
||||
const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
|
||||
|
||||
auto out_grid_desc_m_padded =
|
||||
transform_tensor_descriptor(out_grid_desc_m,
|
||||
make_tuple(make_right_pad_transform(outerLen, outPad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
return (out_grid_desc_m_padded);
|
||||
};
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
float alpha,
|
||||
float beta,
|
||||
const InDataType* in_dev,
|
||||
OutDataType* out_dev,
|
||||
IndexDataType* out_indices_dev,
|
||||
AccDataType* workspace_dev,
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const AccElementwiseOperation& acc_elementwise_op)
|
||||
: in_dev_{in_dev}, out_dev_{out_dev}, out_indices_dev_{out_indices_dev}
|
||||
{
|
||||
inLengths_ = inLengths;
|
||||
inStrides_ = inStrides;
|
||||
outLengths_ = outLengths;
|
||||
outStrides_ = outStrides;
|
||||
|
||||
in_elementwise_op_ = in_elementwise_op;
|
||||
acc_elementwise_op_ = acc_elementwise_op;
|
||||
|
||||
alpha_ = static_cast<AccDataType>(alpha);
|
||||
beta_ = static_cast<OutDataType>(beta);
|
||||
|
||||
invariant_total_length = inLengths[0];
|
||||
reduce_total_length = inLengths[1];
|
||||
|
||||
invariant_lowest_length = inLengths[0];
|
||||
reduce_lowest_length = inLengths[1];
|
||||
|
||||
gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
|
||||
M_BlockTileSize;
|
||||
|
||||
size_t ws_buf2_bytes_offset = math::integer_least_multiple(
|
||||
invariant_total_length * reduce_total_length * sizeof(AccDataType), 64);
|
||||
|
||||
if constexpr(NeedIndices)
|
||||
workspace_indices_dev_ = reinterpret_cast<index_t*>(
|
||||
reinterpret_cast<char*>(workspace_dev) + ws_buf2_bytes_offset);
|
||||
else
|
||||
workspace_indices_dev_ = nullptr;
|
||||
}
|
||||
|
||||
std::vector<int> inLengths_;
|
||||
std::vector<int> inStrides_;
|
||||
std::vector<int> outLengths_;
|
||||
std::vector<int> outStrides_;
|
||||
|
||||
AccDataType alpha_;
|
||||
OutDataType beta_;
|
||||
|
||||
const InDataType* in_dev_;
|
||||
OutDataType* out_dev_;
|
||||
IndexDataType* out_indices_dev_;
|
||||
IndexDataType* workspace_indices_dev_;
|
||||
|
||||
InElementwiseOperation in_elementwise_op_;
|
||||
AccElementwiseOperation acc_elementwise_op_;
|
||||
|
||||
int invariant_lowest_length;
|
||||
int reduce_lowest_length;
|
||||
size_t invariant_total_length;
|
||||
size_t reduce_total_length;
|
||||
|
||||
size_t gridSize;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
{
|
||||
const auto in_grid_desc_m_k = DeviceReduceBlockWiseSecondCall::MakeSrc2dDescriptor(
|
||||
arg.inLengths_, arg.inStrides_);
|
||||
const auto out_grid_desc_m = DeviceReduceBlockWiseSecondCall::MakeDst1dDescriptor(
|
||||
arg.outLengths_, arg.outStrides_);
|
||||
using InGridDesc_M_K = decltype(in_grid_desc_m_k);
|
||||
using OutGridDesc_M = decltype(out_grid_desc_m);
|
||||
|
||||
using GridwiseReduce = GridwiseReduction_mk_to_m_blockwise<InDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
IndexDataType,
|
||||
InGridDesc_M_K,
|
||||
OutGridDesc_M,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
PropagateNan,
|
||||
BetaIsZero,
|
||||
BlockSize,
|
||||
MThreadClusterSize,
|
||||
KThreadClusterSize,
|
||||
MThreadSliceSize,
|
||||
KThreadSliceSize,
|
||||
InSrcVectorDim,
|
||||
InSrcVectorSize,
|
||||
OutDstVectorSize>;
|
||||
|
||||
float avg_time = 0;
|
||||
|
||||
const auto kernel = kernel_reduce_blockwise_second_call<GridwiseReduce,
|
||||
NeedIndices,
|
||||
InDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
IndexDataType,
|
||||
InGridDesc_M_K,
|
||||
OutGridDesc_M,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation>;
|
||||
|
||||
avg_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
in_grid_desc_m_k,
|
||||
out_grid_desc_m,
|
||||
arg.in_elementwise_op_,
|
||||
arg.acc_elementwise_op_,
|
||||
arg.alpha_,
|
||||
arg.in_dev_,
|
||||
arg.beta_,
|
||||
arg.out_dev_,
|
||||
arg.workspace_indices_dev_,
|
||||
arg.out_indices_dev_);
|
||||
|
||||
return (avg_time);
|
||||
};
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
};
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
{
|
||||
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
||||
|
||||
if constexpr(InSrcVectorDim == 0)
|
||||
return (false);
|
||||
|
||||
if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
|
||||
return (false);
|
||||
|
||||
// To improve
|
||||
if(pArg->invariant_lowest_length % OutDstVectorSize != 0)
|
||||
return (false);
|
||||
|
||||
// cases with very small reduce_total_length should be handled by the ThreadWise method
|
||||
if(pArg->reduce_total_length / KThreadSliceSize < 2)
|
||||
return (false);
|
||||
|
||||
return (true);
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
float alpha,
|
||||
float beta,
|
||||
const void* in_dev,
|
||||
void* out_dev,
|
||||
void* out_indices_dev,
|
||||
void* workspace_dev,
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const AccElementwiseOperation& acc_elementwise_op) override
|
||||
{
|
||||
return std::make_unique<Argument>(inLengths,
|
||||
inStrides,
|
||||
outLengths,
|
||||
outStrides,
|
||||
alpha,
|
||||
beta,
|
||||
static_cast<const InDataType*>(in_dev),
|
||||
static_cast<OutDataType*>(out_dev),
|
||||
static_cast<IndexDataType*>(out_indices_dev),
|
||||
static_cast<AccDataType*>(workspace_dev),
|
||||
in_elementwise_op,
|
||||
acc_elementwise_op);
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
};
|
||||
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << "DeviceReduceBlockWiseSecondCall<" << BlockSize << ",";
|
||||
str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
|
||||
str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
|
||||
str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
#endif
|
||||
81
device_operation/include/device_reduce_common.hpp
Normal file
81
device_operation/include/device_reduce_common.hpp
Normal file
@@ -0,0 +1,81 @@
|
||||
#ifndef DEVICE_REDUCE_COMMON_HPP
|
||||
#define DEVICE_REDUCE_COMMON_HPP
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "common_header.hpp"
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
// template <typename preUnaryOpType, typename posUnaryOpType>
|
||||
// using DeviceReducePtr = std::unique_ptr<DeviceReduce<preUnaryOpType, posUnaryOpType>>;
|
||||
|
||||
template <int Rank, typename ReduceDims>
|
||||
std::pair<size_t, size_t> get_2d_lengths(const std::vector<int>& inLengths)
|
||||
{
|
||||
static_assert(Rank <= 6, "bigger Rank size not supported!");
|
||||
|
||||
size_t tensor_total_length = 1;
|
||||
size_t reduce_total_length = 1;
|
||||
|
||||
static_for<0, ReduceDims::Size(), 1>{}(
|
||||
[&](auto i) { reduce_total_length *= inLengths[ReduceDims::At(i)]; });
|
||||
|
||||
static_for<0, Rank, 1>{}([&](auto i) { tensor_total_length *= inLengths[i.value]; });
|
||||
|
||||
return std::make_pair(tensor_total_length / reduce_total_length, reduce_total_length);
|
||||
};
|
||||
|
||||
template <int x, typename Seq>
|
||||
constexpr bool belong()
|
||||
{
|
||||
bool inside = false;
|
||||
|
||||
static_for<0, Seq::Size(), 1>{}([&](auto i) { inside = (inside || (x == Seq::At(i))); });
|
||||
|
||||
return (inside);
|
||||
};
|
||||
|
||||
template <int Rank, typename ReduceDims, int start = 0>
|
||||
constexpr auto get_invariant_dims()
|
||||
{
|
||||
static_assert(Rank <= 6, "bigger Rank size not supported!");
|
||||
|
||||
if constexpr(start >= Rank)
|
||||
return Sequence<>{};
|
||||
else
|
||||
{
|
||||
if constexpr(!belong<start, ReduceDims>())
|
||||
return merge_sequences(Sequence<start>{},
|
||||
get_invariant_dims<Rank, ReduceDims, start + 1>());
|
||||
else
|
||||
return get_invariant_dims<Rank, ReduceDims, start + 1>();
|
||||
};
|
||||
};
|
||||
|
||||
// helper functions using variadic template arguments
|
||||
template <index_t... Ns>
|
||||
static auto make_tuple_from_array_and_index_seq(const std::vector<int>& lengths, Sequence<Ns...>)
|
||||
{
|
||||
return make_tuple(static_cast<index_t>(lengths[Ns])...);
|
||||
};
|
||||
|
||||
template <index_t arraySize>
|
||||
static auto make_tuple_from_array(const std::vector<int>& lengths, Number<arraySize>)
|
||||
{
|
||||
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
|
||||
|
||||
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
|
||||
|
||||
return make_tuple_from_array_and_index_seq(lengths, index_seq);
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
28
device_operation/include/device_reduce_instance.hpp
Normal file
28
device_operation/include/device_reduce_instance.hpp
Normal file
@@ -0,0 +1,28 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANTCE_HPP
|
||||
#define DEVICE_REDUCE_INSTANTCE_HPP
|
||||
|
||||
#include "device_reduce_instance_blockwise_f16_f16_f16.hpp"
|
||||
#include "device_reduce_instance_blockwise_f16_f32_f16.hpp"
|
||||
#include "device_reduce_instance_blockwise_f32_f32_f32.hpp"
|
||||
#include "device_reduce_instance_blockwise_f32_f64_f32.hpp"
|
||||
#include "device_reduce_instance_blockwise_f64_f64_f64.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp"
|
||||
#include "device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp"
|
||||
#include "device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp"
|
||||
#include "device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp"
|
||||
#include "device_reduce_instance_threadwise_f16_f16_f16.hpp"
|
||||
#include "device_reduce_instance_threadwise_f16_f32_f16.hpp"
|
||||
#include "device_reduce_instance_threadwise_f32_f32_f32.hpp"
|
||||
#include "device_reduce_instance_threadwise_f32_f64_f32.hpp"
|
||||
#include "device_reduce_instance_threadwise_f64_f64_f64.hpp"
|
||||
|
||||
#endif
|
||||
168
device_operation/include/device_reduce_instance_blockwise.hpp
Normal file
168
device_operation/include/device_reduce_instance_blockwise.hpp
Normal file
@@ -0,0 +1,168 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_HPP
|
||||
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_impl_common.hpp"
|
||||
#include "device_reduce_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
#ifdef QUICK_REDUCE_TEST
|
||||
using reduce_configuration_2_instances_blockwise = std::tuple<
|
||||
// clang-format off
|
||||
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
||||
ReductionConfiguration_2<0, 2, 2, 2, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 2, 1>,
|
||||
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
||||
ReductionConfiguration_2<1, 2, 2, 1, 2>,
|
||||
ReductionConfiguration_2<0, 1, 1, 3, 1>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 3>
|
||||
// clang-format on
|
||||
>;
|
||||
#else
|
||||
using reduce_configuration_2_instances_blockwise = std::tuple<
|
||||
// clang-format off
|
||||
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
||||
ReductionConfiguration_2<0, 4, 4, 8, 1>,
|
||||
ReductionConfiguration_2<0, 4, 4, 4, 1>,
|
||||
ReductionConfiguration_2<0, 2, 2, 2, 1>,
|
||||
|
||||
ReductionConfiguration_2<1, 4, 1, 1, 8>,
|
||||
ReductionConfiguration_2<1, 4, 1, 1, 4>,
|
||||
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
||||
|
||||
// special instances
|
||||
ReductionConfiguration_2<0, 1, 1, 3, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 5, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 7, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 11, 1>,
|
||||
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 3>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 5>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 7>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 11>
|
||||
// clang-format on
|
||||
>;
|
||||
#endif
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
|
||||
using deviceReduceBlockWisePtrType = DeviceReducePtr<
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation,
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>;
|
||||
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
typename ReduceDims,
|
||||
ReduceTensorOp_t ReduceOpId,
|
||||
NanPropagation_t NanOpt,
|
||||
ReduceTensorIndices_t IndicesOpt>
|
||||
void add_device_reduce_instance_blockwise(
|
||||
std::vector<deviceReduceBlockWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
|
||||
{
|
||||
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
|
||||
using InElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
|
||||
using AccElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
|
||||
AccElementwiseOperation;
|
||||
|
||||
constexpr bool Indexable =
|
||||
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
|
||||
ReduceOpId == ReduceTensorOp_t::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
|
||||
|
||||
static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
|
||||
using cfg1 =
|
||||
remove_cvref_t<decltype(std::get<i.value>(reduce_configuration_1_instances{}))>;
|
||||
|
||||
static_for<0, std::tuple_size<reduce_configuration_2_instances_blockwise>::value, 1>{}(
|
||||
[&](auto j) {
|
||||
using cfg2 = remove_cvref_t<decltype(
|
||||
std::get<j.value>(reduce_configuration_2_instances_blockwise{}))>;
|
||||
|
||||
using ReduceOpInstance = DeviceReduceBlockWise<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
Rank,
|
||||
ReduceDims,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
PropagateNan,
|
||||
NeedIndices,
|
||||
cfg1::BlockSize_,
|
||||
cfg1::MThreadClusterSize_,
|
||||
cfg1::KThreadClusterSize_,
|
||||
cfg2::MThreadSliceSize_,
|
||||
cfg2::KThreadSliceSize_,
|
||||
cfg2::InSrcVectorDim_,
|
||||
cfg2::InSrcVectorSize_,
|
||||
cfg2::OutDstVectorSize_>;
|
||||
|
||||
device_op_instances.push_back(
|
||||
std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
#define ADD_BLOCKWISE_INST_BY_TYPE(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
template void add_device_reduce_instance_blockwise<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
Sequence<__VA_ARGS__>, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
std::vector<deviceReduceBlockWisePtrType<compT, ReduceOpId>> & device_op_instances)
|
||||
|
||||
#define ADD_BLOCKWISE_INST_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
ADD_BLOCKWISE_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
__VA_ARGS__)
|
||||
|
||||
#define ADD_BLOCKWISE_INST_REF_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
extern template void add_device_reduce_instance_blockwise<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
Sequence<__VA_ARGS__>, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
std::vector<DeviceReducePtr< \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
|
||||
AccElementwiseOperation>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_BLOCKWISE_INST_REF_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
__VA_ARGS__)
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,41 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,32 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,50 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,32 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,50 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,167 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_HPP
|
||||
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_impl_common.hpp"
|
||||
#include "device_reduce_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
#ifdef QUICK_REDUCE_TEST
|
||||
using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
|
||||
// clang-format off
|
||||
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
||||
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
||||
ReductionConfiguration_2<1, 2, 2, 1, 2>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 3>,
|
||||
ReductionConfiguration_2<1, 1, 2, 1, 3>
|
||||
// clang-format on
|
||||
>;
|
||||
#else
|
||||
using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
|
||||
// clang-format off
|
||||
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
||||
ReductionConfiguration_2<1, 4, 1, 1, 8>,
|
||||
ReductionConfiguration_2<1, 4, 1, 1, 4>,
|
||||
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
||||
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 3>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 5>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 7>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 11>
|
||||
// clang-format on
|
||||
>;
|
||||
#endif
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
|
||||
using deviceReduceBlockWiseSecondCallPtrType = DeviceReducePtr<
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::InElementwiseOperation,
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::AccElementwiseOperation>;
|
||||
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
typename ReduceDims,
|
||||
ReduceTensorOp_t ReduceOpId,
|
||||
NanPropagation_t NanOpt,
|
||||
ReduceTensorIndices_t IndicesOpt>
|
||||
void add_device_reduce_instance_blockwise_second_call(
|
||||
std::vector<deviceReduceBlockWiseSecondCallPtrType<AccDataType, ReduceOpId>>&
|
||||
device_op_instances)
|
||||
{
|
||||
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
|
||||
using InElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
|
||||
InElementwiseOperation;
|
||||
using AccElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
|
||||
AccElementwiseOperation;
|
||||
|
||||
constexpr bool Indexable =
|
||||
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
|
||||
ReduceOpId == ReduceTensorOp_t::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
|
||||
|
||||
static_assert(std::is_same<InDataType, AccDataType>::value,
|
||||
"InDataType and AccDataType should be the same to use "
|
||||
"add_device_reduce_instance_blockwise_second_call!");
|
||||
|
||||
static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
|
||||
using cfg1 =
|
||||
remove_cvref_t<decltype(std::get<i.value>(reduce_configuration_1_instances{}))>;
|
||||
|
||||
static_for<0,
|
||||
std::tuple_size<reduce_configuration_2_instances_blockwise_second_call>::value,
|
||||
1>{}([&](auto j) {
|
||||
using cfg2 = remove_cvref_t<decltype(
|
||||
std::get<j.value>(reduce_configuration_2_instances_blockwise_second_call{}))>;
|
||||
|
||||
using ReduceOpInstance = DeviceReduceBlockWiseSecondCall<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
Rank,
|
||||
ReduceDims,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
PropagateNan,
|
||||
NeedIndices,
|
||||
cfg1::BlockSize_,
|
||||
cfg1::MThreadClusterSize_,
|
||||
cfg1::KThreadClusterSize_,
|
||||
cfg2::MThreadSliceSize_,
|
||||
cfg2::KThreadSliceSize_,
|
||||
cfg2::InSrcVectorDim_,
|
||||
cfg2::InSrcVectorSize_,
|
||||
cfg2::OutDstVectorSize_>;
|
||||
|
||||
device_op_instances.push_back(std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
template void add_device_reduce_instance_blockwise_second_call<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
Sequence<__VA_ARGS__>, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
std::vector<deviceReduceBlockWiseSecondCallPtrType<compT, ReduceOpId>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
__VA_ARGS__)
|
||||
|
||||
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
extern template void add_device_reduce_instance_blockwise_second_call<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
Sequence<__VA_ARGS__>, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
std::vector< \
|
||||
DeviceReducePtr<typename reduce_unary_operator<compT, ReduceOpId, false, true>:: \
|
||||
InElementwiseOperation, \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, false, true>:: \
|
||||
AccElementwiseOperation>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
__VA_ARGS__)
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,41 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F16_F16_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F16_F16_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,32 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, half_t, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,50 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F32_F32_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,32 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, float, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,50 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F64_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_SECOND_CALL_F64_F64_F64_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,55 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
template <int BlockSize, int MThreadClusterSize, int KThreadClusterSize>
|
||||
struct ReductionConfiguration_1
|
||||
{
|
||||
static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, "Invalid Configuration!");
|
||||
|
||||
static constexpr int BlockSize_ = BlockSize;
|
||||
static constexpr int MThreadClusterSize_ = MThreadClusterSize;
|
||||
static constexpr int KThreadClusterSize_ = KThreadClusterSize;
|
||||
};
|
||||
|
||||
template <int InSrcVectorDim,
|
||||
int InSrcVectorSize,
|
||||
int OutDstVectorSize,
|
||||
int MThreadSliceSize,
|
||||
int KThreadSliceSize>
|
||||
struct ReductionConfiguration_2
|
||||
{
|
||||
static constexpr int InSrcVectorDim_ = InSrcVectorDim;
|
||||
static constexpr int InSrcVectorSize_ = InSrcVectorSize;
|
||||
static constexpr int OutDstVectorSize_ = OutDstVectorSize;
|
||||
static constexpr int MThreadSliceSize_ = MThreadSliceSize;
|
||||
static constexpr int KThreadSliceSize_ = KThreadSliceSize;
|
||||
};
|
||||
|
||||
using reduce_configuration_1_instances = std::tuple<
|
||||
// clang-format off
|
||||
// BlockSize | MThreadClusterSize | KThreadClusterSize
|
||||
ReductionConfiguration_1<256, 128, 2>,
|
||||
ReductionConfiguration_1<256, 64, 4>,
|
||||
ReductionConfiguration_1<256, 32, 8>,
|
||||
ReductionConfiguration_1<256, 16, 16>,
|
||||
ReductionConfiguration_1<256, 8, 32>,
|
||||
ReductionConfiguration_1<256, 4, 64>,
|
||||
ReductionConfiguration_1<256, 2, 128>,
|
||||
ReductionConfiguration_1<256, 1, 256>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
#define QUICK_REDUCE_TEST 1
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,192 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP
|
||||
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_impl_common.hpp"
|
||||
#include "device_reduce_multiblock_atomic_add.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
#ifdef QUICK_REDUCE_TEST
|
||||
using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple<
|
||||
// clang-format off
|
||||
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
||||
ReductionConfiguration_2<0, 2, 2, 2, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 2, 1>,
|
||||
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
||||
ReductionConfiguration_2<1, 2, 2, 1, 2>,
|
||||
ReductionConfiguration_2<0, 1, 1, 3, 1>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 3>
|
||||
// clang-format on
|
||||
>;
|
||||
#else
|
||||
using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple<
|
||||
// clang-format off
|
||||
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
||||
ReductionConfiguration_2<0, 4, 4, 8, 1>,
|
||||
ReductionConfiguration_2<0, 4, 4, 4, 1>,
|
||||
ReductionConfiguration_2<0, 2, 2, 2, 1>,
|
||||
|
||||
ReductionConfiguration_2<1, 4, 1, 1, 8>,
|
||||
ReductionConfiguration_2<1, 4, 1, 1, 4>,
|
||||
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
||||
|
||||
// special instances
|
||||
ReductionConfiguration_2<0, 1, 1, 3, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 5, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 7, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 11, 1>,
|
||||
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 3>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 5>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 7>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 11>
|
||||
// clang-format on
|
||||
>;
|
||||
#endif
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp_t ReduceOperation>
|
||||
using deviceReduceMultiBlockAtomicAddPtrType =
|
||||
DeviceReducePtr<typename reduce_unary_operator<AccDataType, ReduceOperation, true, true>::
|
||||
InElementwiseOperation,
|
||||
typename reduce_unary_operator<AccDataType, ReduceOperation, true, true>::
|
||||
AccElementwiseOperation>;
|
||||
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
typename ReduceDims,
|
||||
ReduceTensorOp_t ReduceOpId,
|
||||
NanPropagation_t NanOpt,
|
||||
ReduceTensorIndices_t IndicesOpt>
|
||||
void add_device_reduce_instance_multiblock_atomic_add(
|
||||
std::vector<deviceReduceMultiBlockAtomicAddPtrType<AccDataType, ReduceOpId>>&
|
||||
device_op_instances)
|
||||
{
|
||||
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
|
||||
using InElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
|
||||
using AccElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
|
||||
AccElementwiseOperation;
|
||||
|
||||
constexpr bool Indexable =
|
||||
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
|
||||
ReduceOpId == ReduceTensorOp_t::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
|
||||
|
||||
static_assert(IndicesOpt == ReduceTensorIndices_t::NO_INDICES,
|
||||
"AtomicAdd can only be used with reduction operations without indices!");
|
||||
|
||||
constexpr bool op_acceptable =
|
||||
(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::MUL ||
|
||||
ReduceOpId == ReduceTensorOp_t::AVG || ReduceOpId == ReduceTensorOp_t::NORM1);
|
||||
|
||||
constexpr bool out_type_acceptable =
|
||||
(std::is_same<OutDataType, float>::value || std::is_same<OutDataType, double>::value);
|
||||
|
||||
if constexpr(!op_acceptable || !out_type_acceptable)
|
||||
return;
|
||||
else
|
||||
{
|
||||
static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
|
||||
using cfg1 =
|
||||
remove_cvref_t<decltype(std::get<i.value>(reduce_configuration_1_instances{}))>;
|
||||
|
||||
static_for<
|
||||
0,
|
||||
std::tuple_size<reduce_configuration_2_instances_multiblock_atomic_add>::value,
|
||||
1>{}([&](auto j) {
|
||||
using cfg2 = remove_cvref_t<decltype(
|
||||
std::get<j.value>(reduce_configuration_2_instances_multiblock_atomic_add{}))>;
|
||||
|
||||
using ReduceOpInstance = DeviceReduceMultiBlockAtomicAdd<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
Rank,
|
||||
ReduceDims,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
PropagateNan,
|
||||
NeedIndices,
|
||||
cfg1::BlockSize_,
|
||||
cfg1::MThreadClusterSize_,
|
||||
cfg1::KThreadClusterSize_,
|
||||
cfg2::MThreadSliceSize_,
|
||||
cfg2::KThreadSliceSize_,
|
||||
cfg2::InSrcVectorDim_,
|
||||
cfg2::InSrcVectorSize_,
|
||||
cfg2::OutDstVectorSize_>;
|
||||
|
||||
device_op_instances.push_back(
|
||||
std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
|
||||
});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
template void add_device_reduce_instance_multiblock_atomic_add<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
Sequence<__VA_ARGS__>, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
std::vector<deviceReduceMultiBlockAtomicAddPtrType<compT, ReduceOpId>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
__VA_ARGS__)
|
||||
|
||||
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
extern template void add_device_reduce_instance_multiblock_atomic_add<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
Sequence<__VA_ARGS__>, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
std::vector<DeviceReducePtr< \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
|
||||
AccElementwiseOperation>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
__VA_ARGS__)
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,29 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_atomic_add.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 0);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,29 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_atomic_add.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,29 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_atomic_add.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 0);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,175 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
|
||||
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_impl_common.hpp"
|
||||
#include "device_reduce_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
#ifdef QUICK_REDUCE_TEST
|
||||
using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple<
|
||||
// clang-format off
|
||||
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
||||
ReductionConfiguration_2<0, 1, 1, 2, 1>,
|
||||
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
||||
ReductionConfiguration_2<0, 1, 1, 3, 1>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 3>
|
||||
// clang-format on
|
||||
>;
|
||||
#else
|
||||
using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple<
|
||||
// clang-format off
|
||||
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
||||
ReductionConfiguration_2<0, 4, 1, 8, 1>,
|
||||
ReductionConfiguration_2<0, 4, 1, 4, 1>,
|
||||
ReductionConfiguration_2<0, 2, 1, 2, 1>,
|
||||
|
||||
ReductionConfiguration_2<1, 4, 1, 1, 8>,
|
||||
ReductionConfiguration_2<1, 4, 1, 1, 4>,
|
||||
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
||||
|
||||
// special instances
|
||||
ReductionConfiguration_2<0, 1, 1, 3, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 5, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 7, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 11, 1>,
|
||||
|
||||
ReductionConfiguration_2<0, 1, 1, 1, 3>,
|
||||
ReductionConfiguration_2<0, 1, 1, 1, 5>,
|
||||
ReductionConfiguration_2<0, 1, 1, 1, 7>,
|
||||
ReductionConfiguration_2<0, 1, 1, 1, 11>
|
||||
// clang-format on
|
||||
>;
|
||||
#endif
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
|
||||
using deviceReduceMultiBlockPartialReducePtrType = DeviceReducePtr<
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::InElementwiseOperation,
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::AccElementwiseOperation>;
|
||||
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
typename ReduceDims,
|
||||
ReduceTensorOp_t ReduceOpId,
|
||||
NanPropagation_t NanOpt,
|
||||
ReduceTensorIndices_t IndicesOpt>
|
||||
void add_device_reduce_instance_multiblock_partial_reduce(
|
||||
std::vector<deviceReduceMultiBlockPartialReducePtrType<AccDataType, ReduceOpId>>&
|
||||
device_op_instances)
|
||||
{
|
||||
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
|
||||
using InElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
|
||||
InElementwiseOperation;
|
||||
using AccElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
|
||||
AccElementwiseOperation;
|
||||
|
||||
constexpr bool Indexable =
|
||||
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
|
||||
ReduceOpId == ReduceTensorOp_t::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
|
||||
|
||||
static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
|
||||
using cfg1 =
|
||||
remove_cvref_t<decltype(std::get<i.value>(reduce_configuration_1_instances{}))>;
|
||||
|
||||
static_for<
|
||||
0,
|
||||
std::tuple_size<reduce_configuration_2_instances_multiblock_partial_reduce>::value,
|
||||
1>{}([&](auto j) {
|
||||
using cfg2 = remove_cvref_t<decltype(
|
||||
std::get<j.value>(reduce_configuration_2_instances_multiblock_partial_reduce{}))>;
|
||||
|
||||
using ReduceOpInstance = DeviceReduceMultiBlockPartialReduce<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
Rank,
|
||||
ReduceDims,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
PropagateNan,
|
||||
NeedIndices,
|
||||
cfg1::BlockSize_,
|
||||
cfg1::MThreadClusterSize_,
|
||||
cfg1::KThreadClusterSize_,
|
||||
cfg2::MThreadSliceSize_,
|
||||
cfg2::KThreadSliceSize_,
|
||||
cfg2::InSrcVectorDim_,
|
||||
cfg2::InSrcVectorSize_,
|
||||
cfg2::OutDstVectorSize_>;
|
||||
|
||||
device_op_instances.push_back(std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
template void add_device_reduce_instance_multiblock_partial_reduce<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
Sequence<__VA_ARGS__>, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
std::vector<deviceReduceMultiBlockPartialReducePtrType<compT, ReduceOpId>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
__VA_ARGS__)
|
||||
|
||||
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
extern template void \
|
||||
add_device_reduce_instance_multiblock_partial_reduce<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
Sequence<__VA_ARGS__>, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
std::vector< \
|
||||
DeviceReducePtr<typename reduce_unary_operator<compT, ReduceOpId, true, false>:: \
|
||||
InElementwiseOperation, \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, true, false>:: \
|
||||
AccElementwiseOperation>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
__VA_ARGS__)
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,41 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F16_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F16_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,32 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F32_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F32_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,45 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F32_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F32_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); //
|
||||
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,26 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F64_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F64_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,53 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F64_F64_F64_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F64_F64_F64_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); //
|
||||
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); //
|
||||
|
||||
// Will be moved to use MultiBlockAtomicAdd
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
164
device_operation/include/device_reduce_instance_threadwise.hpp
Normal file
164
device_operation/include/device_reduce_instance_threadwise.hpp
Normal file
@@ -0,0 +1,164 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
|
||||
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_impl_common.hpp"
|
||||
#include "device_reduce_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
#ifdef QUICK_REDUCE_TEST
|
||||
using reduce_configuration_2_instances_threadwise = std::tuple<
|
||||
// clang-format off
|
||||
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
||||
ReductionConfiguration_2<0, 2, 2, 2, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 2, 1>,
|
||||
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
||||
ReductionConfiguration_2<1, 2, 2, 1, 2>,
|
||||
ReductionConfiguration_2<0, 1, 1, 3, 1>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 3>
|
||||
// clang-format on
|
||||
>;
|
||||
#else
|
||||
using reduce_configuration_2_instances_threadwise = std::tuple<
|
||||
// clang-format off
|
||||
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
|
||||
ReductionConfiguration_2<0, 4, 4, 8, 1>,
|
||||
ReductionConfiguration_2<0, 4, 4, 4, 1>,
|
||||
ReductionConfiguration_2<0, 2, 2, 2, 1>,
|
||||
|
||||
ReductionConfiguration_2<1, 4, 1, 1, 8>,
|
||||
ReductionConfiguration_2<1, 4, 1, 1, 4>,
|
||||
ReductionConfiguration_2<1, 2, 1, 1, 2>,
|
||||
|
||||
// special instances
|
||||
ReductionConfiguration_2<0, 1, 1, 3, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 5, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 7, 1>,
|
||||
ReductionConfiguration_2<0, 1, 1, 11, 1>,
|
||||
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 3>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 5>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 7>,
|
||||
ReductionConfiguration_2<1, 1, 1, 1, 11>
|
||||
// clang-format on
|
||||
>;
|
||||
#endif
|
||||
|
||||
template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
|
||||
using deviceReduceThreadWisePtrType = DeviceReducePtr<
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation,
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>;
|
||||
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
typename ReduceDims,
|
||||
ReduceTensorOp_t ReduceOpId,
|
||||
NanPropagation_t NanOpt,
|
||||
ReduceTensorIndices_t IndicesOpt>
|
||||
void add_device_reduce_instance_threadwise(
|
||||
std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
|
||||
{
|
||||
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
|
||||
using InElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
|
||||
using AccElementwiseOperation =
|
||||
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
|
||||
AccElementwiseOperation;
|
||||
|
||||
constexpr bool Indexable =
|
||||
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
|
||||
ReduceOpId == ReduceTensorOp_t::AMAX);
|
||||
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
|
||||
|
||||
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
|
||||
|
||||
using cfg1 = ReductionConfiguration_1<256, 256, 1>;
|
||||
|
||||
static_for<0, std::tuple_size<reduce_configuration_2_instances_threadwise>::value, 1>{}(
|
||||
[&](auto j) {
|
||||
using cfg2 = remove_cvref_t<decltype(
|
||||
std::get<j.value>(reduce_configuration_2_instances_threadwise{}))>;
|
||||
|
||||
using ReduceOpInstance = DeviceReduceThreadWise<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
Rank,
|
||||
ReduceDims,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
PropagateNan,
|
||||
NeedIndices,
|
||||
cfg1::BlockSize_,
|
||||
cfg1::MThreadClusterSize_,
|
||||
cfg1::KThreadClusterSize_,
|
||||
cfg2::MThreadSliceSize_,
|
||||
cfg2::KThreadSliceSize_,
|
||||
cfg2::InSrcVectorDim_,
|
||||
cfg2::InSrcVectorSize_,
|
||||
cfg2::OutDstVectorSize_>;
|
||||
|
||||
device_op_instances.push_back(std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
|
||||
});
|
||||
};
|
||||
|
||||
#define ADD_THREADWISE_INST_BY_TYPE(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
template void add_device_reduce_instance_threadwise<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
Sequence<__VA_ARGS__>, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances)
|
||||
|
||||
#define ADD_THREADWISE_INST_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
ADD_THREADWISE_INST_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
__VA_ARGS__)
|
||||
|
||||
#define ADD_THREADWISE_INST_REF_BY_TYPE( \
|
||||
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
extern template void add_device_reduce_instance_threadwise<inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
Rank, \
|
||||
Sequence<__VA_ARGS__>, \
|
||||
ReduceOpId, \
|
||||
NanOpt, \
|
||||
IndicesOpt>( \
|
||||
std::vector<DeviceReducePtr< \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
|
||||
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
|
||||
AccElementwiseOperation>> & \
|
||||
device_op_instances)
|
||||
|
||||
#define ADD_THREADWISE_INST_REF_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
|
||||
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
|
||||
compT, \
|
||||
outT, \
|
||||
static_cast<ReduceTensorOp_t>(ReduceOpId), \
|
||||
static_cast<NanPropagation_t>(NanOpt), \
|
||||
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
|
||||
Rank, \
|
||||
__VA_ARGS__)
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,41 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,32 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0);
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,50 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 0);
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1);
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,32 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 0);
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1);
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,50 @@
|
||||
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
|
||||
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
|
||||
|
||||
#include "reduction_enums.hpp"
|
||||
#include "reduction_operator_mapping.hpp"
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 0);
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1);
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
|
||||
#endif
|
||||
418
device_operation/include/device_reduce_multiblock_atomic_add.hpp
Normal file
418
device_operation/include/device_reduce_multiblock_atomic_add.hpp
Normal file
@@ -0,0 +1,418 @@
|
||||
#ifndef DEVICE_REDUCE_MULTIBLOCK_ATOMIC_ADD_HPP
|
||||
#define DEVICE_REDUCE_MULTIBLOCK_ATOMIC_ADD_HPP
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include "device.hpp"
|
||||
#include "device_base.hpp"
|
||||
#include "device_reduce.hpp"
|
||||
#include "device_reduce_common.hpp"
|
||||
#include "gridwise_2d_reduction_multiblock_atomic_add.hpp"
|
||||
#include "gridwise_set_buffer_value.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
typename ReduceDims,
|
||||
typename ReduceOperation,
|
||||
typename InElementwiseOperation,
|
||||
typename AccElementwiseOperation,
|
||||
bool PropagateNan,
|
||||
bool NeedIndices,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
index_t MThreadSliceSize,
|
||||
index_t KThreadSliceSize,
|
||||
index_t InSrcVectorDim,
|
||||
index_t InSrcVectorSize,
|
||||
index_t OutDstVectorSize>
|
||||
struct DeviceReduceMultiBlockAtomicAdd
|
||||
: public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
|
||||
{
|
||||
static_assert(Rank <= 6, "Bigger Rank size is not supported!");
|
||||
static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
|
||||
"Invalid thread cluster size assignments!");
|
||||
|
||||
using IndexDataType = int32_t;
|
||||
|
||||
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>());
|
||||
|
||||
static constexpr index_t srcDims = Rank;
|
||||
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
|
||||
static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
|
||||
|
||||
static constexpr bool support_AtomicAdd =
|
||||
std::is_same<OutDataType, float>::value || std::is_same<OutDataType, double>::value;
|
||||
|
||||
static_assert(!NeedIndices && support_AtomicAdd,
|
||||
"MultiBlockAtomicAdd method can only be used with non-indiced operation and when "
|
||||
"having float/double output type!");
|
||||
|
||||
static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
|
||||
static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
|
||||
|
||||
static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides,
|
||||
int blkGroupSize,
|
||||
int kBlockTileIterations)
|
||||
{
|
||||
const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
|
||||
const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
|
||||
|
||||
const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
||||
|
||||
const auto in_grid_desc_m_k = [&]() {
|
||||
if constexpr(reduceAllDims)
|
||||
{
|
||||
const auto one_dim_inDesc = transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(tupleSrcLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
return transform_tensor_descriptor(one_dim_inDesc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(
|
||||
1, one_dim_inDesc.GetLength(Number<0>{})))),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 1>{}));
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto toReduceDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
|
||||
const auto invariantDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
|
||||
|
||||
return transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(invariantDimLengths),
|
||||
make_merge_transform(toReduceDimLengths)),
|
||||
make_tuple(InvariantDims{}, ReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
}
|
||||
}();
|
||||
|
||||
const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
|
||||
const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
|
||||
|
||||
const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations;
|
||||
const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
|
||||
const auto inPad_K = reduceSizePerBlock * blkGroupSize - innerLen;
|
||||
|
||||
auto in_grid_desc_m_k_padded =
|
||||
transform_tensor_descriptor(in_grid_desc_m_k,
|
||||
make_tuple(make_right_pad_transform(outerLen, inPad_M),
|
||||
make_right_pad_transform(innerLen, inPad_K)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
return (in_grid_desc_m_k_padded);
|
||||
};
|
||||
|
||||
static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides)
|
||||
{
|
||||
const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<dstDims>{});
|
||||
const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<dstDims>{});
|
||||
|
||||
auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
auto out_grid_desc_m = transform_tensor_descriptor(
|
||||
outDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto outerLen = out_grid_desc_m.GetLength(Number<0>{});
|
||||
|
||||
const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
|
||||
|
||||
auto out_grid_desc_m_padded =
|
||||
transform_tensor_descriptor(out_grid_desc_m,
|
||||
make_tuple(make_right_pad_transform(outerLen, outPad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
return (out_grid_desc_m_padded);
|
||||
};
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
float alpha,
|
||||
float beta,
|
||||
const InDataType* in_dev,
|
||||
OutDataType* out_dev,
|
||||
IndexDataType* out_indices_dev,
|
||||
AccDataType* workspace_dev,
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const AccElementwiseOperation& acc_elementwise_op)
|
||||
: in_dev_{in_dev}, out_dev_{out_dev}
|
||||
{
|
||||
(void)out_indices_dev;
|
||||
(void)workspace_dev;
|
||||
|
||||
inLengths_ = inLengths;
|
||||
inStrides_ = inStrides;
|
||||
outLengths_ = outLengths;
|
||||
outStrides_ = outStrides;
|
||||
|
||||
in_elementwise_op_ = in_elementwise_op;
|
||||
acc_elementwise_op_ = acc_elementwise_op;
|
||||
|
||||
alpha_ = static_cast<AccDataType>(alpha);
|
||||
beta_ = static_cast<OutDataType>(beta);
|
||||
|
||||
std::tie(invariant_total_length, reduce_total_length) =
|
||||
get_2d_lengths<Rank, ReduceDims>(inLengths);
|
||||
|
||||
if constexpr(InvariantDims::Size() == 0)
|
||||
invariant_lowest_length = 1;
|
||||
else
|
||||
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)];
|
||||
|
||||
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)];
|
||||
|
||||
int iterations = 1;
|
||||
while(true)
|
||||
{
|
||||
int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
|
||||
(K_BlockTileSize * iterations);
|
||||
|
||||
// we want the blkGroupSize be not more than 128
|
||||
if(testBlkGroupSize <= 128)
|
||||
break;
|
||||
|
||||
iterations++;
|
||||
};
|
||||
|
||||
blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
|
||||
(K_BlockTileSize * iterations);
|
||||
|
||||
kBlockTileIterations = iterations;
|
||||
|
||||
gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
|
||||
M_BlockTileSize * blkGroupSize;
|
||||
|
||||
gridSize_pre =
|
||||
math::integer_least_multiple(invariant_total_length, BlockSize) / BlockSize;
|
||||
}
|
||||
|
||||
std::vector<int> inLengths_;
|
||||
std::vector<int> inStrides_;
|
||||
std::vector<int> outLengths_;
|
||||
std::vector<int> outStrides_;
|
||||
|
||||
AccDataType alpha_;
|
||||
OutDataType beta_;
|
||||
|
||||
const InDataType* in_dev_;
|
||||
OutDataType* out_dev_;
|
||||
|
||||
InElementwiseOperation in_elementwise_op_;
|
||||
AccElementwiseOperation acc_elementwise_op_;
|
||||
|
||||
int invariant_lowest_length;
|
||||
int reduce_lowest_length;
|
||||
size_t invariant_total_length;
|
||||
size_t reduce_total_length;
|
||||
|
||||
index_t blkGroupSize;
|
||||
index_t kBlockTileIterations;
|
||||
size_t gridSize;
|
||||
|
||||
size_t gridSize_pre;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
{
|
||||
const auto in_grid_desc_m_k = DeviceReduceMultiBlockAtomicAdd::MakeSrc2dDescriptor(
|
||||
arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
|
||||
const auto out_grid_desc_m = DeviceReduceMultiBlockAtomicAdd::MakeDst1dDescriptor(
|
||||
arg.outLengths_, arg.outStrides_);
|
||||
using InGridDesc_M_K = decltype(in_grid_desc_m_k);
|
||||
using OutGridDesc_M = decltype(out_grid_desc_m);
|
||||
|
||||
using GridwiseReduce =
|
||||
GridwiseReduction_mk_to_m_multiblock_atomic_add<InDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
InGridDesc_M_K,
|
||||
OutGridDesc_M,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
PropagateNan,
|
||||
BlockSize,
|
||||
MThreadClusterSize,
|
||||
KThreadClusterSize,
|
||||
MThreadSliceSize,
|
||||
KThreadSliceSize,
|
||||
InSrcVectorDim,
|
||||
InSrcVectorSize,
|
||||
OutDstVectorSize>;
|
||||
|
||||
float avg_time = 0;
|
||||
|
||||
KernelTimer timer;
|
||||
|
||||
const auto kernel_pre = kernel_buffer_set_value<BlockSize, OutDataType, OutGridDesc_M>;
|
||||
const auto kernel_main = kernel_reduce_multiblock_atocmi_add<GridwiseReduce,
|
||||
InDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
InGridDesc_M_K,
|
||||
OutGridDesc_M,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation>;
|
||||
|
||||
printf("launch_and_time_kernel: grid_dim {%ld, 1, 1}, block_dim {%d, 1, 1} \n",
|
||||
arg.gridSize,
|
||||
BlockSize);
|
||||
printf("Warm up\n");
|
||||
|
||||
for(int i = 0; i < nrepeat + 1; i++)
|
||||
{
|
||||
if(i == 1)
|
||||
timer.Start();
|
||||
|
||||
launch_kernel(kernel_pre,
|
||||
dim3(arg.gridSize_pre),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
out_grid_desc_m,
|
||||
arg.out_dev_,
|
||||
static_cast<OutDataType>(0.0f));
|
||||
|
||||
launch_kernel(kernel_main,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
in_grid_desc_m_k,
|
||||
out_grid_desc_m,
|
||||
arg.in_elementwise_op_,
|
||||
arg.acc_elementwise_op_,
|
||||
arg.blkGroupSize,
|
||||
arg.kBlockTileIterations,
|
||||
arg.alpha_,
|
||||
arg.in_dev_,
|
||||
arg.out_dev_);
|
||||
};
|
||||
|
||||
timer.End();
|
||||
|
||||
avg_time = timer.GetElapsedTime() / nrepeat;
|
||||
|
||||
return (avg_time);
|
||||
};
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
};
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
{
|
||||
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
||||
|
||||
if constexpr(InSrcVectorDim == 0)
|
||||
{
|
||||
if constexpr(InvariantDims::Size() == 0)
|
||||
return (false);
|
||||
|
||||
if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
|
||||
return (false);
|
||||
|
||||
if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
|
||||
return (false);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
|
||||
return (false);
|
||||
|
||||
if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
|
||||
return (false);
|
||||
};
|
||||
|
||||
if(static_cast<float>(pArg->beta_) != 0.0f)
|
||||
return (false);
|
||||
|
||||
// To improve
|
||||
if(pArg->invariant_lowest_length % OutDstVectorSize != 0)
|
||||
return (false);
|
||||
|
||||
// cases with small reduce_total_length should be handled by the BlockWise method
|
||||
if(pArg->reduce_total_length <= BlockSize * KThreadSliceSize)
|
||||
return (false);
|
||||
|
||||
// This is very strong restriction, but needed to avoid some failure
|
||||
if(pArg->invariant_lowest_length % M_BlockTileSize != 0)
|
||||
return (false);
|
||||
|
||||
return (true);
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
float alpha,
|
||||
float beta,
|
||||
const void* in_dev,
|
||||
void* out_dev,
|
||||
void* out_indices_dev,
|
||||
void* workspace_dev,
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const AccElementwiseOperation& acc_elementwise_op) override
|
||||
{
|
||||
return std::make_unique<Argument>(inLengths,
|
||||
inStrides,
|
||||
outLengths,
|
||||
outStrides,
|
||||
alpha,
|
||||
beta,
|
||||
static_cast<const InDataType*>(in_dev),
|
||||
static_cast<OutDataType*>(out_dev),
|
||||
static_cast<IndexDataType*>(out_indices_dev),
|
||||
static_cast<AccDataType*>(workspace_dev),
|
||||
in_elementwise_op,
|
||||
acc_elementwise_op);
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
};
|
||||
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << "DeviceReduceMultiBlockAtomicAdd<" << BlockSize << ",";
|
||||
str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
|
||||
str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
|
||||
str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -0,0 +1,419 @@
|
||||
#ifndef DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
|
||||
#define DEVICE_REDUCE_MULTIBLOCK_PARTIAL_REDUCE_HPP
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include "device.hpp"
|
||||
#include "device_reduce.hpp"
|
||||
#include "device_reduce_common.hpp"
|
||||
#include "gridwise_2d_reduction_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
int Rank,
|
||||
typename ReduceDims,
|
||||
typename ReduceOperation,
|
||||
typename InElementwiseOperation,
|
||||
typename AccElementwiseOperation,
|
||||
bool PropagateNan,
|
||||
bool NeedIndices,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
index_t MThreadSliceSize,
|
||||
index_t KThreadSliceSize,
|
||||
index_t InSrcVectorDim,
|
||||
index_t InSrcVectorSize,
|
||||
index_t OutDstVectorSize>
|
||||
struct DeviceReduceMultiBlockPartialReduce
|
||||
: public DeviceReduce<InElementwiseOperation, AccElementwiseOperation>
|
||||
{
|
||||
static_assert(Rank <= 6, "Bigger Rank size is not supported!");
|
||||
static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
|
||||
"Invalid thread cluster size assignments!");
|
||||
|
||||
static_assert(OutDstVectorSize == 1, "OutDstVectorSize must be 1 for MultiBlockPartialReduce!");
|
||||
|
||||
using IndexDataType = int32_t;
|
||||
|
||||
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>());
|
||||
|
||||
static constexpr index_t srcDims = Rank;
|
||||
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
|
||||
static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
|
||||
|
||||
static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
|
||||
static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
|
||||
|
||||
size_t GetWorkspaceSizeInBytes(const std::vector<int>& inLengths) override
|
||||
{
|
||||
size_t invariant_total_length;
|
||||
size_t reduce_total_length;
|
||||
|
||||
std::tie(invariant_total_length, reduce_total_length) =
|
||||
get_2d_lengths<Rank, ReduceDims>(inLengths);
|
||||
|
||||
int iterations = 1;
|
||||
while(true)
|
||||
{
|
||||
int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
|
||||
(K_BlockTileSize * iterations);
|
||||
|
||||
// we want the blkGroupSize be not more than 128
|
||||
if(testBlkGroupSize <= 128)
|
||||
break;
|
||||
|
||||
iterations++;
|
||||
};
|
||||
|
||||
int blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
|
||||
(K_BlockTileSize * iterations);
|
||||
|
||||
size_t workspace_size = invariant_total_length * blkGroupSize;
|
||||
|
||||
size_t wsSizeInBytes =
|
||||
!NeedIndices ? workspace_size * sizeof(AccDataType)
|
||||
: workspace_size * (sizeof(AccDataType) + sizeof(int)) + 64 + sizeof(int);
|
||||
|
||||
return (wsSizeInBytes);
|
||||
};
|
||||
|
||||
bool HasFurtherCall() override { return (true); };
|
||||
|
||||
static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides,
|
||||
int blkGroupSize,
|
||||
int kBlockTileIterations)
|
||||
{
|
||||
const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
|
||||
const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
|
||||
|
||||
const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
||||
|
||||
const auto in_grid_desc_m_k = [&]() {
|
||||
if constexpr(reduceAllDims)
|
||||
{
|
||||
const auto one_dim_inDesc = transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(tupleSrcLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
return transform_tensor_descriptor(one_dim_inDesc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(
|
||||
1, one_dim_inDesc.GetLength(Number<0>{})))),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 1>{}));
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto toReduceDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
|
||||
const auto invariantDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
|
||||
|
||||
return transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(invariantDimLengths),
|
||||
make_merge_transform(toReduceDimLengths)),
|
||||
make_tuple(InvariantDims{}, ReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
}
|
||||
}();
|
||||
|
||||
const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
|
||||
const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
|
||||
|
||||
const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations;
|
||||
const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
|
||||
const auto inPad_K = reduceSizePerBlock * blkGroupSize - innerLen;
|
||||
|
||||
auto in_grid_desc_m_k_padded =
|
||||
transform_tensor_descriptor(in_grid_desc_m_k,
|
||||
make_tuple(make_right_pad_transform(outerLen, inPad_M),
|
||||
make_right_pad_transform(innerLen, inPad_K)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
return (in_grid_desc_m_k_padded);
|
||||
};
|
||||
|
||||
static auto MakeWorkspace2dDescriptor(int outerLen, int blkGroupSize)
|
||||
{
|
||||
auto ws_desc_m_k = make_naive_tensor_descriptor_packed(make_tuple(outerLen, blkGroupSize));
|
||||
|
||||
const auto wsPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
|
||||
|
||||
auto ws_desc_m_k_padded =
|
||||
transform_tensor_descriptor(ws_desc_m_k,
|
||||
make_tuple(make_right_pad_transform(outerLen, wsPad),
|
||||
make_pass_through_transform(blkGroupSize)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
return (ws_desc_m_k_padded);
|
||||
};
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const std::vector<index_t>& inLengths,
|
||||
const std::vector<index_t>& inStrides,
|
||||
const std::vector<index_t>& outLengths,
|
||||
const std::vector<index_t>& outStrides,
|
||||
float alpha,
|
||||
float beta,
|
||||
const InDataType* in_dev,
|
||||
OutDataType* out_dev,
|
||||
IndexDataType* out_indices_dev,
|
||||
AccDataType* workspace_dev,
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const AccElementwiseOperation& acc_elementwise_op)
|
||||
: in_dev_{in_dev},
|
||||
out_dev_{out_dev},
|
||||
out_indices_dev_{out_indices_dev},
|
||||
workspace_dev_{workspace_dev}
|
||||
{
|
||||
inLengths_ = inLengths;
|
||||
inStrides_ = inStrides;
|
||||
outLengths_ = outLengths;
|
||||
outStrides_ = outStrides;
|
||||
|
||||
in_elementwise_op_ = in_elementwise_op;
|
||||
acc_elementwise_op_ = acc_elementwise_op;
|
||||
|
||||
alpha_ = static_cast<AccDataType>(alpha);
|
||||
beta_ = static_cast<OutDataType>(beta);
|
||||
|
||||
std::tie(invariant_total_length, reduce_total_length) =
|
||||
get_2d_lengths<Rank, ReduceDims>(inLengths);
|
||||
|
||||
if constexpr(InvariantDims::Size() == 0)
|
||||
invariant_lowest_length = 1;
|
||||
else
|
||||
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)];
|
||||
|
||||
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)];
|
||||
|
||||
int iterations = 1;
|
||||
while(true)
|
||||
{
|
||||
int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
|
||||
(K_BlockTileSize * iterations);
|
||||
|
||||
// we want the blkGroupSize be not more than 128
|
||||
if(testBlkGroupSize <= 128)
|
||||
break;
|
||||
|
||||
iterations++;
|
||||
};
|
||||
|
||||
blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
|
||||
(K_BlockTileSize * iterations);
|
||||
|
||||
kBlockTileIterations = iterations;
|
||||
|
||||
gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
|
||||
M_BlockTileSize * blkGroupSize;
|
||||
|
||||
size_t ws_buf2_bytes_offset = math::integer_least_multiple(
|
||||
invariant_total_length * blkGroupSize * sizeof(AccDataType), 64);
|
||||
|
||||
if constexpr(NeedIndices)
|
||||
workspace_indices_dev_ = reinterpret_cast<int*>(
|
||||
reinterpret_cast<char*>(workspace_dev_) + ws_buf2_bytes_offset);
|
||||
else
|
||||
workspace_indices_dev_ = nullptr;
|
||||
}
|
||||
|
||||
std::vector<int> inLengths_;
|
||||
std::vector<int> inStrides_;
|
||||
std::vector<int> outLengths_;
|
||||
std::vector<int> outStrides_;
|
||||
|
||||
AccDataType alpha_;
|
||||
OutDataType beta_;
|
||||
|
||||
const InDataType* in_dev_;
|
||||
OutDataType* out_dev_;
|
||||
IndexDataType* out_indices_dev_;
|
||||
AccDataType* workspace_dev_;
|
||||
IndexDataType* workspace_indices_dev_;
|
||||
|
||||
InElementwiseOperation in_elementwise_op_;
|
||||
AccElementwiseOperation acc_elementwise_op_;
|
||||
|
||||
int invariant_lowest_length;
|
||||
int reduce_lowest_length;
|
||||
size_t invariant_total_length;
|
||||
size_t reduce_total_length;
|
||||
|
||||
index_t blkGroupSize;
|
||||
index_t kBlockTileIterations;
|
||||
size_t gridSize;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
{
|
||||
const auto in_grid_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeSrc2dDescriptor(
|
||||
arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.kBlockTileIterations);
|
||||
const auto ws_desc_m_k = DeviceReduceMultiBlockPartialReduce::MakeWorkspace2dDescriptor(
|
||||
arg.invariant_total_length, arg.blkGroupSize);
|
||||
using InGridDesc_M_K = decltype(in_grid_desc_m_k);
|
||||
using WorkspaceDesc_M_K = decltype(ws_desc_m_k);
|
||||
|
||||
using GridwiseReduce =
|
||||
GridwiseReduction_mk_to_mk_multiblock_partial_reduce<InDataType,
|
||||
AccDataType,
|
||||
IndexDataType,
|
||||
InGridDesc_M_K,
|
||||
WorkspaceDesc_M_K,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation,
|
||||
PropagateNan,
|
||||
BlockSize,
|
||||
MThreadClusterSize,
|
||||
KThreadClusterSize,
|
||||
MThreadSliceSize,
|
||||
KThreadSliceSize,
|
||||
InSrcVectorDim,
|
||||
InSrcVectorSize,
|
||||
OutDstVectorSize>;
|
||||
|
||||
float avg_time = 0;
|
||||
|
||||
const auto kernel = kernel_partial_reduce_multiblock<GridwiseReduce,
|
||||
NeedIndices,
|
||||
InDataType,
|
||||
AccDataType,
|
||||
IndexDataType,
|
||||
InGridDesc_M_K,
|
||||
WorkspaceDesc_M_K,
|
||||
InElementwiseOperation,
|
||||
AccElementwiseOperation>;
|
||||
|
||||
avg_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
in_grid_desc_m_k,
|
||||
ws_desc_m_k,
|
||||
arg.in_elementwise_op_,
|
||||
arg.acc_elementwise_op_,
|
||||
arg.blkGroupSize,
|
||||
arg.kBlockTileIterations,
|
||||
arg.in_dev_,
|
||||
arg.workspace_dev_,
|
||||
arg.workspace_indices_dev_);
|
||||
|
||||
return (avg_time);
|
||||
};
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
};
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
{
|
||||
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
||||
|
||||
if constexpr(OutDstVectorSize != 1)
|
||||
return (false);
|
||||
|
||||
if constexpr(InSrcVectorDim == 0)
|
||||
{
|
||||
if constexpr(InvariantDims::Size() == 0)
|
||||
return (false);
|
||||
|
||||
if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
|
||||
return (false);
|
||||
|
||||
if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
|
||||
return (false);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
|
||||
return (false);
|
||||
|
||||
if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
|
||||
return (false);
|
||||
};
|
||||
|
||||
// cases with small reduce_total_length should be handled by the BlockWise method
|
||||
if(pArg->reduce_total_length <= BlockSize * KThreadSliceSize)
|
||||
return (false);
|
||||
|
||||
return (true);
|
||||
};
|
||||
|
||||
std::vector<int> GetWorkspace2dLengths(const BaseArgument* p_arg) override
|
||||
{
|
||||
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
||||
|
||||
return (
|
||||
std::vector<int>{static_cast<int>(pArg->invariant_total_length), pArg->blkGroupSize});
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
float alpha,
|
||||
float beta,
|
||||
const void* in_dev,
|
||||
void* out_dev,
|
||||
void* out_indices_dev,
|
||||
void* workspace_dev,
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const AccElementwiseOperation& acc_elementwise_op) override
|
||||
{
|
||||
return std::make_unique<Argument>(inLengths,
|
||||
inStrides,
|
||||
outLengths,
|
||||
outStrides,
|
||||
alpha,
|
||||
beta,
|
||||
static_cast<const InDataType*>(in_dev),
|
||||
static_cast<OutDataType*>(out_dev),
|
||||
static_cast<IndexDataType*>(out_indices_dev),
|
||||
static_cast<AccDataType*>(workspace_dev),
|
||||
in_elementwise_op,
|
||||
acc_elementwise_op);
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
};
|
||||
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << "DeviceReduceMultiBlockPartialReduce<" << BlockSize << ",";
|
||||
str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
|
||||
str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
|
||||
str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
#endif
|
||||
355
device_operation/include/device_reduce_threadwise.hpp
Normal file
355
device_operation/include/device_reduce_threadwise.hpp
Normal file
@@ -0,0 +1,355 @@
|
||||
#ifndef DEVICE_REDUCE_THREADWISE_HPP
|
||||
#define DEVICE_REDUCE_THREADWISE_HPP
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include "device.hpp"
|
||||
#include "device_reduce.hpp"
|
||||
#include "device_reduce_common.hpp"
|
||||
#include "gridwise_2d_reduction_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
index_t Rank,
|
||||
typename ReduceDims,
|
||||
typename ReduceOperation,
|
||||
typename InElementwiseOperation,
|
||||
typename OutElementwiseOperation,
|
||||
bool PropagateNan,
|
||||
bool NeedIndices,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
index_t MThreadSliceSize,
|
||||
index_t KThreadSliceSize,
|
||||
index_t InSrcVectorDim,
|
||||
index_t InSrcVectorSize,
|
||||
index_t OutDstVectorSize>
|
||||
struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutElementwiseOperation>
|
||||
{
|
||||
static_assert(Rank <= 6, "Bigger Rank size is not supported!");
|
||||
static_assert((BlockSize == MThreadClusterSize) && (KThreadClusterSize == 1),
|
||||
"Threadwise can only be called with KThreadClusterSize be 1 !");
|
||||
|
||||
using IndexDataType = int32_t;
|
||||
|
||||
static constexpr bool BetaIsZero = NeedIndices;
|
||||
|
||||
using InvariantDims = decltype(get_invariant_dims<Rank, ReduceDims>());
|
||||
|
||||
static constexpr index_t srcDims = Rank;
|
||||
static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
|
||||
static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
|
||||
|
||||
static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
|
||||
static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
|
||||
|
||||
static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides)
|
||||
{
|
||||
const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
|
||||
const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
|
||||
|
||||
const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
|
||||
|
||||
const auto in_grid_desc_m_k = [&]() {
|
||||
if constexpr(reduceAllDims)
|
||||
{
|
||||
const auto one_dim_inDesc = transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(tupleSrcLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
return transform_tensor_descriptor(one_dim_inDesc,
|
||||
make_tuple(make_unmerge_transform(make_tuple(
|
||||
1, one_dim_inDesc.GetLength(Number<0>{})))),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0, 1>{}));
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto toReduceDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
|
||||
const auto invariantDimLengths =
|
||||
make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
|
||||
|
||||
return transform_tensor_descriptor(
|
||||
inDesc,
|
||||
make_tuple(make_merge_transform(invariantDimLengths),
|
||||
make_merge_transform(toReduceDimLengths)),
|
||||
make_tuple(InvariantDims{}, ReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
}
|
||||
}();
|
||||
|
||||
const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
|
||||
const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
|
||||
|
||||
const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
|
||||
const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen;
|
||||
|
||||
auto in_grid_desc_m_k_padded =
|
||||
transform_tensor_descriptor(in_grid_desc_m_k,
|
||||
make_tuple(make_right_pad_transform(outerLen, inPad_M),
|
||||
make_right_pad_transform(innerLen, inPad_K)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
return (in_grid_desc_m_k_padded);
|
||||
};
|
||||
|
||||
static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides)
|
||||
{
|
||||
const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<dstDims>{});
|
||||
const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<dstDims>{});
|
||||
|
||||
auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
|
||||
|
||||
auto out_grid_desc_m = transform_tensor_descriptor(
|
||||
outDesc,
|
||||
make_tuple(make_merge_transform(tupleDstLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto outerLen = out_grid_desc_m.GetLength(Number<0>{});
|
||||
|
||||
const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
|
||||
|
||||
auto out_grid_desc_m_padded =
|
||||
transform_tensor_descriptor(out_grid_desc_m,
|
||||
make_tuple(make_right_pad_transform(outerLen, outPad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
return (out_grid_desc_m_padded);
|
||||
};
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
float alpha,
|
||||
float beta,
|
||||
const InDataType* in_dev,
|
||||
OutDataType* out_dev,
|
||||
IndexDataType* out_indices_dev,
|
||||
AccDataType* workspace_dev,
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const OutElementwiseOperation& acc_elementwise_op)
|
||||
: in_dev_{in_dev}, out_dev_{out_dev}, out_indices_dev_{out_indices_dev}
|
||||
{
|
||||
(void)workspace_dev;
|
||||
|
||||
inLengths_ = inLengths;
|
||||
inStrides_ = inStrides;
|
||||
outLengths_ = outLengths;
|
||||
outStrides_ = outStrides;
|
||||
|
||||
in_elementwise_op_ = in_elementwise_op;
|
||||
acc_elementwise_op_ = acc_elementwise_op;
|
||||
|
||||
alpha_ = static_cast<AccDataType>(alpha);
|
||||
beta_ = static_cast<OutDataType>(beta);
|
||||
|
||||
std::tie(invariant_total_length, reduce_total_length) =
|
||||
get_2d_lengths<Rank, ReduceDims>(inLengths);
|
||||
|
||||
if constexpr(InvariantDims::Size() == 0)
|
||||
invariant_lowest_length = 1;
|
||||
else
|
||||
invariant_lowest_length = inLengths[InvariantDims::At(InvariantDims::Size() - 1)];
|
||||
|
||||
reduce_lowest_length = inLengths[ReduceDims::At(ReduceDims::Size() - 1)];
|
||||
|
||||
gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
|
||||
M_BlockTileSize;
|
||||
}
|
||||
|
||||
std::vector<int> inLengths_;
|
||||
std::vector<int> inStrides_;
|
||||
std::vector<int> outLengths_;
|
||||
std::vector<int> outStrides_;
|
||||
|
||||
AccDataType alpha_;
|
||||
OutDataType beta_;
|
||||
|
||||
const InDataType* in_dev_;
|
||||
OutDataType* out_dev_;
|
||||
IndexDataType* out_indices_dev_;
|
||||
|
||||
InElementwiseOperation in_elementwise_op_;
|
||||
OutElementwiseOperation acc_elementwise_op_;
|
||||
|
||||
int invariant_lowest_length;
|
||||
int reduce_lowest_length;
|
||||
size_t invariant_total_length;
|
||||
size_t reduce_total_length;
|
||||
|
||||
size_t gridSize;
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, int nrepeat = 1)
|
||||
{
|
||||
const auto in_grid_desc_m_k =
|
||||
DeviceReduceThreadWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_);
|
||||
const auto out_grid_desc_m =
|
||||
DeviceReduceThreadWise::MakeDst1dDescriptor(arg.outLengths_, arg.outStrides_);
|
||||
using InGridDesc_M_K = decltype(in_grid_desc_m_k);
|
||||
using OutGridDesc_M = decltype(out_grid_desc_m);
|
||||
|
||||
using GridwiseReduce = GridwiseReduction_mk_to_m_threadwise<InDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
IndexDataType,
|
||||
InGridDesc_M_K,
|
||||
OutGridDesc_M,
|
||||
ReduceOperation,
|
||||
InElementwiseOperation,
|
||||
OutElementwiseOperation,
|
||||
PropagateNan,
|
||||
BetaIsZero,
|
||||
BlockSize,
|
||||
MThreadClusterSize,
|
||||
KThreadClusterSize,
|
||||
MThreadSliceSize,
|
||||
KThreadSliceSize,
|
||||
InSrcVectorDim,
|
||||
InSrcVectorSize,
|
||||
OutDstVectorSize>;
|
||||
|
||||
float avg_time = 0;
|
||||
|
||||
const auto kernel = kernel_reduce_threadwise<GridwiseReduce,
|
||||
NeedIndices,
|
||||
InDataType,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
IndexDataType,
|
||||
InGridDesc_M_K,
|
||||
OutGridDesc_M,
|
||||
InElementwiseOperation,
|
||||
OutElementwiseOperation>;
|
||||
|
||||
avg_time = launch_and_time_kernel(kernel,
|
||||
nrepeat,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
in_grid_desc_m_k,
|
||||
out_grid_desc_m,
|
||||
arg.in_elementwise_op_,
|
||||
arg.acc_elementwise_op_,
|
||||
arg.alpha_,
|
||||
arg.in_dev_,
|
||||
arg.beta_,
|
||||
arg.out_dev_,
|
||||
arg.out_indices_dev_);
|
||||
|
||||
return (avg_time);
|
||||
};
|
||||
|
||||
float Run(const BaseArgument* p_arg, int nrepeat = 1) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
|
||||
};
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* p_arg) override
|
||||
{
|
||||
const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
|
||||
|
||||
if constexpr(InSrcVectorDim == 0)
|
||||
{
|
||||
if constexpr(InvariantDims::Size() == 0)
|
||||
return (false);
|
||||
|
||||
if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
|
||||
return (false);
|
||||
|
||||
if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
|
||||
return (false);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
|
||||
return (false);
|
||||
|
||||
if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
|
||||
return (false);
|
||||
};
|
||||
|
||||
// To improve
|
||||
if(pArg->invariant_lowest_length % OutDstVectorSize != 0)
|
||||
return (false);
|
||||
|
||||
// TODO: remove this. Should return true, as long as this DeviceOP instance support this
|
||||
// case for bigger reduce_total_length size, we are supposed to use BlockWise method for
|
||||
// better performance
|
||||
if(pArg->reduce_total_length / KThreadSliceSize >= 32)
|
||||
return (false);
|
||||
|
||||
return (true);
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const std::vector<int>& inLengths,
|
||||
const std::vector<int>& inStrides,
|
||||
const std::vector<int>& outLengths,
|
||||
const std::vector<int>& outStrides,
|
||||
float alpha,
|
||||
float beta,
|
||||
const void* in_dev,
|
||||
void* out_dev,
|
||||
void* out_indices_dev,
|
||||
void* workspace_dev,
|
||||
const InElementwiseOperation& in_elementwise_op,
|
||||
const OutElementwiseOperation& acc_elementwise_op) override
|
||||
{
|
||||
return std::make_unique<Argument>(inLengths,
|
||||
inStrides,
|
||||
outLengths,
|
||||
outStrides,
|
||||
alpha,
|
||||
beta,
|
||||
static_cast<const InDataType*>(in_dev),
|
||||
static_cast<OutDataType*>(out_dev),
|
||||
static_cast<IndexDataType*>(out_indices_dev),
|
||||
static_cast<AccDataType*>(workspace_dev),
|
||||
in_elementwise_op,
|
||||
acc_elementwise_op);
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
};
|
||||
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << "DeviceReducceThreadWise<" << BlockSize << ",";
|
||||
str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
|
||||
str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
|
||||
str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
#endif
|
||||
169
device_operation/include/reduction_operator_mapping.hpp
Normal file
169
device_operation/include/reduction_operator_mapping.hpp
Normal file
@@ -0,0 +1,169 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2020 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*******************************************************************************/
|
||||
#ifndef CK_REDUCTION_OPERATOR_MAPPING_HPP
|
||||
#define CK_REDUCTION_OPERATOR_MAPPING_HPP
|
||||
|
||||
#include "reduction_operator.hpp"
|
||||
#include "reduction_enums.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
|
||||
// respective functor classes.
|
||||
// The boolean member "indexable" are also provided in reduce_binary_operactor for
|
||||
// easier checking by the upper-layer codes in the kernels.
|
||||
|
||||
template <typename T, ReduceTensorOp_t Op>
|
||||
struct reduce_binary_operator;
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::ADD>
|
||||
{
|
||||
using opType = reduce::Add<T>;
|
||||
using dataType = T;
|
||||
|
||||
static constexpr bool indexable = false;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::MUL>
|
||||
{
|
||||
using opType = reduce::Mul<T>;
|
||||
using dataType = T;
|
||||
|
||||
static constexpr bool indexable = false;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::MIN>
|
||||
{
|
||||
using opType = reduce::Min<T>;
|
||||
using dataType = T;
|
||||
|
||||
static constexpr bool indexable = true;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::MAX>
|
||||
{
|
||||
using opType = reduce::Max<T>;
|
||||
using dataType = T;
|
||||
|
||||
static constexpr bool indexable = true;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::AMAX>
|
||||
{
|
||||
using opType = reduce::AMax<T>;
|
||||
using dataType = T;
|
||||
|
||||
static constexpr bool indexable = true;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::AVG>
|
||||
{
|
||||
using opType = reduce::Add<T>;
|
||||
using dataType = T;
|
||||
|
||||
static constexpr bool indexable = false;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::NORM1>
|
||||
{
|
||||
using opType = reduce::Add<T>;
|
||||
using dataType = T;
|
||||
|
||||
static constexpr bool indexable = false;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_binary_operator<T, ReduceTensorOp_t::NORM2>
|
||||
{
|
||||
using opType = reduce::Add<T>;
|
||||
using dataType = T;
|
||||
|
||||
static constexpr bool indexable = false;
|
||||
};
|
||||
|
||||
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
|
||||
// functor classes.
|
||||
// The two unary functors are called before and afer the Reduction is executed respectively
|
||||
template <typename T, ReduceTensorOp_t Op, bool IsFirstReduce, bool IsLastReduce>
|
||||
struct reduce_unary_operator
|
||||
{
|
||||
using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
|
||||
using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
|
||||
};
|
||||
|
||||
template <typename T, bool IsFirstReduce>
|
||||
struct reduce_unary_operator<T, ReduceTensorOp_t::AVG, IsFirstReduce, true>
|
||||
{
|
||||
using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
|
||||
using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T, true>;
|
||||
};
|
||||
|
||||
template <typename T, bool IsLastReduce>
|
||||
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM1, true, IsLastReduce>
|
||||
{
|
||||
using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs<T, T>;
|
||||
using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
|
||||
};
|
||||
|
||||
template <typename T, bool IsLastReduce>
|
||||
struct reduce_unary_operator<T, ReduceTensorOp_t::AMAX, true, IsLastReduce>
|
||||
{
|
||||
using InElementwiseOperation = tensor_operation::element_wise::UnaryAbs<T, T>;
|
||||
using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, false>
|
||||
{
|
||||
using InElementwiseOperation = tensor_operation::element_wise::UnarySquare<T, T>;
|
||||
using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, true>
|
||||
{
|
||||
using InElementwiseOperation = tensor_operation::element_wise::UnarySquare<T, T>;
|
||||
using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt<T, T>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, false, true>
|
||||
{
|
||||
using InElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
|
||||
using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt<T, T>;
|
||||
};
|
||||
|
||||
} // end of namespace ck
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,34 @@
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,25 @@
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,43 @@
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,25 @@
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,43 @@
|
||||
#include "device_reduce_instance_blockwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,34 @@
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,25 @@
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, half_t, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,43 @@
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,25 @@
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, float, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,43 @@
|
||||
#include "device_reduce_instance_blockwise_second_call.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0); //
|
||||
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,22 @@
|
||||
#include "device_reduce_instance_multiblock_atomic_add.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 0);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,22 @@
|
||||
#include "device_reduce_instance_multiblock_atomic_add.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,22 @@
|
||||
#include "device_reduce_instance_multiblock_atomic_add.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,34 @@
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,25 @@
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,38 @@
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); //
|
||||
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,19 @@
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,46 @@
|
||||
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); //
|
||||
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); //
|
||||
|
||||
// Will be moved to use MultiBlockAtomicAdd
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0); //
|
||||
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,34 @@
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,25 @@
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0);
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,43 @@
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0);
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1);
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,25 @@
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0);
|
||||
ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1);
|
||||
ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,43 @@
|
||||
#include "device_reduce_instance_threadwise.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_reduce_instance {
|
||||
|
||||
// clang-format off
|
||||
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0);
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); //
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0); //
|
||||
ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); //
|
||||
// clang-format on
|
||||
|
||||
} // namespace device_reduce_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
} // namespace ck
|
||||
Reference in New Issue
Block a user