mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 19:09:59 +00:00
Reduction in Composable Kernel (#82)
* Initial adding of generic reduction
* Initial adding of generic reduction ...
* Updates to make compiling done
* clang-format all files
* clang-format some files again
* Renaming in profiler/include/profile_reduce.hpp
* Updates and make BlockWise cases passed
* Updates and make ThreadWise and MultiBlockTwoCall cases passed
* Remove the support for MUL and NORM1 reduceOp from the profiler and the device instances
* Change to replace the dim0_max_vector_size/dim1_max_vector_size template argument in the device reduce classes
* format
* adding pooling
* added max and average pooling
* comment out cout and kernel timing
* Tiny simplification in profiler/reduce_profiler.cpp
* Add example for reduce_blockwise
* Tiny updates
* Change to pass the ElementWiseOp from device layer to kernel
* Fix the vectorDim and vectorSize in Device layer
* Enable vector load on both dim0 and dim1 for Threadwise method
* Tiny updates
* Change to let the user to pass the preUnaryOp and posUnaryOp
* Make pooling example work
* split device_reduce_instance into two libraries
* Tiny update
* Replace nanPropaOpt enum by boolean propagate_nan
* Simplification in DeviceReduce layer codes
* update build
* Change to clarify the difference between ck::half_t and half_float::half
* Renaming in all the reduction codes
* Add VectorSize as template parameter for device layer
* Add BetaIsZero as kernel template and as AccDataType for alpha
* print
* Small updates for pooling
* Updates for host_generic_reduction for reference
* Update to make AVG pooling pass
* Update to make MAX pooling with indices output pass
* fix
* add OutDst vector store to threadwise reduction and pooling
* tweak
* turn off check_indices that caused build issue
* refactor pooling
* clean up
* turn off check_indices for building issue for php-compiler
* add more tile size for odd C
* tweak conv for odd C
* update script
* clean up elementwise op
* add hack in reduction_operator.hpp to avoid compile error. To fix it, need to use element_wise_op in reduction op
* Add OutVectorSize as device and kernel tunable, also update to Elementwise Operations
* Move reduce operator mapping to host layer file reduction_operator_mapping.hpp from reduction_operator.hpp
* Change to the unary operators
* Move the definitions of unary operations to element_wise_operation.hpp
* re-org files
* Refine in device interfaces and multiblock kernels
* Split the reduction configurations into instances for specific methods
* Update in getTypeString() of device pool2d
* Renaming in host and kernel
* Tiny update in profiler/src/profiler.cpp
* Uncomment in device_operation/CMakeLists.txt to enable the building of all operations
* Make check_indices a templated function to remove some linking issue
* Renaming in the profiler reduce module
* Add support for double Reduction (but disable MultiblockAtomicAdd for double)
* Tiny correction of literal string
* Rename DevicePoolFwd to DevicePool2dFwd
* Split device_reduce_instance_xxx.cpp files according to the data types to speed up compiling
* Add comments for lists of configurations, lists of instances and references of add_reduce_instances_xxx
* Remove un-used header file gridwise_generic_reduction_wrapper_common.hpp
* Renaming and refining in the Reduction codes
* Tiny change in the unary operators
* Renaming symbols and files
* Renaming symbols in the kernels
* Move kernel kernel_set_buffer_value to separate file
* Add IndexDataType template parameter for kernels and use int32_t as index data type in device layer
* Tiny update in the kernels
* Remove definition of sqrtf()/isnan()/abs() for half_t due to some ADL issue
* Simplify a helper function in device layer
* Tiny adjustment in testing data initialization
* Renaming in kernel/device/host
* Add two testing scripts for reduction
* Refine the Unary operators in element_wise_operation.hpp
* Update in the reduce profiler module
* Update to the reduction testing scripts
* reduce compile parallelism
* change CI docker to rocm5.0
* remove unused variables
* fix build
Co-authored-by: Chao Liu <chao.liu2@amd.com>
[ROCm/composable_kernel commit: e17c0d8008]
This commit is contained in:
425
profiler/src/profile_reduce.cpp
Normal file
425
profiler/src/profile_reduce.cpp
Normal file
@@ -0,0 +1,425 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
#include <sstream>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "reduction_enums.hpp"
|
||||
|
||||
#include "profile_reduce_impl.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
using ck::NanPropagation_t;
|
||||
using ck::ReduceTensorIndices_t;
|
||||
using ck::ReduceTensorOp_t;
|
||||
|
||||
static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
|
||||
{"toReduceDims", required_argument, nullptr, 'R'},
|
||||
{"reduceOp", required_argument, nullptr, 'O'},
|
||||
{"compType", required_argument, nullptr, 'C'},
|
||||
{"outType", required_argument, nullptr, 'W'},
|
||||
{"nanOpt", required_argument, nullptr, 'N'},
|
||||
{"indicesOpt", required_argument, nullptr, 'I'},
|
||||
{"scales", required_argument, nullptr, 'S'},
|
||||
{"half", no_argument, nullptr, '?'},
|
||||
{"double", no_argument, nullptr, '?'},
|
||||
{"dumpout", required_argument, nullptr, 'o'},
|
||||
{"verify", required_argument, nullptr, 'v'},
|
||||
{"log", required_argument, nullptr, 'l'},
|
||||
{"help", no_argument, nullptr, '?'},
|
||||
{nullptr, 0, nullptr, 0}};
|
||||
|
||||
template <typename T>
|
||||
static T getSingleValueFromString(const string& valueStr)
|
||||
{
|
||||
std::istringstream iss(valueStr);
|
||||
|
||||
T val;
|
||||
|
||||
iss >> val;
|
||||
|
||||
return (val);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static std::vector<T> getTypeValuesFromString(const char* cstr_values)
|
||||
{
|
||||
std::string valuesStr(cstr_values);
|
||||
|
||||
std::vector<T> values;
|
||||
std::size_t pos = 0;
|
||||
std::size_t new_pos;
|
||||
|
||||
new_pos = valuesStr.find(',', pos);
|
||||
while(new_pos != std::string::npos)
|
||||
{
|
||||
const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
|
||||
|
||||
T val = getSingleValueFromString<T>(sliceStr);
|
||||
|
||||
values.push_back(val);
|
||||
|
||||
pos = new_pos + 1;
|
||||
new_pos = valuesStr.find(',', pos);
|
||||
};
|
||||
|
||||
std::string sliceStr = valuesStr.substr(pos);
|
||||
T val = getSingleValueFromString<T>(sliceStr);
|
||||
|
||||
values.push_back(val);
|
||||
|
||||
return (values);
|
||||
}
|
||||
|
||||
typedef enum
|
||||
{
|
||||
appHalf = 0,
|
||||
appFloat = 1,
|
||||
appInt32 = 2,
|
||||
appInt8 = 3,
|
||||
appInt8x4 = 4,
|
||||
appBFloat16 = 5,
|
||||
appDouble = 6,
|
||||
} appDataType_t;
|
||||
|
||||
static void check_reduce_dims(const int rank, const std::vector<int>& toReduceDims)
|
||||
{
|
||||
for(auto dim : toReduceDims)
|
||||
{
|
||||
if(dim < 0 || dim >= rank)
|
||||
throw std::runtime_error("Invalid dimension index specified for Reducing");
|
||||
};
|
||||
|
||||
unsigned int flag = 0;
|
||||
|
||||
for(auto dim : toReduceDims)
|
||||
{
|
||||
if(flag & (0x1 << dim))
|
||||
throw std::runtime_error("All toReduce dimensions should be different!");
|
||||
flag = flag | (0x1 << dim);
|
||||
};
|
||||
};
|
||||
|
||||
class AppArgs
|
||||
{
|
||||
private:
|
||||
int option_index = 0;
|
||||
|
||||
public:
|
||||
bool use_half = false;
|
||||
bool use_double = false;
|
||||
|
||||
std::vector<size_t> inLengths;
|
||||
std::vector<size_t> outLengths;
|
||||
std::vector<int> toReduceDims;
|
||||
|
||||
std::vector<float> scales;
|
||||
|
||||
ReduceTensorOp_t reduceOp = ReduceTensorOp_t::ADD;
|
||||
appDataType_t compTypeId = appFloat;
|
||||
appDataType_t outTypeId = appFloat;
|
||||
|
||||
bool compType_assigned = false;
|
||||
bool outType_assigned = false;
|
||||
|
||||
NanPropagation_t nanOpt = NanPropagation_t::NOT_PROPAGATE_NAN;
|
||||
ReduceTensorIndices_t indicesOpt = ReduceTensorIndices_t::NO_INDICES;
|
||||
bool do_log = false;
|
||||
bool do_verification = false;
|
||||
bool do_dumpout = false;
|
||||
|
||||
int init_method;
|
||||
int nrepeat;
|
||||
|
||||
bool need_indices = false;
|
||||
|
||||
AppArgs() = default;
|
||||
~AppArgs() = default;
|
||||
|
||||
void show_usage(const char* cmd)
|
||||
{
|
||||
std::cout << "Usage of " << cmd << std::endl;
|
||||
std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
|
||||
<< std::endl;
|
||||
std::cout << "--toReduceDims or -R, comma separated list of to-reduce dimensions"
|
||||
<< std::endl;
|
||||
std::cout << "--reduceOp or -O, enum value indicating the reduction operations"
|
||||
<< std::endl;
|
||||
std::cout << "--compType or -C, enum value indicating the type of accumulated values used "
|
||||
"during the reduction"
|
||||
<< std::endl;
|
||||
std::cout << "--outType or -W, optional enum value indicating the type of the reduced "
|
||||
"output, which could be float when the input data is half"
|
||||
<< std::endl;
|
||||
std::cout << "--nanOpt or -N, enum value indicates the selection for NanOpt" << std::endl;
|
||||
std::cout << "--indicesOpt or -I, enum value indicates the selection for IndicesOpt"
|
||||
<< std::endl;
|
||||
std::cout << "--scales or -S, comma separated two float values for alpha and beta"
|
||||
<< std::endl;
|
||||
std::cout << "--half, use fp16 for the input and output tensor data types" << std::endl;
|
||||
std::cout << "--double, use fp64 for the input and output tensor data types" << std::endl;
|
||||
std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
|
||||
"comparing with the host-based reduction"
|
||||
<< std::endl;
|
||||
std::cout << "--dumpout or -o, 1/0 to indicate where to save the reduction result to files "
|
||||
"for further analysis"
|
||||
<< std::endl;
|
||||
std::cout << "--log or -l, 1/0 to indicate whether to log some information" << std::endl;
|
||||
};
|
||||
|
||||
int processArgs(int argc, char* argv[])
|
||||
{
|
||||
unsigned int ch;
|
||||
|
||||
optind++; // to skip the "reduce" module name
|
||||
|
||||
while(1)
|
||||
{
|
||||
ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:l:", long_options, &option_index);
|
||||
if(ch == -1)
|
||||
break;
|
||||
switch(ch)
|
||||
{
|
||||
case 'D':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
inLengths = getTypeValuesFromString<size_t>(optarg);
|
||||
break;
|
||||
case 'R':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
toReduceDims = getTypeValuesFromString<int>(optarg);
|
||||
break;
|
||||
case 'O':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
reduceOp = static_cast<ReduceTensorOp_t>(std::atoi(optarg));
|
||||
break;
|
||||
case 'C':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
compTypeId = static_cast<appDataType_t>(std::atoi(optarg));
|
||||
compType_assigned = true;
|
||||
break;
|
||||
case 'W':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
outTypeId = static_cast<appDataType_t>(std::atoi(optarg));
|
||||
outType_assigned = true;
|
||||
break;
|
||||
case 'N':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
nanOpt = static_cast<NanPropagation_t>(std::atoi(optarg));
|
||||
break;
|
||||
case 'I':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
indicesOpt = static_cast<ReduceTensorIndices_t>(std::atoi(optarg));
|
||||
break;
|
||||
case 'S':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
scales = getTypeValuesFromString<float>(optarg);
|
||||
|
||||
if(scales.size() != 2)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
break;
|
||||
case 'v':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
do_verification = static_cast<bool>(std::atoi(optarg));
|
||||
break;
|
||||
case 'o':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
do_dumpout = static_cast<bool>(std::atoi(optarg));
|
||||
break;
|
||||
case 'l':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
do_log = static_cast<bool>(std::atoi(optarg));
|
||||
break;
|
||||
case '?':
|
||||
if(std::string(long_options[option_index].name) == "half")
|
||||
use_half = true;
|
||||
else if(std::string(long_options[option_index].name) == "double")
|
||||
use_double = true;
|
||||
else if(std::string(long_options[option_index].name) == "help")
|
||||
{
|
||||
show_usage(argv[0]);
|
||||
return (-1);
|
||||
};
|
||||
break;
|
||||
|
||||
default:
|
||||
show_usage(argv[0]);
|
||||
std::cerr << "Invalid cmd-line options!" << std::endl;
|
||||
return (-1);
|
||||
};
|
||||
};
|
||||
|
||||
if(optind + 2 > argc)
|
||||
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
|
||||
|
||||
init_method = std::atoi(argv[optind++]);
|
||||
nrepeat = std::atoi(argv[optind]);
|
||||
|
||||
if(scales.empty())
|
||||
{
|
||||
scales.push_back(1.0f);
|
||||
scales.push_back(0.0f);
|
||||
};
|
||||
|
||||
if(reduceOp == ReduceTensorOp_t::MIN || reduceOp == ReduceTensorOp_t::MAX ||
|
||||
reduceOp == ReduceTensorOp_t::AMAX)
|
||||
{
|
||||
if(indicesOpt != ReduceTensorIndices_t::NO_INDICES)
|
||||
need_indices = true;
|
||||
|
||||
// for indexable operations, no need to assign compType and outType, just let them be
|
||||
// same as inType
|
||||
compType_assigned = false;
|
||||
outType_assigned = false;
|
||||
};
|
||||
|
||||
return (0);
|
||||
};
|
||||
|
||||
}; // end of class AppArgs
|
||||
|
||||
int profile_reduce(int argc, char* argv[])
|
||||
{
|
||||
using namespace ck::profiler;
|
||||
|
||||
AppArgs args;
|
||||
|
||||
if(args.processArgs(argc, argv) < 0)
|
||||
return (-1);
|
||||
|
||||
int rank = args.inLengths.size();
|
||||
|
||||
check_reduce_dims(rank, args.toReduceDims);
|
||||
|
||||
if(args.reduceOp == ReduceTensorOp_t::MUL || args.reduceOp == ReduceTensorOp_t::NORM1)
|
||||
throw std::runtime_error("MUL and NORM1 are not supported by composable kernel!");
|
||||
|
||||
if(args.use_half)
|
||||
{
|
||||
if(!args.compType_assigned)
|
||||
args.compTypeId = appHalf;
|
||||
|
||||
if(args.outType_assigned && (args.outTypeId != appHalf && args.outTypeId != appFloat))
|
||||
args.outTypeId = appFloat;
|
||||
|
||||
if(!args.outType_assigned)
|
||||
args.outTypeId = appHalf;
|
||||
|
||||
if(args.compTypeId == appHalf)
|
||||
{
|
||||
profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(args.do_verification,
|
||||
args.init_method,
|
||||
args.do_log,
|
||||
args.do_dumpout,
|
||||
args.nrepeat,
|
||||
args.inLengths,
|
||||
args.toReduceDims,
|
||||
args.reduceOp,
|
||||
args.nanOpt,
|
||||
args.indicesOpt,
|
||||
args.scales[0],
|
||||
args.scales[1]);
|
||||
}
|
||||
else if(args.compTypeId == appFloat)
|
||||
{
|
||||
profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification,
|
||||
args.init_method,
|
||||
args.do_log,
|
||||
args.do_dumpout,
|
||||
args.nrepeat,
|
||||
args.inLengths,
|
||||
args.toReduceDims,
|
||||
args.reduceOp,
|
||||
args.nanOpt,
|
||||
args.indicesOpt,
|
||||
args.scales[0],
|
||||
args.scales[1]);
|
||||
}
|
||||
else
|
||||
throw std::runtime_error("Invalid compType assignment!");
|
||||
}
|
||||
else if(args.use_double)
|
||||
{
|
||||
profile_reduce_impl<double, double, double>(args.do_verification,
|
||||
args.init_method,
|
||||
args.do_log,
|
||||
args.do_dumpout,
|
||||
args.nrepeat,
|
||||
args.inLengths,
|
||||
args.toReduceDims,
|
||||
args.reduceOp,
|
||||
args.nanOpt,
|
||||
args.indicesOpt,
|
||||
args.scales[0],
|
||||
args.scales[1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(args.compTypeId == appFloat)
|
||||
{
|
||||
profile_reduce_impl<float, float, float>(args.do_verification,
|
||||
args.init_method,
|
||||
args.do_log,
|
||||
args.do_dumpout,
|
||||
args.nrepeat,
|
||||
args.inLengths,
|
||||
args.toReduceDims,
|
||||
args.reduceOp,
|
||||
args.nanOpt,
|
||||
args.indicesOpt,
|
||||
args.scales[0],
|
||||
args.scales[1]);
|
||||
}
|
||||
else if(args.compTypeId == appDouble)
|
||||
{
|
||||
profile_reduce_impl<float, double, float>(args.do_verification,
|
||||
args.init_method,
|
||||
args.do_log,
|
||||
args.do_dumpout,
|
||||
args.nrepeat,
|
||||
args.inLengths,
|
||||
args.toReduceDims,
|
||||
args.reduceOp,
|
||||
args.nanOpt,
|
||||
args.indicesOpt,
|
||||
args.scales[0],
|
||||
args.scales[1]);
|
||||
}
|
||||
else
|
||||
throw std::runtime_error("Invalid compType assignment!");
|
||||
};
|
||||
|
||||
return (0);
|
||||
};
|
||||
Reference in New Issue
Block a user