mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 18:17:44 +00:00
* init for reduce_threadwise multi_d
* add reduce_threadwise_multi_d
* add reduce_multi_d
* clean
* start add an other splitk device op
* add reduce template parameter to SplitKBatchOffset
* add reduce c matrix
* clean up code
* change example data type to bf16
* add bf16Ai8B example
* remove reduce template parameter
* add splitk atomic status to v4
* example add multi d parameters
* device op add multi-d parameters
* add multi-d to reduce
* fix kbach=1 bug
* change B layout to col in bf16Ai8B example
* remove float adding struct
* change multi-d interface
* change file and class name
* remove multi-d of bf16Ai8B example
* change IsReduce function to IsReduceAdd
* change example layout to RRR from RCR
* according layout to set ds stride
* reset parameter layout
* add gemm universal reduce instance
* add reduce factory
* add profile_gemm_universal_reduce
* add reduce to profiler
* fix reduce instance
* fix profiler reduce compiling bug
* format
* format library instance code
* add mem instance for reduce library
* fix call instance names
* add workspace for reduce in ckProfiler
* format
* add mnpading to reduce library instance
* add fp16 instance to reduce of profiler
* change copyright time
* restore profiler cmake file
* add reduce text to instances
* add DsLayout and DsDataType to instances template parameter
* fixed gemm_reduce_multi_d
* add an example without multi_d
* Update common.hpp
* Update gtest.cmake
* Update gemm_xdl_splitk_reduce_bf16.cpp
* clean
* Update gtest.cmake
* format
* fixe api
* format
* default parameter change to RRR
* add vector_len for multi_d
* format
* Update gtest.cmake
* fix bf16A iBB elementwiseop
* add ReduceDataType
* move ReduceDataType to end position
* format
* remove googletest git method address
* fix copyright time
* update init data
---------
Co-authored-by: root <jizhan@amd.com>
Co-authored-by: letaoqin <letaoqin@amd.com>
Co-authored-by: Jing Zhang <jizhan@meta.com>
Co-authored-by: zjing14 <zhangjing14@gmail.com>
[ROCm/composable_kernel commit: c544eb4da0]
102 lines
3.0 KiB
C++
102 lines
3.0 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include <cstdlib>
|
|
#include <iostream>
|
|
#include <initializer_list>
|
|
#include <numeric>
|
|
|
|
#include "ck/ck.hpp"
|
|
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
|
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
|
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
|
#include "ck/utility/data_type.hpp"
|
|
|
|
#include "ck/library/utility/check_err.hpp"
|
|
#include "ck/library/utility/device_memory.hpp"
|
|
#include "ck/library/utility/fill.hpp"
|
|
#include "ck/library/utility/host_tensor.hpp"
|
|
#include "ck/library/utility/host_tensor_generator.hpp"
|
|
#include "ck/library/utility/literals.hpp"
|
|
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
|
#include "ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp"
|
|
|
|
struct ProblemSizeSplitK final
|
|
{
|
|
ck::index_t M = 256;
|
|
ck::index_t N = 1024;
|
|
ck::index_t K = 512;
|
|
|
|
ck::index_t StrideA = K;
|
|
ck::index_t StrideB = N;
|
|
ck::index_t StrideC = N;
|
|
|
|
ck::index_t KBatch = 2;
|
|
};
|
|
|
|
struct ExecutionConfig final
|
|
{
|
|
bool do_verification = true;
|
|
int init_method = 2;
|
|
bool time_kernel = true;
|
|
};
|
|
|
|
template <ck::index_t... Is>
|
|
using S = ck::Sequence<Is...>;
|
|
|
|
using Row = ck::tensor_layout::gemm::RowMajor;
|
|
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
|
|
|
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
|
using Add = ck::tensor_operation::element_wise::Add;
|
|
|
|
bool parse_cmd_args(int argc,
|
|
char* argv[],
|
|
ProblemSizeSplitK& problem_size,
|
|
ExecutionConfig& config)
|
|
{
|
|
if(argc == 1)
|
|
{
|
|
// use default case
|
|
}
|
|
else if(argc == 4)
|
|
{
|
|
config.do_verification = std::stoi(argv[1]);
|
|
config.init_method = std::stoi(argv[2]);
|
|
config.time_kernel = std::stoi(argv[3]);
|
|
}
|
|
else if(argc >= 10)
|
|
{
|
|
config.do_verification = std::stoi(argv[1]);
|
|
config.init_method = std::stoi(argv[2]);
|
|
config.time_kernel = std::stoi(argv[3]);
|
|
|
|
problem_size.M = std::stoi(argv[4]);
|
|
problem_size.N = std::stoi(argv[5]);
|
|
problem_size.K = std::stoi(argv[6]);
|
|
|
|
problem_size.StrideA = std::stoi(argv[7]);
|
|
problem_size.StrideB = std::stoi(argv[8]);
|
|
problem_size.StrideC = std::stoi(argv[9]);
|
|
|
|
if(argc >= 11)
|
|
{
|
|
problem_size.KBatch = std::stoi(argv[10]);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
|
|
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
|
|
<< std::endl
|
|
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
|
|
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
|
|
<< "arg10: KBatch" << std::endl;
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|