mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
Universal gemm flush cache (#1251)
* add flush cache to device op * add flush cache parameter to ckProfiler * change calculate size a and b method * chang evaluation time method foro AVERAGE to MEDIAN * format code * adjust some code * fix core dumped * remove loop call flush icache in kernel * remove loop(outer) call flush icache --------- Co-authored-by: letaoqin <letaoqin@amd.com>
This commit is contained in:
229
include/ck/host_utility/flush_cache.hpp
Normal file
229
include/ck/host_utility/flush_cache.hpp
Normal file
@@ -0,0 +1,229 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <set>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/stream_config.hpp"
|
||||
#include "ck/host_utility/hip_check_error.hpp"
|
||||
#include "ck/utility/flush_icache.hpp"
|
||||
namespace ck {
|
||||
namespace utility {
|
||||
|
||||
template <typename Argument>
|
||||
struct RotatingMemWrapper
|
||||
{
|
||||
using ADataType = decltype(Argument::p_a_grid);
|
||||
using BDataType = decltype(Argument::p_b_grid);
|
||||
|
||||
RotatingMemWrapper() = delete;
|
||||
RotatingMemWrapper(Argument& arg_,
|
||||
std::size_t rotating_count_,
|
||||
std::size_t size_a_,
|
||||
std::size_t size_b_)
|
||||
: arg(arg_), rotating_count(rotating_count_), size_a(size_a_), size_b(size_b_)
|
||||
{
|
||||
p_a_grids.push_back(arg.p_a_grid);
|
||||
p_b_grids.push_back(arg.p_b_grid);
|
||||
for(size_t i = 1; i < rotating_count; i++)
|
||||
{
|
||||
{
|
||||
void* pADeviceBuf;
|
||||
hip_check_error(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
|
||||
hip_check_error(hipMemcpy(static_cast<void*>(pADeviceBuf),
|
||||
const_cast<void*>(p_a_grids[0]),
|
||||
size_a_,
|
||||
hipMemcpyDeviceToDevice));
|
||||
p_a_grids.push_back(pADeviceBuf);
|
||||
}
|
||||
|
||||
{
|
||||
void* pBDeviceBuf;
|
||||
hip_check_error(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
|
||||
hip_check_error(hipMemcpy(static_cast<void*>(pBDeviceBuf),
|
||||
const_cast<void*>(p_b_grids[0]),
|
||||
size_b_,
|
||||
hipMemcpyDeviceToDevice));
|
||||
p_b_grids.push_back(pBDeviceBuf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Next()
|
||||
{
|
||||
if(rotating_count > 1)
|
||||
{
|
||||
std::size_t idx = iter++ % rotating_count;
|
||||
arg.p_a_grid = reinterpret_cast<ADataType>(p_a_grids[idx]);
|
||||
arg.p_b_grid = reinterpret_cast<BDataType>(p_b_grids[idx]);
|
||||
}
|
||||
}
|
||||
void Print()
|
||||
{
|
||||
std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b
|
||||
<< ", rotating_count: " << rotating_count << "}" << std::endl;
|
||||
}
|
||||
~RotatingMemWrapper()
|
||||
{
|
||||
if(rotating_count > 1)
|
||||
{
|
||||
// restore ptr
|
||||
arg.p_a_grid = reinterpret_cast<ADataType>(p_a_grids[0]);
|
||||
arg.p_b_grid = reinterpret_cast<BDataType>(p_b_grids[0]);
|
||||
|
||||
// free device mem
|
||||
for(size_t i = 1; i < rotating_count; i++)
|
||||
{
|
||||
hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
|
||||
hip_check_error(hipFree(const_cast<void*>(p_b_grids[i])));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
Argument& arg;
|
||||
std::size_t iter = 0;
|
||||
std::size_t rotating_count = 1;
|
||||
std::size_t size_a = 0;
|
||||
std::size_t size_b = 0;
|
||||
std::vector<const void*> p_a_grids;
|
||||
std::vector<const void*> p_b_grids;
|
||||
};
|
||||
|
||||
inline void flush_icache()
|
||||
{
|
||||
hipDeviceProp_t deviceProps;
|
||||
hip_check_error(hipGetDeviceProperties(&deviceProps, 0));
|
||||
int32_t gpu_block3 = deviceProps.multiProcessorCount * 60;
|
||||
|
||||
ck::flush_icache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
|
||||
hip_check_error(hipGetLastError());
|
||||
}
|
||||
// if TimePrePress == false, return time does not include preprocess's time
|
||||
template <bool TimePreprocess, typename Args, typename F, typename PreProcessFunc>
|
||||
float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
|
||||
PreProcessFunc preprocess,
|
||||
F kernel,
|
||||
dim3 grid_dim,
|
||||
dim3 block_dim,
|
||||
std::size_t lds_byte,
|
||||
Args& args)
|
||||
{
|
||||
#if CK_TIME_KERNEL
|
||||
#define MEDIAN 1
|
||||
if(stream_config.time_kernel_)
|
||||
{
|
||||
#if DEBUG_LOG
|
||||
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
|
||||
__func__,
|
||||
grid_dim.x,
|
||||
grid_dim.y,
|
||||
grid_dim.z,
|
||||
block_dim.x,
|
||||
block_dim.y,
|
||||
block_dim.z);
|
||||
|
||||
printf("Warm up %d times\n", stream_config.cold_niters_);
|
||||
#endif
|
||||
// warm up
|
||||
for(int i = 0; i < stream_config.cold_niters_; ++i)
|
||||
{
|
||||
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args);
|
||||
hip_check_error(hipGetLastError());
|
||||
}
|
||||
|
||||
const int nrepeat = stream_config.nrepeat_;
|
||||
if(nrepeat == 0)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
#if DEBUG_LOG
|
||||
printf("Start running %d times...\n", nrepeat);
|
||||
#endif
|
||||
|
||||
#if MEDIAN
|
||||
std::set<float> times;
|
||||
#else
|
||||
float total_time = 0;
|
||||
#endif
|
||||
for(int i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
if constexpr(!TimePreprocess)
|
||||
{
|
||||
preprocess();
|
||||
}
|
||||
|
||||
hipEvent_t start, stop;
|
||||
|
||||
hip_check_error(hipEventCreate(&start));
|
||||
hip_check_error(hipEventCreate(&stop));
|
||||
|
||||
hip_check_error(hipDeviceSynchronize());
|
||||
hip_check_error(hipEventRecord(start, stream_config.stream_id_));
|
||||
// calculate preprocess time
|
||||
if constexpr(TimePreprocess)
|
||||
{
|
||||
preprocess();
|
||||
}
|
||||
// run real kernel
|
||||
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args);
|
||||
hip_check_error(hipGetLastError());
|
||||
// end real kernel
|
||||
|
||||
hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
|
||||
hip_check_error(hipEventSynchronize(stop));
|
||||
float cur_time = 0;
|
||||
hip_check_error(hipEventElapsedTime(&cur_time, start, stop));
|
||||
#if MEDIAN
|
||||
times.insert(cur_time);
|
||||
#else
|
||||
total_time += cur_time;
|
||||
#endif
|
||||
|
||||
#if DEBUG_LOG
|
||||
std::cout << "i: " << i << " cur_time: " << cur_time << std::endl;
|
||||
|
||||
printf("args.p_a_grid: %p, args.p_b_grid:%p\n",
|
||||
static_cast<const void*>(args.p_a_grid),
|
||||
static_cast<const void*>(args.p_b_grid));
|
||||
#endif
|
||||
}
|
||||
|
||||
#if MEDIAN
|
||||
auto mid = times.begin();
|
||||
std::advance(mid, (nrepeat - 1) / 2);
|
||||
if(nrepeat % 2 == 1)
|
||||
{
|
||||
return *mid;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto mid_next = mid;
|
||||
std::advance(mid_next, 1);
|
||||
return (*mid + *mid_next) / 2;
|
||||
}
|
||||
#else
|
||||
return total_time / nrepeat;
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
preprocess();
|
||||
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args);
|
||||
hip_check_error(hipGetLastError());
|
||||
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args);
|
||||
hip_check_error(hipGetLastError());
|
||||
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace utility
|
||||
} // namespace ck
|
||||
@@ -13,4 +13,7 @@ struct StreamConfig
|
||||
int log_level_ = 0;
|
||||
int cold_niters_ = 5;
|
||||
int nrepeat_ = 50;
|
||||
|
||||
bool flush_cache = false;
|
||||
int rotating_count = 1;
|
||||
};
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/host_utility/flush_cache.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
@@ -151,14 +152,49 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
|
||||
const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
|
||||
|
||||
const auto Run = [&](const auto& kernel) {
|
||||
if(arg.KBatch > 1)
|
||||
hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
|
||||
0,
|
||||
arg.M * arg.N * sizeof(CDataType),
|
||||
stream_config.stream_id_));
|
||||
if(stream_config.flush_cache)
|
||||
{
|
||||
Argument arg_ = arg;
|
||||
ck::utility::RotatingMemWrapper<Argument> rotating_mem(
|
||||
arg_,
|
||||
stream_config.rotating_count,
|
||||
arg_.M * arg_.K * sizeof(ADataType),
|
||||
arg_.K * arg_.N * sizeof(BDataType));
|
||||
rotating_mem.Print();
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
|
||||
auto run_flush_cache = [&]() {
|
||||
// flush icache
|
||||
ck::utility::flush_icache();
|
||||
// rotating mem
|
||||
rotating_mem.Next();
|
||||
// clear c mem
|
||||
if(arg_.KBatch > 1)
|
||||
hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
|
||||
0,
|
||||
arg_.M * arg_.N * sizeof(CDataType),
|
||||
stream_config.stream_id_));
|
||||
};
|
||||
|
||||
ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
|
||||
stream_config,
|
||||
run_flush_cache,
|
||||
kernel,
|
||||
dim3(gdx, gdy, gdz),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg_);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(arg.KBatch > 1)
|
||||
hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
|
||||
0,
|
||||
arg.M * arg.N * sizeof(CDataType),
|
||||
stream_config.stream_id_));
|
||||
|
||||
ave_time = launch_and_time_kernel(
|
||||
stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
|
||||
}
|
||||
};
|
||||
|
||||
constexpr index_t minimum_occupancy =
|
||||
|
||||
30
include/ck/utility/flush_icache.hpp
Normal file
30
include/ck/utility/flush_icache.hpp
Normal file
@@ -0,0 +1,30 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace ck {
|
||||
static __global__ void flush_icache()
|
||||
{
|
||||
asm __volatile__("s_icache_inv \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t"
|
||||
"s_nop 0 \n\t" ::
|
||||
:);
|
||||
}
|
||||
} // namespace ck
|
||||
@@ -43,7 +43,8 @@ bool profile_gemm_universal_impl(int do_verification,
|
||||
int StrideC,
|
||||
int KBatch,
|
||||
int n_warmup,
|
||||
int n_iter)
|
||||
int n_iter,
|
||||
uint64_t rotating = 0)
|
||||
{
|
||||
bool pass = true;
|
||||
|
||||
@@ -66,9 +67,16 @@ bool profile_gemm_universal_impl(int do_verification,
|
||||
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
|
||||
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
|
||||
|
||||
int total_gemm_needed = a_m_k.GetElementSpaceSizeInBytes() + b_k_n.GetElementSpaceSizeInBytes();
|
||||
int rotating_count = std::max(
|
||||
1,
|
||||
std::min(n_iter,
|
||||
static_cast<int>(std::ceil(static_cast<double>(rotating) / total_gemm_needed))));
|
||||
|
||||
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
|
||||
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
|
||||
std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
|
||||
std::cout << "rotating count: " << rotating_count << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
@@ -200,8 +208,14 @@ bool profile_gemm_universal_impl(int do_verification,
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
float ave_time = invoker_ptr->Run(
|
||||
argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter});
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(),
|
||||
StreamConfig{nullptr,
|
||||
time_kernel,
|
||||
0,
|
||||
n_warmup,
|
||||
n_iter,
|
||||
rotating_count > 1,
|
||||
rotating_count});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ enum struct GemmDataType
|
||||
|
||||
int profile_gemm_universal(int argc, char* argv[])
|
||||
{
|
||||
if(argc != 15 && argc != 17)
|
||||
if(argc != 15 && argc != 18)
|
||||
{
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: f16, "
|
||||
@@ -51,6 +51,7 @@ int profile_gemm_universal(int argc, char* argv[])
|
||||
printf("optional:\n");
|
||||
printf("arg15: number of warm-up cycles (default 1)\n");
|
||||
printf("arg16: number of iterations (default 10)\n");
|
||||
printf("arg17: memory for rotating buffer (default 0, size in MB)\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@@ -70,12 +71,14 @@ int profile_gemm_universal(int argc, char* argv[])
|
||||
const int StrideC = std::stoi(argv[13]);
|
||||
const int KBatch = std::stoi(argv[14]);
|
||||
|
||||
int n_warmup = 1;
|
||||
int n_iter = 10;
|
||||
if(argc == 17)
|
||||
int n_warmup = 1;
|
||||
int n_iter = 10;
|
||||
uint64_t rotating = 0;
|
||||
if(argc == 18)
|
||||
{
|
||||
n_warmup = std::stoi(argv[15]);
|
||||
n_iter = std::stoi(argv[16]);
|
||||
rotating = std::stoull(argv[17]) * 1024 * 1024;
|
||||
}
|
||||
|
||||
using F32 = float;
|
||||
@@ -124,7 +127,8 @@ int profile_gemm_universal(int argc, char* argv[])
|
||||
(StrideC < 0) ? DefaultStrideC : StrideC,
|
||||
KBatch,
|
||||
n_warmup,
|
||||
n_iter);
|
||||
n_iter,
|
||||
rotating);
|
||||
|
||||
return pass ? 0 : 1;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user