mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 18:17:44 +00:00
* initial stream-k implementation with example
* fix unexpected change in err
* improve a little bit performance by reorganize pipeline.
* improve perf a little bit by swizzle block idx
* add profiler
* update example
* fix spelling
* shrink karg for streamk
* support dynamic buffer using memory coherence glc_slc bit from template
* control memory coherence while construct dynamic buffer
* update reduction for streamk(not ready yet)
* Add template parameter to make_dynamic_buffer to support amd_buffer coherence setting
* fix build issue
* fix several bug
* now result is correct, everything works (but has scratch)
* remove scratch by manually reset coordinate
* update device code
* fix a bug in final reduce
* fix something in example
* update async memset
* fix enum as camel case
* modify coherence enum name
* clean code and use atomic streamk by default
* remove unused var
* throw exception if have empty pointer
* fix format
* fix CI warning
* fix type in init
* modify CI error
* filter out on gfx10+
* restore changed example code
---------
Co-authored-by: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
[ROCm/composable_kernel commit: e7dca79d27]
145 lines
4.1 KiB
C++
145 lines
4.1 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include <hip/hip_runtime.h>
|
|
|
|
#include "ck/ck.hpp"
|
|
#include "ck/stream_config.hpp"
|
|
#include "ck/host_utility/hip_check_error.hpp"
|
|
|
|
template <typename... Args, typename F>
|
|
float launch_and_time_kernel(const StreamConfig& stream_config,
|
|
F kernel,
|
|
dim3 grid_dim,
|
|
dim3 block_dim,
|
|
std::size_t lds_byte,
|
|
Args... args)
|
|
{
|
|
#if CK_TIME_KERNEL
|
|
if(stream_config.time_kernel_)
|
|
{
|
|
#if DEBUG_LOG
|
|
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
|
|
__func__,
|
|
grid_dim.x,
|
|
grid_dim.y,
|
|
grid_dim.z,
|
|
block_dim.x,
|
|
block_dim.y,
|
|
block_dim.z);
|
|
|
|
printf("Warm up 1 time\n");
|
|
#endif
|
|
// warm up
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
|
|
const int nrepeat = 10;
|
|
#if DEBUG_LOG
|
|
printf("Start running %d times...\n", nrepeat);
|
|
#endif
|
|
hipEvent_t start, stop;
|
|
|
|
hip_check_error(hipEventCreate(&start));
|
|
hip_check_error(hipEventCreate(&stop));
|
|
|
|
hip_check_error(hipDeviceSynchronize());
|
|
hip_check_error(hipEventRecord(start, stream_config.stream_id_));
|
|
|
|
for(int i = 0; i < nrepeat; ++i)
|
|
{
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
}
|
|
|
|
hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
|
|
hip_check_error(hipEventSynchronize(stop));
|
|
|
|
float total_time = 0;
|
|
|
|
hip_check_error(hipEventElapsedTime(&total_time, start, stop));
|
|
|
|
return total_time / nrepeat;
|
|
}
|
|
else
|
|
{
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
|
|
return 0;
|
|
}
|
|
#else
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
template <typename... Args, typename F, typename PreProcessFunc>
|
|
float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
|
|
PreProcessFunc preprocess,
|
|
F kernel,
|
|
dim3 grid_dim,
|
|
dim3 block_dim,
|
|
std::size_t lds_byte,
|
|
Args... args)
|
|
{
|
|
#if CK_TIME_KERNEL
|
|
if(stream_config.time_kernel_)
|
|
{
|
|
#if DEBUG_LOG
|
|
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
|
|
__func__,
|
|
grid_dim.x,
|
|
grid_dim.y,
|
|
grid_dim.z,
|
|
block_dim.x,
|
|
block_dim.y,
|
|
block_dim.z);
|
|
|
|
printf("Warm up 1 time\n");
|
|
#endif
|
|
// warm up
|
|
preprocess();
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
|
|
const int nrepeat = 10;
|
|
#if DEBUG_LOG
|
|
printf("Start running %d times...\n", nrepeat);
|
|
#endif
|
|
hipEvent_t start, stop;
|
|
|
|
hip_check_error(hipEventCreate(&start));
|
|
hip_check_error(hipEventCreate(&stop));
|
|
|
|
hip_check_error(hipDeviceSynchronize());
|
|
hip_check_error(hipEventRecord(start, stream_config.stream_id_));
|
|
|
|
for(int i = 0; i < nrepeat; ++i)
|
|
{
|
|
preprocess();
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
}
|
|
|
|
hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
|
|
hip_check_error(hipEventSynchronize(stop));
|
|
|
|
float total_time = 0;
|
|
|
|
hip_check_error(hipEventElapsedTime(&total_time, start, stop));
|
|
|
|
return total_time / nrepeat;
|
|
}
|
|
else
|
|
{
|
|
preprocess();
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
|
|
return 0;
|
|
}
|
|
#else
|
|
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
|
|
|
|
return 0;
|
|
#endif
|
|
}
|