mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
* add add new algorithm from v4r4r2
* program once issue
* add split k functiion
* redefine code
* add a matrix unmerge
* add b matrix unmerge k0
* trans a and b to gridegemm
* nhwc init
* no hacks and vector load
* add hacks
* modify some parameter
* fix tuning prometer for fp32
* fix tuning prometer for fp16
* start change gridwise k split
* init ok
* revome a b matrix k0mk1 desc in grid
* carewrite lculate gridsize
* add kbatch to CalculateBottomIndex
* remove some unused funtion
* add clear data function before call kernel
* out hacks
* in hacks
* rename device convolution file and function name
* modify kBatch value
* fix some tuning code
* start from v4r4 nhwc
* nhwc atomic is able to run
* just for fp32
* enable nchw atomic
* tweak
* tweak
* re-arrange gridwise gemm hot loop for wrw
* add wrw v4r5
* v4r4r5 fp16
* v4r4r4 fp16
* v4r4r2 fp16
* V4R4R4XDLNHWC fp16
* V4R4R2XDLATOMICNCHW fp16
* adjust for fp16
* input gridsize
* change kbatch to gridsize
* testing wrw
* clean up
* k_batch to gridsize
* fix bug
* wrw v4r4r4 kbatch change to gride size
* wrw v4r4r2 kbatch change to gride size
* after merge , change gridwise gemm v2r4
* change MakeCBlockClusterAdaptor
* other method use new gridwise gemm
* clean up
* chapad method nge to make_right_pad_transform
* kbatch out from transform function
* clean up and fix bug
* fix bug
* using function type reduce template parameters
* using auto replace define fuction type
* clean up
Co-authored-by: ltqin <letaoqin@amd.com>
Co-authored-by: Chao Liu <chao.liu2@amd.com>
Co-authored-by: Jing Zhang <jizhan@amd.com>
[ROCm/composable_kernel commit: fd49ff8080]
85 lines
1.8 KiB
C++
85 lines
1.8 KiB
C++
#ifndef DEVICE_HPP
|
|
#define DEVICE_HPP
|
|
|
|
#include <memory>
|
|
#include <functional>
|
|
#include <thread>
|
|
#include <chrono>
|
|
#include "hip/hip_runtime.h"
|
|
#include "hip/hip_fp16.h"
|
|
|
|
struct DeviceMem
|
|
{
|
|
DeviceMem() = delete;
|
|
DeviceMem(std::size_t mem_size);
|
|
void* GetDeviceBuffer();
|
|
void ToDevice(const void* p);
|
|
void FromDevice(void* p);
|
|
~DeviceMem();
|
|
|
|
void* mpDeviceBuf;
|
|
std::size_t mMemSize;
|
|
};
|
|
|
|
struct KernelTimerImpl;
|
|
|
|
struct KernelTimer
|
|
{
|
|
KernelTimer();
|
|
~KernelTimer();
|
|
void Start();
|
|
void End();
|
|
float GetElapsedTime() const;
|
|
|
|
std::unique_ptr<KernelTimerImpl> impl;
|
|
};
|
|
|
|
using device_stream_t = hipStream_t;
|
|
|
|
template <typename... Args, typename F>
|
|
void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
|
|
{
|
|
hipStream_t stream_id = nullptr;
|
|
|
|
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
|
|
}
|
|
|
|
template <typename... Args, typename F>
|
|
float launch_and_time_kernel(
|
|
F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
|
|
{
|
|
KernelTimer timer;
|
|
|
|
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
|
|
__func__,
|
|
grid_dim.x,
|
|
grid_dim.y,
|
|
grid_dim.z,
|
|
block_dim.x,
|
|
block_dim.y,
|
|
block_dim.z);
|
|
|
|
printf("Warm up\n");
|
|
|
|
hipStream_t stream_id = nullptr;
|
|
|
|
// warm up
|
|
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
|
|
|
|
printf("Start running %d times...\n", nrepeat);
|
|
|
|
timer.Start();
|
|
|
|
for(int i = 0; i < nrepeat; ++i)
|
|
{
|
|
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
|
|
}
|
|
|
|
timer.End();
|
|
|
|
// std::this_thread::sleep_for (std::chrono::microseconds(10));
|
|
|
|
return timer.GetElapsedTime() / nrepeat;
|
|
}
|
|
#endif
|