initial stream-k implementation with example (#699)

* initial stream-k implementation with example

* fix unexpected change in err

* improve a little bit performance by reorganize pipeline.

* improve perf a little bit by swizzle block idx

* add profiler

* update example

* fix spelling

* shrink karg for streamk

* support dynamic buffer using memory coherence glc_slc bit from template

* control memory coherence while construct dynamic buffer

* update reduction for streamk(not ready yet)

* Add template parameter to make_dynamic_buffer to support amd_buffer coherence setting

* fix build issue

* fix several bug

* now result is correct, everything works (but has scratch)

* remove scratch by manually reset coordinate

* update device code

* fix a bug in final reduce

* fix something in example

* update async memset

* fix enum as camel case

* modify coherence enum name

* clean code and use atomic streamk by default

* remove unused var

* throw exception if have empty pointer

* fix format

* fix CI warning

* fix type in init

* modify CI error

* filter out on gfx10+

* restore changed example code

---------

Co-authored-by: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
This commit is contained in:
carlushuang
2023-07-27 03:18:15 +08:00
committed by GitHub
parent 9195435c77
commit e7dca79d27
28 changed files with 4234 additions and 36 deletions

View File

@@ -10,20 +10,57 @@ DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
hip_check_error(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
}
void DeviceMem::Realloc(std::size_t mem_size)
{
if(mpDeviceBuf)
{
hip_check_error(hipFree(mpDeviceBuf));
}
mMemSize = mem_size;
hip_check_error(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
}
void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
std::size_t DeviceMem::GetBufferSize() const { return mMemSize; }
void DeviceMem::ToDevice(const void* p) const
{
hip_check_error(hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
if(mpDeviceBuf)
{
hip_check_error(
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
}
else
{
throw std::runtime_error("ToDevice with an empty pointer");
}
}
void DeviceMem::FromDevice(void* p) const
{
hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
if(mpDeviceBuf)
{
hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
}
else
{
throw std::runtime_error("FromDevice with an empty pointer");
}
}
void DeviceMem::SetZero() const { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); }
void DeviceMem::SetZero() const
{
if(mpDeviceBuf)
{
hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize));
}
}
DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
DeviceMem::~DeviceMem()
{
if(mpDeviceBuf)
{
hip_check_error(hipFree(mpDeviceBuf));
}
}