mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-02 20:51:23 +00:00
Post-merge changes for fully async args copy in ck grouped gemm (#1991)
* Post-merge changes for fully async args copy in ck grouped gemm * Post-merge documentation and naming changes * Build fix and updated changelog * Revised comments
This commit is contained in:
@@ -607,6 +607,9 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
|
||||
}
|
||||
}
|
||||
|
||||
// If the user provides copy stream and copy event, we assume that they're also
|
||||
// responsible for providing allocated host memory (eg. pinned) which
|
||||
// would be used to copy kernel arguments to the device.
|
||||
if(cpy_stream && cpy_event)
|
||||
{
|
||||
if(arg.gemm_kernel_host_args_ == nullptr)
|
||||
@@ -625,7 +628,7 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
|
||||
hipGetErrorString(hipEventRecord(cpy_event, cpy_stream));
|
||||
hipGetErrorString(hipEventSynchronize(cpy_event));
|
||||
}
|
||||
else
|
||||
else // In this case CK owns memory allocated on host.
|
||||
{
|
||||
hipGetErrorString(
|
||||
hipMemcpyAsync(arg.p_workspace_,
|
||||
@@ -801,7 +804,15 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
|
||||
return this->SetWorkSpacePointer(p_arg, p_dev_kernel_args);
|
||||
}
|
||||
|
||||
void SetHostKernelArgs(BaseArgument* p_arg, void* p_host_kernel_args) const
|
||||
//----------------------------------------------------------------------------------------------
|
||||
/// @brief Sets the host kernel arguments pointer and copies that data on the host side.
|
||||
/// This function can be utilised to use pinned memory for the host args and
|
||||
/// achieve fully async data copy.
|
||||
///
|
||||
/// @param p_arg The pointer to the Argument we're going to update.
|
||||
/// @param[in] p_host_kernel_args The pointer to the host memory where the kernel
|
||||
/// arguments will be copied
|
||||
void SetHostKernelArgsPointer(BaseArgument* p_arg, void* p_host_kernel_args) const
|
||||
{
|
||||
Argument* pArg_ = dynamic_cast<Argument*>(p_arg);
|
||||
if(!pArg_)
|
||||
|
||||
@@ -560,6 +560,9 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
|
||||
}
|
||||
}
|
||||
|
||||
// If the user provides copy stream and copy event, we assume that they're also
|
||||
// responsible for providing allocated host memory (eg. pinned) which
|
||||
// would be used to copy kernel arguments to the device.
|
||||
if(cpy_stream && cpy_event)
|
||||
{
|
||||
if(arg.gemm_kernel_host_args_ == nullptr)
|
||||
@@ -578,7 +581,7 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
|
||||
hipGetErrorString(hipEventRecord(cpy_event, cpy_stream));
|
||||
hipGetErrorString(hipEventSynchronize(cpy_event));
|
||||
}
|
||||
else
|
||||
else // In this case CK owns memory allocated on host.
|
||||
{
|
||||
hipGetErrorString(hipMemcpyAsync(arg.p_workspace_,
|
||||
arg.gemm_desc_kernel_arg_.data(),
|
||||
@@ -763,7 +766,16 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
|
||||
|
||||
size_t GetHostKernelArgSize(const BaseArgument* p_arg) const { return GetWorkSpaceSize(p_arg); }
|
||||
|
||||
void SetHostKernelArgs(BaseArgument* p_arg, void* p_host_kernel_args) const
|
||||
//----------------------------------------------------------------------------------------------
|
||||
/// @brief Sets the host kernel arguments pointer and copies that data on the host side.
|
||||
/// This function can be utilised to use pinned memory for the host args and
|
||||
/// achieve fully async data copy.
|
||||
///
|
||||
/// @param p_arg The pointer to the Argument we're going to update.
|
||||
/// @param[in] p_host_kernel_args The pointer to the host memory where the kernel
|
||||
/// arguments will be copied
|
||||
///
|
||||
void SetHostKernelArgsPointer(BaseArgument* p_arg, void* p_host_kernel_args) const
|
||||
{
|
||||
Argument* pArg_ = dynamic_cast<Argument*>(p_arg);
|
||||
if(!pArg_)
|
||||
|
||||
@@ -423,6 +423,9 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
|
||||
}
|
||||
}
|
||||
|
||||
// If the user provides copy stream and copy event, we assume that they're also
|
||||
// responsible for providing allocated host memory (eg. pinned) which
|
||||
// would be used to copy kernel arguments to the device.
|
||||
if(cpy_stream && cpy_event)
|
||||
{
|
||||
if(arg.gemm_kernel_host_args_ == nullptr)
|
||||
@@ -441,7 +444,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
|
||||
hip_check_error(hipEventRecord(cpy_event, cpy_stream));
|
||||
hip_check_error(hipEventSynchronize(cpy_event));
|
||||
}
|
||||
else
|
||||
else // In this case CK owns memory allocated on host.
|
||||
{
|
||||
|
||||
hip_check_error(
|
||||
@@ -702,7 +705,16 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
|
||||
return this->SetWorkSpacePointer(p_arg, p_dev_kernel_args);
|
||||
}
|
||||
|
||||
void SetHostKernelArgs(BaseArgument* p_arg, void* p_host_kernel_args) const
|
||||
//----------------------------------------------------------------------------------------------
|
||||
/// @brief Sets the host kernel arguments pointer and copies that data on the host side.
|
||||
/// This function can be utilised to use pinned memory for the host args and
|
||||
/// achieve fully async data copy.
|
||||
///
|
||||
/// @param p_arg The pointer to the Argument we're going to update.
|
||||
/// @param[in] p_host_kernel_args The pointer to the host memory where the kernel
|
||||
/// arguments will be copied
|
||||
///
|
||||
void SetHostKernelArgsPointer(BaseArgument* p_arg, void* p_host_kernel_args) const
|
||||
{
|
||||
Argument* pArg_ = dynamic_cast<Argument*>(p_arg);
|
||||
if(!pArg_)
|
||||
|
||||
Reference in New Issue
Block a user