mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-20 12:59:49 +00:00
Fused GEMM+GEMM (#351)
* initial stub for gemm_gemm_xdl_cshuffle
* set up example code
* compiles
* prevent integer overflow
* harmonize interface between ref_gemm and ref_batched_gemm
* batched_gemm_gemm
* fix example
* host tensor gen: diagonal pattern in lowest two-dimensions only
* make c descriptors containing only integral constants
* clean up
* add BlockwiseGemmXdlops_v2 while exploring an unified approach
* implement proper interface
* tidy up example
* fix compilation warnings
* coarsely controlled 2nd gemm padding
* remove rocm-cmake's hard requirement for certain revision
* clang-format
* resolve merge conflict
* fix compilation error on gfx10
* adds acc0 elementwise op to interface
* add gemm_gemm instances and tests
* avoid LDS data hazard
* fix build
Co-authored-by: Chao Liu <chao.liu2@amd.com>
[ROCm/composable_kernel commit: c20a75b07d]
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
#ifndef CK_BLOCKWISE_GEMM_XDLOPS_B_REGISTER_HPP
|
||||
#define CK_BLOCKWISE_GEMM_XDLOPS_B_REGISTER_HPP
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
|
||||
@@ -317,4 +319,3 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1r1
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user