mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-05 14:11:29 +00:00
* Add GEMM examples for int4 Currently the source files are just copied from int8 examples * Re-use pre-defined alias in int4 exmples * Distinguish user-side type from kernel-side type * Add int4_t support for check_err() * Allow conversion between Tensor<> specializations * Re-format source files * Use different type for host tensors * Re-use CopyAsType<>() to implement copy ctor * Re-use element-wise operation type alias * Fix typo in alias names * Complete the int4 examples * Add constraint to Tensor<> templated methods * Add type traits 'is_signed_integral<>' * Add type constraints for integer version check_err<>() * Allow comparing different-sized integral types in check_err() * Check converted Tensor<int4_t> with golden Tensor<int8_t> * Remove constraint of Tensor<>::CopyAsType() * Avoid compilation error while disabling ck::int4_t support * Remove debug messages * Add #error directive to prevent compile sources with wrong setting * Simplify tensor usages in examples * Add constraint to check_err() input reference type * Align design with other PR * Use ""_uz to simplify example code * Avoid too much generalizing check_err() * Re-format GEMM instance template arguments * Extract int4 example common codes * Sort include directives * Move #include directives into new header * Move common codes together * Re-format template argument in example code * Reuse same implementation code for most of GEMM examples * Re-format common.hpp * Unify structured comment in examples * Use reinterpret_cast<>() for cross-type pointer conversion * Revert "Add type traits 'is_signed_integral<>'" This reverts commitf2c148efae. * Allow unsigned integer arguments for check_err() * Fix compilation error in check_err() * Remove unnecessary copy ctor for Tensor<> * Mark Tensor<> special member functions as 'default' * Use more strict condition to add code in examples * Fix wrong program return value of GEMM examples * Handle the case while user specify all the strides * Fix never-ran examples * Exit successfully if GEMM instance does not support given problem * Add missing 'else' keyword * Re-format CMakeLists.txt * Add wrapper function to hide value conversion while copying memory * Add new DeviceMem API to copy memory * Use new DeviceMem API to implement examples * Revert "Add new DeviceMem API to copy memory" This reverts commit3f190b0779. * Add conversion ctor for Tensor<> * Write Tensor<> conversion logics explicitly in example code * Convert Tensor<> values after transfer data to host
47 lines
3.9 KiB
C++
47 lines
3.9 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
|
#error Should compile this file with ck::int4_t support
|
|
#endif
|
|
|
|
#include "common.hpp"
|
|
|
|
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
|
|
|
|
using ADataType = ck::int4_t;
|
|
using BDataType = ck::int4_t;
|
|
using CDataType = ck::int4_t;
|
|
using KernelADataType = int8_t;
|
|
using KernelBDataType = int8_t;
|
|
using KernelCDataType = int8_t;
|
|
using AccDataType = int32_t;
|
|
using CShuffleDataType = int8_t;
|
|
|
|
using ALayout = Row;
|
|
using BLayout = Col;
|
|
using CLayout = Row;
|
|
|
|
using AElementOp = PassThrough;
|
|
using BElementOp = PassThrough;
|
|
using CElementOp = PassThrough;
|
|
|
|
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
|
|
|
|
// clang-format off
|
|
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
|
|
// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
|
|
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
|
|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
|
|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
|
< ALayout, BLayout, CLayout, KernelADataType, KernelBDataType, KernelCDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 16>;
|
|
// clang-format on
|
|
|
|
using ReferenceGemmInstance = ck::tensor_operation::host::
|
|
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
|
|
|
|
#define BUILD_INT4_EXAMPLE
|
|
#include "run_gemm_example.inc"
|
|
|
|
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
|