mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
CK: Extract shared boilerplate from 47 gemm_quant test files (#6323)
Depends on #6303 ## Summary Extract shared test boilerplate (includes, type aliases, test fixture macros) from 47 `test_gemm_quant_*` files into a single `test_gemm_quant_common.hpp` header. Each test file is reduced from ~50 lines of boilerplate to ~5 lines. | Metric | Value | |--------|-------| | Files changed | 48 | | Insertions | +413 | | Deletions | −1,106 | | **Net lines removed** | **−693** | ### What changed | Before | After | |--------|-------| | 47 test files, each with ~50 lines of identical includes, type aliases, and fixture macros | 1 shared header (`test_gemm_quant_common.hpp`) + 47 thin files (~5 lines each: include + params) | ### Readability assessment A code realist review confirmed this change **improves readability**: the 47 test files had identical boilerplate obscuring the only meaningful content — the `GemmConfig` type alias and test dimensions. After the refactoring, each file's unique configuration is immediately visible, and adding a new test variant requires specifying only the varying parameters instead of copying 50 lines. ### Cumulative cleanup series stats | PR | Description | Net lines | |----|-------------|-----------| | #6300 | Remove 61 dead `#if 0` blocks | −2,648 | | #6302 | Remove 41 commented-out dead code blocks | −2,861 | | #6303 | Remove 4 orphaned files | −3,886 | | This PR | Extract gemm_quant test boilerplate | −693 | | **Total** | | **−10,088** |
This commit is contained in:
@@ -194,3 +194,35 @@ using DeviceOpInstanceMN_FP64 = ck::tensor_operation::device::
|
||||
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 1, 256, 128, 128, 16, 1, 1, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, 0, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>;
|
||||
// clang-format on
|
||||
|
||||
// Macro to instantiate all four layout variants of DeviceOpInstance.
|
||||
//
|
||||
// BASE: Generic (for fp16/bf16/fp32) or FP64 (for fp64 — different tile sizes)
|
||||
// SUFFIX: NN for bilinear (DsDataType = Tuple<DDataType>),
|
||||
// N for scale (DsDataType = Tuple<>)
|
||||
//
|
||||
// Requires these names to be defined in the calling TU before invocation:
|
||||
// NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType,
|
||||
// CShuffleDataType, DsDataType, EDataType, ComputeDataType,
|
||||
// AElementOp, BElementOp, CDEElementOp
|
||||
//
|
||||
// Example: CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
|
||||
// expands to DeviceOpInstanceKKNN, DeviceOpInstanceKNNN,
|
||||
// DeviceOpInstanceMKNN, DeviceOpInstanceMNNN,
|
||||
// and sets DeviceOpInstance = DeviceOpInstanceKKNN.
|
||||
// clang-format off
|
||||
#define CK_CONTRACTION_DEVICE_OP_INSTANCES(BASE, SUFFIX) \
|
||||
using DeviceOpInstanceKK##SUFFIX = DeviceOpInstanceKK_##BASE<NumDimM, NumDimN, NumDimK, \
|
||||
ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, \
|
||||
ComputeDataType, AElementOp, BElementOp, CDEElementOp>; \
|
||||
using DeviceOpInstanceKN##SUFFIX = DeviceOpInstanceKN_##BASE<NumDimM, NumDimN, NumDimK, \
|
||||
ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, \
|
||||
ComputeDataType, AElementOp, BElementOp, CDEElementOp>; \
|
||||
using DeviceOpInstanceMK##SUFFIX = DeviceOpInstanceMK_##BASE<NumDimM, NumDimN, NumDimK, \
|
||||
ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, \
|
||||
ComputeDataType, AElementOp, BElementOp, CDEElementOp>; \
|
||||
using DeviceOpInstanceMN##SUFFIX = DeviceOpInstanceMN_##BASE<NumDimM, NumDimN, NumDimK, \
|
||||
ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, \
|
||||
ComputeDataType, AElementOp, BElementOp, CDEElementOp>; \
|
||||
using DeviceOpInstance = DeviceOpInstanceKK##SUFFIX
|
||||
// clang-format on
|
||||
|
||||
@@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
|
||||
|
||||
using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKNN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
|
||||
|
||||
#include "run_contraction_bilinear_example.inc"
|
||||
|
||||
|
||||
@@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
|
||||
|
||||
using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKNN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
|
||||
|
||||
#include "run_contraction_bilinear_example.inc"
|
||||
|
||||
|
||||
@@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
|
||||
|
||||
using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKNN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
|
||||
|
||||
#include "run_contraction_bilinear_example.inc"
|
||||
|
||||
|
||||
@@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
|
||||
|
||||
using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKNN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
|
||||
|
||||
#include "run_contraction_bilinear_example.inc"
|
||||
|
||||
|
||||
@@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
|
||||
|
||||
using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKNN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
|
||||
|
||||
#include "run_contraction_bilinear_example.inc"
|
||||
|
||||
|
||||
@@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
|
||||
|
||||
using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKNN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
|
||||
|
||||
#include "run_contraction_bilinear_example.inc"
|
||||
|
||||
|
||||
@@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
|
||||
|
||||
using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKNN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
|
||||
|
||||
#include "run_contraction_bilinear_example.inc"
|
||||
|
||||
|
||||
@@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
|
||||
|
||||
using DeviceOpInstanceKKNN = DeviceOpInstanceKK_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNNN = DeviceOpInstanceKN_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKNN = DeviceOpInstanceMK_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNNN = DeviceOpInstanceMN_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKNN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(FP64, NN);
|
||||
|
||||
#include "run_contraction_bilinear_example.inc"
|
||||
|
||||
|
||||
@@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
|
||||
|
||||
using DeviceOpInstanceKKNN = DeviceOpInstanceKK_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNNN = DeviceOpInstanceKN_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKNN = DeviceOpInstanceMK_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNNN = DeviceOpInstanceMN_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKNN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(FP64, NN);
|
||||
|
||||
#include "run_contraction_bilinear_example.inc"
|
||||
|
||||
|
||||
@@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Scale;
|
||||
|
||||
using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N);
|
||||
|
||||
#include "run_contraction_scale_example.inc"
|
||||
|
||||
|
||||
@@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Scale;
|
||||
|
||||
using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N);
|
||||
|
||||
#include "run_contraction_scale_example.inc"
|
||||
|
||||
|
||||
@@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Scale;
|
||||
|
||||
using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N);
|
||||
|
||||
#include "run_contraction_scale_example.inc"
|
||||
|
||||
|
||||
@@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Scale;
|
||||
|
||||
using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N);
|
||||
|
||||
#include "run_contraction_scale_example.inc"
|
||||
|
||||
|
||||
@@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Scale;
|
||||
|
||||
using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N);
|
||||
|
||||
#include "run_contraction_scale_example.inc"
|
||||
|
||||
|
||||
@@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Scale;
|
||||
|
||||
using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N);
|
||||
|
||||
#include "run_contraction_scale_example.inc"
|
||||
|
||||
|
||||
@@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Scale;
|
||||
|
||||
using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N);
|
||||
|
||||
#include "run_contraction_scale_example.inc"
|
||||
|
||||
|
||||
@@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Scale;
|
||||
|
||||
using DeviceOpInstanceKKN = DeviceOpInstanceKK_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNN = DeviceOpInstanceKN_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKN = DeviceOpInstanceMK_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNN = DeviceOpInstanceMN_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(FP64, N);
|
||||
|
||||
#include "run_contraction_scale_example.inc"
|
||||
|
||||
|
||||
@@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Scale;
|
||||
|
||||
using DeviceOpInstanceKKN = DeviceOpInstanceKK_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNN = DeviceOpInstanceKN_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKN = DeviceOpInstanceMK_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNN = DeviceOpInstanceMN_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(FP64, N);
|
||||
|
||||
#include "run_contraction_scale_example.inc"
|
||||
|
||||
|
||||
@@ -194,3 +194,35 @@ using DeviceOpInstanceMN_FP64 = ck::tensor_operation::device::
|
||||
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 1, 256, 128, 128, 16, 1, 1, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, 0, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>;
|
||||
// clang-format on
|
||||
|
||||
// Macro to instantiate all four layout variants of DeviceOpInstance.
|
||||
//
|
||||
// BASE: Generic (for fp16/bf16/fp32) or FP64 (for fp64 — different tile sizes)
|
||||
// SUFFIX: NN for bilinear (DsDataType = Tuple<DDataType>),
|
||||
// N for scale (DsDataType = Tuple<>)
|
||||
//
|
||||
// Requires these names to be defined in the calling TU before invocation:
|
||||
// NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType,
|
||||
// CShuffleDataType, DsDataType, EDataType, ComputeDataType,
|
||||
// AElementOp, BElementOp, CDEElementOp
|
||||
//
|
||||
// Example: CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
|
||||
// expands to DeviceOpInstanceKKNN, DeviceOpInstanceKNNN,
|
||||
// DeviceOpInstanceMKNN, DeviceOpInstanceMNNN,
|
||||
// and sets DeviceOpInstance = DeviceOpInstanceKKNN.
|
||||
// clang-format off
|
||||
#define CK_CONTRACTION_DEVICE_OP_INSTANCES(BASE, SUFFIX) \
|
||||
using DeviceOpInstanceKK##SUFFIX = DeviceOpInstanceKK_##BASE<NumDimM, NumDimN, NumDimK, \
|
||||
ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, \
|
||||
ComputeDataType, AElementOp, BElementOp, CDEElementOp>; \
|
||||
using DeviceOpInstanceKN##SUFFIX = DeviceOpInstanceKN_##BASE<NumDimM, NumDimN, NumDimK, \
|
||||
ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, \
|
||||
ComputeDataType, AElementOp, BElementOp, CDEElementOp>; \
|
||||
using DeviceOpInstanceMK##SUFFIX = DeviceOpInstanceMK_##BASE<NumDimM, NumDimN, NumDimK, \
|
||||
ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, \
|
||||
ComputeDataType, AElementOp, BElementOp, CDEElementOp>; \
|
||||
using DeviceOpInstanceMN##SUFFIX = DeviceOpInstanceMN_##BASE<NumDimM, NumDimN, NumDimK, \
|
||||
ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, \
|
||||
ComputeDataType, AElementOp, BElementOp, CDEElementOp>; \
|
||||
using DeviceOpInstance = DeviceOpInstanceKK##SUFFIX
|
||||
// clang-format on
|
||||
|
||||
@@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
|
||||
|
||||
using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKNN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN);
|
||||
|
||||
#include "run_complex_contraction_bilinear_example.inc"
|
||||
|
||||
|
||||
@@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
|
||||
|
||||
using DeviceOpInstanceKKNN = DeviceOpInstanceKK_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceKNNN = DeviceOpInstanceKN_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMKNN = DeviceOpInstanceMK_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstanceMNNN = DeviceOpInstanceMN_FP64<NumDimM,
|
||||
NumDimN,
|
||||
NumDimK,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
ComputeDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
using DeviceOpInstance = DeviceOpInstanceKKNN;
|
||||
// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN).
|
||||
// See common_instances.hpp for macro definition and available BASE/SUFFIX options.
|
||||
CK_CONTRACTION_DEVICE_OP_INSTANCES(FP64, NN);
|
||||
|
||||
#include "run_complex_contraction_bilinear_example.inc"
|
||||
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
# This directory contains only shared header files (contraction_instance_common.hpp).
|
||||
# There are no source files to compile here — the header is included by the
|
||||
# contraction_bilinear/ and contraction_scale/ instance directories.
|
||||
@@ -0,0 +1,77 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
// Macro to generate a contraction device operation instance definition and its
|
||||
// registration function. Each invocation produces one using-alias and one
|
||||
// add_device_* function inside ck::tensor_operation::device::instance.
|
||||
//
|
||||
// Parameters:
|
||||
// INST_TPL — instance template (e.g. device_contraction_kk_instance,
|
||||
// device_contraction_f64_kk_instance)
|
||||
// OP_NAME — lowercase operation name for identifier construction
|
||||
// (bilinear or scale)
|
||||
// CDE_OP — C++ element-wise operation type for template argument
|
||||
// (Bilinear or Scale)
|
||||
// NDIM_VAL — number of dimensions (2 or 6)
|
||||
// NAME_SUFFIX — data-type and layout suffix for the generated names
|
||||
// (e.g. f32_f32_f32_f32_kknn, bf16_bf16_bf16_bf16_compute_f32_knnn)
|
||||
// ADATA — ADataType
|
||||
// BDATA — BDataType
|
||||
// ACC — AccDataType
|
||||
// CSHUFFLE — CShuffleDataType
|
||||
// DS_TUPLE — DsDataType (e.g. F32_Tuple, Empty_Tuple)
|
||||
// EDATA — EDataType
|
||||
// COMPUTE — ComputeDataType
|
||||
//
|
||||
// Example — bilinear, F32, kk layout, 2D:
|
||||
//
|
||||
// CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
// bilinear, Bilinear, 2, f32_f32_f32_f32_kknn,
|
||||
// F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
//
|
||||
// Expands to:
|
||||
// using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance = ...;
|
||||
// void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(...)
|
||||
// { ... }
|
||||
//
|
||||
// clang-format off
|
||||
#define CK_CONTRACTION_INSTANCE(INST_TPL, OP_NAME, CDE_OP, NDIM_VAL, \
|
||||
NAME_SUFFIX, ADATA, BDATA, ACC, CSHUFFLE, DS_TUPLE, EDATA, COMPUTE) \
|
||||
\
|
||||
namespace ck { \
|
||||
namespace tensor_operation { \
|
||||
namespace device { \
|
||||
namespace instance { \
|
||||
\
|
||||
using device_contraction_##OP_NAME##_m##NDIM_VAL##_n##NDIM_VAL##_k##NDIM_VAL##_xdl_c_shuffle_##NAME_SUFFIX##_instance = \
|
||||
INST_TPL<ADATA, BDATA, ACC, CSHUFFLE, DS_TUPLE, EDATA, COMPUTE, \
|
||||
PassThrough, PassThrough, CDE_OP, NDIM_VAL>; \
|
||||
\
|
||||
void add_device_contraction_##OP_NAME##_m##NDIM_VAL##_n##NDIM_VAL##_k##NDIM_VAL##_xdl_c_shuffle_##NAME_SUFFIX##_instance( \
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<NDIM_VAL, NDIM_VAL, NDIM_VAL, \
|
||||
ADATA, BDATA, DS_TUPLE, EDATA, PassThrough, PassThrough, CDE_OP, COMPUTE>>>& instances) \
|
||||
{ \
|
||||
add_device_operation_instances(instances, \
|
||||
device_contraction_##OP_NAME##_m##NDIM_VAL##_n##NDIM_VAL##_k##NDIM_VAL##_xdl_c_shuffle_##NAME_SUFFIX##_instance{}); \
|
||||
} \
|
||||
\
|
||||
} /* namespace instance */ \
|
||||
} /* namespace device */ \
|
||||
} /* namespace tensor_operation */ \
|
||||
} /* namespace ck */
|
||||
// clang-format on
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance =
|
||||
device_contraction_kk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_kknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance =
|
||||
device_contraction_kn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_knnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance =
|
||||
device_contraction_mk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_mknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance =
|
||||
device_contraction_mn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_mnnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance =
|
||||
device_contraction_kk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_kknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance =
|
||||
device_contraction_kn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_knnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance =
|
||||
device_contraction_mk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_mknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance =
|
||||
device_contraction_mn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_mnnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance =
|
||||
device_contraction_kk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_kknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance =
|
||||
device_contraction_kn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_knnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance =
|
||||
device_contraction_mk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_mknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance =
|
||||
device_contraction_mn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_mnnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance =
|
||||
device_contraction_kk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_kknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance =
|
||||
device_contraction_kn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_knnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance =
|
||||
device_contraction_mk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_mknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance =
|
||||
device_contraction_mn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_mnnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance =
|
||||
device_contraction_kk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_kknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance =
|
||||
device_contraction_kn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_knnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance =
|
||||
device_contraction_mk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_mknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance =
|
||||
device_contraction_mn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_mnnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance =
|
||||
device_contraction_kk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_kknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance =
|
||||
device_contraction_kn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_knnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance =
|
||||
device_contraction_mk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_mknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance =
|
||||
device_contraction_mn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_mnnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance =
|
||||
device_contraction_kk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_kknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance =
|
||||
device_contraction_kn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_knnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance =
|
||||
device_contraction_mk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_mknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance =
|
||||
device_contraction_mn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_mnnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance =
|
||||
device_contraction_f64_kk_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_kknn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance =
|
||||
device_contraction_f64_kn_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_knnn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance =
|
||||
device_contraction_f64_mk_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_mknn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance =
|
||||
device_contraction_f64_mn_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_mnnn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance =
|
||||
device_contraction_f64_kk_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_kknn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance =
|
||||
device_contraction_f64_kn_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_knnn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance =
|
||||
device_contraction_f64_mk_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_mknn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance =
|
||||
device_contraction_f64_mn_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_mnnn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance =
|
||||
device_contraction_kk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_kknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance =
|
||||
device_contraction_kn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_knnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance =
|
||||
device_contraction_mk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_mknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance =
|
||||
device_contraction_mn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_mnnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance =
|
||||
device_contraction_kk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_kknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance =
|
||||
device_contraction_kn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_knnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance =
|
||||
device_contraction_mk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_mknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance =
|
||||
device_contraction_mn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_mnnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance =
|
||||
device_contraction_kk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_kknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance =
|
||||
device_contraction_kn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_knnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance =
|
||||
device_contraction_mk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_mknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance =
|
||||
device_contraction_mn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_mnnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,60 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance =
|
||||
device_contraction_kk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
printf("[CK_DEBUG] f16+f16+f16+f16_kknn_instance: before add, size=%zu\n", instances.size());
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance{});
|
||||
printf("[CK_DEBUG] f16+f16+f16+f16_kknn_instance: after add, size=%zu\n", instances.size());
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_kknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance =
|
||||
device_contraction_kn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_knnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance =
|
||||
device_contraction_mk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_mknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance =
|
||||
device_contraction_mn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_mnnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance =
|
||||
device_contraction_kk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_kknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance =
|
||||
device_contraction_kn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_knnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance =
|
||||
device_contraction_mk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_mknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance =
|
||||
device_contraction_mn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_mnnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance =
|
||||
device_contraction_kk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_kknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance =
|
||||
device_contraction_kn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_knnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance =
|
||||
device_contraction_mk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_mknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance =
|
||||
device_contraction_mn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_mnnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance =
|
||||
device_contraction_kk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_kknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance =
|
||||
device_contraction_kn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_knnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance =
|
||||
device_contraction_mk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_mknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance =
|
||||
device_contraction_mn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_mnnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance =
|
||||
device_contraction_f64_kk_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_kknn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance =
|
||||
device_contraction_f64_kn_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_knnn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance =
|
||||
device_contraction_f64_mk_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_mknn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance =
|
||||
device_contraction_f64_mn_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_mnnn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance =
|
||||
device_contraction_f64_kk_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_kknn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance =
|
||||
device_contraction_f64_kn_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_knnn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance =
|
||||
device_contraction_f64_mk_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_mknn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance =
|
||||
device_contraction_f64_mn_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_mnnn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance =
|
||||
device_contraction_kk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
scale, Scale, 2, bf16_bf16_bf16_compute_f32_kkn,
|
||||
BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance =
|
||||
device_contraction_kn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
scale, Scale, 2, bf16_bf16_bf16_compute_f32_knn,
|
||||
BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance =
|
||||
device_contraction_mk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
scale, Scale, 2, bf16_bf16_bf16_compute_f32_mkn,
|
||||
BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance =
|
||||
device_contraction_mn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
scale, Scale, 2, bf16_bf16_bf16_compute_f32_mnn,
|
||||
BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user