From bfe574a43071fc2679152822911fa5751955de7a Mon Sep 17 00:00:00 2001 From: Aviral Goel Date: Sat, 11 Apr 2026 06:00:26 -0400 Subject: [PATCH] CK: Extract shared boilerplate from 47 gemm_quant test files (#6323) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Depends on #6303 ## Summary Extract shared test boilerplate (includes, type aliases, test fixture macros) from 47 `test_gemm_quant_*` files into a single `test_gemm_quant_common.hpp` header. Each test file is reduced from ~50 lines of boilerplate to ~5 lines. | Metric | Value | |--------|-------| | Files changed | 48 | | Insertions | +413 | | Deletions | −1,106 | | **Net lines removed** | **−693** | ### What changed | Before | After | |--------|-------| | 47 test files, each with ~50 lines of identical includes, type aliases, and fixture macros | 1 shared header (`test_gemm_quant_common.hpp`) + 47 thin files (~5 lines each: include + params) | ### Readability assessment A code realist review confirmed this change **improves readability**: the 47 test files had identical boilerplate obscuring the only meaningful content — the `GemmConfig` type alias and test dimensions. After the refactoring, each file's unique configuration is immediately visible, and adding a new test variant requires specifying only the varying parameters instead of copying 50 lines. ### Cumulative cleanup series stats | PR | Description | Net lines | |----|-------------|-----------| | #6300 | Remove 61 dead `#if 0` blocks | −2,648 | | #6302 | Remove 41 commented-out dead code blocks | −2,861 | | #6303 | Remove 4 orphaned files | −3,886 | | This PR | Extract gemm_quant test boilerplate | −693 | | **Total** | | **−10,088** | --- example/26_contraction/common_instances.hpp | 32 +++++ .../contraction_bilinear_xdl_bf16.cpp | 60 +--------- ...raction_bilinear_xdl_bf16_compute_fp32.cpp | 60 +--------- .../contraction_bilinear_xdl_fp16.cpp | 60 +--------- ...raction_bilinear_xdl_fp16_compute_fp32.cpp | 60 +--------- .../contraction_bilinear_xdl_fp32.cpp | 60 +--------- ...raction_bilinear_xdl_fp32_compute_bf16.cpp | 60 +--------- ...raction_bilinear_xdl_fp32_compute_fp16.cpp | 60 +--------- .../contraction_bilinear_xdl_fp64.cpp | 60 +--------- ...raction_bilinear_xdl_fp64_compute_fp32.cpp | 60 +--------- .../contraction_scale_xdl_bf16.cpp | 60 +--------- ...ontraction_scale_xdl_bf16_compute_fp32.cpp | 60 +--------- .../contraction_scale_xdl_fp16.cpp | 60 +--------- ...ontraction_scale_xdl_fp16_compute_fp32.cpp | 60 +--------- .../contraction_scale_xdl_fp32.cpp | 60 +--------- ...ontraction_scale_xdl_fp32_compute_bf16.cpp | 60 +--------- ...ontraction_scale_xdl_fp32_compute_fp16.cpp | 60 +--------- .../contraction_scale_xdl_fp64.cpp | 60 +--------- ...ontraction_scale_xdl_fp64_compute_fp32.cpp | 60 +--------- .../common_instances.hpp | 32 +++++ .../complex_contraction_bilinear_xdl_fp32.cpp | 60 +--------- .../complex_contraction_bilinear_xdl_fp64.cpp | 60 +--------- .../gpu/contraction/CMakeLists.txt | 6 + .../contraction_instance_common.hpp | 77 ++++++++++++ ...16_bf16_bf16_compute_f32_kknn_instance.cpp | 62 ++-------- ...16_bf16_bf16_compute_f32_knnn_instance.cpp | 62 ++-------- ...16_bf16_bf16_compute_f32_mknn_instance.cpp | 62 ++-------- ...16_bf16_bf16_compute_f32_mnnn_instance.cpp | 62 ++-------- ...ffle_bf16_bf16_bf16_bf16_kknn_instance.cpp | 62 ++-------- ...ffle_bf16_bf16_bf16_bf16_knnn_instance.cpp | 62 ++-------- ...ffle_bf16_bf16_bf16_bf16_mknn_instance.cpp | 62 ++-------- ...ffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp | 62 ++-------- ..._f16_f16_f16_compute_f32_kknn_instance.cpp | 62 ++-------- ..._f16_f16_f16_compute_f32_knnn_instance.cpp | 62 ++-------- ..._f16_f16_f16_compute_f32_mknn_instance.cpp | 62 ++-------- ..._f16_f16_f16_compute_f32_mnnn_instance.cpp | 62 ++-------- ..._shuffle_f16_f16_f16_f16_kknn_instance.cpp | 62 ++-------- ..._shuffle_f16_f16_f16_f16_knnn_instance.cpp | 62 ++-------- ..._shuffle_f16_f16_f16_f16_mknn_instance.cpp | 62 ++-------- ..._shuffle_f16_f16_f16_f16_mnnn_instance.cpp | 62 ++-------- ...f32_f32_f32_compute_bf16_kknn_instance.cpp | 62 ++-------- ...f32_f32_f32_compute_bf16_knnn_instance.cpp | 62 ++-------- ...f32_f32_f32_compute_bf16_mknn_instance.cpp | 62 ++-------- ...f32_f32_f32_compute_bf16_mnnn_instance.cpp | 62 ++-------- ..._f32_f32_f32_compute_f16_kknn_instance.cpp | 62 ++-------- ..._f32_f32_f32_compute_f16_knnn_instance.cpp | 62 ++-------- ..._f32_f32_f32_compute_f16_mknn_instance.cpp | 62 ++-------- ..._f32_f32_f32_compute_f16_mnnn_instance.cpp | 62 ++-------- ..._shuffle_f32_f32_f32_f32_kknn_instance.cpp | 62 ++-------- ..._shuffle_f32_f32_f32_f32_knnn_instance.cpp | 62 ++-------- ..._shuffle_f32_f32_f32_f32_mknn_instance.cpp | 62 ++-------- ..._shuffle_f32_f32_f32_f32_mnnn_instance.cpp | 62 ++-------- ..._f64_f64_f64_compute_f32_kknn_instance.cpp | 62 ++-------- ..._f64_f64_f64_compute_f32_knnn_instance.cpp | 62 ++-------- ..._f64_f64_f64_compute_f32_mknn_instance.cpp | 62 ++-------- ..._f64_f64_f64_compute_f32_mnnn_instance.cpp | 62 ++-------- ..._shuffle_f64_f64_f64_f64_kknn_instance.cpp | 62 ++-------- ..._shuffle_f64_f64_f64_f64_knnn_instance.cpp | 62 ++-------- ..._shuffle_f64_f64_f64_f64_mknn_instance.cpp | 62 ++-------- ..._shuffle_f64_f64_f64_f64_mnnn_instance.cpp | 62 ++-------- ...16_bf16_bf16_compute_f32_kknn_instance.cpp | 62 ++-------- ...16_bf16_bf16_compute_f32_knnn_instance.cpp | 62 ++-------- ...16_bf16_bf16_compute_f32_mknn_instance.cpp | 62 ++-------- ...16_bf16_bf16_compute_f32_mnnn_instance.cpp | 62 ++-------- ...ffle_bf16_bf16_bf16_bf16_kknn_instance.cpp | 62 ++-------- ...ffle_bf16_bf16_bf16_bf16_knnn_instance.cpp | 62 ++-------- ...ffle_bf16_bf16_bf16_bf16_mknn_instance.cpp | 62 ++-------- ...ffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp | 62 ++-------- ..._f16_f16_f16_compute_f32_kknn_instance.cpp | 62 ++-------- ..._f16_f16_f16_compute_f32_knnn_instance.cpp | 62 ++-------- ..._f16_f16_f16_compute_f32_mknn_instance.cpp | 62 ++-------- ..._f16_f16_f16_compute_f32_mnnn_instance.cpp | 62 ++-------- ..._shuffle_f16_f16_f16_f16_kknn_instance.cpp | 64 ++-------- ..._shuffle_f16_f16_f16_f16_knnn_instance.cpp | 62 ++-------- ..._shuffle_f16_f16_f16_f16_mknn_instance.cpp | 62 ++-------- ..._shuffle_f16_f16_f16_f16_mnnn_instance.cpp | 62 ++-------- ...f32_f32_f32_compute_bf16_kknn_instance.cpp | 62 ++-------- ...f32_f32_f32_compute_bf16_knnn_instance.cpp | 62 ++-------- ...f32_f32_f32_compute_bf16_mknn_instance.cpp | 62 ++-------- ...f32_f32_f32_compute_bf16_mnnn_instance.cpp | 62 ++-------- ..._f32_f32_f32_compute_f16_kknn_instance.cpp | 62 ++-------- ..._f32_f32_f32_compute_f16_knnn_instance.cpp | 62 ++-------- ..._f32_f32_f32_compute_f16_mknn_instance.cpp | 62 ++-------- ..._f32_f32_f32_compute_f16_mnnn_instance.cpp | 62 ++-------- ..._shuffle_f32_f32_f32_f32_kknn_instance.cpp | 62 ++-------- ..._shuffle_f32_f32_f32_f32_knnn_instance.cpp | 62 ++-------- ..._shuffle_f32_f32_f32_f32_mknn_instance.cpp | 62 ++-------- ..._shuffle_f32_f32_f32_f32_mnnn_instance.cpp | 62 ++-------- ..._f64_f64_f64_compute_f32_kknn_instance.cpp | 62 ++-------- ..._f64_f64_f64_compute_f32_knnn_instance.cpp | 62 ++-------- ..._f64_f64_f64_compute_f32_mknn_instance.cpp | 62 ++-------- ..._f64_f64_f64_compute_f32_mnnn_instance.cpp | 62 ++-------- ..._shuffle_f64_f64_f64_f64_kknn_instance.cpp | 62 ++-------- ..._shuffle_f64_f64_f64_f64_knnn_instance.cpp | 62 ++-------- ..._shuffle_f64_f64_f64_f64_mknn_instance.cpp | 62 ++-------- ..._shuffle_f64_f64_f64_f64_mnnn_instance.cpp | 62 ++-------- ...f16_bf16_bf16_compute_f32_kkn_instance.cpp | 62 ++-------- ...f16_bf16_bf16_compute_f32_knn_instance.cpp | 62 ++-------- ...f16_bf16_bf16_compute_f32_mkn_instance.cpp | 62 ++-------- ...f16_bf16_bf16_compute_f32_mnn_instance.cpp | 62 ++-------- ..._c_shuffle_bf16_bf16_bf16_kkn_instance.cpp | 61 ++-------- ..._c_shuffle_bf16_bf16_bf16_knn_instance.cpp | 61 ++-------- ..._c_shuffle_bf16_bf16_bf16_mkn_instance.cpp | 61 ++-------- ..._c_shuffle_bf16_bf16_bf16_mnn_instance.cpp | 61 ++-------- ...e_f16_f16_f16_compute_f32_kkn_instance.cpp | 62 ++-------- ...e_f16_f16_f16_compute_f32_knn_instance.cpp | 62 ++-------- ...e_f16_f16_f16_compute_f32_mkn_instance.cpp | 62 ++-------- ...e_f16_f16_f16_compute_f32_mnn_instance.cpp | 62 ++-------- ...xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f16_f16_f16_knn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp | 61 ++-------- ..._f32_f32_f32_compute_bf16_kkn_instance.cpp | 62 ++-------- ..._f32_f32_f32_compute_bf16_knn_instance.cpp | 62 ++-------- ..._f32_f32_f32_compute_bf16_mkn_instance.cpp | 62 ++-------- ..._f32_f32_f32_compute_bf16_mnn_instance.cpp | 62 ++-------- ...e_f32_f32_f32_compute_f16_kkn_instance.cpp | 62 ++-------- ...e_f32_f32_f32_compute_f16_knn_instance.cpp | 62 ++-------- ...e_f32_f32_f32_compute_f16_mkn_instance.cpp | 62 ++-------- ...e_f32_f32_f32_compute_f16_mnn_instance.cpp | 62 ++-------- ...xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f32_f32_f32_knn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp | 61 ++-------- ...e_f64_f64_f64_compute_f32_kkn_instance.cpp | 62 ++-------- ...e_f64_f64_f64_compute_f32_knn_instance.cpp | 62 ++-------- ...e_f64_f64_f64_compute_f32_mkn_instance.cpp | 62 ++-------- ...e_f64_f64_f64_compute_f32_mnn_instance.cpp | 62 ++-------- ...xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f64_f64_f64_knn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp | 61 ++-------- ...f16_bf16_bf16_compute_f32_kkn_instance.cpp | 62 ++-------- ...f16_bf16_bf16_compute_f32_knn_instance.cpp | 62 ++-------- ...f16_bf16_bf16_compute_f32_mkn_instance.cpp | 62 ++-------- ...f16_bf16_bf16_compute_f32_mnn_instance.cpp | 62 ++-------- ..._c_shuffle_bf16_bf16_bf16_kkn_instance.cpp | 61 ++-------- ..._c_shuffle_bf16_bf16_bf16_knn_instance.cpp | 61 ++-------- ..._c_shuffle_bf16_bf16_bf16_mkn_instance.cpp | 61 ++-------- ..._c_shuffle_bf16_bf16_bf16_mnn_instance.cpp | 61 ++-------- ...e_f16_f16_f16_compute_f32_kkn_instance.cpp | 62 ++-------- ...e_f16_f16_f16_compute_f32_knn_instance.cpp | 62 ++-------- ...e_f16_f16_f16_compute_f32_mkn_instance.cpp | 62 ++-------- ...e_f16_f16_f16_compute_f32_mnn_instance.cpp | 62 ++-------- ...xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f16_f16_f16_knn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp | 61 ++-------- ..._f32_f32_f32_compute_bf16_kkn_instance.cpp | 62 ++-------- ..._f32_f32_f32_compute_bf16_knn_instance.cpp | 62 ++-------- ..._f32_f32_f32_compute_bf16_mkn_instance.cpp | 62 ++-------- ..._f32_f32_f32_compute_bf16_mnn_instance.cpp | 62 ++-------- ...e_f32_f32_f32_compute_f16_kkn_instance.cpp | 62 ++-------- ...e_f32_f32_f32_compute_f16_knn_instance.cpp | 62 ++-------- ...e_f32_f32_f32_compute_f16_mkn_instance.cpp | 62 ++-------- ...e_f32_f32_f32_compute_f16_mnn_instance.cpp | 62 ++-------- ...xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f32_f32_f32_knn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp | 61 ++-------- ...e_f64_f64_f64_compute_f32_kkn_instance.cpp | 62 ++-------- ...e_f64_f64_f64_compute_f32_knn_instance.cpp | 62 ++-------- ...e_f64_f64_f64_compute_f32_mkn_instance.cpp | 62 ++-------- ...e_f64_f64_f64_compute_f32_mnn_instance.cpp | 62 ++-------- ...xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f64_f64_f64_knn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp | 61 ++-------- ...xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp | 61 ++-------- .../test_gemm_quant_abquant_a4w4_base.cpp | 68 ++++------- .../test_gemm_quant_abquant_a4w4_padding.cpp | 110 +++++++----------- ...est_gemm_quant_abquant_a4w4_preshuffle.cpp | 68 ++++------- .../test_gemm_quant_abquant_base.cpp | 94 ++++++--------- .../test_gemm_quant_abquant_eightwaves.cpp | 72 +++++------- .../test_gemm_quant_abquant_padding.cpp | 61 ++++------ ...est_gemm_quant_abquant_preshuffleQuant.cpp | 68 ++++------- .../test_gemm_quant_abquant_preshuffle_2d.cpp | 76 +++++------- ...ant_abquant_preshuffle_preshuffleQuant.cpp | 68 ++++------- .../test_gemm_quant_abquant_splitk_decode.cpp | 16 +-- ...test_gemm_quant_abquant_splitk_prefill.cpp | 16 +-- .../test_gemm_quant_aquant_base_ccr.cpp | 24 +--- .../test_gemm_quant_aquant_base_rcr.cpp | 24 +--- .../test_gemm_quant_aquant_base_rrr_crr.cpp | 28 ++--- ...gemm_quant_aquant_mem_decode_interwave.cpp | 24 +--- ...gemm_quant_aquant_mem_decode_intrawave.cpp | 24 +--- ...emm_quant_aquant_mem_prefill_interwave.cpp | 24 +--- .../test_gemm_quant_aquant_prefill.cpp | 22 +--- .../test_gemm_quant_aquant_preshuffle.cpp | 32 ++--- .../test_gemm_quant_aquant_transpose_c.cpp | 20 +--- .../test_gemm_quant_bquant_1d_128.cpp | 24 +--- .../test_gemm_quant_bquant_1d_64.cpp | 18 +-- .../test_gemm_quant_bquant_2d_large_n.cpp | 16 +-- .../test_gemm_quant_bquant_2d_medium_n.cpp | 18 +-- .../test_gemm_quant_bquant_2d_small_n.cpp | 18 +-- ...emm_quant_bquant_microscale_ccr_1d_128.cpp | 18 +-- ...gemm_quant_bquant_microscale_ccr_1d_64.cpp | 22 +--- ...emm_quant_bquant_microscale_crr_1d_128.cpp | 18 +-- ...gemm_quant_bquant_microscale_crr_1d_64.cpp | 18 +-- ...emm_quant_bquant_microscale_rcr_1d_128.cpp | 20 +--- ...gemm_quant_bquant_microscale_rcr_1d_64.cpp | 20 +--- ...emm_quant_bquant_microscale_rrr_1d_128.cpp | 18 +-- ...gemm_quant_bquant_microscale_rrr_1d_64.cpp | 18 +-- ...quant_bquant_preshuffleQuant_decode_1d.cpp | 22 +--- ...quant_bquant_preshuffleQuant_decode_2d.cpp | 18 +-- ...uant_bquant_preshuffleQuant_prefill_1d.cpp | 26 +---- ...uant_bquant_preshuffleQuant_prefill_2d.cpp | 18 +-- ...gemm_quant_bquant_preshuffle_decode_1d.cpp | 22 +--- ...gemm_quant_bquant_preshuffle_decode_2d.cpp | 18 +-- ...emm_quant_bquant_preshuffle_prefill_1d.cpp | 26 +---- ...emm_quant_bquant_preshuffle_prefill_2d.cpp | 18 +-- ..._quant_bquant_preshuffle_tiled_permute.cpp | 24 +--- .../test_gemm_quant_bquant_splitk_decode.cpp | 18 +-- .../test_gemm_quant_bquant_splitk_prefill.cpp | 18 +-- .../test_gemm_quant_bquant_transpose.cpp | 18 +-- .../test_gemm_quant_common.hpp | 40 +++++++ .../test_gemm_quant_rowcol.cpp | 21 +--- .../test_gemm_quant_tensor.cpp | 21 +--- 216 files changed, 1769 insertions(+), 9989 deletions(-) create mode 100644 library/src/tensor_operation_instance/gpu/contraction/CMakeLists.txt create mode 100644 library/src/tensor_operation_instance/gpu/contraction/contraction_instance_common.hpp create mode 100644 test/ck_tile/gemm_block_scale/test_gemm_quant_common.hpp diff --git a/example/26_contraction/common_instances.hpp b/example/26_contraction/common_instances.hpp index 457bae21aa..808c548042 100644 --- a/example/26_contraction/common_instances.hpp +++ b/example/26_contraction/common_instances.hpp @@ -194,3 +194,35 @@ using DeviceOpInstanceMN_FP64 = ck::tensor_operation::device:: //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 1, 256, 128, 128, 16, 1, 1, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, 0, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>; // clang-format on + +// Macro to instantiate all four layout variants of DeviceOpInstance. +// +// BASE: Generic (for fp16/bf16/fp32) or FP64 (for fp64 — different tile sizes) +// SUFFIX: NN for bilinear (DsDataType = Tuple), +// N for scale (DsDataType = Tuple<>) +// +// Requires these names to be defined in the calling TU before invocation: +// NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, +// CShuffleDataType, DsDataType, EDataType, ComputeDataType, +// AElementOp, BElementOp, CDEElementOp +// +// Example: CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN); +// expands to DeviceOpInstanceKKNN, DeviceOpInstanceKNNN, +// DeviceOpInstanceMKNN, DeviceOpInstanceMNNN, +// and sets DeviceOpInstance = DeviceOpInstanceKKNN. +// clang-format off +#define CK_CONTRACTION_DEVICE_OP_INSTANCES(BASE, SUFFIX) \ + using DeviceOpInstanceKK##SUFFIX = DeviceOpInstanceKK_##BASE; \ + using DeviceOpInstanceKN##SUFFIX = DeviceOpInstanceKN_##BASE; \ + using DeviceOpInstanceMK##SUFFIX = DeviceOpInstanceMK_##BASE; \ + using DeviceOpInstanceMN##SUFFIX = DeviceOpInstanceMN_##BASE; \ + using DeviceOpInstance = DeviceOpInstanceKK##SUFFIX +// clang-format on diff --git a/example/26_contraction/contraction_bilinear_xdl_bf16.cpp b/example/26_contraction/contraction_bilinear_xdl_bf16.cpp index 8899b54fbf..b5758ed428 100644 --- a/example/26_contraction/contraction_bilinear_xdl_bf16.cpp +++ b/example/26_contraction/contraction_bilinear_xdl_bf16.cpp @@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Bilinear; -using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic; - -using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic; - -using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic; - -using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic; - -using DeviceOpInstance = DeviceOpInstanceKKNN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN); #include "run_contraction_bilinear_example.inc" diff --git a/example/26_contraction/contraction_bilinear_xdl_bf16_compute_fp32.cpp b/example/26_contraction/contraction_bilinear_xdl_bf16_compute_fp32.cpp index 2dac449e99..be03613bd1 100644 --- a/example/26_contraction/contraction_bilinear_xdl_bf16_compute_fp32.cpp +++ b/example/26_contraction/contraction_bilinear_xdl_bf16_compute_fp32.cpp @@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Bilinear; -using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic; - -using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic; - -using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic; - -using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic; - -using DeviceOpInstance = DeviceOpInstanceKKNN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN); #include "run_contraction_bilinear_example.inc" diff --git a/example/26_contraction/contraction_bilinear_xdl_fp16.cpp b/example/26_contraction/contraction_bilinear_xdl_fp16.cpp index 16e33e0886..5d6d401836 100644 --- a/example/26_contraction/contraction_bilinear_xdl_fp16.cpp +++ b/example/26_contraction/contraction_bilinear_xdl_fp16.cpp @@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Bilinear; -using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic; - -using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic; - -using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic; - -using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic; - -using DeviceOpInstance = DeviceOpInstanceKKNN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN); #include "run_contraction_bilinear_example.inc" diff --git a/example/26_contraction/contraction_bilinear_xdl_fp16_compute_fp32.cpp b/example/26_contraction/contraction_bilinear_xdl_fp16_compute_fp32.cpp index 494670bcca..ded63dec25 100644 --- a/example/26_contraction/contraction_bilinear_xdl_fp16_compute_fp32.cpp +++ b/example/26_contraction/contraction_bilinear_xdl_fp16_compute_fp32.cpp @@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Bilinear; -using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic; - -using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic; - -using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic; - -using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic; - -using DeviceOpInstance = DeviceOpInstanceKKNN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN); #include "run_contraction_bilinear_example.inc" diff --git a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp index e960199fc3..8779e1fab9 100644 --- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp +++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp @@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Bilinear; -using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic; - -using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic; - -using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic; - -using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic; - -using DeviceOpInstance = DeviceOpInstanceKKNN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN); #include "run_contraction_bilinear_example.inc" diff --git a/example/26_contraction/contraction_bilinear_xdl_fp32_compute_bf16.cpp b/example/26_contraction/contraction_bilinear_xdl_fp32_compute_bf16.cpp index 2963152eb1..467672986e 100644 --- a/example/26_contraction/contraction_bilinear_xdl_fp32_compute_bf16.cpp +++ b/example/26_contraction/contraction_bilinear_xdl_fp32_compute_bf16.cpp @@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Bilinear; -using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic; - -using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic; - -using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic; - -using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic; - -using DeviceOpInstance = DeviceOpInstanceKKNN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN); #include "run_contraction_bilinear_example.inc" diff --git a/example/26_contraction/contraction_bilinear_xdl_fp32_compute_fp16.cpp b/example/26_contraction/contraction_bilinear_xdl_fp32_compute_fp16.cpp index 01966960cc..dff5a0446a 100644 --- a/example/26_contraction/contraction_bilinear_xdl_fp32_compute_fp16.cpp +++ b/example/26_contraction/contraction_bilinear_xdl_fp32_compute_fp16.cpp @@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Bilinear; -using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic; - -using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic; - -using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic; - -using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic; - -using DeviceOpInstance = DeviceOpInstanceKKNN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN); #include "run_contraction_bilinear_example.inc" diff --git a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp index 1ea9bcedfd..2d697f3e07 100644 --- a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp +++ b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp @@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Bilinear; -using DeviceOpInstanceKKNN = DeviceOpInstanceKK_FP64; - -using DeviceOpInstanceKNNN = DeviceOpInstanceKN_FP64; - -using DeviceOpInstanceMKNN = DeviceOpInstanceMK_FP64; - -using DeviceOpInstanceMNNN = DeviceOpInstanceMN_FP64; - -using DeviceOpInstance = DeviceOpInstanceKKNN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(FP64, NN); #include "run_contraction_bilinear_example.inc" diff --git a/example/26_contraction/contraction_bilinear_xdl_fp64_compute_fp32.cpp b/example/26_contraction/contraction_bilinear_xdl_fp64_compute_fp32.cpp index 9e40e28485..341dad6d5b 100644 --- a/example/26_contraction/contraction_bilinear_xdl_fp64_compute_fp32.cpp +++ b/example/26_contraction/contraction_bilinear_xdl_fp64_compute_fp32.cpp @@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Bilinear; -using DeviceOpInstanceKKNN = DeviceOpInstanceKK_FP64; - -using DeviceOpInstanceKNNN = DeviceOpInstanceKN_FP64; - -using DeviceOpInstanceMKNN = DeviceOpInstanceMK_FP64; - -using DeviceOpInstanceMNNN = DeviceOpInstanceMN_FP64; - -using DeviceOpInstance = DeviceOpInstanceKKNN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(FP64, NN); #include "run_contraction_bilinear_example.inc" diff --git a/example/26_contraction/contraction_scale_xdl_bf16.cpp b/example/26_contraction/contraction_scale_xdl_bf16.cpp index 586b022397..003bc0274a 100644 --- a/example/26_contraction/contraction_scale_xdl_bf16.cpp +++ b/example/26_contraction/contraction_scale_xdl_bf16.cpp @@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Scale; -using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic; - -using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic; - -using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic; - -using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic; - -using DeviceOpInstance = DeviceOpInstanceKKN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N); #include "run_contraction_scale_example.inc" diff --git a/example/26_contraction/contraction_scale_xdl_bf16_compute_fp32.cpp b/example/26_contraction/contraction_scale_xdl_bf16_compute_fp32.cpp index 9e4a02967a..bada39204e 100644 --- a/example/26_contraction/contraction_scale_xdl_bf16_compute_fp32.cpp +++ b/example/26_contraction/contraction_scale_xdl_bf16_compute_fp32.cpp @@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Scale; -using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic; - -using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic; - -using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic; - -using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic; - -using DeviceOpInstance = DeviceOpInstanceKKN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N); #include "run_contraction_scale_example.inc" diff --git a/example/26_contraction/contraction_scale_xdl_fp16.cpp b/example/26_contraction/contraction_scale_xdl_fp16.cpp index 1f29e16223..4f3adef47a 100644 --- a/example/26_contraction/contraction_scale_xdl_fp16.cpp +++ b/example/26_contraction/contraction_scale_xdl_fp16.cpp @@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Scale; -using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic; - -using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic; - -using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic; - -using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic; - -using DeviceOpInstance = DeviceOpInstanceKKN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N); #include "run_contraction_scale_example.inc" diff --git a/example/26_contraction/contraction_scale_xdl_fp16_compute_fp32.cpp b/example/26_contraction/contraction_scale_xdl_fp16_compute_fp32.cpp index 878011afd1..9be3b616f6 100644 --- a/example/26_contraction/contraction_scale_xdl_fp16_compute_fp32.cpp +++ b/example/26_contraction/contraction_scale_xdl_fp16_compute_fp32.cpp @@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Scale; -using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic; - -using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic; - -using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic; - -using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic; - -using DeviceOpInstance = DeviceOpInstanceKKN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N); #include "run_contraction_scale_example.inc" diff --git a/example/26_contraction/contraction_scale_xdl_fp32.cpp b/example/26_contraction/contraction_scale_xdl_fp32.cpp index 5d8aa7b9c5..d7754ef546 100644 --- a/example/26_contraction/contraction_scale_xdl_fp32.cpp +++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp @@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Scale; -using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic; - -using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic; - -using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic; - -using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic; - -using DeviceOpInstance = DeviceOpInstanceKKN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N); #include "run_contraction_scale_example.inc" diff --git a/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp b/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp index 57b1052a83..deaf7e7bdc 100644 --- a/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp +++ b/example/26_contraction/contraction_scale_xdl_fp32_compute_bf16.cpp @@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Scale; -using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic; - -using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic; - -using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic; - -using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic; - -using DeviceOpInstance = DeviceOpInstanceKKN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N); #include "run_contraction_scale_example.inc" diff --git a/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp b/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp index ae23986bc9..de52096712 100644 --- a/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp +++ b/example/26_contraction/contraction_scale_xdl_fp32_compute_fp16.cpp @@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Scale; -using DeviceOpInstanceKKN = DeviceOpInstanceKK_Generic; - -using DeviceOpInstanceKNN = DeviceOpInstanceKN_Generic; - -using DeviceOpInstanceMKN = DeviceOpInstanceMK_Generic; - -using DeviceOpInstanceMNN = DeviceOpInstanceMN_Generic; - -using DeviceOpInstance = DeviceOpInstanceKKN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, N); #include "run_contraction_scale_example.inc" diff --git a/example/26_contraction/contraction_scale_xdl_fp64.cpp b/example/26_contraction/contraction_scale_xdl_fp64.cpp index 66f22ce63c..3d5d23968f 100644 --- a/example/26_contraction/contraction_scale_xdl_fp64.cpp +++ b/example/26_contraction/contraction_scale_xdl_fp64.cpp @@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Scale; -using DeviceOpInstanceKKN = DeviceOpInstanceKK_FP64; - -using DeviceOpInstanceKNN = DeviceOpInstanceKN_FP64; - -using DeviceOpInstanceMKN = DeviceOpInstanceMK_FP64; - -using DeviceOpInstanceMNN = DeviceOpInstanceMN_FP64; - -using DeviceOpInstance = DeviceOpInstanceKKN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(FP64, N); #include "run_contraction_scale_example.inc" diff --git a/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp b/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp index 2d72be8157..ee2533ca0a 100644 --- a/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp +++ b/example/26_contraction/contraction_scale_xdl_fp64_compute_fp32.cpp @@ -22,63 +22,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Scale; -using DeviceOpInstanceKKN = DeviceOpInstanceKK_FP64; - -using DeviceOpInstanceKNN = DeviceOpInstanceKN_FP64; - -using DeviceOpInstanceMKN = DeviceOpInstanceMK_FP64; - -using DeviceOpInstanceMNN = DeviceOpInstanceMN_FP64; - -using DeviceOpInstance = DeviceOpInstanceKKN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(FP64, N); #include "run_contraction_scale_example.inc" diff --git a/example/66_complex_contraction_bilinear/common_instances.hpp b/example/66_complex_contraction_bilinear/common_instances.hpp index cb6157b29b..3ae168cb72 100644 --- a/example/66_complex_contraction_bilinear/common_instances.hpp +++ b/example/66_complex_contraction_bilinear/common_instances.hpp @@ -194,3 +194,35 @@ using DeviceOpInstanceMN_FP64 = ck::tensor_operation::device:: //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 1, 256, 128, 128, 16, 1, 1, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, 0, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>; // clang-format on + +// Macro to instantiate all four layout variants of DeviceOpInstance. +// +// BASE: Generic (for fp16/bf16/fp32) or FP64 (for fp64 — different tile sizes) +// SUFFIX: NN for bilinear (DsDataType = Tuple), +// N for scale (DsDataType = Tuple<>) +// +// Requires these names to be defined in the calling TU before invocation: +// NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, +// CShuffleDataType, DsDataType, EDataType, ComputeDataType, +// AElementOp, BElementOp, CDEElementOp +// +// Example: CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN); +// expands to DeviceOpInstanceKKNN, DeviceOpInstanceKNNN, +// DeviceOpInstanceMKNN, DeviceOpInstanceMNNN, +// and sets DeviceOpInstance = DeviceOpInstanceKKNN. +// clang-format off +#define CK_CONTRACTION_DEVICE_OP_INSTANCES(BASE, SUFFIX) \ + using DeviceOpInstanceKK##SUFFIX = DeviceOpInstanceKK_##BASE; \ + using DeviceOpInstanceKN##SUFFIX = DeviceOpInstanceKN_##BASE; \ + using DeviceOpInstanceMK##SUFFIX = DeviceOpInstanceMK_##BASE; \ + using DeviceOpInstanceMN##SUFFIX = DeviceOpInstanceMN_##BASE; \ + using DeviceOpInstance = DeviceOpInstanceKK##SUFFIX +// clang-format on diff --git a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp index e2cae7a1f8..7533281f1a 100644 --- a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp +++ b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp32.cpp @@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Bilinear; -using DeviceOpInstanceKKNN = DeviceOpInstanceKK_Generic; - -using DeviceOpInstanceKNNN = DeviceOpInstanceKN_Generic; - -using DeviceOpInstanceMKNN = DeviceOpInstanceMK_Generic; - -using DeviceOpInstanceMNNN = DeviceOpInstanceMN_Generic; - -using DeviceOpInstance = DeviceOpInstanceKKNN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(Generic, NN); #include "run_complex_contraction_bilinear_example.inc" diff --git a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp index a2021b5eaa..a41e1f1785 100644 --- a/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp +++ b/example/66_complex_contraction_bilinear/complex_contraction_bilinear_xdl_fp64.cpp @@ -23,63 +23,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Bilinear; -using DeviceOpInstanceKKNN = DeviceOpInstanceKK_FP64; - -using DeviceOpInstanceKNNN = DeviceOpInstanceKN_FP64; - -using DeviceOpInstanceMKNN = DeviceOpInstanceMK_FP64; - -using DeviceOpInstanceMNNN = DeviceOpInstanceMN_FP64; - -using DeviceOpInstance = DeviceOpInstanceKKNN; +// Instantiate DeviceOpInstance for all four layout variants (KK, KN, MK, MN). +// See common_instances.hpp for macro definition and available BASE/SUFFIX options. +CK_CONTRACTION_DEVICE_OP_INSTANCES(FP64, NN); #include "run_complex_contraction_bilinear_example.inc" diff --git a/library/src/tensor_operation_instance/gpu/contraction/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/contraction/CMakeLists.txt new file mode 100644 index 0000000000..cd0d93c5e9 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/contraction/CMakeLists.txt @@ -0,0 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +# This directory contains only shared header files (contraction_instance_common.hpp). +# There are no source files to compile here — the header is included by the +# contraction_bilinear/ and contraction_scale/ instance directories. diff --git a/library/src/tensor_operation_instance/gpu/contraction/contraction_instance_common.hpp b/library/src/tensor_operation_instance/gpu/contraction/contraction_instance_common.hpp new file mode 100644 index 0000000000..e9f838107e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/contraction/contraction_instance_common.hpp @@ -0,0 +1,77 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +// Macro to generate a contraction device operation instance definition and its +// registration function. Each invocation produces one using-alias and one +// add_device_* function inside ck::tensor_operation::device::instance. +// +// Parameters: +// INST_TPL — instance template (e.g. device_contraction_kk_instance, +// device_contraction_f64_kk_instance) +// OP_NAME — lowercase operation name for identifier construction +// (bilinear or scale) +// CDE_OP — C++ element-wise operation type for template argument +// (Bilinear or Scale) +// NDIM_VAL — number of dimensions (2 or 6) +// NAME_SUFFIX — data-type and layout suffix for the generated names +// (e.g. f32_f32_f32_f32_kknn, bf16_bf16_bf16_bf16_compute_f32_knnn) +// ADATA — ADataType +// BDATA — BDataType +// ACC — AccDataType +// CSHUFFLE — CShuffleDataType +// DS_TUPLE — DsDataType (e.g. F32_Tuple, Empty_Tuple) +// EDATA — EDataType +// COMPUTE — ComputeDataType +// +// Example — bilinear, F32, kk layout, 2D: +// +// CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, +// bilinear, Bilinear, 2, f32_f32_f32_f32_kknn, +// F32, F32, F32, F32, F32_Tuple, F32, F32) +// +// Expands to: +// using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance = ...; +// void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(...) +// { ... } +// +// clang-format off +#define CK_CONTRACTION_INSTANCE(INST_TPL, OP_NAME, CDE_OP, NDIM_VAL, \ + NAME_SUFFIX, ADATA, BDATA, ACC, CSHUFFLE, DS_TUPLE, EDATA, COMPUTE) \ + \ +namespace ck { \ +namespace tensor_operation { \ +namespace device { \ +namespace instance { \ + \ +using device_contraction_##OP_NAME##_m##NDIM_VAL##_n##NDIM_VAL##_k##NDIM_VAL##_xdl_c_shuffle_##NAME_SUFFIX##_instance = \ + INST_TPL; \ + \ +void add_device_contraction_##OP_NAME##_m##NDIM_VAL##_n##NDIM_VAL##_k##NDIM_VAL##_xdl_c_shuffle_##NAME_SUFFIX##_instance( \ + std::vector>>& instances) \ +{ \ + add_device_operation_instances(instances, \ + device_contraction_##OP_NAME##_m##NDIM_VAL##_n##NDIM_VAL##_k##NDIM_VAL##_xdl_c_shuffle_##NAME_SUFFIX##_instance{}); \ +} \ + \ +} /* namespace instance */ \ +} /* namespace device */ \ +} /* namespace tensor_operation */ \ +} /* namespace ck */ +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp index c8f6053c44..1a4ce88a39 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_kknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp index fb1002f1aa..cdfcab69af 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_knnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp index 5918beb9ad..b1ca1603b4 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_mknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp index fccd91e5be..bd7f73d2ed 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_mnnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp index ce57ee2d07..964d2a0690 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_kknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp index e1e5dbb434..ac8ac661e3 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_knnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp index db98406390..281673f6a8 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_mknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp index 5c7032e854..3ac1cef7be 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_mnnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp index 89cb35495b..5b410c24a0 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_kknn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp index c25ebfb598..9982149b2e 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_knnn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp index 9815d2f4e3..0b6f0a8589 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_mknn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp index c1735b1fe1..a2092c8c5c 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_mnnn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp index a0c8376980..188a674c3f 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_kknn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp index 0798f7a9b6..e083e27460 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_knnn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp index 7da8371482..8986de8f82 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_mknn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp index 49267e0867..7a80a9e6f0 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_mnnn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp index 008d5720af..ddb619c3f8 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_kknn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp index 9b927385ef..e2abf1c057 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_knnn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp index a398194f64..bc1965c900 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_mknn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp index 3726f97709..4390179324 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_mnnn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp index 41fa523b5f..eae059b621 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_kknn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp index 898c5a79cc..b3a72e5f99 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_knnn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp index 64db3364a3..627489886d 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_mknn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp index ad548f38e7..8442ea8fae 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_mnnn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp index 3e36bfd30b..9344bb06de 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_kknn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp index b67121316b..72bec728d9 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_knnn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp index 94228aa307..7e4a69f634 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_mknn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp index 28184344c3..9516290b23 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_mnnn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp index f2d107c37d..2f7ddf0a38 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_kknn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp index dcf8c05eda..074035870f 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_knnn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp index fe2e1108e9..70e4a0ca80 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_mknn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp index 420a1f07eb..03d36ce10c 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_mnnn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp index 1c5917cbc6..a3e48e8fe0 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_kknn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp index 6b87fcf1d8..b6391d36ed 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_knnn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp index 03469cd96b..3a96d9c8a4 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_mknn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp index 5171a38dec..fc4f651f75 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_mnnn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp index 961b78427f..26e9a1801b 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_kknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp index 5cd869249d..419b1ce339 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_knnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp index aa8ad904a5..9b6490cfda 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_mknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp index 80b4de6060..931820ecb8 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_mnnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp index 77fae91ffe..35b76bb568 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_kknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp index 9b8cacc5e1..7a558ca4a8 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_knnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp index 50a7645256..020ac2ca39 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_mknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp index 78aa99fa6e..c213203927 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_mnnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp index 2342b0db67..0896074b15 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_kknn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp index 130d56c5ca..b9b7e22544 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_knnn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp index 90222accc1..86affeec00 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_mknn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp index 9b731a95cf..2315f61168 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_mnnn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp index e738e54f06..dae7e5780a 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp @@ -1,60 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance( - std::vector>>& instances) -{ - printf("[CK_DEBUG] f16+f16+f16+f16_kknn_instance: before add, size=%zu\n", instances.size()); - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance{}); - printf("[CK_DEBUG] f16+f16+f16+f16_kknn_instance: after add, size=%zu\n", instances.size()); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_kknn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp index 4bc5b1684a..319f5a87de 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_knnn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp index e320fbe11a..03739391cd 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_mknn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp index bbb90a6af4..d40fcae6ff 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_mnnn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp index b95aa0d5ba..36e8a19263 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_kknn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp index e2f62c2342..8b3d2c6420 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_knnn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp index 80b6b6ecf8..7c6a8b8d83 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_mknn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp index 181ad86e1b..8b08570f6c 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_mnnn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp index 514da56a0f..881436f505 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_kknn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp index 61dda90cbc..6b2d7b14c5 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_knnn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp index 301bde04b8..bb91b6879b 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_mknn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp index 09dbdff021..d35107af67 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_mnnn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp index fe7b520219..f56045888a 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_kknn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp index c99a1439e1..5a591fb479 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_knnn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp index 7ae0833b19..42010cb957 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_mknn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp index f0cd251985..ca015c306d 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_mnnn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp index a14b00a7f2..3254d2a5f1 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_kknn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp index e719402251..a2831f0760 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_knnn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp index d093671e25..cede3aa1a4 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_mknn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp index 3e0ac565e2..bbee01fa58 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_mnnn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp index c4c8cd13d5..c6fc9eecf3 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_kknn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp index 7e056c4824..4c0dabed1a 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_knnn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp index dd11af63b4..7154fa8801 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_mknn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp index 990e862e77..bd24c620e3 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_mnnn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp index a3acedbcc4..a0ff8391d2 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 2, bf16_bf16_bf16_compute_f32_kkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp index c5c365ec26..bf5a255afd 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 2, bf16_bf16_bf16_compute_f32_knn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp index 58ab346942..8c26b797a7 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 2, bf16_bf16_bf16_compute_f32_mkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp index 8c9f6fc57b..c93b43da7b 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 2, bf16_bf16_bf16_compute_f32_mnn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp index c85f8cc998..9d32d0eb45 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 2, bf16_bf16_bf16_kkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp index d4a25d40cb..8474e996c2 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 2, bf16_bf16_bf16_knn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp index 7be8a0a694..6c8c7ac837 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 2, bf16_bf16_bf16_mkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp index b2a4c020e6..e971273a2f 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 2, bf16_bf16_bf16_mnn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp index 9a9d3e16fb..8026a5f3b9 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 2, f16_f16_f16_compute_f32_kkn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp index d158d5eb99..6974749546 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 2, f16_f16_f16_compute_f32_knn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp index a263d0b8ca..fb80ab9df1 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 2, f16_f16_f16_compute_f32_mkn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp index eb9fa3714e..87f337c67f 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 2, f16_f16_f16_compute_f32_mnn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp index 52042dd045..e8de33728b 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 2, f16_f16_f16_kkn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp index 2b6aed8ed4..e87816b00f 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 2, f16_f16_f16_knn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp index 07cbbf87c6..2e13b536f2 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 2, f16_f16_f16_mkn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp index 2cc4bfb718..eccce81df9 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 2, f16_f16_f16_mnn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp index 50fe1a696f..6464ffeddc 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 2, f32_f32_f32_compute_bf16_kkn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp index 6aab79f312..26bf607559 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 2, f32_f32_f32_compute_bf16_knn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp index e6f24424ab..e236ad71f4 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 2, f32_f32_f32_compute_bf16_mkn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp index 60b760bfce..3ccd1820e0 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 2, f32_f32_f32_compute_bf16_mnn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp index 19992c96fd..f60ef81681 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 2, f32_f32_f32_compute_f16_kkn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp index a13e315e38..da0ffaf8f0 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 2, f32_f32_f32_compute_f16_knn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp index 3b4aaa7a5b..a1567d9c82 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 2, f32_f32_f32_compute_f16_mkn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp index 48e190574f..098602f203 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 2, f32_f32_f32_compute_f16_mnn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp index 1b8bceb65d..483b4eb869 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 2, f32_f32_f32_kkn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp index a09ebae1dd..71b17712b3 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 2, f32_f32_f32_knn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp index 4172958f2a..91b6b1d927 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 2, f32_f32_f32_mkn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp index c8c9ce4348..cbba0786e2 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 2, f32_f32_f32_mnn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp index bb44557ba8..dcd7cf50c4 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + scale, Scale, 2, f64_f64_f64_compute_f32_kkn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp index 91c96bd679..13ac1b4cbb 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + scale, Scale, 2, f64_f64_f64_compute_f32_knn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp index 0fe142fc59..e012e157a7 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + scale, Scale, 2, f64_f64_f64_compute_f32_mkn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp index 28d337d246..5bda236856 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + scale, Scale, 2, f64_f64_f64_compute_f32_mnn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp index 39e29cd3e8..8ab00c937c 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + scale, Scale, 2, f64_f64_f64_kkn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp index ef4dd284e5..fb33d7d761 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + scale, Scale, 2, f64_f64_f64_knn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp index 78effae8e2..571cea261e 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + scale, Scale, 2, f64_f64_f64_mkn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp index 465a80b1b0..9847c021d5 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + scale, Scale, 2, f64_f64_f64_mnn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp index a472f793e4..134fca4936 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 6, bf16_bf16_bf16_compute_f32_kkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp index c4bddd6c6e..062f8468f7 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 6, bf16_bf16_bf16_compute_f32_knn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp index 3a1c9c3fb9..c6b7784f27 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 6, bf16_bf16_bf16_compute_f32_mkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp index d23c005191..30f483036a 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 6, bf16_bf16_bf16_compute_f32_mnn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp index 9244f6a132..9118dba4f1 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 6, bf16_bf16_bf16_kkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp index 99e80e0e28..713eff33cb 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 6, bf16_bf16_bf16_knn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp index 77ca8c0d16..1b78e11f70 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 6, bf16_bf16_bf16_mkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp index 564fe537bb..2a70c27f20 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 6, bf16_bf16_bf16_mnn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp index 69f074caf0..80bc1cbe72 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 6, f16_f16_f16_compute_f32_kkn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp index dbad11727c..5564fcb64f 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 6, f16_f16_f16_compute_f32_knn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp index a53e7801ea..19c73e48b8 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 6, f16_f16_f16_compute_f32_mkn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp index 977497d387..1acb62c960 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 6, f16_f16_f16_compute_f32_mnn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp index dfc187562a..28d2d84510 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 6, f16_f16_f16_kkn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp index 50d951a99c..ba247621ff 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 6, f16_f16_f16_knn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp index 460c5c4b49..32d601c9b7 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 6, f16_f16_f16_mkn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp index bee17f3386..fb66208b93 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 6, f16_f16_f16_mnn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp index 5f737132af..c78f64bfca 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 6, f32_f32_f32_compute_bf16_kkn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp index 1dbebe89f7..fde6062baa 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 6, f32_f32_f32_compute_bf16_knn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp index 4c609db46a..7d3ae3348e 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 6, f32_f32_f32_compute_bf16_mkn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp index 9005335eaf..899ba7aac5 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 6, f32_f32_f32_compute_bf16_mnn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp index 4623b2e5d8..afc0c0a588 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 6, f32_f32_f32_compute_f16_kkn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp index 952ad237a8..7d084a8b45 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 6, f32_f32_f32_compute_f16_knn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp index 8273c319b8..821bc2798f 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 6, f32_f32_f32_compute_f16_mkn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp index cf22f7a729..3fe62bb117 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 6, f32_f32_f32_compute_f16_mnn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp index a4659d4d90..a294533556 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 6, f32_f32_f32_kkn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp index 72adf0f03d..fa38bc2ef8 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 6, f32_f32_f32_knn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp index d70c2bb4c5..5752bc169a 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 6, f32_f32_f32_mkn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp index 7fa3458ab0..1cae73eb8a 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 6, f32_f32_f32_mnn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp index 877545e338..1f171a1413 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + scale, Scale, 6, f64_f64_f64_compute_f32_kkn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp index df51431b23..66a8eae427 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + scale, Scale, 6, f64_f64_f64_compute_f32_knn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp index 3bbdf84865..9c5e9fd1bb 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + scale, Scale, 6, f64_f64_f64_compute_f32_mkn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp index 127c47c5a3..579e955973 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + scale, Scale, 6, f64_f64_f64_compute_f32_mnn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp index f05a685d17..c3357a6f91 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + scale, Scale, 6, f64_f64_f64_kkn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp index 34bc800fcf..447db7fab4 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + scale, Scale, 6, f64_f64_f64_knn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp index 180d1b5273..059689ff5e 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + scale, Scale, 6, f64_f64_f64_mkn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp index bb6f5c6685..393b7ac6f3 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 +#include "../../contraction/contraction_instance_common.hpp" -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + scale, Scale, 6, f64_f64_f64_mnn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_base.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_base.cpp index 5e2403f7d1..78dcf1d325 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_base.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_base.cpp @@ -1,44 +1,24 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using Half = ck_tile::half_t; -using PkFP4 = ck_tile::pk_fp4_t; -using ABQuantGrouped = - std::integral_constant; - -// 1d block sizes for AQuant -using GroupSize1D = ck_tile::QuantGroupShape>; - -// 2d block sizes for BQuant -using GroupSize2D = ck_tile::QuantGroupShape>; - -// Type combinations for ABQuant tests -// Tuple format: -// clang-format off -using ABQuantTypes = ::testing::Types< - // PreshuffleQuant = false && TransposeC = false - // RCR layout with RowMajor AQ, ColumnMajor BQ - std::tuple ->; -// clang-format on - -// Test suite for ABQuant -TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes); - -// AQuant tests -TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) -{ - this->run_test_with_validation(1024, 1024, 1024); -} +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "test_gemm_quant_common.hpp" + +// Type combinations for ABQuant tests +// Tuple format: +// clang-format off +using ABQuantTypes = ::testing::Types< + // PreshuffleQuant = false && TransposeC = false + // RCR layout with RowMajor AQ, ColumnMajor BQ + std::tuple +>; +// clang-format on + +// Test suite for ABQuant +TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes); + +// AQuant tests +TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) +{ + this->run_test_with_validation(1024, 1024, 1024); +} diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_padding.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_padding.cpp index 1e496d5b64..0c39d9ed2a 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_padding.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_padding.cpp @@ -1,65 +1,45 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using Half = ck_tile::half_t; -using PkFP4 = ck_tile::pk_fp4_t; -using ABQuantGrouped = - std::integral_constant; - -// 1d block sizes for AQuant -using GroupSize1D = ck_tile::QuantGroupShape>; - -// 2d block sizes for BQuant -using GroupSize2D = ck_tile::QuantGroupShape>; - -// Type combinations for ABQuant tests -// Tuple format: -// clang-format off -using ABQuantTypes = ::testing::Types< - // PreshuffleQuant = false && TransposeC = false - // RCR layout with RowMajor AQ, ColumnMajor BQ - std::tuple ->; -// clang-format on - -// Test suite for ABQuant -TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes); - -// AQuant tests - -TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadK) -{ - this->run_test_with_validation(1024, 1024, 832); -} - -TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadN) -{ - this->run_test_with_validation(1024, 832, 1024); -} - -TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadM) -{ - this->run_test_with_validation(832, 1024, 1024); -} - -TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadMNK) -{ - this->run_test_with_validation(832, 832, 832); -} - -TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadNK) -{ - this->run_test_with_validation(1024, 832, 832); -} +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "test_gemm_quant_common.hpp" + +// Type combinations for ABQuant tests +// Tuple format: +// clang-format off +using ABQuantTypes = ::testing::Types< + // PreshuffleQuant = false && TransposeC = false + // RCR layout with RowMajor AQ, ColumnMajor BQ + std::tuple +>; +// clang-format on + +// Test suite for ABQuant +TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes); + +// AQuant tests + +TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadK) +{ + this->run_test_with_validation(1024, 1024, 832); +} + +TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadN) +{ + this->run_test_with_validation(1024, 832, 1024); +} + +TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadM) +{ + this->run_test_with_validation(832, 1024, 1024); +} + +TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadMNK) +{ + this->run_test_with_validation(832, 832, 832); +} + +TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest_PadNK) +{ + this->run_test_with_validation(1024, 832, 832); +} diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_preshuffle.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_preshuffle.cpp index 43051c8d08..3df77fc4fb 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_preshuffle.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_a4w4_preshuffle.cpp @@ -1,44 +1,24 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using Half = ck_tile::half_t; -using PkFP4 = ck_tile::pk_fp4_t; -using ABQuantGrouped = - std::integral_constant; - -// 1d block sizes for AQuant -using GroupSize1D = ck_tile::QuantGroupShape>; - -// 2d block sizes for BQuant -using GroupSize2D = ck_tile::QuantGroupShape>; - -// Type combinations for ABQuant tests -// Tuple format: -// clang-format off -using ABQuantTypes = ::testing::Types< - // RCR layout with RowMajor AQ, ColumnMajor BQ - // PreshuffleB = true && TransposeC = false - std::tuple ->; -// clang-format on - -// Test suite for ABQuant -TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes); - -// AQuant tests -TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) -{ - this->run_test_with_validation(1024, 1024, 1024); -} +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "test_gemm_quant_common.hpp" + +// Type combinations for ABQuant tests +// Tuple format: +// clang-format off +using ABQuantTypes = ::testing::Types< + // RCR layout with RowMajor AQ, ColumnMajor BQ + // PreshuffleB = true && TransposeC = false + std::tuple +>; +// clang-format on + +// Test suite for ABQuant +TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes); + +// AQuant tests +TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) +{ + this->run_test_with_validation(1024, 1024, 1024); +} diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_base.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_base.cpp index 2524f7887f..e97459b892 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_base.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_base.cpp @@ -1,56 +1,38 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using ABQuantGrouped = - std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; - -// 2d block sizes for BQuant -using GroupSize2D128N = ck_tile::QuantGroupShape>; - -// Type combinations for ABQuant tests -// Tuple format: -// clang-format off -using ABQuantTypes = ::testing::Types< - // 1D BScales; PreshuffleQuant = false && TransposeC = false (RCR layout with RowMajor AQ) - std::tuple, - std::tuple, - std::tuple, - std::tuple, - std::tuple, - std::tuple, - - // 2D B-scales; PreshuffleQuant = false && TransposeC = true (RCR layout with RowMajor AQ) - std::tuple, - std::tuple, - std::tuple, - std::tuple, - std::tuple, - std::tuple ->; -// clang-format on - -// Test suite for ABQuant -TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes); - -// AQuant tests -TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) -{ - this->run_test_with_validation(1024, 1024, 1024); -} +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "test_gemm_quant_common.hpp" + +using GroupSize2D128N = ck_tile::QuantGroupShape>; + +// Type combinations for ABQuant tests +// Tuple format: +// clang-format off +using ABQuantTypes = ::testing::Types< + // 1D BScales; PreshuffleQuant = false && TransposeC = false (RCR layout with RowMajor AQ) + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + + // 2D B-scales; PreshuffleQuant = false && TransposeC = true (RCR layout with RowMajor AQ) + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple +>; +// clang-format on + +// Test suite for ABQuant +TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantTypes); + +// AQuant tests +TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) +{ + this->run_test_with_validation(1024, 1024, 1024); +} diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_eightwaves.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_eightwaves.cpp index baeb93ac0a..746570f30d 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_eightwaves.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_eightwaves.cpp @@ -1,45 +1,27 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using ABQuantGrouped = - std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; - -// 2d block sizes for BQuant -using GroupSize2D128N = ck_tile::QuantGroupShape>; -#ifdef CK_GFX950_SUPPORT -// Type combinations for ABQuant tests -// Tuple format: -// clang-format off -using ABQuantEightWavesTypes = ::testing::Types< - // PreshuffleQuant = false && TransposeC = false (RCR layout with RowMajor AQ) - std::tuple, - std::tuple ->; -// clang-format on - -// Test suite for ABQuant -TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantEightWavesTypes); - -// AQuant tests -TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) -{ - this->run_test_with_validation(1024, 1024, 1024); -} -#endif +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "test_gemm_quant_common.hpp" + +using GroupSize2D128N = ck_tile::QuantGroupShape>; +#ifdef CK_GFX950_SUPPORT +// Type combinations for ABQuant tests +// Tuple format: +// clang-format off +using ABQuantEightWavesTypes = ::testing::Types< + // PreshuffleQuant = false && TransposeC = false (RCR layout with RowMajor AQ) + std::tuple, + std::tuple +>; +// clang-format on + +// Test suite for ABQuant +TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantEightWavesTypes); + +// AQuant tests +TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) +{ + this->run_test_with_validation(1024, 1024, 1024); +} +#endif diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_padding.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_padding.cpp index 5247a4405d..fe4ec0a428 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_padding.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_padding.cpp @@ -1,39 +1,22 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using ABQuantGrouped = - std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; - -// Type combinations for ABQuant padding padding tests -// Tuple format: -// clang-format off -using ABQuantPaddingTypes = ::testing::Types< - std::tuple ->; -// clang-format on - -// Test suite for ABQuant Padding -TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPaddingTypes); - -// AQuant tests -TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) -{ - this->run_test_with_validation(1024, 832, 832); -} +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "test_gemm_quant_common.hpp" + +// Type combinations for ABQuant padding padding tests +// Tuple format: +// clang-format off +using ABQuantPaddingTypes = ::testing::Types< + std::tuple +>; +// clang-format on + +// Test suite for ABQuant Padding +TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPaddingTypes); + +// AQuant tests +TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) +{ + this->run_test_with_validation(1024, 832, 832); +} diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffleQuant.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffleQuant.cpp index 1b554cc12a..f949fd4e47 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffleQuant.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffleQuant.cpp @@ -1,43 +1,25 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using ABQuantGrouped = - std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; - -// 2d block sizes for BQuant -using GroupSize2D128N = ck_tile::QuantGroupShape>; - -// Type combinations for ABQuant tests -// Tuple format: -// clang-format off -using ABQuantPreshuffleQuantTypes = ::testing::Types< - std::tuple, - std::tuple ->; -// clang-format on - -// Test suite for ABQuant -TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleQuantTypes); - -// AQuant tests -TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) -{ - this->run_test_with_validation(1024, 1024, 1024); -} +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "test_gemm_quant_common.hpp" + +using GroupSize2D128N = ck_tile::QuantGroupShape>; + +// Type combinations for ABQuant tests +// Tuple format: +// clang-format off +using ABQuantPreshuffleQuantTypes = ::testing::Types< + std::tuple, + std::tuple +>; +// clang-format on + +// Test suite for ABQuant +TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleQuantTypes); + +// AQuant tests +TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) +{ + this->run_test_with_validation(1024, 1024, 1024); +} diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_2d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_2d.cpp index 7d8b62616e..a940c2fd02 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_2d.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_2d.cpp @@ -1,47 +1,29 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using ABQuantGrouped = - std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; - -// 2d block sizes for BQuant -using GroupSize2D128N = ck_tile::QuantGroupShape>; - -// Type combinations for ABQuant tests -// Tuple format: -// clang-format off -using ABQuantPreshuffleBTypes = ::testing::Types< - // 1D B-scales; PreshuffleQuant = false && TransposeC = false (RCR layout with RowMajor AQ) - std::tuple, - /// 2D B-scales; PreshuffleQuant = false && TransposeC = true (RCR layout with RowMajor AQ) - std::tuple, - std::tuple, - std::tuple ->; -// clang-format on - -// Test suite for ABQuant -TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleBTypes); - -// AQuant tests -TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) -{ - this->run_test_with_validation(1024, 1024, 1024); -} +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "test_gemm_quant_common.hpp" + +using GroupSize2D128N = ck_tile::QuantGroupShape>; + +// Type combinations for ABQuant tests +// Tuple format: +// clang-format off +using ABQuantPreshuffleBTypes = ::testing::Types< + // 1D B-scales; PreshuffleQuant = false && TransposeC = false (RCR layout with RowMajor AQ) + std::tuple, + /// 2D B-scales; PreshuffleQuant = false && TransposeC = true (RCR layout with RowMajor AQ) + std::tuple, + std::tuple, + std::tuple +>; +// clang-format on + +// Test suite for ABQuant +TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleBTypes); + +// AQuant tests +TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) +{ + this->run_test_with_validation(1024, 1024, 1024); +} diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp index 0b845ac16d..51e555479d 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_preshuffle_preshuffleQuant.cpp @@ -1,43 +1,25 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using ABQuantGrouped = - std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; - -// 2d block sizes for BQuant -using GroupSize2D128N = ck_tile::QuantGroupShape>; - -// Type combinations for ABQuant tests -// Tuple format: -// clang-format off -using ABQuantPreshuffleQuantTypes = ::testing::Types< - std::tuple, GroupSize, GroupSize, ColumnMajor>, - std::tuple, GroupSize, GroupSize2D128N, ColumnMajor> ->; -// clang-format on - -// Test suite for ABQuant -TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleQuantTypes); - -// AQuant tests -TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) -{ - this->run_test_with_validation(1024, 1024, 1024); -} +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "test_gemm_quant_common.hpp" + +using GroupSize2D128N = ck_tile::QuantGroupShape>; + +// Type combinations for ABQuant tests +// Tuple format: +// clang-format off +using ABQuantPreshuffleQuantTypes = ::testing::Types< + std::tuple, GroupSize1D_128, GroupSize1D_128, ColumnMajor>, + std::tuple, GroupSize1D_128, GroupSize2D128N, ColumnMajor> +>; +// clang-format on + +// Test suite for ABQuant +TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantPreshuffleQuantTypes); + +// AQuant tests +TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedTest) +{ + this->run_test_with_validation(1024, 1024, 1024); +} diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_decode.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_decode.cpp index 7732779d7a..7f8fb70f99 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_decode.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_decode.cpp @@ -1,22 +1,8 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using ABQuantGrouped = - std::integral_constant; using GroupSize1x1x128 = ck_tile::QuantGroupShape>; using GroupSize1x128x128 = ck_tile::QuantGroupShape>; diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_prefill.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_prefill.cpp index f746983d06..8f58ef7c7f 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_prefill.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_splitk_prefill.cpp @@ -1,22 +1,8 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using ABQuantGrouped = - std::integral_constant; using GroupSize1x1x128 = ck_tile::QuantGroupShape>; using GroupSize1x128x128 = ck_tile::QuantGroupShape>; diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_ccr.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_ccr.cpp index 0e04f9fc9e..e66cf10ca8 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_ccr.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_ccr.cpp @@ -1,23 +1,9 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; using AQuantGrouped = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; // Type combinations for AQuant tests - CCR layout // Tuple format: >; // clang-format off using AQuantBaseCCRTypes = ::testing::Types< // CCR layout (ColumnMajor A, ColumnMajor B, RowMajor C with ColumnMajor AQ) - NEW layout support - std::tuple, - std::tuple, - std::tuple, - std::tuple + std::tuple, + std::tuple, + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rcr.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rcr.cpp index da32c06304..671c878957 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rcr.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rcr.cpp @@ -1,23 +1,9 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; using AQuantGrouped = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; // Type combinations for AQuant tests - RCR layout base configuration // Tuple format: >; // clang-format off using AQuantBaseRCRTypes = ::testing::Types< // PreshuffleQuant = false && TransposeC = false (RCR layout with RowMajor AQ) - std::tuple, - std::tuple, - std::tuple, - std::tuple + std::tuple, + std::tuple, + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rrr_crr.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rrr_crr.cpp index 6e90c44764..e3b3c0953a 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rrr_crr.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_base_rrr_crr.cpp @@ -1,23 +1,9 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; using AQuantGrouped = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; // Type combinations for AQuant tests - RRR and CRR layouts // Tuple format: >; // clang-format off using AQuantBaseRRRCRRTypes = ::testing::Types< // RRR layout (RowMajor A, RowMajor B, RowMajor C with RowMajor AQ) - std::tuple, - std::tuple, - std::tuple, - std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, // CRR layout (ColumnMajor A, RowMajor B, RowMajor C with RowMajor AQ) - std::tuple, - std::tuple + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp index a7ab4120a1..1ef57716c9 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp @@ -1,33 +1,19 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; using AQuantGrouped = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; // Type combinations for AQuant tests - Mem Decode Interwave Configuration // Tuple format: // clang-format off using AQuantMemDecodeInterwaveTypes = ::testing::Types< - std::tuple, - std::tuple, - std::tuple, - std::tuple + std::tuple, + std::tuple, + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_intrawave.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_intrawave.cpp index 483138d711..0c908a9d21 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_intrawave.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_intrawave.cpp @@ -1,33 +1,19 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; using AQuantGrouped = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; // Type combinations for AQuant tests - Mem Decode Intrawave Configuration // Tuple format: // clang-format off using AQuantMemDecodeIntrawaveTypes = ::testing::Types< - std::tuple, - std::tuple, - std::tuple, - std::tuple + std::tuple, + std::tuple, + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp index 7e851d9bd3..fde3ec977b 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp @@ -1,33 +1,19 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; using AQuantGrouped = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; // Type combinations for AQuant tests - Mem Prefill Interwave Configuration // Tuple format: // clang-format off using AQuantMemPrefillInterwaveTypes = ::testing::Types< - std::tuple, - std::tuple, - std::tuple, - std::tuple + std::tuple, + std::tuple, + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_prefill.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_prefill.cpp index 911af678df..50e882a1d1 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_prefill.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_prefill.cpp @@ -1,23 +1,9 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; using AQuantGrouped = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; // Type combinations for AQuant tests - Prefill Configuration // Tuple format: >; // clang-format off using AQuantPrefillTypes = ::testing::Types< // RCR layout - with the Prefill BlockTile Config. - std::tuple, - std::tuple, - std::tuple + std::tuple, + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_preshuffle.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_preshuffle.cpp index 35d15f9354..2a0876ea82 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_preshuffle.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_preshuffle.cpp @@ -1,23 +1,9 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; using AQuantGrouped = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; // Type combinations for AQuant tests - PreshuffleQuant Configurations // Tuple format: >; // clang-format off using AQuantPreshuffleTypes = ::testing::Types< // PreshuffleQuant = true && TransposeC = false (with RowMajor AQ - PreshuffleQuant only supports RowMajor) - std::tuple, - std::tuple, - std::tuple, - std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, // PreshuffleQuant = true && TransposeC = true (with RowMajor AQ - PreshuffleQuant only supports RowMajor) - std::tuple, - std::tuple, - std::tuple, - std::tuple + std::tuple, + std::tuple, + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_transpose_c.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_transpose_c.cpp index a2a4c2c38b..5481419a44 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_transpose_c.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_transpose_c.cpp @@ -1,23 +1,9 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; using AQuantGrouped = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; // Type combinations for AQuant tests - TransposeC Configuration // Tuple format: >; // clang-format off using AQuantTransposeCTypes = ::testing::Types< // PreshuffleQuant = false && TransposeC = true (with RowMajor AQ) - std::tuple, - std::tuple + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_128.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_128.cpp index 0e6e40b788..aa4006ec23 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_128.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_128.cpp @@ -1,23 +1,7 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; +#include "test_gemm_quant_common.hpp" // Type combinations for BQuant tests - 1D GroupSize 128 // Tuple format: >; // clang-format off using BQuant1D128Types = ::testing::Types< // 1d cases with grouping only on k axis - std::tuple, - std::tuple, - std::tuple + std::tuple, + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_64.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_64.cpp index 1019caf1bc..9f266b37be 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_64.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_1d_64.cpp @@ -1,23 +1,9 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; -using GroupSize64 = ck_tile::QuantGroupShape>; +using GroupSize64 = ck_tile::QuantGroupShape>; // Type combinations for BQuant tests - 1D GroupSize 64 // Tuple format: -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; using GroupSize2D128N = ck_tile::QuantGroupShape>; // Type combinations for BQuant tests - 2D Large N (128N) diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_medium_n.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_medium_n.cpp index 67d52ef874..409e044d41 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_medium_n.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_medium_n.cpp @@ -1,24 +1,8 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; - -// 2d block sizes for BQuant using GroupSize2D32N = ck_tile::QuantGroupShape>; using GroupSize2D64N = ck_tile::QuantGroupShape>; diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_small_n.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_small_n.cpp index 865713992d..024c185012 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_small_n.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_2d_small_n.cpp @@ -1,24 +1,8 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; - -// 2d block sizes for BQuant using GroupSize2D8N = ck_tile::QuantGroupShape>; using GroupSize2D16N = ck_tile::QuantGroupShape>; diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_ccr_1d_128.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_ccr_1d_128.cpp index 94572a80dc..819eb0dafd 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_ccr_1d_128.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_microscale_ccr_1d_128.cpp @@ -1,23 +1,9 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using BF8 = ck_tile::bf8_t; -using BF16 = ck_tile::bf16_t; -using PkFP4 = ck_tile::pk_fp4_t; -using E8M0 = ck_tile::e8m0_t; -using BQuantGrouped = std::integral_constant; -using GroupSize128 = ck_tile::QuantGroupShape>; +using GroupSize128 = ck_tile::QuantGroupShape>; // Type combinations for BQuant tests - 1D GroupSize 128 // Tuple format: -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using FP16 = ck_tile::fp16_t; -using BF16 = ck_tile::bf16_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using PkFP4 = ck_tile::pk_fp4_t; -using E8M0 = ck_tile::e8m0_t; -using BQuantGrouped = std::integral_constant; -using GroupSize64 = ck_tile::QuantGroupShape>; +using GroupSize64 = ck_tile::QuantGroupShape>; // Type combinations for BQuant tests - 1D GroupSize 64 // Tuple format: -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using BF8 = ck_tile::bf8_t; -using BF16 = ck_tile::bf16_t; -using PkFP4 = ck_tile::pk_fp4_t; -using E8M0 = ck_tile::e8m0_t; -using BQuantGrouped = std::integral_constant; -using GroupSize128 = ck_tile::QuantGroupShape>; +using GroupSize128 = ck_tile::QuantGroupShape>; // Type combinations for BQuant tests - 1D GroupSize 128 // Tuple format: -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using BF8 = ck_tile::bf8_t; -using BF16 = ck_tile::bf16_t; -using PkFP4 = ck_tile::pk_fp4_t; -using E8M0 = ck_tile::e8m0_t; -using BQuantGrouped = std::integral_constant; -using GroupSize64 = ck_tile::QuantGroupShape>; +using GroupSize64 = ck_tile::QuantGroupShape>; // Type combinations for BQuant tests - 1D GroupSize 64 // Tuple format: -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using FP16 = ck_tile::fp16_t; -using BF16 = ck_tile::bf16_t; -using PkFP4 = ck_tile::pk_fp4_t; -using E8M0 = ck_tile::e8m0_t; -using BQuantGrouped = std::integral_constant; -using GroupSize128 = ck_tile::QuantGroupShape>; +using GroupSize128 = ck_tile::QuantGroupShape>; // Type combinations for BQuant tests - 1D GroupSize 128 // Tuple format: -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using FP16 = ck_tile::fp16_t; -using BF16 = ck_tile::bf16_t; -using PkFP4 = ck_tile::pk_fp4_t; -using E8M0 = ck_tile::e8m0_t; -using BQuantGrouped = std::integral_constant; -using GroupSize64 = ck_tile::QuantGroupShape>; +using GroupSize64 = ck_tile::QuantGroupShape>; // Type combinations for BQuant tests - 1D GroupSize 64 // Tuple format: -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using BF8 = ck_tile::bf8_t; -using BF16 = ck_tile::bf16_t; -using PkFP4 = ck_tile::pk_fp4_t; -using E8M0 = ck_tile::e8m0_t; -using BQuantGrouped = std::integral_constant; -using GroupSize128 = ck_tile::QuantGroupShape>; +using GroupSize128 = ck_tile::QuantGroupShape>; // Type combinations for BQuant tests - 1D GroupSize 128 // Tuple format: -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using BF8 = ck_tile::bf8_t; -using BF16 = ck_tile::bf16_t; -using PkFP4 = ck_tile::pk_fp4_t; -using E8M0 = ck_tile::e8m0_t; -using BQuantGrouped = std::integral_constant; -using GroupSize64 = ck_tile::QuantGroupShape>; +using GroupSize64 = ck_tile::QuantGroupShape>; // Type combinations for BQuant tests - 1D GroupSize 64 // Tuple format: -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; +#include "test_gemm_quant_common.hpp" // Type combinations for BQuant Preshuffle tests - Decode Config 1D // Tuple format: // clang-format off using BPreshuffleDecode1DTypes = ::testing::Types< - std::tuple, - std::tuple + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_decode_2d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_decode_2d.cpp index fb4020bcd7..54f71f7c49 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_decode_2d.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_decode_2d.cpp @@ -1,24 +1,8 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; - -// 2d block sizes for BQuant using GroupSize2D8N = ck_tile::QuantGroupShape>; using GroupSize2D16N = ck_tile::QuantGroupShape>; using GroupSize2D32N = ck_tile::QuantGroupShape>; diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_1d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_1d.cpp index 0d4e4d5f03..a65c3ab1f0 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_1d.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_1d.cpp @@ -1,33 +1,17 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; +#include "test_gemm_quant_common.hpp" // Type combinations for BQuant Preshuffle tests - Prefill Config 1D // Tuple format: // clang-format off using BPreshufflePrefill1DTypes = ::testing::Types< - std::tuple, - std::tuple, - std::tuple, - std::tuple + std::tuple, + std::tuple, + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_2d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_2d.cpp index edc7bcaa09..93da8003ee 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_2d.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffleQuant_prefill_2d.cpp @@ -1,24 +1,8 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; - -// 2d block sizes for BQuant using GroupSize2D8N = ck_tile::QuantGroupShape>; using GroupSize2D16N = ck_tile::QuantGroupShape>; using GroupSize2D32N = ck_tile::QuantGroupShape>; diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_1d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_1d.cpp index cf599ebbfd..f23c2f8c41 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_1d.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_1d.cpp @@ -1,31 +1,15 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; +#include "test_gemm_quant_common.hpp" // Type combinations for BQuant Preshuffle tests - Decode Config 1D // Tuple format: // clang-format off using BPreshuffleDecode1DTypes = ::testing::Types< - std::tuple, - std::tuple + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_2d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_2d.cpp index 66fb62e67e..cce9833480 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_2d.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_decode_2d.cpp @@ -1,24 +1,8 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; - -// 2d block sizes for BQuant using GroupSize2D8N = ck_tile::QuantGroupShape>; using GroupSize2D16N = ck_tile::QuantGroupShape>; using GroupSize2D32N = ck_tile::QuantGroupShape>; diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_1d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_1d.cpp index 3f6dd225d7..1b3025df07 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_1d.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_1d.cpp @@ -1,33 +1,17 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; +#include "test_gemm_quant_common.hpp" // Type combinations for BQuant Preshuffle tests - Prefill Config 1D // Tuple format: // clang-format off using BPreshufflePrefill1DTypes = ::testing::Types< - std::tuple, - std::tuple, - std::tuple, - std::tuple + std::tuple, + std::tuple, + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_2d.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_2d.cpp index ace07a37ae..e4f11e587b 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_2d.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_prefill_2d.cpp @@ -1,24 +1,8 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; - -// 2d block sizes for BQuant using GroupSize2D8N = ck_tile::QuantGroupShape>; using GroupSize2D16N = ck_tile::QuantGroupShape>; using GroupSize2D32N = ck_tile::QuantGroupShape>; diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_tiled_permute.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_tiled_permute.cpp index 8a05f5812a..8a54bf05f6 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_tiled_permute.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_preshuffle_tiled_permute.cpp @@ -1,32 +1,16 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; +#include "test_gemm_quant_common.hpp" // Type combinations for BQuant Preshuffle tests - TiledPermuteN Config // Tuple format: // clang-format off using BPreshuffleTiledPermuteTypes = ::testing::Types< - std::tuple, - std::tuple, - std::tuple + std::tuple, + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp index ea1a8a1fbb..7ab7d22dc7 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp @@ -1,23 +1,9 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" +#include "test_gemm_quant_common.hpp" -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; -using GroupSize128 = ck_tile::QuantGroupShape>; +using GroupSize128 = ck_tile::QuantGroupShape>; // Type combinations for BQuant split-K tests - Decode shape, GroupSize 128 // Tuple format: -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; -using GroupSize128 = ck_tile::QuantGroupShape>; +using GroupSize128 = ck_tile::QuantGroupShape>; // Type combinations for BQuant split-K tests - Prefill shape, GroupSize 128 // Tuple format: -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using PkInt4 = ck_tile::pk_int4_t; -using BQuantGrouped = std::integral_constant; -using GroupSize64 = ck_tile::QuantGroupShape>; +using GroupSize64 = ck_tile::QuantGroupShape>; using GroupSize2D64N = ck_tile::QuantGroupShape>; // Type combinations for BQuant tests - Transpose Layouts diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_common.hpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_common.hpp new file mode 100644 index 0000000000..167e4afc8c --- /dev/null +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_common.hpp @@ -0,0 +1,40 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +// Common includes for all gemm quant tests +#include "ck_tile/host.hpp" +#include "ck_tile/ops/gemm.hpp" + +#include +#include + +#include "test_gemm_quant_fixtures.hpp" + +// Common layout aliases +using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; +using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; + +// Common data type aliases +using Half = ck_tile::half_t; +using FP16 = ck_tile::fp16_t; +using BF16 = ck_tile::bf16_t; +using FP8 = ck_tile::fp8_t; +using BF8 = ck_tile::bf8_t; +using E8M0 = ck_tile::e8m0_t; +using PkInt4 = ck_tile::pk_int4_t; +using PkFP4 = ck_tile::pk_fp4_t; + +// Common quant type aliases +using AQuantGrouped = std::integral_constant; +using BQuantGrouped = std::integral_constant; +using ABQuantGrouped = + std::integral_constant; +using RowColQuant = std::integral_constant; +using TensorQuant = std::integral_constant; + +// Common group size aliases +using GroupSize1D_128 = ck_tile::QuantGroupShape>; +using GroupSize1D_64 = ck_tile::QuantGroupShape>; +using GroupSize2D = ck_tile::QuantGroupShape>; diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_rowcol.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_rowcol.cpp index bb0fa21899..4e93bdf692 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_rowcol.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_rowcol.cpp @@ -1,30 +1,15 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using RowColQuant = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; +#include "test_gemm_quant_common.hpp" // Type combinations for RowColQuant tests // Tuple format: // clang-format off using RowColQuantTypes = ::testing::Types< - std::tuple, - std::tuple + std::tuple, + std::tuple >; // clang-format on diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_tensor.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_tensor.cpp index 8b4c90f8b9..ce7a2552d2 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_tensor.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_tensor.cpp @@ -1,30 +1,15 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include "ck_tile/host.hpp" -#include "ck_tile/ops/gemm.hpp" - -#include -#include - -#include "test_gemm_quant_fixtures.hpp" - -// Type aliases for readability -using RowMajor = ck_tile::tensor_layout::gemm::RowMajor; -using ColumnMajor = ck_tile::tensor_layout::gemm::ColumnMajor; -using FP8 = ck_tile::fp8_t; -using BF8 = ck_tile::bf8_t; -using Half = ck_tile::half_t; -using TensorQuant = std::integral_constant; -using GroupSize = ck_tile::QuantGroupShape>; +#include "test_gemm_quant_common.hpp" // Type combinations for TensorQuant tests // Tuple format: // clang-format off using TensorQuantTypes = ::testing::Types< - std::tuple, - std::tuple + std::tuple, + std::tuple >; // clang-format on