mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-13 01:36:06 +00:00
[rocm-libraries] ROCm/rocm-libraries#6323 (commit a668483)
CK: Extract shared boilerplate from 47 gemm_quant test files (#6323) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Depends on #6303 ## Summary Extract shared test boilerplate (includes, type aliases, test fixture macros) from 47 `test_gemm_quant_*` files into a single `test_gemm_quant_common.hpp` header. Each test file is reduced from ~50 lines of boilerplate to ~5 lines. | Metric | Value | |--------|-------| | Files changed | 48 | | Insertions | +413 | | Deletions | −1,106 | | **Net lines removed** | **−693** | ### What changed | Before | After | |--------|-------| | 47 test files, each with ~50 lines of identical includes, type aliases, and fixture macros | 1 shared header (`test_gemm_quant_common.hpp`) + 47 thin files (~5 lines each: include + params) | ### Readability assessment A code realist review confirmed this change **improves readability**: the 47 test files had identical boilerplate obscuring the only meaningful content — the `GemmConfig` type alias and test dimensions. After the refactoring, each file's unique configuration is immediately visible, and adding a new test variant requires specifying only the varying parameters instead of copying 50 lines. ### Cumulative cleanup series stats | PR | Description | Net lines | |----|-------------|-----------| | #6300 | Remove 61 dead `#if 0` blocks | −2,648 | | #6302 | Remove 41 commented-out dead code blocks | −2,861 | | #6303 | Remove 4 orphaned files | −3,886 | | This PR | Extract gemm_quant test boilerplate | −693 | | **Total** | | **−10,088** |
This commit is contained in:
committed by
assistant-librarian[bot]
parent
ce099b7afd
commit
fa4473fde6
@@ -0,0 +1,6 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
# This directory contains only shared header files (contraction_instance_common.hpp).
|
||||
# There are no source files to compile here — the header is included by the
|
||||
# contraction_bilinear/ and contraction_scale/ instance directories.
|
||||
@@ -0,0 +1,77 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
// Macro to generate a contraction device operation instance definition and its
|
||||
// registration function. Each invocation produces one using-alias and one
|
||||
// add_device_* function inside ck::tensor_operation::device::instance.
|
||||
//
|
||||
// Parameters:
|
||||
// INST_TPL — instance template (e.g. device_contraction_kk_instance,
|
||||
// device_contraction_f64_kk_instance)
|
||||
// OP_NAME — lowercase operation name for identifier construction
|
||||
// (bilinear or scale)
|
||||
// CDE_OP — C++ element-wise operation type for template argument
|
||||
// (Bilinear or Scale)
|
||||
// NDIM_VAL — number of dimensions (2 or 6)
|
||||
// NAME_SUFFIX — data-type and layout suffix for the generated names
|
||||
// (e.g. f32_f32_f32_f32_kknn, bf16_bf16_bf16_bf16_compute_f32_knnn)
|
||||
// ADATA — ADataType
|
||||
// BDATA — BDataType
|
||||
// ACC — AccDataType
|
||||
// CSHUFFLE — CShuffleDataType
|
||||
// DS_TUPLE — DsDataType (e.g. F32_Tuple, Empty_Tuple)
|
||||
// EDATA — EDataType
|
||||
// COMPUTE — ComputeDataType
|
||||
//
|
||||
// Example — bilinear, F32, kk layout, 2D:
|
||||
//
|
||||
// CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
// bilinear, Bilinear, 2, f32_f32_f32_f32_kknn,
|
||||
// F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
//
|
||||
// Expands to:
|
||||
// using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance = ...;
|
||||
// void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(...)
|
||||
// { ... }
|
||||
//
|
||||
// clang-format off
|
||||
#define CK_CONTRACTION_INSTANCE(INST_TPL, OP_NAME, CDE_OP, NDIM_VAL, \
|
||||
NAME_SUFFIX, ADATA, BDATA, ACC, CSHUFFLE, DS_TUPLE, EDATA, COMPUTE) \
|
||||
\
|
||||
namespace ck { \
|
||||
namespace tensor_operation { \
|
||||
namespace device { \
|
||||
namespace instance { \
|
||||
\
|
||||
using device_contraction_##OP_NAME##_m##NDIM_VAL##_n##NDIM_VAL##_k##NDIM_VAL##_xdl_c_shuffle_##NAME_SUFFIX##_instance = \
|
||||
INST_TPL<ADATA, BDATA, ACC, CSHUFFLE, DS_TUPLE, EDATA, COMPUTE, \
|
||||
PassThrough, PassThrough, CDE_OP, NDIM_VAL>; \
|
||||
\
|
||||
void add_device_contraction_##OP_NAME##_m##NDIM_VAL##_n##NDIM_VAL##_k##NDIM_VAL##_xdl_c_shuffle_##NAME_SUFFIX##_instance( \
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<NDIM_VAL, NDIM_VAL, NDIM_VAL, \
|
||||
ADATA, BDATA, DS_TUPLE, EDATA, PassThrough, PassThrough, CDE_OP, COMPUTE>>>& instances) \
|
||||
{ \
|
||||
add_device_operation_instances(instances, \
|
||||
device_contraction_##OP_NAME##_m##NDIM_VAL##_n##NDIM_VAL##_k##NDIM_VAL##_xdl_c_shuffle_##NAME_SUFFIX##_instance{}); \
|
||||
} \
|
||||
\
|
||||
} /* namespace instance */ \
|
||||
} /* namespace device */ \
|
||||
} /* namespace tensor_operation */ \
|
||||
} /* namespace ck */
|
||||
// clang-format on
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance =
|
||||
device_contraction_kk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_kknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance =
|
||||
device_contraction_kn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_knnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance =
|
||||
device_contraction_mk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_mknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance =
|
||||
device_contraction_mn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_mnnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance =
|
||||
device_contraction_kk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_kknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance =
|
||||
device_contraction_kn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_knnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance =
|
||||
device_contraction_mk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_mknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance =
|
||||
device_contraction_mn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_mnnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance =
|
||||
device_contraction_kk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_kknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance =
|
||||
device_contraction_kn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_knnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance =
|
||||
device_contraction_mk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_mknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance =
|
||||
device_contraction_mn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_mnnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance =
|
||||
device_contraction_kk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_kknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance =
|
||||
device_contraction_kn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_knnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance =
|
||||
device_contraction_mk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_mknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance =
|
||||
device_contraction_mn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 2, f16_f16_f16_f16_mnnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance =
|
||||
device_contraction_kk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_kknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance =
|
||||
device_contraction_kn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_knnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance =
|
||||
device_contraction_mk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_mknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance =
|
||||
device_contraction_mn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_mnnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance =
|
||||
device_contraction_kk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_kknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance =
|
||||
device_contraction_kn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_knnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance =
|
||||
device_contraction_mk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_mknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance =
|
||||
device_contraction_mn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_mnnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance =
|
||||
device_contraction_kk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_kknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance =
|
||||
device_contraction_kn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_knnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance =
|
||||
device_contraction_mk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_mknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance =
|
||||
device_contraction_mn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 2, f32_f32_f32_f32_mnnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance =
|
||||
device_contraction_f64_kk_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_kknn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance =
|
||||
device_contraction_f64_kn_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_knnn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance =
|
||||
device_contraction_f64_mk_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_mknn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance =
|
||||
device_contraction_f64_mn_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_mnnn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance =
|
||||
device_contraction_f64_kk_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_kknn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance =
|
||||
device_contraction_f64_kn_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_knnn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance =
|
||||
device_contraction_f64_mk_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_mknn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance =
|
||||
device_contraction_f64_mn_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
|
||||
bilinear, Bilinear, 2, f64_f64_f64_f64_mnnn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance =
|
||||
device_contraction_kk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_kknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance =
|
||||
device_contraction_kn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_knnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance =
|
||||
device_contraction_mk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_mknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance =
|
||||
device_contraction_mn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_mnnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance =
|
||||
device_contraction_kk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_kknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance =
|
||||
device_contraction_kn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_knnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance =
|
||||
device_contraction_mk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_mknn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance =
|
||||
device_contraction_mn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_mnnn,
|
||||
BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance =
|
||||
device_contraction_kk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_kknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance =
|
||||
device_contraction_kn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_knnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance =
|
||||
device_contraction_mk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_mknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance =
|
||||
device_contraction_mn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_mnnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,60 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance =
|
||||
device_contraction_kk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
printf("[CK_DEBUG] f16+f16+f16+f16_kknn_instance: before add, size=%zu\n", instances.size());
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance{});
|
||||
printf("[CK_DEBUG] f16+f16+f16+f16_kknn_instance: after add, size=%zu\n", instances.size());
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_kknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance =
|
||||
device_contraction_kn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_knnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance =
|
||||
device_contraction_mk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_mknn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance =
|
||||
device_contraction_mn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F16,
|
||||
F16,
|
||||
F16_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 6, f16_f16_f16_f16_mnnn,
|
||||
F16, F16, F32, F16, F16_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance =
|
||||
device_contraction_kk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_kknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance =
|
||||
device_contraction_kn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_knnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance =
|
||||
device_contraction_mk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_mknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance =
|
||||
device_contraction_mn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_mnnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance =
|
||||
device_contraction_kk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_kknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance =
|
||||
device_contraction_kn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_knnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance =
|
||||
device_contraction_mk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_mknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance =
|
||||
device_contraction_mn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_mnnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance =
|
||||
device_contraction_kk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_kknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance =
|
||||
device_contraction_kn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_knnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance =
|
||||
device_contraction_mk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_mknn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance =
|
||||
device_contraction_mn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F32,
|
||||
F32,
|
||||
F32_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
bilinear, Bilinear, 6, f32_f32_f32_f32_mnnn,
|
||||
F32, F32, F32, F32, F32_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance =
|
||||
device_contraction_f64_kk_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_kknn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance =
|
||||
device_contraction_f64_kn_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_knnn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance =
|
||||
device_contraction_f64_mk_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_mknn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance =
|
||||
device_contraction_f64_mn_instance<F64,
|
||||
F64,
|
||||
F32,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_mnnn,
|
||||
F64, F64, F32, F64, F64_Tuple, F64, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance =
|
||||
device_contraction_f64_kk_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_kknn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance =
|
||||
device_contraction_f64_kn_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_knnn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance =
|
||||
device_contraction_f64_mk_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_mknn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance =
|
||||
device_contraction_f64_mn_instance<F64,
|
||||
F64,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
6>;
|
||||
|
||||
void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
|
||||
6,
|
||||
6,
|
||||
F64,
|
||||
F64,
|
||||
F64_Tuple,
|
||||
F64,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear,
|
||||
F64>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance,
|
||||
bilinear, Bilinear, 6, f64_f64_f64_f64_mnnn,
|
||||
F64, F64, F64, F64, F64_Tuple, F64, F64)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance =
|
||||
device_contraction_kk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
scale, Scale, 2, bf16_bf16_bf16_compute_f32_kkn,
|
||||
BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance =
|
||||
device_contraction_kn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
scale, Scale, 2, bf16_bf16_bf16_compute_f32_knn,
|
||||
BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance =
|
||||
device_contraction_mk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
scale, Scale, 2, bf16_bf16_bf16_compute_f32_mkn,
|
||||
BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance =
|
||||
device_contraction_mn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
scale, Scale, 2, bf16_bf16_bf16_compute_f32_mnn,
|
||||
BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,57 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance =
|
||||
device_contraction_kk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
scale, Scale, 2, bf16_bf16_bf16_kkn,
|
||||
BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,57 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance =
|
||||
device_contraction_kn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
scale, Scale, 2, bf16_bf16_bf16_knn,
|
||||
BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,57 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance =
|
||||
device_contraction_mk_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
scale, Scale, 2, bf16_bf16_bf16_mkn,
|
||||
BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,57 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance =
|
||||
device_contraction_mn_instance<BF16,
|
||||
BF16,
|
||||
F32,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
BF16,
|
||||
BF16,
|
||||
Empty_Tuple,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
scale, Scale, 2, bf16_bf16_bf16_mnn,
|
||||
BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance =
|
||||
device_contraction_kk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
scale, Scale, 2, f16_f16_f16_compute_f32_kkn,
|
||||
F16, F16, F32, F16, Empty_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance =
|
||||
device_contraction_kn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
scale, Scale, 2, f16_f16_f16_compute_f32_knn,
|
||||
F16, F16, F32, F16, Empty_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance =
|
||||
device_contraction_mk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
scale, Scale, 2, f16_f16_f16_compute_f32_mkn,
|
||||
F16, F16, F32, F16, Empty_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance =
|
||||
device_contraction_mn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
scale, Scale, 2, f16_f16_f16_compute_f32_mnn,
|
||||
F16, F16, F32, F16, Empty_Tuple, F16, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,57 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance =
|
||||
device_contraction_kk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
scale, Scale, 2, f16_f16_f16_kkn,
|
||||
F16, F16, F32, F16, Empty_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,57 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance =
|
||||
device_contraction_kn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
scale, Scale, 2, f16_f16_f16_knn,
|
||||
F16, F16, F32, F16, Empty_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,57 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance =
|
||||
device_contraction_mk_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
scale, Scale, 2, f16_f16_f16_mkn,
|
||||
F16, F16, F32, F16, Empty_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,57 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance =
|
||||
device_contraction_mn_instance<F16,
|
||||
F16,
|
||||
F32,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F16,
|
||||
F16,
|
||||
Empty_Tuple,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
scale, Scale, 2, f16_f16_f16_mnn,
|
||||
F16, F16, F32, F16, Empty_Tuple, F16, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance =
|
||||
device_contraction_kk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
scale, Scale, 2, f32_f32_f32_compute_bf16_kkn,
|
||||
F32, F32, F32, F32, Empty_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance =
|
||||
device_contraction_kn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
scale, Scale, 2, f32_f32_f32_compute_bf16_knn,
|
||||
F32, F32, F32, F32, Empty_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance =
|
||||
device_contraction_mk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
scale, Scale, 2, f32_f32_f32_compute_bf16_mkn,
|
||||
F32, F32, F32, F32, Empty_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance =
|
||||
device_contraction_mn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
BF16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
scale, Scale, 2, f32_f32_f32_compute_bf16_mnn,
|
||||
F32, F32, F32, F32, Empty_Tuple, F32, BF16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance =
|
||||
device_contraction_kk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
scale, Scale, 2, f32_f32_f32_compute_f16_kkn,
|
||||
F32, F32, F32, F32, Empty_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance =
|
||||
device_contraction_kn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
scale, Scale, 2, f32_f32_f32_compute_f16_knn,
|
||||
F32, F32, F32, F32, Empty_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance =
|
||||
device_contraction_mk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mk_instance,
|
||||
scale, Scale, 2, f32_f32_f32_compute_f16_mkn,
|
||||
F32, F32, F32, F32, Empty_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,58 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// m/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance =
|
||||
device_contraction_mn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
F16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F16>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_mn_instance,
|
||||
scale, Scale, 2, f32_f32_f32_compute_f16_mnn,
|
||||
F32, F32, F32, F32, Empty_Tuple, F32, F16)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,57 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/k/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance =
|
||||
device_contraction_kk_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
|
||||
scale, Scale, 2, f32_f32_f32_kkn,
|
||||
F32, F32, F32, F32, Empty_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
@@ -1,57 +1,12 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
|
||||
// setting Don't use this hack unless absolutely necessary!
|
||||
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
|
||||
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
|
||||
#include "../../contraction/contraction_instance_common.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
|
||||
// k/n/n/n are the fast changing dimension for A/B/D/E
|
||||
using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance =
|
||||
device_contraction_kn_instance<F32,
|
||||
F32,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
2>;
|
||||
|
||||
void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
|
||||
std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
|
||||
2,
|
||||
2,
|
||||
F32,
|
||||
F32,
|
||||
Empty_Tuple,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale,
|
||||
F32>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// Instantiate contraction device operation and register via add_device_* function.
|
||||
// See contraction_instance_common.hpp for macro definition and parameter documentation.
|
||||
// clang-format off
|
||||
CK_CONTRACTION_INSTANCE(device_contraction_kn_instance,
|
||||
scale, Scale, 2, f32_f32_f32_knn,
|
||||
F32, F32, F32, F32, Empty_Tuple, F32, F32)
|
||||
// clang-format on
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user