mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-20 04:49:54 +00:00
Absolute include path (#281)
* ad gelu and fast_gelu
* added GeLU and fast GeLU
* clean up
* add gemm+fastgelu example
* add gemm+gelu instances
* update profiler
* clean up
* clean up
* adding gemm+bias+activation
* clean
* adding bias
* clean
* adding gemm multiple d
* debugging
* add gemm bias add fastgelu
* rename, clean
* refactoring; add readme
* refactor
* refactor
* refactor
* refactor
* refactor
* refactor
* fix
* fix
* update example
* update example
* rename
* update example
* add ckProfiler
* clean
* clean
* clean
* clean
* add client app example
* update readme
* delete obselete files
* remove old client app
* delete old file
* cleaning
* clean
* remove half
* fix header path
* fix header path
* fix header path
* fix header path
* fix header path
* fix header path for all examples
* fix header path
* fix header path
* fix header path
* fix header path
* fix header path
* fix header path
* fix header path
* fix header path
* fix header path
* revert client app example
* clean build
* fix build
* temporary disable client test on Jenkins
* clean
* clean
* clean
[ROCm/composable_kernel commit: d1db6a0c3e]
This commit is contained in:
@@ -1,26 +1,5 @@
|
||||
include_directories(BEFORE
|
||||
${PROJECT_SOURCE_DIR}/
|
||||
${PROJECT_SOURCE_DIR}/include/ck
|
||||
${PROJECT_SOURCE_DIR}/include/ck/utility
|
||||
${PROJECT_SOURCE_DIR}/include/ck/host_utility
|
||||
${PROJECT_SOURCE_DIR}/include/ck/tensor_description
|
||||
${PROJECT_SOURCE_DIR}/include/ck/tensor
|
||||
${PROJECT_SOURCE_DIR}/include/ck/problem_transform
|
||||
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/device
|
||||
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/grid
|
||||
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/block
|
||||
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/warp
|
||||
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
|
||||
${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
|
||||
${PROJECT_SOURCE_DIR}/library/include/ck/library/utility
|
||||
${PROJECT_SOURCE_DIR}/test/include
|
||||
${PROJECT_SOURCE_DIR}/profiler/include
|
||||
${PROJECT_SOURCE_DIR}/external/include/half
|
||||
)
|
||||
|
||||
include(googletest)
|
||||
@@ -66,4 +45,3 @@ add_subdirectory(conv2d_bwd_weight)
|
||||
add_subdirectory(convnd_bwd_data)
|
||||
add_subdirectory(block_to_ctile_map)
|
||||
add_subdirectory(softmax)
|
||||
# DONOT add client_app, that is tested via CI independently
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "profile_batched_gemm_impl.hpp"
|
||||
#include "profiler/include/profile_batched_gemm_impl.hpp"
|
||||
|
||||
namespace {
|
||||
using ADataType = ck::half_t;
|
||||
|
||||
@@ -1,9 +1,3 @@
|
||||
include_directories(BEFORE
|
||||
${PROJECT_SOURCE_DIR}/profiler/include
|
||||
${PROJECT_SOURCE_DIR}/test/include
|
||||
${PROJECT_SOURCE_DIR}/external/include/half
|
||||
)
|
||||
|
||||
add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
|
||||
target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE host_tensor)
|
||||
target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "profile_batched_gemm_reduce_impl.hpp"
|
||||
#include "profiler/include/profile_batched_gemm_reduce_impl.hpp"
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
#include <ck/config.hpp>
|
||||
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
|
||||
@@ -1,11 +0,0 @@
|
||||
cmake_minimum_required(VERSION 3.15)
|
||||
project(ck_app)
|
||||
add_compile_options(-std=c++14)
|
||||
|
||||
find_package(composable_kernel 1.0.0 COMPONENTS device_operations host_tensor)
|
||||
find_package(hip REQUIRED PATHS /opt/rocm)
|
||||
message(STATUS "Build with HIP ${hip_VERSION}")
|
||||
|
||||
add_executable(test_client_app client_app.cpp)
|
||||
|
||||
target_link_libraries(test_client_app PRIVATE composable_kernel::device_operations composable_kernel::host_tensor hip::host)
|
||||
@@ -1,77 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "client_app_impl.hpp"
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
if(argc != 25)
|
||||
{
|
||||
printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n");
|
||||
printf("arg2: data type (0: fp32; 1: fp16)\n");
|
||||
printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
|
||||
printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
|
||||
printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
|
||||
printf("arg6: verification (0: no; 1: yes)\n");
|
||||
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
|
||||
printf("arg8: print tensor value (0: no; 1: yes)\n");
|
||||
printf("arg9: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
|
||||
"RightPx\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const ConvDataType data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
|
||||
const int in_layout = static_cast<ConvInputLayout>(std::stoi(argv[3]));
|
||||
const int wei_layout = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
|
||||
const int out_layout = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
|
||||
const bool do_verification = std::stoi(argv[6]);
|
||||
const int init_method = std::stoi(argv[7]);
|
||||
const bool do_log = std::stoi(argv[8]);
|
||||
const bool time_kernel = std::stoi(argv[9]);
|
||||
|
||||
const ck::index_t N = std::stoi(argv[10]);
|
||||
const ck::index_t K = std::stoi(argv[11]);
|
||||
const ck::index_t C = std::stoi(argv[12]);
|
||||
const ck::index_t Y = std::stoi(argv[13]);
|
||||
const ck::index_t X = std::stoi(argv[14]);
|
||||
const ck::index_t Hi = std::stoi(argv[15]);
|
||||
const ck::index_t Wi = std::stoi(argv[16]);
|
||||
|
||||
const ck::index_t conv_stride_h = std::stoi(argv[17]);
|
||||
const ck::index_t conv_stride_w = std::stoi(argv[18]);
|
||||
const ck::index_t conv_dilation_h = std::stoi(argv[19]);
|
||||
const ck::index_t conv_dilation_w = std::stoi(argv[20]);
|
||||
const ck::index_t in_left_pad_h = std::stoi(argv[21]);
|
||||
const ck::index_t in_left_pad_w = std::stoi(argv[22]);
|
||||
const ck::index_t in_right_pad_h = std::stoi(argv[23]);
|
||||
const ck::index_t in_right_pad_w = std::stoi(argv[24]);
|
||||
|
||||
const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
|
||||
const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
|
||||
|
||||
const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
|
||||
const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
||||
|
||||
ck::app::profile_conv_fwd_impl(do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
data_type,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
std::vector<ck::index_t>{Hi, Wi},
|
||||
std::vector<ck::index_t>{Y, X},
|
||||
std::vector<ck::index_t>{Ho, Wo},
|
||||
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
|
||||
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
|
||||
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
|
||||
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
|
||||
return 1;
|
||||
}
|
||||
@@ -1,214 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "host_interface.hpp"
|
||||
|
||||
enum ConvDataType
|
||||
{
|
||||
F32_F32_F32, // 0
|
||||
F16_F16_F16, // 1
|
||||
BF16_BF16_BF16, // 2
|
||||
INT8_INT8_INT8, // 3
|
||||
};
|
||||
|
||||
enum ConvInputLayout
|
||||
{
|
||||
NCHW, // 0
|
||||
NHWC, // 1
|
||||
};
|
||||
|
||||
enum ConvWeightLayout
|
||||
{
|
||||
KCYX, // 0
|
||||
KYXC, // 1
|
||||
};
|
||||
|
||||
enum ConvOutputLayout
|
||||
{
|
||||
NKHW, // 0
|
||||
NHWK, // 1
|
||||
};
|
||||
|
||||
void check_hip_error(void)
|
||||
{
|
||||
hipError_t err = hipGetLastError();
|
||||
if(err != hipSuccess)
|
||||
{
|
||||
std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
|
||||
exit(err);
|
||||
}
|
||||
}
|
||||
std::string getDeviceName(int device)
|
||||
{
|
||||
struct hipDeviceProp_t prop;
|
||||
hipGetDeviceProperties(&prop, device);
|
||||
check_hip_error();
|
||||
return std::string(prop.name);
|
||||
}
|
||||
|
||||
int getDriver(void)
|
||||
{
|
||||
int driver;
|
||||
hipDriverGetVersion(&driver);
|
||||
check_hip_error();
|
||||
return driver;
|
||||
}
|
||||
|
||||
namespace ck {
|
||||
namespace app {
|
||||
struct DeviceMem
|
||||
{
|
||||
DeviceMem() = delete;
|
||||
DeviceMem(std::size_t mem_size);
|
||||
void* GetDeviceBuffer();
|
||||
void ToDevice(const void* p);
|
||||
void FromDevice(void* p);
|
||||
~DeviceMem();
|
||||
|
||||
void* mpDeviceBuf;
|
||||
std::size_t mMemSize;
|
||||
};
|
||||
|
||||
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
|
||||
{
|
||||
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
|
||||
}
|
||||
|
||||
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
|
||||
|
||||
void DeviceMem::ToDevice(const void* p)
|
||||
{
|
||||
hipGetErrorString(
|
||||
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
void DeviceMem::FromDevice(void* p)
|
||||
{
|
||||
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
|
||||
|
||||
void profile_conv_fwd_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
bool time_kernel,
|
||||
ConvDataType data_type,
|
||||
ck::index_t N,
|
||||
ck::index_t K,
|
||||
ck::index_t C,
|
||||
std::vector<ck::index_t> input_spatial_lengths,
|
||||
std::vector<ck::index_t> filter_spatial_lengths,
|
||||
std::vector<ck::index_t> output_spatial_lengths,
|
||||
std::vector<ck::index_t> conv_filter_strides,
|
||||
std::vector<ck::index_t> conv_filter_dilations,
|
||||
std::vector<ck::index_t> input_left_pads,
|
||||
std::vector<ck::index_t> input_right_pads)
|
||||
{
|
||||
const ck::index_t Y = filter_spatial_lengths[0];
|
||||
const ck::index_t X = filter_spatial_lengths[1];
|
||||
|
||||
const ck::index_t Hi = input_spatial_lengths[0];
|
||||
const ck::index_t Wi = input_spatial_lengths[1];
|
||||
|
||||
const ck::index_t Ho = output_spatial_lengths[0];
|
||||
const ck::index_t Wo = output_spatial_lengths[1];
|
||||
|
||||
const auto in_sz = N * C * Hi * Wi;
|
||||
const auto wei_sz = K * C * Y * X;
|
||||
const auto out_sz = N * K * Ho * Wo;
|
||||
|
||||
using WeiDataType = float;
|
||||
using InDataType = float;
|
||||
using OutDataType = float;
|
||||
|
||||
app::DeviceMem in_device_buf(sizeof(InDataType) * in_sz);
|
||||
app::DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_sz);
|
||||
app::DeviceMem out_device_buf(sizeof(OutDataType) * out_sz);
|
||||
// data is already on device!
|
||||
|
||||
// add device Conv instances
|
||||
std::vector<DeviceConvFwdPtr_t> conv_ptrs;
|
||||
if(data_type == F16_F16_F16)
|
||||
{
|
||||
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs);
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs);
|
||||
}
|
||||
else if(data_type == BF16_BF16_BF16)
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(conv_ptrs);
|
||||
else if(data_type == F32_F32_F32)
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(conv_ptrs);
|
||||
else if(data_type == INT8_INT8_INT8)
|
||||
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs);
|
||||
else
|
||||
throw std::runtime_error("wrong! Invalid data type");
|
||||
if(conv_ptrs.empty())
|
||||
{
|
||||
throw std::runtime_error("wrong! no device Conv instance found");
|
||||
}
|
||||
|
||||
std::string best_conv_name;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
int deviceIndex = 0;
|
||||
hipSetDevice(deviceIndex);
|
||||
check_hip_error();
|
||||
|
||||
StreamConfig stream_config{nullptr, time_kernel};
|
||||
hipStreamCreate(&stream_config.stream_id_);
|
||||
check_hip_error();
|
||||
|
||||
// profile device Conv instances
|
||||
for(auto& conv_ptr : conv_ptrs)
|
||||
{
|
||||
auto argument_ptr =
|
||||
conv_ptr.MakeArgumentPointer(static_cast<void*>(in_device_buf.GetDeviceBuffer()),
|
||||
static_cast<void*>(wei_device_buf.GetDeviceBuffer()),
|
||||
static_cast<void*>(out_device_buf.GetDeviceBuffer()),
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
output_spatial_lengths,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads);
|
||||
|
||||
auto invoker_ptr = conv_ptr.MakeInvokerPointer();
|
||||
|
||||
if(conv_ptr.IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
std::string conv_name = conv_ptr.GetTypeString();
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), stream_config);
|
||||
|
||||
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
|
||||
|
||||
std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
|
||||
sizeof(WeiDataType) * (K * C * Y * X) +
|
||||
sizeof(OutDataType) * (N * K * Ho * Wo);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
|
||||
<< " GB/s, " << conv_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_conv_name = conv_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
|
||||
}
|
||||
|
||||
} // namespace app
|
||||
} // namespace ck
|
||||
@@ -1,7 +1,2 @@
|
||||
include_directories(BEFORE
|
||||
${PROJECT_SOURCE_DIR}/profiler/include
|
||||
${PROJECT_SOURCE_DIR}/external/include/half
|
||||
)
|
||||
|
||||
add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp)
|
||||
target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_util)
|
||||
|
||||
@@ -2,12 +2,10 @@
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "conv_util.hpp"
|
||||
#include "profile_conv_bwd_weight_impl.hpp"
|
||||
#include "test/convnd_fwd/conv_util.hpp"
|
||||
#include "profiler/include/profile_conv_bwd_weight_impl.hpp"
|
||||
|
||||
int test_self()
|
||||
{
|
||||
|
||||
@@ -3,10 +3,11 @@
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "config.hpp"
|
||||
#include "conv_util.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
#include "check_err.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/conv_util.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
|
||||
@@ -1,7 +1,2 @@
|
||||
include_directories(BEFORE
|
||||
${PROJECT_SOURCE_DIR}/profiler/include
|
||||
${PROJECT_SOURCE_DIR}/external/include/half
|
||||
)
|
||||
|
||||
add_test_executable(test_convnd_bwd_data convnd_bwd_data.cpp)
|
||||
target_link_libraries(test_convnd_bwd_data PRIVATE host_tensor device_convnd_bwd_data_instance conv_util)
|
||||
|
||||
@@ -2,11 +2,9 @@
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "profile_convnd_bwd_data_impl.hpp"
|
||||
#include "profiler/include/profile_convnd_bwd_data_impl.hpp"
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
#include <iostream>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include "gtest/gtest.h"
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "data_type.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "library/include/ck/library/utility/conv_util.hpp"
|
||||
#include "conv_util.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/utility/conv_util.hpp"
|
||||
#include "test/convnd_fwd/conv_util.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
|
||||
@@ -1,13 +1,11 @@
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include "gtest/gtest.h"
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/utility/conv_util.hpp"
|
||||
#include "config.hpp"
|
||||
#include "conv_util.hpp"
|
||||
#include "data_type.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "fill.hpp"
|
||||
#include "test/convnd_fwd/conv_util.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
#include <half.hpp>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include "gtest/gtest.h"
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "data_type.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "library/include/ck/library/utility/conv_util.hpp"
|
||||
#include "conv_util.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/conv_util.hpp"
|
||||
|
||||
#include "test/convnd_fwd/conv_util.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
|
||||
@@ -2,12 +2,12 @@
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "config.hpp"
|
||||
#include "data_type.hpp"
|
||||
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "sequence.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/sequence.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
|
||||
@@ -1,23 +1,22 @@
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <half.hpp>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "../gemm/gemm_util.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "host_gemm.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_gemm_dl.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "reference_gemm.hpp"
|
||||
#include "gemm_specialization.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/host_tensor/device_memory.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
#include "test/gemm/gemm_util.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
|
||||
@@ -1,23 +1,22 @@
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <half.hpp>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "../gemm/gemm_util.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "host_gemm.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_gemm_dl.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "reference_gemm.hpp"
|
||||
#include "gemm_specialization.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/host_tensor/device_memory.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
#include "test/gemm/gemm_util.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
|
||||
@@ -1,23 +1,22 @@
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <half.hpp>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "../gemm/gemm_util.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "host_gemm.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_gemm_dl.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "reference_gemm.hpp"
|
||||
#include "gemm_specialization.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/host_tensor/device_memory.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
#include "test/gemm/gemm_util.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
#ifndef GEMM_UTILS_HPP
|
||||
#define GEMM_UTILS_HPP
|
||||
#pragma once
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "reference_gemm.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/host_tensor/device_memory.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace gemm_util {
|
||||
@@ -350,4 +349,3 @@ struct TestGemmBF16
|
||||
|
||||
} // namespace gemm_util
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
@@ -1,24 +1,22 @@
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <half.hpp>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "gemm_util.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "host_gemm.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_gemm_xdl.hpp"
|
||||
#include "device_gemm_xdl_cshuffle.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "reference_gemm.hpp"
|
||||
#include "gemm_specialization.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/host_tensor/device_memory.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
#include "test/gemm/gemm_util.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
|
||||
@@ -1,21 +1,23 @@
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <half.hpp>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "gemm_util.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_gemm.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_gemm_xdl.hpp"
|
||||
#include "device_gemm_xdl_cshuffle.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "gemm_specialization.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/host_tensor/device_memory.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
#include "test/gemm/gemm_util.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
|
||||
@@ -1,24 +1,23 @@
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <half.hpp>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "gemm_util.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "host_gemm.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_gemm_xdl.hpp"
|
||||
#include "device_gemm_xdl_cshuffle.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "reference_gemm.hpp"
|
||||
#include "gemm_specialization.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/host_tensor/device_memory.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
#include "test/gemm/gemm_util.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
|
||||
@@ -1,23 +1,23 @@
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <half.hpp>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "gemm_util.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "host_gemm.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_gemm_xdl.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "reference_gemm.hpp"
|
||||
#include "gemm_specialization.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/host_tensor/device_memory.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
#include "test/gemm/gemm_util.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
|
||||
@@ -1,24 +1,23 @@
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <half.hpp>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "gemm_util.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "host_gemm.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_gemm_xdl.hpp"
|
||||
#include "device_gemm_xdl_cshuffle.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "reference_gemm.hpp"
|
||||
#include "gemm_specialization.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/host_tensor/device_memory.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
#include "test/gemm/gemm_util.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
|
||||
@@ -1,9 +1,3 @@
|
||||
include_directories(BEFORE
|
||||
${PROJECT_SOURCE_DIR}/profiler/include
|
||||
${PROJECT_SOURCE_DIR}/test/include
|
||||
${PROJECT_SOURCE_DIR}/external/include/half
|
||||
)
|
||||
|
||||
add_test_executable(test_gemm_reduce_fp16 gemm_reduce_fp16.cpp)
|
||||
target_link_libraries(test_gemm_reduce_fp16 PRIVATE host_tensor)
|
||||
target_link_libraries(test_gemm_reduce_fp16 PRIVATE device_gemm_reduce_instance)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "profile_gemm_reduce_impl.hpp"
|
||||
#include "profiler/include/profile_gemm_reduce_impl.hpp"
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
@@ -1,16 +1,21 @@
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "host_gemm.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
#include "device_gemm_xdl_splitk.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/host_tensor/device_memory.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor_generator.hpp"
|
||||
#include "ck/library/host_tensor/device_memory.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
#include "ck/library/host_tensor/host_gemm.hpp"
|
||||
|
||||
enum struct GemmMatrixLayout
|
||||
{
|
||||
|
||||
@@ -2,21 +2,18 @@
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "print.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "host_gemm.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "device_grouped_gemm_xdl.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "reference_gemm.hpp"
|
||||
#include "gemm_specialization.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/host_tensor/device_memory.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
|
||||
@@ -2,16 +2,13 @@
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#include <half.hpp>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "magic_division.hpp"
|
||||
#include "device.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "host_tensor_generator.hpp"
|
||||
#include "device_tensor.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/magic_division.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/host_tensor/device_memory.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor_generator.hpp"
|
||||
|
||||
__global__ void gpu_magic_number_division(uint32_t magic_multiplier,
|
||||
uint32_t magic_shift,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "getopt.h"
|
||||
#include <getopt.h>
|
||||
|
||||
#include "host_common_util.hpp"
|
||||
#include "profile_reduce_impl.hpp"
|
||||
#include "ck/library/host_tensor/host_common_util.hpp"
|
||||
#include "profiler/include/profile_reduce_impl.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "getopt.h"
|
||||
#include <getopt.h>
|
||||
|
||||
#include "host_common_util.hpp"
|
||||
#include "profile_reduce_impl.hpp"
|
||||
#include "ck/library/host_tensor/host_common_util.hpp"
|
||||
#include "profiler/include/profile_reduce_impl.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <half.hpp>
|
||||
#include <numeric>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
#include "gtest/gtest.h"
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "check_err.hpp"
|
||||
#include "config.hpp"
|
||||
#include "conv_util.hpp"
|
||||
#include "element_wise_operation.hpp"
|
||||
#include "fill.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "reference_conv_fwd.hpp"
|
||||
#include "tensor_layout.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/conv_util.hpp"
|
||||
#include "ck/library/utility/fill.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
|
||||
namespace {
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include "gtest/gtest.h"
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "config.hpp"
|
||||
#include "host_tensor.hpp"
|
||||
#include "check_err.hpp"
|
||||
#include "number.hpp"
|
||||
#include "reference_softmax.hpp"
|
||||
#include "device_softmax.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/number.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/host_tensor/host_tensor.hpp"
|
||||
#include "ck/library/host_tensor/device_memory.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
|
||||
@@ -3,7 +3,9 @@
|
||||
#include <numeric>
|
||||
#include <cassert>
|
||||
|
||||
#include "tensor_space_filling_curve.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
|
||||
|
||||
using namespace ck;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user