mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-24 14:54:47 +00:00
Extend permute scale support up to 6D (#1168)
* Extend permute scale support up to 6D
* Fixes
* Fixes
* Update profiler/README.md
Co-authored-by: Lisa <lisajdelaney@gmail.com>
* Update profiler/README.md
Co-authored-by: Lisa <lisajdelaney@gmail.com>
* Update profiler/README.md
Co-authored-by: Lisa <lisajdelaney@gmail.com>
* Update profiler/README.md
Co-authored-by: Lisa <lisajdelaney@gmail.com>
* Update profiler/README.md
Co-authored-by: Lisa <lisajdelaney@gmail.com>
* Update profiler/README.md
Co-authored-by: Lisa <lisajdelaney@gmail.com>
* Update profiler/README.md
Co-authored-by: Lisa <lisajdelaney@gmail.com>
---------
Co-authored-by: Lisa <lisajdelaney@gmail.com>
[ROCm/composable_kernel commit: 66736edb95]
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -322,6 +322,19 @@ struct DeviceElementwiseImpl : public DeviceElementwise<InDataTypeTuple,
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
};
|
||||
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << "DeviceElementwiseNormalizationImpl<";
|
||||
str << NumDim << ", ";
|
||||
str << MPerThread << ">";
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
}; // namespace device
|
||||
|
||||
} // namespace device
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -17,7 +17,32 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_permute_scale_f16_instances(
|
||||
#ifdef CK_ENABLE_FP16
|
||||
void add_device_permute_scale_1d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>,
|
||||
ck::Tuple<F16>,
|
||||
PassThrough,
|
||||
element_wise::UnarySquare,
|
||||
Scale,
|
||||
1>>>&);
|
||||
|
||||
void add_device_permute_scale_2d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>,
|
||||
ck::Tuple<F16>,
|
||||
PassThrough,
|
||||
element_wise::UnarySquare,
|
||||
Scale,
|
||||
2>>>&);
|
||||
|
||||
void add_device_permute_scale_3d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>,
|
||||
ck::Tuple<F16>,
|
||||
PassThrough,
|
||||
element_wise::UnarySquare,
|
||||
Scale,
|
||||
3>>>&);
|
||||
|
||||
void add_device_permute_scale_4d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>,
|
||||
ck::Tuple<F16>,
|
||||
PassThrough,
|
||||
@@ -25,7 +50,50 @@ void add_device_permute_scale_f16_instances(
|
||||
Scale,
|
||||
4>>>&);
|
||||
|
||||
void add_device_permute_scale_f32_instances(
|
||||
void add_device_permute_scale_5d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>,
|
||||
ck::Tuple<F16>,
|
||||
PassThrough,
|
||||
element_wise::UnarySquare,
|
||||
Scale,
|
||||
5>>>&);
|
||||
|
||||
void add_device_permute_scale_6d_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F16>,
|
||||
ck::Tuple<F16>,
|
||||
PassThrough,
|
||||
element_wise::UnarySquare,
|
||||
Scale,
|
||||
6>>>&);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef CK_ENABLE_FP32
|
||||
void add_device_permute_scale_1d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>,
|
||||
ck::Tuple<F32>,
|
||||
PassThrough,
|
||||
element_wise::UnarySquare,
|
||||
Scale,
|
||||
1>>>&);
|
||||
|
||||
void add_device_permute_scale_2d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>,
|
||||
ck::Tuple<F32>,
|
||||
PassThrough,
|
||||
element_wise::UnarySquare,
|
||||
Scale,
|
||||
2>>>&);
|
||||
|
||||
void add_device_permute_scale_3d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>,
|
||||
ck::Tuple<F32>,
|
||||
PassThrough,
|
||||
element_wise::UnarySquare,
|
||||
Scale,
|
||||
3>>>&);
|
||||
|
||||
void add_device_permute_scale_4d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>,
|
||||
ck::Tuple<F32>,
|
||||
PassThrough,
|
||||
@@ -33,6 +101,23 @@ void add_device_permute_scale_f32_instances(
|
||||
Scale,
|
||||
4>>>&);
|
||||
|
||||
void add_device_permute_scale_5d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>,
|
||||
ck::Tuple<F32>,
|
||||
PassThrough,
|
||||
element_wise::UnarySquare,
|
||||
Scale,
|
||||
5>>>&);
|
||||
|
||||
void add_device_permute_scale_6d_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>,
|
||||
ck::Tuple<F32>,
|
||||
PassThrough,
|
||||
element_wise::UnarySquare,
|
||||
Scale,
|
||||
6>>>&);
|
||||
#endif
|
||||
|
||||
template <typename InDataTypeTuple,
|
||||
typename OutDataTypeTuple,
|
||||
typename ElementwiseOperation,
|
||||
@@ -57,15 +142,107 @@ struct DeviceOperationInstanceFactory<
|
||||
static auto GetInstances()
|
||||
{
|
||||
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
|
||||
if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F32>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F32>>)
|
||||
if constexpr(NumDim == 1)
|
||||
{
|
||||
add_device_permute_scale_f32_instances(op_ptrs);
|
||||
#ifdef CK_ENABLE_FP32
|
||||
if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F32>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F32>>)
|
||||
{
|
||||
add_device_permute_scale_1d_f32_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_FP16
|
||||
if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F16>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F16>>)
|
||||
{
|
||||
add_device_permute_scale_1d_f16_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F16>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F16>>)
|
||||
else if constexpr(NumDim == 2)
|
||||
{
|
||||
add_device_permute_scale_f16_instances(op_ptrs);
|
||||
#ifdef CK_ENABLE_FP32
|
||||
if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F32>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F32>>)
|
||||
{
|
||||
add_device_permute_scale_2d_f32_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_FP16
|
||||
if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F16>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F16>>)
|
||||
{
|
||||
add_device_permute_scale_2d_f16_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if constexpr(NumDim == 3)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP32
|
||||
if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F32>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F32>>)
|
||||
{
|
||||
add_device_permute_scale_3d_f32_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_FP16
|
||||
if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F16>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F16>>)
|
||||
{
|
||||
add_device_permute_scale_3d_f16_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if constexpr(NumDim == 4)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP32
|
||||
if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F32>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F32>>)
|
||||
{
|
||||
add_device_permute_scale_4d_f32_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_FP16
|
||||
if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F16>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F16>>)
|
||||
{
|
||||
add_device_permute_scale_4d_f16_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if constexpr(NumDim == 5)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP32
|
||||
if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F32>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F32>>)
|
||||
{
|
||||
add_device_permute_scale_5d_f32_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_FP16
|
||||
if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F16>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F16>>)
|
||||
{
|
||||
add_device_permute_scale_5d_f16_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if constexpr(NumDim == 6)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP32
|
||||
if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F32>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F32>>)
|
||||
{
|
||||
add_device_permute_scale_6d_f32_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
#ifdef CK_ENABLE_FP16
|
||||
if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F16>> &&
|
||||
is_same_v<OutDataTypeTuple, ck::Tuple<F16>>)
|
||||
{
|
||||
add_device_permute_scale_6d_f16_instances(op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return op_ptrs;
|
||||
}
|
||||
|
||||
@@ -1,56 +1,42 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Pass = ck::tensor_operation::element_wise::PassThrough;
|
||||
using UnaryOp = ck::tensor_operation::element_wise::UnarySquare;
|
||||
using Scale = ck::tensor_operation::element_wise::Scale;
|
||||
|
||||
// clang-format off
|
||||
using device_permute_scale_f16_instances =
|
||||
std::tuple <
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4, 1, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4, 8, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4, 4, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4, 2, ck::Sequence<1>, ck::Sequence<1>>
|
||||
>;
|
||||
|
||||
using device_permute_scale_f32_instances = std::tuple<
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4, 1, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4, 8, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4, 4, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4, 2, ck::Sequence<1>, ck::Sequence<1>>
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
void add_device_permute_scale_f16_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_permute_scale_f16_instances{});
|
||||
}
|
||||
|
||||
void add_device_permute_scale_f32_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_permute_scale_f32_instances{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Pass = ck::tensor_operation::element_wise::PassThrough;
|
||||
using UnaryOp = ck::tensor_operation::element_wise::UnarySquare;
|
||||
using Scale = ck::tensor_operation::element_wise::Scale;
|
||||
|
||||
// clang-format off
|
||||
template <index_t NDims>
|
||||
using device_permute_scale_f16_instances =
|
||||
std::tuple <
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, NDims, 1, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, NDims, 8, ck::Sequence<8>, ck::Sequence<1>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, NDims, 4, ck::Sequence<4>, ck::Sequence<1>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, NDims, 2, ck::Sequence<2>, ck::Sequence<1>>
|
||||
>;
|
||||
|
||||
template <index_t NDims>
|
||||
using device_permute_scale_f32_instances = std::tuple<
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, NDims, 1, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, NDims, 8, ck::Sequence<8>, ck::Sequence<1>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, NDims, 4, ck::Sequence<4>, ck::Sequence<1>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, NDims, 2, ck::Sequence<2>, ck::Sequence<1>>
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,2 +1,7 @@
|
||||
add_instance_library(device_permute_scale_instance
|
||||
device_permute_scale_instances.cpp)
|
||||
device_permute_scale_1d_instances.cpp
|
||||
device_permute_scale_2d_instances.cpp
|
||||
device_permute_scale_3d_instances.cpp
|
||||
device_permute_scale_4d_instances.cpp
|
||||
device_permute_scale_5d_instances.cpp
|
||||
device_permute_scale_6d_instances.cpp)
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_permute_scale_1d_f16_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 1>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_permute_scale_f16_instances<1>{});
|
||||
}
|
||||
|
||||
void add_device_permute_scale_1d_f32_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 1>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_permute_scale_f32_instances<1>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,29 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_permute_scale_2d_f16_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 2>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_permute_scale_f16_instances<2>{});
|
||||
}
|
||||
|
||||
void add_device_permute_scale_2d_f32_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 2>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_permute_scale_f32_instances<2>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,29 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_permute_scale_3d_f16_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 3>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_permute_scale_f16_instances<3>{});
|
||||
}
|
||||
|
||||
void add_device_permute_scale_3d_f32_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 3>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_permute_scale_f32_instances<3>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,29 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_permute_scale_4d_f16_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_permute_scale_f16_instances<4>{});
|
||||
}
|
||||
|
||||
void add_device_permute_scale_4d_f32_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_permute_scale_f32_instances<4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,29 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_permute_scale_5d_f16_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 5>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_permute_scale_f16_instances<5>{});
|
||||
}
|
||||
|
||||
void add_device_permute_scale_5d_f32_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 5>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_permute_scale_f32_instances<5>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,29 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_permute_scale_6d_f16_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 6>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_permute_scale_f16_instances<6>{});
|
||||
}
|
||||
|
||||
void add_device_permute_scale_6d_f32_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 6>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_permute_scale_f32_instances<6>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -37,9 +37,9 @@ Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s
|
||||
################ op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
|
||||
./bin/ckProfiler conv2d_fwd 1 1 1 1 1 1 0 5 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1
|
||||
```
|
||||
|
||||
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
|
||||
```
|
||||
|
||||
```bash
|
||||
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
|
||||
wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
|
||||
out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
|
||||
@@ -104,6 +104,7 @@ arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
|
||||
arg.e_grid_desc_m_n_{ 4096, 4096}
|
||||
....
|
||||
Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s
|
||||
```
|
||||
## Profile grouped convolution backward data kernels
|
||||
```bash
|
||||
# arg1: tensor operation (grouped_conv_bwd_data: Grouped Convolution Backward Data)
|
||||
@@ -129,10 +130,11 @@ Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s
|
||||
################ op datatype layout verify init log time Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx
|
||||
./bin/ckProfiler grouped_conv_bwd_data 1 0 1 1 0 1 2 32 4 192 192 3 3 28 28 1 1 1 1 1 1 1 1
|
||||
|
||||
```
|
||||
```
|
||||
|
||||
Result (MI100, FP16, GNHWC_GKYXC_GNHWK)
|
||||
```
|
||||
|
||||
```bash
|
||||
out: dim 5, lengths {32, 4, 192, 28, 28}, strides {602112, 150528, 1, 5376, 192}
|
||||
wei: dim 5, lengths {32, 192, 192, 3, 3}, strides {331776, 1728, 1, 576, 192}
|
||||
in: dim 5, lengths {32, 4, 192, 28, 28}, strides {602112, 150528, 1, 5376, 192}
|
||||
@@ -173,10 +175,11 @@ GB/s: 127.947
|
||||
################ op datatype layout verify init log time Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx SplitK
|
||||
./bin/ckProfiler grouped_conv_bwd_weight 1 1 0 1 0 1 2 32 256 256 512 3 3 28 28 1 1 1 1 1 0 0 0 1
|
||||
|
||||
```
|
||||
```
|
||||
|
||||
Result (MI100, FP16, GNHWC_GKYXC_GNHWK)
|
||||
```
|
||||
|
||||
```bash
|
||||
input: dim 5, lengths {32, 512, 1024, 28, 28}, strides {411041792, 802816, 1, 28672, 1024}
|
||||
weight: dim 5, lengths {32, 512, 1024, 3, 3}, strides {4718592, 9216, 1, 3072, 1024}
|
||||
output: dim 5, lengths {32, 512, 512, 26, 26}, strides {177209344, 346112, 1, 13312, 512}
|
||||
@@ -190,8 +193,9 @@ GB/s: 69.2301
|
||||
Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
|
||||
|
||||
## Profile image to column/column to image kernels
|
||||
|
||||
```bash
|
||||
# arg1: tensor operation (" OP_NAME ": " OP_DESC ")
|
||||
# arg1: tensor operation ( conv_tensor_rearrange : Conv Tensor Rearrange )
|
||||
# arg2: data type (0: Input fp32, Weight fp32, Output fp32
|
||||
# 1: Input fp16, Weight fp16, Output fp16
|
||||
# 2: Input bf16, Weight bf16, Output bf16
|
||||
@@ -216,10 +220,11 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate
|
||||
################ op datatype layout verify init log time opType Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx
|
||||
./bin/ckProfiler conv_tensor_rearrange 0 0 0 1 0 1 0 2 1 256 1 512 3 3 28 28 1 1 1 1 0 0 0 0
|
||||
|
||||
```
|
||||
```
|
||||
|
||||
Result (MI210, FP32, NHWC)
|
||||
```
|
||||
|
||||
```bash
|
||||
input: dim 5, lengths {1, 256, 512, 28, 28}, strides {102760448, 401408, 1, 14336, 512}
|
||||
output: dim 2, lengths {173056, 4608}, strides {4608, 1}
|
||||
....
|
||||
@@ -229,3 +234,30 @@ avg_time: 3.12326
|
||||
GB/s: 2042.59
|
||||
```
|
||||
Note: Column to image kernel adds to the output memory, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
|
||||
|
||||
## Profile Permute scale kernels
|
||||
|
||||
```bash
|
||||
# arg1: tensor operation ( permute_scale : Permute Scale )
|
||||
# arg2: data type (0: Input fp32, Output fp32
|
||||
# 1: Input fp16, Output fp16
|
||||
# arg4: verification (0: no, 1: yes)
|
||||
# arg5: initialization (0: no init, 1: integer value, 2: decimal value)
|
||||
# arg6: print tensor value (0: no; 1: yes)
|
||||
# arg7: time kernel (0: no, 1: yes)
|
||||
# from arg8: tensor lengths
|
||||
# input strides
|
||||
# output strides
|
||||
|
||||
################ op datatype verify init log time dim0 dim1 dim2 in_stride0 in_stride1 in_stride2 out_stride0 out_stride1 out_stride2
|
||||
./bin/ckProfiler permute_scale 0 1 1 0 1 64 64 64 4096 64 1 1 64 4096
|
||||
```
|
||||
|
||||
Result (MI100, FP32)
|
||||
|
||||
```bash
|
||||
A: dim 3, lengths {64, 64, 64}, strides {4096, 64, 1}
|
||||
B: dim 3, lengths {64, 64, 64}, strides {1, 64, 4096}
|
||||
....
|
||||
Best perf = 0.0146878 ms, 142.782 GB/s, DeviceElementwiseNormalizationImpl<3, 2>
|
||||
```
|
||||
|
||||
@@ -1,212 +1,188 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iomanip>
|
||||
#include <random>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise_scale.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
|
||||
namespace ck {
|
||||
template <typename HostTensorA, typename HostTensorB, typename FunctorA, typename FunctorB>
|
||||
void host_elementwise4D(HostTensorB& B_nhwc,
|
||||
const HostTensorA& A_nchw,
|
||||
FunctorA functor_a,
|
||||
FunctorB functor_b,
|
||||
float scale)
|
||||
{
|
||||
std::size_t N = A_nchw.mDesc.GetLengths()[0];
|
||||
std::size_t C = A_nchw.mDesc.GetLengths()[1];
|
||||
std::size_t H = A_nchw.mDesc.GetLengths()[2];
|
||||
std::size_t W = A_nchw.mDesc.GetLengths()[3];
|
||||
for(std::size_t w = 0; w < W; ++w)
|
||||
for(std::size_t h = 0; h < H; ++h)
|
||||
for(std::size_t c = 0; c < C; ++c)
|
||||
for(std::size_t n = 0; n < N; ++n)
|
||||
{
|
||||
using tmp_type = ck::remove_reference_t<decltype(B_nhwc(0, 0))>;
|
||||
tmp_type tmp_val = 0;
|
||||
auto a_val = A_nchw.mData[(n) + (c * N) + (h * C * N) + (w * H * C * N)];
|
||||
functor_b(tmp_val, a_val);
|
||||
functor_a(B_nhwc.mData[(n) + (c * W * H * N) + (h * N) + (w * H * N)],
|
||||
scale * tmp_val);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ADataType, typename BDataType, index_t NumDim>
|
||||
bool test_permute_scale_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
bool time_kernel,
|
||||
std::vector<index_t> lengths)
|
||||
{
|
||||
bool pass = true;
|
||||
|
||||
using ElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using UnaryOp = ck::tensor_operation::element_wise::UnarySquare;
|
||||
using Scale = ck::tensor_operation::element_wise::Scale;
|
||||
float scale = 2.f;
|
||||
|
||||
index_t N = lengths[0];
|
||||
index_t C = lengths[1];
|
||||
index_t H = lengths[2];
|
||||
index_t W = lengths[3];
|
||||
|
||||
std::vector<ck::index_t> nchw = {N, C, H, W};
|
||||
std::vector<ck::index_t> nhwc = {N, H, W, C};
|
||||
Tensor<ADataType> a(nchw);
|
||||
Tensor<BDataType> b(nhwc);
|
||||
Tensor<BDataType> host_b(nhwc);
|
||||
|
||||
std::array<ck::index_t, 4> ab_lengths;
|
||||
|
||||
std::array<ck::index_t, 4> a_strides = {1,
|
||||
static_cast<int>(nchw[0]),
|
||||
static_cast<int>(nchw[0] * nchw[1]),
|
||||
static_cast<int>(nchw[0] * nchw[1] * nchw[2])};
|
||||
|
||||
std::array<ck::index_t, 4> b_strides = {1,
|
||||
static_cast<int>(nhwc[0] * nhwc[1] * nhwc[2]),
|
||||
static_cast<int>(nhwc[0]),
|
||||
static_cast<int>(nhwc[0] * nhwc[1])};
|
||||
ck::ranges::copy(nchw, ab_lengths.begin());
|
||||
|
||||
std::cout << "A: " << a.mDesc << std::endl;
|
||||
std::cout << "B: " << b.mDesc << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
|
||||
default: // a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}
|
||||
std::mt19937 gen(11939);
|
||||
std::uniform_int_distribution<int> dis(0, 1);
|
||||
auto i = 0;
|
||||
for(std::size_t w = 0; w < a.mDesc.GetLengths()[3]; ++w)
|
||||
for(std::size_t h = 0; h < a.mDesc.GetLengths()[2]; ++h)
|
||||
for(std::size_t c = 0; c < a.mDesc.GetLengths()[1]; ++c)
|
||||
for(std::size_t n = 0; n < a.mDesc.GetLengths()[0]; ++n)
|
||||
{
|
||||
a.mData[(n * nchw[1] * nchw[2] * nchw[3]) + (c * nchw[2] * nchw[3]) +
|
||||
(h * nchw[3]) + w] = i;
|
||||
i = dis(gen);
|
||||
}
|
||||
}
|
||||
|
||||
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
|
||||
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
|
||||
|
||||
a_device_buf.ToDevice(a.mData.data());
|
||||
|
||||
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
|
||||
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
|
||||
ck::Tuple<BDataType>,
|
||||
ElementOp,
|
||||
UnaryOp,
|
||||
Scale,
|
||||
NumDim>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_instance_name;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
host_elementwise4D(host_b, a, ElementOp{}, UnaryOp{}, scale);
|
||||
}
|
||||
|
||||
for(auto& op_ptr : op_ptrs)
|
||||
{
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(ab_lengths,
|
||||
{a_strides},
|
||||
{b_strides},
|
||||
input,
|
||||
output,
|
||||
ElementOp{},
|
||||
UnaryOp{},
|
||||
Scale{scale});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
b_device_buf.SetZero();
|
||||
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
b_device_buf.FromDevice(b.mData.data());
|
||||
|
||||
pass &= ck::utils::check_err(
|
||||
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
float ave_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
|
||||
|
||||
std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
|
||||
sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_instance_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
if(time_kernel)
|
||||
{
|
||||
LogRange(std::cout << "length = ", lengths, ",") << ", ";
|
||||
std::cout << "best perf = " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_instance_name << std::endl;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iomanip>
|
||||
#include <random>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise_scale.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
|
||||
namespace ck {
|
||||
template <typename HostTensorA,
|
||||
typename HostTensorB,
|
||||
typename AElementOp,
|
||||
typename BElementOp,
|
||||
typename ScaleElementOp>
|
||||
void reference_permute_scale(HostTensorB& b_tensor,
|
||||
const HostTensorA& a_tensor,
|
||||
AElementOp a_tensor_op,
|
||||
BElementOp b_tensor_op,
|
||||
ScaleElementOp scale_op)
|
||||
{
|
||||
b_tensor.ForEach([&](auto& self, auto idx) {
|
||||
auto tmp_val = a_tensor(idx);
|
||||
b_tensor_op(tmp_val, tmp_val);
|
||||
scale_op(tmp_val, tmp_val);
|
||||
a_tensor_op(self(idx), tmp_val);
|
||||
});
|
||||
}
|
||||
|
||||
namespace profiler {
|
||||
|
||||
template <typename ADataType, typename BDataType, index_t NumDim>
|
||||
bool profile_permute_scale_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
bool time_kernel,
|
||||
std::vector<index_t> lengths_vector,
|
||||
std::vector<index_t> input_strides_vector,
|
||||
std::vector<index_t> output_strides_vector)
|
||||
{
|
||||
bool pass = true;
|
||||
bool instance_found = false;
|
||||
|
||||
using ElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using UnaryOp = ck::tensor_operation::element_wise::UnarySquare;
|
||||
using Scale = ck::tensor_operation::element_wise::Scale;
|
||||
float scale = 2.f;
|
||||
|
||||
Tensor<ADataType> a(lengths_vector, input_strides_vector);
|
||||
Tensor<BDataType> b(lengths_vector, output_strides_vector);
|
||||
Tensor<BDataType> host_b(lengths_vector, output_strides_vector);
|
||||
|
||||
std::cout << "A: " << a.mDesc << std::endl;
|
||||
std::cout << "B: " << b.mDesc << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
|
||||
default: a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); break;
|
||||
}
|
||||
|
||||
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
|
||||
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
|
||||
|
||||
a_device_buf.ToDevice(a.mData.data());
|
||||
|
||||
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
|
||||
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
|
||||
ck::Tuple<BDataType>,
|
||||
ElementOp,
|
||||
UnaryOp,
|
||||
Scale,
|
||||
NumDim>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_instance_name;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
reference_permute_scale(host_b, a, ElementOp{}, UnaryOp{}, Scale{scale});
|
||||
}
|
||||
|
||||
auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
|
||||
std::array<ck::index_t, NumDim> lengths{};
|
||||
std::array<ck::index_t, NumDim> input_strides{};
|
||||
std::array<ck::index_t, NumDim> output_strides{};
|
||||
copy(lengths_vector, lengths);
|
||||
copy(input_strides_vector, input_strides);
|
||||
copy(output_strides_vector, output_strides);
|
||||
|
||||
for(auto& op_ptr : op_ptrs)
|
||||
{
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(lengths,
|
||||
{input_strides},
|
||||
{output_strides},
|
||||
input,
|
||||
output,
|
||||
ElementOp{},
|
||||
UnaryOp{},
|
||||
Scale{scale});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
instance_found = true;
|
||||
|
||||
b_device_buf.SetZero();
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
b_device_buf.FromDevice(b.mData.data());
|
||||
|
||||
pass &= ck::utils::check_err(
|
||||
b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
|
||||
LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
float ave_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * a.mDesc.GetElementSpaceSize() / sizeof(ADataType);
|
||||
|
||||
std::size_t num_btype = sizeof(ADataType) * a.mDesc.GetElementSpaceSize() +
|
||||
sizeof(BDataType) * b.mDesc.GetElementSpaceSize();
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_instance_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
if(time_kernel)
|
||||
{
|
||||
std::cout << "Best perf = " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_instance_name << std::endl;
|
||||
}
|
||||
|
||||
return pass && instance_found;
|
||||
}
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace ck
|
||||
@@ -32,6 +32,7 @@ set(PROFILER_SOURCES
|
||||
profile_grouped_conv_bwd_data.cpp
|
||||
profile_conv_tensor_rearrange.cpp
|
||||
profile_transpose.cpp
|
||||
profile_permute_scale.cpp
|
||||
)
|
||||
|
||||
if(DL_KERNELS)
|
||||
@@ -99,6 +100,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_d
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)
|
||||
|
||||
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
|
||||
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
|
||||
|
||||
170
profiler/src/profile_permute_scale.cpp
Normal file
170
profiler/src/profile_permute_scale.cpp
Normal file
@@ -0,0 +1,170 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/profile_permute_scale_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
enum struct DataType
|
||||
{
|
||||
F32_F32, // 0
|
||||
F16_F16 // 1
|
||||
};
|
||||
|
||||
#define OP_NAME "permute_scale"
|
||||
#define OP_DESC "Permute Scale"
|
||||
|
||||
static void print_helper_msg()
|
||||
{
|
||||
std::cout
|
||||
// clang-format off
|
||||
<< "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
|
||||
<< "arg2: data type (0: Input fp32, Output fp32\n"
|
||||
<< " 1: Input fp16, Output fp16\n"
|
||||
<< "arg4: verification (0: no, 1: yes)\n"
|
||||
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
|
||||
<< "arg6: print tensor value (0: no; 1: yes)\n"
|
||||
<< "arg7: time kernel (0: no, 1: yes)\n"
|
||||
<< "from arg8: tensor lengths\n"
|
||||
<< " input strides\n"
|
||||
<< " output strides\n" << std::endl;
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int profile_permute_scale(int argc, char* argv[])
|
||||
{
|
||||
constexpr int control_argc = 7;
|
||||
const int dims_argc = argc - control_argc;
|
||||
// Number of lenghs, input strides and outputs strides must be equal
|
||||
if(argc < control_argc && dims_argc % 3 != 0)
|
||||
{
|
||||
print_helper_msg();
|
||||
return 1;
|
||||
}
|
||||
|
||||
const auto data_type = static_cast<DataType>(std::stoi(argv[2]));
|
||||
const bool do_verification = std::stoi(argv[3]);
|
||||
const int init_method = std::stoi(argv[4]);
|
||||
const bool do_log = std::stoi(argv[5]);
|
||||
const bool time_kernel = std::stoi(argv[6]);
|
||||
const int num_dims = dims_argc / 3;
|
||||
|
||||
std::vector<ck::index_t> lengths(num_dims);
|
||||
std::vector<ck::index_t> input_strides(num_dims);
|
||||
std::vector<ck::index_t> output_strides(num_dims);
|
||||
|
||||
for(int i = 0; i < num_dims; i++)
|
||||
{
|
||||
lengths[i] = std::stoi(argv[control_argc + i]);
|
||||
input_strides[i] = std::stoi(argv[control_argc + num_dims + i]);
|
||||
output_strides[i] = std::stoi(argv[control_argc + 2 * num_dims + i]);
|
||||
}
|
||||
|
||||
using F32 = float;
|
||||
using F16 = ck::half_t;
|
||||
|
||||
constexpr auto I1 = ck::Number<1>{};
|
||||
constexpr auto I2 = ck::Number<2>{};
|
||||
constexpr auto I3 = ck::Number<3>{};
|
||||
constexpr auto I4 = ck::Number<4>{};
|
||||
constexpr auto I5 = ck::Number<5>{};
|
||||
constexpr auto I6 = ck::Number<6>{};
|
||||
|
||||
auto profile = [&](auto num_dim_tmp, auto in_type, auto out_type) {
|
||||
constexpr ck::index_t NDim = num_dim_tmp.value;
|
||||
|
||||
using InDataType = decltype(in_type);
|
||||
using OutDataType = decltype(out_type);
|
||||
|
||||
bool pass =
|
||||
ck::profiler::profile_permute_scale_impl<InDataType, OutDataType, NDim>(do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
lengths,
|
||||
input_strides,
|
||||
output_strides);
|
||||
|
||||
return pass ? 0 : 1;
|
||||
};
|
||||
|
||||
if(num_dims == 1)
|
||||
{
|
||||
if(data_type == DataType::F32_F32)
|
||||
{
|
||||
return profile(I1, F32{}, F32{});
|
||||
}
|
||||
else if(data_type == DataType::F16_F16)
|
||||
{
|
||||
return profile(I1, F16{}, F16{});
|
||||
}
|
||||
}
|
||||
else if(num_dims == 2)
|
||||
{
|
||||
if(data_type == DataType::F32_F32)
|
||||
{
|
||||
return profile(I2, F32{}, F32{});
|
||||
}
|
||||
else if(data_type == DataType::F16_F16)
|
||||
{
|
||||
return profile(I2, F16{}, F16{});
|
||||
}
|
||||
}
|
||||
else if(num_dims == 3)
|
||||
{
|
||||
if(data_type == DataType::F32_F32)
|
||||
{
|
||||
return profile(I3, F32{}, F32{});
|
||||
}
|
||||
else if(data_type == DataType::F16_F16)
|
||||
{
|
||||
return profile(I3, F16{}, F16{});
|
||||
}
|
||||
}
|
||||
else if(num_dims == 4)
|
||||
{
|
||||
if(data_type == DataType::F32_F32)
|
||||
{
|
||||
return profile(I4, F32{}, F32{});
|
||||
}
|
||||
else if(data_type == DataType::F16_F16)
|
||||
{
|
||||
return profile(I4, F16{}, F16{});
|
||||
}
|
||||
}
|
||||
else if(num_dims == 5)
|
||||
{
|
||||
if(data_type == DataType::F32_F32)
|
||||
{
|
||||
return profile(I5, F32{}, F32{});
|
||||
}
|
||||
else if(data_type == DataType::F16_F16)
|
||||
{
|
||||
return profile(I5, F16{}, F16{});
|
||||
}
|
||||
}
|
||||
else if(num_dims == 6)
|
||||
{
|
||||
if(data_type == DataType::F32_F32)
|
||||
{
|
||||
return profile(I6, F32{}, F32{});
|
||||
}
|
||||
else if(data_type == DataType::F16_F16)
|
||||
{
|
||||
return profile(I6, F16{}, F16{});
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "this data_type & layout is not implemented" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_permute_scale);
|
||||
@@ -1,8 +1,8 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "test_permute_scale_impl.hpp"
|
||||
#include "profiler/profile_permute_scale_impl.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
@@ -15,15 +15,32 @@ class TestPermute : public ::testing::Test
|
||||
using ADataType = std::tuple_element_t<0, Tuple>;
|
||||
using BDataType = std::tuple_element_t<1, Tuple>;
|
||||
|
||||
void Run()
|
||||
constexpr bool skip_case()
|
||||
{
|
||||
std::vector<std::vector<ck::index_t>> lengths = {
|
||||
{4, 2, 1, 8}, {1, 1, 1, 1}, {16, 8, 32, 64}, {32, 64, 128, 128}};
|
||||
|
||||
for(auto length : lengths)
|
||||
#ifndef CK_ENABLE_FP16
|
||||
if constexpr(ck::is_same_v<ADataType, F16> || ck::is_same_v<BDataType, F16>)
|
||||
{
|
||||
bool success =
|
||||
ck::test_permute_scale_impl<ADataType, BDataType, 4>(true, 2, false, false, length);
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
#ifndef CK_ENABLE_FP32
|
||||
if constexpr(ck::is_same_v<ADataType, F32> || ck::is_same_v<BDataType, F32>)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
template <ck::index_t NDims>
|
||||
void Run(std::vector<ck::index_t> lengths,
|
||||
std::vector<ck::index_t> input_strides,
|
||||
std::vector<ck::index_t> output_strides)
|
||||
{
|
||||
if(!skip_case())
|
||||
{
|
||||
bool success = ck::profiler::profile_permute_scale_impl<ADataType, BDataType, NDims>(
|
||||
true, 2, false, false, lengths, input_strides, output_strides);
|
||||
EXPECT_TRUE(success);
|
||||
}
|
||||
}
|
||||
@@ -32,5 +49,52 @@ class TestPermute : public ::testing::Test
|
||||
using KernelTypes = ::testing::Types<std::tuple<F16, F16>, std::tuple<F32, F32>>;
|
||||
|
||||
TYPED_TEST_SUITE(TestPermute, KernelTypes);
|
||||
TYPED_TEST(TestPermute, Test_FP16) { this->Run(); }
|
||||
TYPED_TEST(TestPermute, Test_FP32) { this->Run(); }
|
||||
TYPED_TEST(TestPermute, Test1D)
|
||||
{
|
||||
constexpr ck::index_t NumDims = 1;
|
||||
this->template Run<NumDims>({8}, {1}, {2});
|
||||
this->template Run<NumDims>({8}, {2}, {1});
|
||||
this->template Run<NumDims>({1}, {1}, {1});
|
||||
}
|
||||
|
||||
TYPED_TEST(TestPermute, Test2D)
|
||||
{
|
||||
constexpr ck::index_t NumDims = 2;
|
||||
this->template Run<NumDims>({8, 4}, {4, 1}, {1, 8});
|
||||
this->template Run<NumDims>({8, 4}, {1, 8}, {4, 1});
|
||||
this->template Run<NumDims>({1, 1}, {1, 1}, {1, 1});
|
||||
}
|
||||
|
||||
TYPED_TEST(TestPermute, Test3D)
|
||||
{
|
||||
constexpr ck::index_t NumDims = 3;
|
||||
this->template Run<NumDims>({2, 4, 4}, {16, 4, 1}, {1, 2, 8});
|
||||
this->template Run<NumDims>({2, 4, 4}, {1, 2, 8}, {16, 4, 1});
|
||||
this->template Run<NumDims>({1, 1, 1}, {1, 1, 1}, {1, 1, 1});
|
||||
}
|
||||
|
||||
TYPED_TEST(TestPermute, Test4D)
|
||||
{
|
||||
constexpr ck::index_t NumDims = 4;
|
||||
this->template Run<NumDims>({2, 4, 4, 4}, {64, 16, 4, 1}, {1, 2, 8, 32});
|
||||
this->template Run<NumDims>({2, 4, 4, 4}, {1, 2, 8, 32}, {64, 16, 4, 1});
|
||||
this->template Run<NumDims>({1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1});
|
||||
}
|
||||
|
||||
TYPED_TEST(TestPermute, Test5D)
|
||||
{
|
||||
constexpr ck::index_t NumDims = 5;
|
||||
this->template Run<NumDims>({2, 4, 4, 4, 4}, {256, 64, 16, 4, 1}, {1, 2, 8, 32, 128});
|
||||
this->template Run<NumDims>({2, 4, 4, 4, 4}, {1, 2, 8, 32, 128}, {256, 64, 16, 4, 1});
|
||||
this->template Run<NumDims>({1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1});
|
||||
}
|
||||
|
||||
TYPED_TEST(TestPermute, Test6D)
|
||||
{
|
||||
constexpr ck::index_t NumDims = 6;
|
||||
this->template Run<NumDims>(
|
||||
{2, 4, 4, 4, 4, 4}, {1024, 256, 64, 16, 4, 1}, {1, 2, 8, 32, 128, 512});
|
||||
this->template Run<NumDims>(
|
||||
{2, 4, 4, 4, 4, 4}, {1, 2, 8, 32, 128, 512}, {1024, 256, 64, 16, 4, 1});
|
||||
this->template Run<NumDims>({1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user