From 7db609fe32101dd3c3f69e99827af4ff8b8780c8 Mon Sep 17 00:00:00 2001 From: dummycoderfe Date: Sun, 27 Oct 2024 15:05:52 +0000 Subject: [PATCH] build ok --- CMakeLists.txt | 4 - cmake/EnableCompilerWarnings.cmake | 1 - .../ck_tile/06_layernorm2d_bwd/CMakeLists.txt | 21 ++ example/ck_tile/06_layernorm2d_bwd/README.md | 22 +++ .../instances/layernorm2d_bwd_api.cpp | 44 +++++ ...layernorm2d_bwd_bf16_n64_n128_instance.cpp | 11 ++ .../layernorm2d_bwd_instance_common.hpp | 55 ++++++ .../06_layernorm2d_bwd/layernorm2d_bwd.cpp | 179 ++++++++++++++++++ .../06_layernorm2d_bwd/layernorm2d_bwd.hpp | 113 +++++++++++ .../06_layernorm2d_bwd/script/perf_test.sh | 38 ++++ .../06_layernorm2d_bwd/script/smoke_test.sh | 31 +++ example/ck_tile/CMakeLists.txt | 1 + include/ck_tile/ops/layernorm2d.hpp | 7 + .../layernorm2d_bwd_gamma_beta_kernel.hpp | 6 +- ...ayernorm2d_bwd_pipeline_default_policy.hpp | 16 +- ...> layernorm2d_bwd_pipeline_gamma_beta.hpp} | 44 ++++- .../layernorm2d_bwd_pipeline_problem.hpp | 4 +- .../layernorm2d_fwd_pipeline_problem.hpp | 29 --- 18 files changed, 579 insertions(+), 47 deletions(-) create mode 100644 example/ck_tile/06_layernorm2d_bwd/CMakeLists.txt create mode 100644 example/ck_tile/06_layernorm2d_bwd/README.md create mode 100644 example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_api.cpp create mode 100644 example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_bf16_n64_n128_instance.cpp create mode 100644 example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_instance_common.hpp create mode 100644 example/ck_tile/06_layernorm2d_bwd/layernorm2d_bwd.cpp create mode 100644 example/ck_tile/06_layernorm2d_bwd/layernorm2d_bwd.hpp create mode 100755 example/ck_tile/06_layernorm2d_bwd/script/perf_test.sh create mode 100755 example/ck_tile/06_layernorm2d_bwd/script/smoke_test.sh rename include/ck_tile/ops/layernorm2d/pipeline/{layernorm2d_bwd_gamma_beta.hpp => layernorm2d_bwd_pipeline_gamma_beta.hpp} (64%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a51803639..5d98c5bdd5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -495,10 +495,6 @@ include_directories(BEFORE ) SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV") -if(BUILD_DEV) - add_compile_options(-Werror) - add_compile_options(-Weverything) -endif() message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") diff --git a/cmake/EnableCompilerWarnings.cmake b/cmake/EnableCompilerWarnings.cmake index 93fd306e98..d5bcd6f978 100644 --- a/cmake/EnableCompilerWarnings.cmake +++ b/cmake/EnableCompilerWarnings.cmake @@ -66,7 +66,6 @@ else() -Wunreachable-code -Wunused -Wno-reserved-identifier - -Werror -Wno-option-ignored -Wsign-compare -Wno-extra-semi-stmt diff --git a/example/ck_tile/06_layernorm2d_bwd/CMakeLists.txt b/example/ck_tile/06_layernorm2d_bwd/CMakeLists.txt new file mode 100644 index 0000000000..0281f7a4a9 --- /dev/null +++ b/example/ck_tile/06_layernorm2d_bwd/CMakeLists.txt @@ -0,0 +1,21 @@ +set(EXAMPLE_LAYERNORM2D_BWD "tile_example_layernorm2d_bwd") +# not using add_example_executable() to add this target, since we don't want this to have +# to be included in "make all/install/check" +message("adding example ${EXAMPLE_LAYERNORM2D_BWD}") +file(GLOB INSTANCE_SRCS instances/*.cpp) +add_executable(${EXAMPLE_LAYERNORM2D_BWD} EXCLUDE_FROM_ALL layernorm2d_bwd.cpp) +target_include_directories(${EXAMPLE_LAYERNORM2D_BWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_sources(${EXAMPLE_LAYERNORM2D_BWD} PRIVATE ${INSTANCE_SRCS}) + +set(EXAMPLE_layernorm2d_bwd_COMPILE_OPTIONS) + +# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations +list(APPEND EXAMPLE_layernorm2d_bwd_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) + +target_compile_options(${EXAMPLE_LAYERNORM2D_BWD} PRIVATE ${EXAMPLE_layernorm2d_bwd_COMPILE_OPTIONS}) + +# TODO: we have to turn off this global prop, otherwise the progress bar generated +# by cmake will print too many files, execvp: /bin/sh: Argument list too long +# however, this property may affect global +# TODO: consider codegen a makefile by us +set_property(GLOBAL PROPERTY RULE_MESSAGES OFF) diff --git a/example/ck_tile/06_layernorm2d_bwd/README.md b/example/ck_tile/06_layernorm2d_bwd/README.md new file mode 100644 index 0000000000..8969fc297a --- /dev/null +++ b/example/ck_tile/06_layernorm2d_bwd/README.md @@ -0,0 +1,22 @@ +# Layernorm2D forward + +This folder contains example for Layernorm2D forward using ck_tile tile-programming implementation. + +## build +``` +# in the root of ck_tile +mkdir build && cd build +sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +make tile_example_layernorm2d_bwd -j +``` +This will result in an executable `build/bin/tile_example_layernorm2d_bwd` + +## example +``` +args: + -m m dimension (default:3328) + -n m dimension (default:4096) + -e epsilon (default:1e-5) + -v cpu validation or not (default:1) + -prec precision (default:fp16) +``` diff --git a/example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_api.cpp b/example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_api.cpp new file mode 100644 index 0000000000..387379fa3a --- /dev/null +++ b/example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_api.cpp @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "layernorm2d_bwd.hpp" + +template +using trait_ = layernorm2d_bwd_traits_; + +template +float layernorm2d_bwd_b16_(layernorm2d_bwd_traits /*t*/, + layernorm2d_bwd_args a, + const ck_tile::stream_config& s) +{ + return layernorm2d_bwd_>(s, a); +} + +float layernorm2d_bwd(layernorm2d_bwd_traits t, + layernorm2d_bwd_args a, + const ck_tile::stream_config& s) +{ + + float r = -1; + if(t.data_type.compare("fp16") == 0) + { + return layernorm2d_bwd_b16_(t, a, s); + } + else if(t.data_type.compare("bf16") == 0) + { + return layernorm2d_bwd_b16_(t, a, s); + } + if(r < 0) + throw std::runtime_error("Without supported instances!"); + + return r; +} diff --git a/example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_bf16_n64_n128_instance.cpp b/example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_bf16_n64_n128_instance.cpp new file mode 100644 index 0000000000..3965838c8c --- /dev/null +++ b/example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_bf16_n64_n128_instance.cpp @@ -0,0 +1,11 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "layernorm2d_bwd_instance_common.hpp" + +// clang-format off +// rm tm tn pd +template float layernorm2d_bwd_>(const S&, A); +template float layernorm2d_bwd_>(const S&, A); +// clang-format on diff --git a/example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_instance_common.hpp b/example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_instance_common.hpp new file mode 100644 index 0000000000..1ac58c5be2 --- /dev/null +++ b/example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_instance_common.hpp @@ -0,0 +1,55 @@ + +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "layernorm2d_bwd.hpp" +#include + +#pragma once + +using S = ck_tile::stream_config; +using A = layernorm2d_bwd_args; + +template +using trait_ = layernorm2d_bwd_traits_; + +template +float layernorm2d_bwd_(const S& s, A a) +{ + using DataType = typename Traits_::DataType; + + using PipelineProblem = ck_tile::Layernorm2dBwdGammaBetaPipelineProblem< + typename LayerNormTypeConfig::XDataType, + typename LayerNormTypeConfig::GammaDataType, + typename LayerNormTypeConfig::BetaDataType, + typename LayerNormTypeConfig::ComputeDataType, + typename LayerNormTypeConfig::YDataType, + typename LayerNormTypeConfig::MeanDataType, + typename LayerNormTypeConfig::InvStdDataType, + typename Traits_::Shape, + Traits_::kPadN>; + + using Pipeline = ck_tile::Layernorm2dBwdGammaBetaPipeline; + + using Kernel = ck_tile::Layernorm2dBwdGammaBeta; + + const dim3 grids = Kernel::GridSize(a); + constexpr dim3 blocks = Kernel::BlockSize(); + constexpr ck_tile::index_t kBlockPerCu = 1; + + auto kargs = Kernel::MakeKargs(a); + if(s.log_level_ > 0) + std::cout << ", " << Kernel::GetName() << std::flush; + + return ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); +} diff --git a/example/ck_tile/06_layernorm2d_bwd/layernorm2d_bwd.cpp b/example/ck_tile/06_layernorm2d_bwd/layernorm2d_bwd.cpp new file mode 100644 index 0000000000..72c41025ac --- /dev/null +++ b/example/ck_tile/06_layernorm2d_bwd/layernorm2d_bwd.cpp @@ -0,0 +1,179 @@ +#include "ck_tile/host.hpp" +#include "layernorm2d_bwd.hpp" +#include + +// different threshold for different dtype +template +auto get_elimit() +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +template <> +auto get_elimit() +{ + double rtol = 1e-2; + double atol = 1e-2; + return ck_tile::make_tuple(rtol, atol); +} + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("m", "3328", "m dimension") + .insert("n", "4096", "n dimension") + .insert("stride", "-1", "stride per row, if -1 then equal to n") + .insert("v", "1", "cpu validation or not") + .insert("kname", "1", "print kernel name or not") + .insert("prec", "fp16", "precision") + .insert("warmup", "5", "cold iter") + .insert("repeat", "20", "hot iter"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +template +bool run(const ck_tile::ArgParser& arg_parser) +{ + ck_tile::index_t m = arg_parser.get_int("m"); + ck_tile::index_t n = arg_parser.get_int("n"); + ck_tile::index_t stride = arg_parser.get_int("stride"); + if(stride < 0) + stride = n; + std::string data_type = arg_parser.get_str("prec"); + int kname = arg_parser.get_int("kname"); + int do_validation = arg_parser.get_int("v"); + int warmup = arg_parser.get_int("warmup"); + int repeat = arg_parser.get_int("repeat"); + + assert(stride >= n); + + using TypeConfig = LayerNormTypeConfig; + + using XDataType = typename TypeConfig::XDataType; + using YDataType = typename TypeConfig::YDataType; + using GammaDataType = typename TypeConfig::GammaDataType; + using BetaDataType = typename TypeConfig::BetaDataType; + + using MeanDataType = typename TypeConfig::MeanDataType; + using InvStdDataType = typename TypeConfig::InvStdDataType; + + using ComputeDataType = typename TypeConfig::ComputeDataType; + + // host verify + ck_tile::HostTensor dy_host({m, n}, {stride, 1}); + ck_tile::HostTensor mean_host({m}); + ck_tile::HostTensor invStd_host({m}); + + ck_tile::HostTensor dgamma_host_dev({n}); + ck_tile::HostTensor dbeta_host_dev({n}); + ck_tile::HostTensor dgamma_host_ref({n}); + ck_tile::HostTensor dbeta_host_ref({n}); + + + ck_tile::FillUniformDistribution{-.5f, .5f}(dy_host); + // ck_tile::FillUniformDistribution{-.5f, .5f}(mean_host); + ck_tile::FillMonotonicSeq{}(mean_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(invStd_host); + + ck_tile::DeviceMem dy_buf(dy_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem mean_buf(mean_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem invStd_buf(invStd_host.get_element_space_size_in_bytes()); + + ck_tile::DeviceMem dgamma_buf(dgamma_host_dev.get_element_space_size_in_bytes()); + ck_tile::DeviceMem dbeta_buf(dbeta_host_dev.get_element_space_size_in_bytes()); + + dy_buf.ToDevice(dy_host.data()); + mean_buf.ToDevice(mean_host.data()); + invStd_buf.ToDevice(invStd_host.data()); + + std::cout << "[" << data_type << "]" + << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush; + + layernorm2d_bwd_traits traits{data_type}; + + layernorm2d_bwd_args args{dy_buf.GetDeviceBuffer(), + mean_buf.GetDeviceBuffer(), + invStd_buf.GetDeviceBuffer(), + dgamma_buf.GetDeviceBuffer(), + dbeta_buf.GetDeviceBuffer(), + nullptr, + m, + n, + stride}; + + float ave_time = layernorm2d_bwd( + traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat}); + + std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(GammaDataType) * n + + sizeof(BetaDataType) * n + sizeof(YDataType) * m * n; + + float gb_per_sec = num_byte / 1.E6 / ave_time; + std::cout << sizeof(ComputeDataType) << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush; + + bool pass = true; + + if(do_validation) + { + // // reference + // ck_tile::reference_layernorm2d_bwd( + // x_host, gamma_host, beta_host, y_host_ref, mean_host_ref, invStd_host_ref, epsilon); + + // y_buf.FromDevice(y_host_dev.data()); + + // auto [rtol, atol] = get_elimit(); + // if(stride == n) + // { + // pass = ck_tile::check_err( + // y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol); + // } + // else + // { + // for(int i_r = 0; i_r < m; i_r++) + // { + // std::vector y_host_dev_row(y_host_dev.begin() + i_r * stride, + // y_host_dev.begin() + i_r * stride + n); + // std::vector y_host_ref_row(y_host_ref.begin() + i_r * stride, + // y_host_ref.begin() + i_r * stride + n); + // pass &= ck_tile::check_err(y_host_dev_row, + // y_host_ref_row, + // std::string("OUT[") + std::to_string(i_r) + + // std::string("] Error: Incorrect results!"), + // rtol, + // atol); + // } + // } + + std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl; + } + + return pass; +} + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + const std::string data_type = arg_parser.get_str("prec"); + if(data_type == "fp16") + { + return run(arg_parser) ? 0 : -2; + } + else if(data_type == "bf16") + { + return run(arg_parser) ? 0 : -2; + } + + return -3; +} diff --git a/example/ck_tile/06_layernorm2d_bwd/layernorm2d_bwd.hpp b/example/ck_tile/06_layernorm2d_bwd/layernorm2d_bwd.hpp new file mode 100644 index 0000000000..8108ae5fc9 --- /dev/null +++ b/example/ck_tile/06_layernorm2d_bwd/layernorm2d_bwd.hpp @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/layernorm2d.hpp" +#include + +template +struct LayerNormTypeConfig; + +template <> +struct LayerNormTypeConfig +{ + using XDataType = ck_tile::half_t; + using YDataType = ck_tile::half_t; + using GammaDataType = ck_tile::half_t; + using BetaDataType = ck_tile::half_t; + using MeanDataType = ck_tile::half_t; + using InvStdDataType = ck_tile::half_t; + using ComputeDataType = float; +}; + +template <> +struct LayerNormTypeConfig +{ + using XDataType = ck_tile::bf16_t; + using YDataType = ck_tile::bf16_t; + using GammaDataType = ck_tile::bf16_t; + using BetaDataType = ck_tile::bf16_t; + using MeanDataType = ck_tile::bf16_t; + using InvStdDataType = ck_tile::bf16_t; + using ComputeDataType = float; +}; + +// runtime args +struct layernorm2d_bwd_args : public ck_tile::Layernorm2dBwdGammaBetaHostArgs +{ +}; + +// this is used to pattern-match internl kernel implementation, not to instantiate kernel +template +struct layernorm2d_bwd_traits_ +{ + using DataType = ck_tile::remove_cvref_t; + + static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize; + static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0); + static constexpr ck_tile::index_t total_warps = + (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize; + + // num of warps along m + static constexpr ck_tile::index_t BlockWarps_M = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return total_warps * (warpSize / ThreadPerBlock_N_); + } + else + { + // static_assert(warpSize % ThreadPerBlock_M_ == 0); + return total_warps / (ThreadPerBlock_N_ / warpSize); + } + }(); + + // num of warps along n + static constexpr ck_tile::index_t BlockWarps_N = []() { + if constexpr(is_warp_per_row) + { + static_assert(warpSize % ThreadPerBlock_N_ == 0); + return 1; + } + else + { + static_assert(ThreadPerBlock_N_ % warpSize == 0); + return ThreadPerBlock_N_ / warpSize; + } + }(); + + static constexpr ck_tile::index_t Repeat_M = Repeat_M_; + + static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_; + static constexpr ck_tile::index_t Block_N = ThreadPerBlock_N_; + + static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M; + static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N; + + using BlockTile = ck_tile::sequence; + using BlockWarps = ck_tile::sequence; + using WarpTile = ck_tile::sequence; + using Vector = ck_tile::sequence<1, 1>; + + using Shape = ck_tile::Layernorm2dShape; + + static constexpr bool kPadN = kPadN_; +}; + +template +float layernorm2d_bwd_(const ck_tile::stream_config& s, layernorm2d_bwd_args a); + +// This is the public API, will be generated by script +struct layernorm2d_bwd_traits +{ + std::string data_type; +}; + +float layernorm2d_bwd(layernorm2d_bwd_traits, layernorm2d_bwd_args, const ck_tile::stream_config&); diff --git a/example/ck_tile/06_layernorm2d_bwd/script/perf_test.sh b/example/ck_tile/06_layernorm2d_bwd/script/perf_test.sh new file mode 100755 index 0000000000..663052aa5d --- /dev/null +++ b/example/ck_tile/06_layernorm2d_bwd/script/perf_test.sh @@ -0,0 +1,38 @@ + +# run from top of ck folder +EXE=build/bin/tile_example_layernorm2d_bwd + +$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 +$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=bf16 -repeat=1000 + +$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 +$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=fp16 -repeat=1000 \ No newline at end of file diff --git a/example/ck_tile/06_layernorm2d_bwd/script/smoke_test.sh b/example/ck_tile/06_layernorm2d_bwd/script/smoke_test.sh new file mode 100755 index 0000000000..811f34cd1f --- /dev/null +++ b/example/ck_tile/06_layernorm2d_bwd/script/smoke_test.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# call from top of CK folder +EXE=./build/bin/tile_example_layernorm2d_bwd + +for pr_i in "fp16" "bf16" ; do +$EXE -prec=$pr_i -m=99 -n=13 +$EXE -prec=$pr_i -m=17 -n=16 +$EXE -prec=$pr_i -m=1 -n=100 +$EXE -prec=$pr_i -m=4 -n=128 +$EXE -prec=$pr_i -m=80 -n=127 +$EXE -prec=$pr_i -m=22 -n=255 -stride=256 +$EXE -prec=$pr_i -m=7 -n=599 +$EXE -prec=$pr_i -m=19 -n=512 +$EXE -prec=$pr_i -m=33 -n=313 -stride=1000 +$EXE -prec=$pr_i -m=11 -n=510 +$EXE -prec=$pr_i -m=171 -n=676 -stride=818 +$EXE -prec=$pr_i -m=91 -n=636 +$EXE -prec=$pr_i -m=12 -n=768 -stride=800 +$EXE -prec=$pr_i -m=100 -n=766 -stride=812 +$EXE -prec=$pr_i -m=31 -n=1024 +$EXE -prec=$pr_i -m=64 -n=1000 -stride=1004 +$EXE -prec=$pr_i -m=8 -n=1501 +$EXE -prec=$pr_i -m=3 -n=1826 +$EXE -prec=$pr_i -m=5 -n=2040 +$EXE -prec=$pr_i -m=7 -n=2734 +$EXE -prec=$pr_i -m=1 -n=3182 +$EXE -prec=$pr_i -m=9 -n=4096 +$EXE -prec=$pr_i -m=3 -n=8192 +$EXE -prec=$pr_i -m=1 -n=10547 +$EXE -prec=$pr_i -m=3 -n=17134 +done diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index ec4a175d35..a2bcd7ecd0 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -7,3 +7,4 @@ add_subdirectory(02_layernorm2d) add_subdirectory(03_gemm) add_subdirectory(04_img2col) add_subdirectory(05_reduce) +add_subdirectory(06_layernorm2d_bwd) diff --git a/include/ck_tile/ops/layernorm2d.hpp b/include/ck_tile/ops/layernorm2d.hpp index 2a403b0f49..f7f3d5a4b2 100644 --- a/include/ck_tile/ops/layernorm2d.hpp +++ b/include/ck_tile/ops/layernorm2d.hpp @@ -9,4 +9,11 @@ #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp" #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp" #include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp" + +#include "ck_tile/ops/layernorm2d/kernel/layernorm2d_bwd_gamma_beta_kernel.hpp" +#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_default_policy.hpp" +#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_gamma_beta.hpp" +#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_problem.hpp" + #include "ck_tile/ops/common/tensor_layout.hpp" + diff --git a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_bwd_gamma_beta_kernel.hpp b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_bwd_gamma_beta_kernel.hpp index 49410bffb8..d73c0f3f00 100644 --- a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_bwd_gamma_beta_kernel.hpp +++ b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_bwd_gamma_beta_kernel.hpp @@ -104,15 +104,13 @@ struct Layernorm2dBwdGammaBeta auto surfix = [&] () { std::string n; if (kPadN) n += "_pn"; - if (kSaveMeanInvStd) n += "_mv"; - if (kTwoPass) n += "_2p"; return n; }(); #define _SS_ std::string #define _TS_ std::to_string - return _SS_("layernorm2d_fwd_") + _SS_(t2s::name) + "_" + + return _SS_("layernorm2d_bwd_") + _SS_(t2s::name) + "_" + _TS_(S_::Block_M) + "x" + _TS_(S_::Block_N) + "_" + _TS_(S_::WarpPerBlock_M) + "x" + _TS_(S_::WarpPerBlock_N) + "_" + - _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(S_::Vector_N) + "_" + + _TS_(S_::Warp_M) + "x" + _TS_(S_::Warp_N) + "_" + _TS_(S_::Vector_M) + "x" + _TS_(1) + "_" + _SS_(Pipeline::name) + surfix; #undef _SS_ #undef _TS_ diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_default_policy.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_default_policy.hpp index 0cc023ba45..910bc14238 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_default_policy.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_default_policy.hpp @@ -4,8 +4,6 @@ #pragma once #include "ck_tile/core.hpp" -#include "ck_tile/ops/welford/block/block_welford_problem.hpp" -#include "ck_tile/ops/welford/block/block_welford.hpp" namespace ck_tile { @@ -41,6 +39,20 @@ struct Layernorm2dBwdGammaBetaPipelineDefaultPolicy sequence<0>>{}); } + // template + // CK_TILE_DEVICE static constexpr auto MakeGammaBetaBlockTileDistribution() + // { + // using S = typename Problem::BlockShape; + + // return make_static_tile_distribution( + // tile_distribution_encoding< + // sequence, + // tuple>, + // tuple, sequence<0, 1>>, + // tuple, sequence<2, 1>>, + // sequence<0>, + // sequence<0>>{}); + // } template CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_gamma_beta.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_gamma_beta.hpp similarity index 64% rename from include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_gamma_beta.hpp rename to include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_gamma_beta.hpp index 009536514c..4832de5dd2 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_gamma_beta.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_gamma_beta.hpp @@ -4,7 +4,7 @@ #pragma once #include "ck_tile/core.hpp" -#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp" +#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_default_policy.hpp" #include #include @@ -28,14 +28,32 @@ struct Layernorm2dBwdGammaBetaPipeline static constexpr bool kPadN = Problem::kPadN; static constexpr const char* name = []() { - return "bwd_gamma_beta" + return "bwd_gamma_beta"; }(); CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Policy::template GetSmemSize(); } + // template + // CK_TILE_DEVICE void dump(const DumpTensor_& x) const + // { + // constexpr auto I0 = number<0>{}; + // constexpr auto I1 = number<1>{}; + // constexpr auto spans = DumpTensor_::get_distributed_spans(); + + // sweep_tile_span(spans[I1], [&](auto i1) { + // sweep_tile_span(spans[I0], [&](auto i0) { + // constexpr auto in_dstr_idx = make_tuple(i0, i1); + // auto v = ck_tile::type_convert(x[in_dstr_idx]); + // index_t tid = + // (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x; + // printf("%d %f\n", tid, v); + + // }); + // }); + // } template ()); + const auto dy_window = make_tile_window(dy_window_, + Policy::template MakeDyBlockTileDistribution()); const auto mean_window = make_tile_window( mean_window_, Policy::template MakeMeanBlockTileDistribution()); + const auto inv_std_window = make_tile_window( + inv_std_window_, Policy::template MakeMeanBlockTileDistribution()); // const auto gamma_window = make_tile_window( // gamma_window_, Policy::template MakeGammaBetaBlockTileDistribution()); // const auto beta_window = make_tile_window( @@ -60,7 +80,23 @@ struct Layernorm2dBwdGammaBetaPipeline const auto dy = load_tile(dy_window); const auto mean = load_tile(mean_window); + const auto inv_std = load_tile(inv_std_window); + // auto y = make_static_distributed_tensor(dy.get_tile_distribution()); + sweep_tile(mean, [&](auto idx) { + constexpr auto i_idx = make_tuple(idx[number<0>{}]); + // constexpr auto j_idx = make_tuple(idx[number<1>{}]); + + index_t tid = (threadIdx.y * blockDim.x) + threadIdx.x; + const auto m = type_convert(mean[i_idx]); + if(blockIdx.x == 0 && blockIdx.y == 0) + printf("%d %f\n", tid, m); + + }); + // dump(dy); + // dump(mean); + // dump(inv_std); + *reinterpret_cast(smem) = row_size; // layernorm computation // auto y = make_static_distributed_tensor(x.get_tile_distribution()); diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_problem.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_problem.hpp index 40fd8e80ba..2895b95137 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_problem.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_problem.hpp @@ -15,9 +15,7 @@ template + bool kPadN_> struct Layernorm2dBwdGammaBetaPipelineProblem { using XDataType = remove_cvref_t; diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp index d8108f65a8..8e9f8e81e4 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp @@ -37,33 +37,4 @@ struct Layernorm2dFwdPipelineProblem static constexpr bool kTwoPass = kTwoPass_; }; -template -struct Layernorm2dFwdPipelineProblem -{ - using XDataType = remove_cvref_t; - using GammaDataType = remove_cvref_t; - using BetaDataType = remove_cvref_t; - using ComputeDataType = remove_cvref_t; - using YDataType = remove_cvref_t; - using MeanDataType = remove_cvref_t; - using InvStdDataType = remove_cvref_t; - using BlockShape = remove_cvref_t; - - static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1; - static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1; - - static constexpr bool kPadN = kPadN_; - static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_; - static constexpr bool kTwoPass = kTwoPass_; -}; } // namespace ck_tile