mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-18 20:09:25 +00:00
* 50ms -> 28ms
* Fix bug in non fuse_add_store cases
* Fine tuned setting for 2 pass pipeline
* adjust workload
* remove unnecessary change
* add layernorm
* Adding output quant and unquant results at the same time.
* fix test
* fix format
* tune for cases 128x640 and 128x1024
* bug ifx
[ROCm/composable_kernel commit: d49abdaa87]
13 lines
513 B
C++
13 lines
513 B
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp"
|
|
#include "ck_tile/ops/epilogue/default_2d_epilogue.hpp"
|
|
#include "ck_tile/ops/epilogue/dynamic_quant_epilogue.hpp"
|
|
#include "ck_tile/ops/epilogue/default_2d_and_dynamic_quant_epilogue.hpp"
|
|
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
|
|
#include "ck_tile/ops/common/tensor_layout.hpp"
|
|
#include "ck_tile/ops/common/utils.hpp"
|