mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-15 10:37:44 +00:00
* hack for cap logits
* fix bug
* Re-format files
* Allow specifying logits_soft_cap through APIs
* Support turn on/off logits_soft_cap in async pipeline
* Do not generate non-verified kernels
* Align receipt used in Aiter
* Sync logits soft-capping across pipelines
* Re-enable some hdim pipelines
* fix perf
* Add attention variant for logits_soft_cap
* Add newline at end-of-file
* Fix performance
* Add comment to explain logits_soft_cap pre-processing
* Unify code
* Unify floating-point literal style
* Use class data member to slience the compilation error
* [CK_TILE] Update attention customizaton interface: add LogitsMask() (#2133)
* Send 'mask' along with variant params to the LogitsMask()
* Send block indices to the variant
* Add indices parameters in variant interface
* Fix fmha bwd codegen error
* Allow switch logits_soft_cap impl
* Eliminate register spills
* Fix compilation errors
* Fix wrong LSE
* Fix LSE for splitkv kernel
* Sync splitkv pipeline changes
* Add batch_prefill kernel/pipeline
* Fix codegen error
* Undo changes in CMakeLists.txt
* Merge pipeline filtering check
* Use different code path if kHasLogitsSoftCap=false
* Remove [[maybe_unused]] attribute
* Use pre-existing compile-time flag to instantiate templates
* Sync pipeline changes
* Update CHANGELOG.md
---------
Co-authored-by: Bernard <bernaliu@amd.com>
Co-authored-by: coderfeli <coderfeli@163.com>
[ROCm/composable_kernel commit: 2920604786]
78 lines
3.6 KiB
C++
78 lines
3.6 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include "ck_tile/core/algorithm/cluster_descriptor.hpp"
|
|
#include "ck_tile/core/algorithm/coordinate_transform.hpp"
|
|
#include "ck_tile/core/algorithm/indexing_adaptor.hpp"
|
|
#include "ck_tile/core/algorithm/space_filling_curve.hpp"
|
|
#include "ck_tile/core/algorithm/static_encoding_pattern.hpp"
|
|
#include "ck_tile/core/arch/amd_buffer_addressing.hpp"
|
|
#include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
|
|
#include "ck_tile/core/arch/arch.hpp"
|
|
#include "ck_tile/core/arch/generic_memory_space_atomic.hpp"
|
|
#include "ck_tile/core/arch/utility.hpp"
|
|
#include "ck_tile/core/arch/workgroup_barrier.hpp"
|
|
#include "ck_tile/core/config.hpp"
|
|
#include "ck_tile/core/container/array.hpp"
|
|
#include "ck_tile/core/container/container_helper.hpp"
|
|
#include "ck_tile/core/container/map.hpp"
|
|
#include "ck_tile/core/container/meta_data_buffer.hpp"
|
|
#include "ck_tile/core/container/multi_index.hpp"
|
|
#include "ck_tile/core/container/sequence.hpp"
|
|
#include "ck_tile/core/container/span.hpp"
|
|
#include "ck_tile/core/container/statically_indexed_array.hpp"
|
|
#include "ck_tile/core/container/thread_buffer.hpp"
|
|
#include "ck_tile/core/container/tuple.hpp"
|
|
#include "ck_tile/core/numeric/bfloat16.hpp"
|
|
#include "ck_tile/core/numeric/float8.hpp"
|
|
#include "ck_tile/core/numeric/half.hpp"
|
|
#include "ck_tile/core/numeric/int8.hpp"
|
|
#include "ck_tile/core/numeric/integer.hpp"
|
|
#include "ck_tile/core/numeric/integral_constant.hpp"
|
|
#include "ck_tile/core/numeric/math.hpp"
|
|
#include "ck_tile/core/numeric/null_type.hpp"
|
|
#include "ck_tile/core/numeric/numeric.hpp"
|
|
#include "ck_tile/core/numeric/pk_int4.hpp"
|
|
#include "ck_tile/core/numeric/type_convert.hpp"
|
|
#include "ck_tile/core/numeric/vector_type.hpp"
|
|
#include "ck_tile/core/tensor/buffer_view.hpp"
|
|
#include "ck_tile/core/tensor/load_tile.hpp"
|
|
#include "ck_tile/core/tensor/null_tensor.hpp"
|
|
#include "ck_tile/core/tensor/null_tile_window.hpp"
|
|
#include "ck_tile/core/tensor/shuffle_tile.hpp"
|
|
#include "ck_tile/core/tensor/slice_tile.hpp"
|
|
#include "ck_tile/core/tensor/static_distributed_tensor.hpp"
|
|
#include "ck_tile/core/tensor/store_tile.hpp"
|
|
#include "ck_tile/core/tensor/sweep_tile.hpp"
|
|
#include "ck_tile/core/tensor/tensor_adaptor.hpp"
|
|
#include "ck_tile/core/tensor/tensor_adaptor_coordinate.hpp"
|
|
#include "ck_tile/core/tensor/tensor_coordinate.hpp"
|
|
#include "ck_tile/core/tensor/tensor_descriptor.hpp"
|
|
#include "ck_tile/core/tensor/tensor_view.hpp"
|
|
#include "ck_tile/core/tensor/tile_distribution.hpp"
|
|
#include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
|
|
#include "ck_tile/core/tensor/tile_elementwise.hpp"
|
|
#include "ck_tile/core/tensor/tile_scatter_gather.hpp"
|
|
#include "ck_tile/core/tensor/tile_window.hpp"
|
|
#include "ck_tile/core/tensor/tile_window_linear.hpp"
|
|
#include "ck_tile/core/tensor/tile_window_utils.hpp"
|
|
#include "ck_tile/core/tensor/transpose_tile.hpp"
|
|
#include "ck_tile/core/tensor/update_tile.hpp"
|
|
#include "ck_tile/core/utility/bit_cast.hpp"
|
|
#include "ck_tile/core/utility/env.hpp"
|
|
#include "ck_tile/core/utility/functional.hpp"
|
|
#include "ck_tile/core/utility/functional_with_tuple.hpp"
|
|
#include "ck_tile/core/utility/ignore.hpp"
|
|
#include "ck_tile/core/utility/literals.hpp"
|
|
#include "ck_tile/core/utility/magic_div.hpp"
|
|
#include "ck_tile/core/utility/philox_rand.hpp"
|
|
#include "ck_tile/core/utility/random.hpp"
|
|
#include "ck_tile/core/utility/reduce_operator.hpp"
|
|
#include "ck_tile/core/utility/static_counter.hpp"
|
|
#include "ck_tile/core/utility/to_sequence.hpp"
|
|
#include "ck_tile/core/utility/transpose_vectors.hpp"
|
|
#include "ck_tile/core/utility/type_traits.hpp"
|
|
#include "ck_tile/core/utility/unary_element_function.hpp"
|