From 1e4eebfba8da4fac3c0fa4a714cad1c445b23769 Mon Sep 17 00:00:00 2001 From: Sami Remes Date: Wed, 22 Apr 2026 13:52:02 +0300 Subject: [PATCH 1/3] [CK_TILE] Preserve input strides in EightWaves async-load descriptor (#6611) `MakeAsyncLoadADramWindow` in `GemmPipelineAgBgCrCompAsyncEightWavesPolicy` was rebuilding the 6D view descriptor with `make_naive_tensor_descriptor_packed`, which synthesizes strides from lengths and assumes a dense layout. When the input view's leading-dim stride is larger than its inner length (non-packed memory layout), the resulting tile window stepped through memory at the wrong stride. Compose the unmerge transforms on top of the input view's existing descriptor instead, so the actual runtime strides are preserved and the correct `element_space_size` is inherited for bounds checking. ## Test Plan Added an unit test showing the problem. ## Test Result The new test fails before fixes and passes after. ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --- ...ag_bg_cr_comp_async_eight_waves_policy.hpp | 13 +++++--- test/ck_tile/gemm_block_scale/CMakeLists.txt | 6 ++++ ...quant_abquant_eightwaves_padded_stride.cpp | 31 +++++++++++++++++++ .../test_gemm_quant_fixtures.hpp | 9 ++++-- 4 files changed, 53 insertions(+), 6 deletions(-) create mode 100644 test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_eightwaves_padded_stride.cpp diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async_eight_waves_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async_eight_waves_policy.hpp index 29991197cd..1e1f525c3b 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async_eight_waves_policy.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async_eight_waves_policy.hpp @@ -176,10 +176,15 @@ struct GemmPipelineAgBgCrCompAsyncEightWavesPolicy const index_t M0 = integer_divide_ceil(rows, M1); const auto row_lens = make_tuple(M0, number{}); - const auto d0 = make_naive_tensor_descriptor_packed(container_concat(row_lens, col_lens)); - const auto desc_0 = decltype(d0)( // set correct size (without padding) - d0.get_transforms(), - tensor_view_tmp.get_tensor_descriptor().get_element_space_size()); + // Build the 6D view by composing unmerge transforms on top of the + // input view's existing descriptor. This preserves the input's actual + // strides (so a non-packed leading-dim stride is honored) and inherits + // its element_space_size for bounds checking. + const auto desc_0 = transform_tensor_descriptor( + tensor_view_tmp.get_tensor_descriptor(), + make_tuple(make_unmerge_transform(row_lens), make_unmerge_transform(col_lens)), + make_tuple(sequence<0>{}, sequence<1>{}), + make_tuple(sequence<0, 1>{}, sequence<2, 3, 4, 5>{})); const auto desc_1 = transform_tensor_descriptor( desc_0, make_tuple(make_pass_through_transform(M0), diff --git a/test/ck_tile/gemm_block_scale/CMakeLists.txt b/test/ck_tile/gemm_block_scale/CMakeLists.txt index 9f77cf01d7..21d34f7b34 100644 --- a/test/ck_tile/gemm_block_scale/CMakeLists.txt +++ b/test/ck_tile/gemm_block_scale/CMakeLists.txt @@ -86,6 +86,11 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12") ) target_compile_options(test_tile_gemm_quant_abquant_eightwaves PRIVATE ${TEST_GEMM_COMPILE_OPTIONS}) + add_gtest_executable(test_tile_gemm_quant_abquant_eightwaves_padded_stride + test_gemm_quant_abquant_eightwaves_padded_stride.cpp + ) + target_compile_options(test_tile_gemm_quant_abquant_eightwaves_padded_stride PRIVATE ${TEST_GEMM_COMPILE_OPTIONS}) + # ABQuant split-K tests add_gtest_executable(test_tile_gemm_quant_abquant_splitk_decode test_gemm_quant_abquant_splitk_decode.cpp @@ -281,6 +286,7 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12") test_tile_gemm_quant_abquant_a4w4_padding test_tile_gemm_quant_abquant_a4w4_preshuffle test_tile_gemm_quant_abquant_eightwaves + test_tile_gemm_quant_abquant_eightwaves_padded_stride # ABQuant split-K tests test_tile_gemm_quant_abquant_splitk_decode test_tile_gemm_quant_abquant_splitk_prefill diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_eightwaves_padded_stride.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_eightwaves_padded_stride.cpp new file mode 100644 index 0000000000..28b7811af3 --- /dev/null +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_abquant_eightwaves_padded_stride.cpp @@ -0,0 +1,31 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +// Regression test for the EightWaves ABQuant pipeline on a B tensor whose +// leading-dim stride is larger than the packed value. The async B-load +// descriptor in the EightWaves policy must be built from the input view's +// real strides so that the kernel addresses B correctly when stride_B is +// larger than the inner length (e.g. row-aligned weight padding). + +#include "test_gemm_quant_common.hpp" + +using GroupSize2D128N = ck_tile::QuantGroupShape>; +#ifdef CK_GFX950_SUPPORT +// Tuple format: +// clang-format off +using ABQuantEightWavesPaddedStrideTypes = ::testing::Types< + std::tuple +>; +// clang-format on + +TYPED_TEST_SUITE(TestCkTileGemmABQuant, ABQuantEightWavesPaddedStrideTypes); + +TYPED_TEST(TestCkTileGemmABQuant, ABQuantGroupedPaddedBStrideTest) +{ + // 256-byte row alignment for FP8 -> 256 elements of leading-dim padding. + constexpr ck_tile::index_t k_batch = 1; + constexpr ck_tile::index_t stride_B_pad = 256; + this->run_test_with_validation(1024, 1024, 1024, k_batch, stride_B_pad); +} +#endif diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp index 8fbda4a3ce..e5731c5caa 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp @@ -1038,12 +1038,17 @@ class TestCkTileGemmABQuant : public TestCkTileGemmQuantBaseis_row_major(ALayout{})); + // stride_B_pad lets a test exercise a B tensor whose leading-dim stride is + // larger than the packed value (e.g. row-aligned padding). The host tensor, + // device buffer, and kernel args are all built with this padded stride so + // the kernel must honor the runtime stride to address B correctly. const ck_tile::index_t stride_B = - ck_tile::get_default_stride(K, N, 0, this->is_row_major(BLayout{})); + ck_tile::get_default_stride(K, N, 0, this->is_row_major(BLayout{})) + stride_B_pad; const ck_tile::index_t stride_C = ck_tile::get_default_stride(M, N, 0, this->is_row_major(CLayout{})); From cfb09d76a509baf9665e965909bc8fbc9cc9d37b Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 22 Apr 2026 08:47:47 -0700 Subject: [PATCH 2/3] [CK] Fix/suppress clang lifetimebound warnings with staging compiler. (#6550) ## Motivation New changes from upstream llvm-project cause an avalanche of warnings in CK. Gonna disable them by ignoring the lifetime-safety-intra-tu-suggestions flag until a better permanent solution is found. ## Technical Details ## Test Plan ## Test Result ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --- .../grouped_gemm_multiple_d_xdl_fp16.cpp | 5 +++++ .../grouped_gemm_bias_e_permute_xdl_fp16.cpp | 4 ++++ .../run_batched_gemm_bias_e_permute_example.inc | 3 +++ .../sparse_embedding3_forward_layernorm.cpp | 4 ++++ example/39_permute/common.hpp | 4 ++++ .../splitk_gemm_bias_e_permute_xdl_fp16.cpp | 4 ++++ .../splitk_gemm_bias_e_permute_xdl_fp32.cpp | 4 ++++ .../03_gemm/gemm_splitk_two_stage_reduce.cpp | 5 +++++ .../gpu/device/device_grouped_gemm.hpp | 5 +++++ ...batched_gemm_gemm_wmma_cshuffle_v3_common.hpp | 4 ++++ .../device_batched_gemm_gemm_xdl_cshuffle.hpp | 5 +++++ ...vice_batched_gemm_reduce_wmma_cshuffle_v3.hpp | 4 ++++ .../device_batched_gemm_reduce_xdl_cshuffle.hpp | 5 +++++ ...d_gemm_softmax_gemm_permute_wmma_cshuffle.hpp | 4 ++++ ...ed_gemm_softmax_gemm_permute_xdl_cshuffle.hpp | 5 +++++ ...ce_batched_gemm_softmax_gemm_xdl_cshuffle.hpp | 5 +++++ .../gpu/device/impl/device_batched_gemm_xdl.hpp | 5 +++++ .../impl/device_batchnorm_backward_impl.hpp | 5 +++++ .../impl/device_batchnorm_forward_impl.hpp | 5 +++++ .../device_batchnorm_forward_impl_obsolete.hpp | 5 +++++ .../impl/device_cgemm_4gemm_xdl_cshuffle.hpp | 5 +++++ ...huffle_bias_activation_add_nhwc_kyxc_nhwk.hpp | 5 +++++ ..._c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp | 5 +++++ ...e_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp | 5 +++++ .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp | 5 +++++ ...device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp | 5 +++++ .../device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp | 4 ++++ .../device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp | 6 ++++++ ...ice_gemm_bias_add_reduce_wmma_cshuffle_v3.hpp | 4 ++++ .../device_gemm_bias_add_reduce_xdl_cshuffle.hpp | 5 +++++ .../impl/device_gemm_reduce_wmma_cshuffle_v3.hpp | 4 ++++ .../impl/device_gemm_reduce_xdl_cshuffle.hpp | 5 +++++ .../impl/device_gemm_wmma_cshuffle_v3r1.hpp | 4 ++++ .../impl/device_gemm_xdl_layernorm_cshuffle.hpp | 5 +++++ .../device/impl/device_gemm_xdl_skip_b_lds.hpp | 5 +++++ .../device_gemm_xdl_waveletmodel_cshuffle.hpp | 5 +++++ .../impl/device_grouped_conv_bwd_weight_dl.hpp | 4 ++++ ...vice_grouped_query_attention_forward_wmma.hpp | 4 ++++ .../device/impl/device_image_to_column_impl.hpp | 5 +++++ ...device_multi_query_attention_forward_wmma.hpp | 4 ++++ .../impl/device_normalization_bwd_data_impl.hpp | 5 +++++ .../device_normalization_bwd_gamma_beta_impl.hpp | 5 +++++ .../impl/device_normalization_fwd_impl.hpp | 5 +++++ .../device/impl/device_pool2d_fwd_nhwc_nhwc.hpp | 5 +++++ .../impl/device_pool3d_fwd_ndhwc_ndhwc.hpp | 5 +++++ .../gpu/device/impl/device_put_element_impl.hpp | 5 +++++ .../gpu/device/impl/device_reduce_threadwise.hpp | 5 +++++ .../impl/device_reduce_threadwise_multi_d.hpp | 5 +++++ ...evice_sparse_embeddings_forward_layernorm.hpp | 5 +++++ .../grid/epilogue_cshuffle_v3_welford_wmma.hpp | 4 ++++ .../gpu/grid/gridwise_gemm_dpp.hpp | 5 +++++ .../gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp | 4 ++++ .../gridwise_gemm_wmma_cshuffle_v3_ab_scale.hpp | 4 ++++ .../grid/gridwise_gemm_xdl_cshuffle_conv_v3.hpp | 5 +++++ .../gridwise_gemm_xdl_cshuffle_streamk_v3.hpp | 5 +++++ .../gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp | 5 +++++ .../gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp | 5 +++++ .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 6 ++++++ ...ridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp | 4 ++++ .../gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp | 5 +++++ .../gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp | 5 +++++ ...ise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp | 4 ++++ ...gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp | 5 +++++ ...huffle_v3_multi_d_blockscale_b_preshuffle.hpp | 4 ++++ .../grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp | 4 ++++ ...dwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp | 4 ++++ ...idwise_gemm_xdlops_splitk_lds_direct_load.hpp | 5 +++++ .../gpu/grid/gridwise_gemm_xdlops_streamk.hpp | 5 +++++ .../gpu/grid/gridwise_gemm_xdlops_v2r3.hpp | 5 +++++ .../gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp | 5 +++++ .../gpu/grid/gridwise_moe_gemm.hpp | 4 ++++ .../gpu/grid/gridwise_moe_gemm_blockscale.hpp | 4 ++++ .../gpu/grid/gridwise_moe_mx_gemm.hpp | 4 ++++ .../gpu/grid/gridwise_moe_mx_gemm_bns.hpp | 4 ++++ .../grid/gridwise_moe_mx_gemm_bpreshuffle.hpp | 4 ++++ .../thread/threadwise_tensor_slice_transfer.hpp | 5 +++++ include/ck/utility/c_style_pointer_cast.hpp | 4 ++-- include/ck/utility/dynamic_buffer.hpp | 8 +++++++- include/ck/utility/span.hpp | 5 +++++ include/ck/utility/tuple.hpp | 8 +++++++- include/ck/utility/workgroup_barrier.hpp | 5 +++++ include/ck_tile/core/arch/mma/mma_pipeline.hpp | 5 +++++ include/ck_tile/core/arch/mma/mma_transforms.hpp | 2 +- include/ck_tile/core/arch/workgroup_barrier.hpp | 2 +- include/ck_tile/core/container/span.hpp | 4 ++++ include/ck_tile/core/tensor/buffer_view.hpp | 16 ++++++++++++---- include/ck_tile/core/utility/type_traits.hpp | 2 +- .../kernel/batched_contraction_kernel.hpp | 5 +++++ .../ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp | 5 +++++ .../ops/flatmm/kernel/grouped_flatmm_kernel.hpp | 5 +++++ .../ops/flatmm/kernel/moe_flatmm_kernel.hpp | 5 +++++ .../ops/fmha/block/page_block_navigator.hpp | 5 +++++ include/ck_tile/ops/fmha/block/variants.hpp | 5 +++++ .../ops/fused_moe/kernel/moe_sorting_kernel.hpp | 5 +++++ include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp | 5 +++++ .../ops/gemm/kernel/gemm_multi_abd_kernel.hpp | 5 +++++ .../ops/gemm/kernel/gemm_multi_d_kernel.hpp | 5 +++++ .../ops/gemm/kernel/grouped_gemm_kernel.hpp | 5 +++++ .../ops/gemm/kernel/universal_gemm_kernel.hpp | 2 +- .../ck_tile/ops/gemm_mx/kernel/scale_pointer.hpp | 4 ++++ .../gemm_quant/block/block_gemm_quant_common.hpp | 4 ++++ .../ops/gemm_quant/kernel/gemm_quant_kernel.hpp | 4 ++++ .../kernel/grouped_gemm_quant_kernel.hpp | 4 ++++ .../utils/grouped_convolution_utils.hpp | 5 +++++ .../ck_tile/ops/pooling/kernel/pool_kernel.hpp | 5 +++++ .../cpu/reference_avgpool_bwd.hpp | 5 +++++ .../cpu/reference_batched_gemm.hpp | 4 ++++ .../cpu/reference_cgemm.hpp | 5 +++++ .../cpu/reference_column_to_image.hpp | 5 +++++ .../cpu/reference_contraction.hpp | 5 +++++ .../cpu/reference_conv_bwd_data.hpp | 5 +++++ .../cpu/reference_conv_bwd_weight.hpp | 5 +++++ .../cpu/reference_conv_fwd.hpp | 5 +++++ .../cpu/reference_conv_fwd_bias_activation.hpp | 5 +++++ .../reference_conv_fwd_bias_activation_add.hpp | 5 +++++ .../cpu/reference_elementwise.hpp | 5 +++++ .../cpu/reference_fpAintB_gemm.hpp | 4 ++++ .../cpu/reference_gemm.hpp | 5 +++++ .../cpu/reference_gemm_layernorm.hpp | 5 +++++ .../cpu/reference_gemm_multi_abd.hpp | 5 +++++ .../cpu/reference_gemm_multiple_d.hpp | 5 +++++ .../cpu/reference_groupnorm.hpp | 5 +++++ .../cpu/reference_groupnorm_bwd.hpp | 5 +++++ .../cpu/reference_image_to_column.hpp | 5 +++++ .../cpu/reference_layernorm.hpp | 5 +++++ .../cpu/reference_layernorm_bwd.hpp | 5 +++++ .../cpu/reference_maxpool_bwd.hpp | 5 +++++ .../cpu/reference_moe_gemm.hpp | 4 ++++ .../cpu/reference_moe_gemm1_blockscale.hpp | 4 ++++ .../reference_moe_gemm1_blockscale_splitk.hpp | 4 ++++ .../cpu/reference_moe_gemm2.hpp | 4 ++++ .../cpu/reference_moe_gemm2_blockscale.hpp | 4 ++++ .../cpu/reference_moe_mx_gemm1.hpp | 4 ++++ .../cpu/reference_moe_mx_gemm2.hpp | 4 ++++ .../cpu/reference_mx_gemm.hpp | 4 ++++ .../cpu/reference_pool_fwd.hpp | 5 +++++ ...rence_sparse_embedding3_forward_layernorm.hpp | 5 +++++ tile_engine/ops/common/utils.hpp | 5 +++-- tile_engine/ops/gemm/gemm_benchmark.hpp | 3 ++- 139 files changed, 639 insertions(+), 15 deletions(-) diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp index 9fdcf4aaad..3dde25776e 100644 --- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp @@ -23,6 +23,9 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + using ::ck::DeviceMem; using ::ck::hip_check_error; using ::ck::HostTensorDescriptor; @@ -74,3 +77,5 @@ using DeviceGemmInstance = #include "run_grouped_gemm_multiple_d_example.inc" int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); } + +#pragma clang diagnostic pop diff --git a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp index 82d02fc399..40d61d7174 100644 --- a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp +++ b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp @@ -18,6 +18,9 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/numeric.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + using ::ck::DeviceMem; using ::ck::HostTensorDescriptor; using ::ck::make_ParallelTensorFunctor; @@ -478,3 +481,4 @@ int main(int argc, char* argv[]) return pass ? 0 : 1; } +#pragma clang diagnostic pop diff --git a/example/29_batched_gemm_bias_e_permute/run_batched_gemm_bias_e_permute_example.inc b/example/29_batched_gemm_bias_e_permute/run_batched_gemm_bias_e_permute_example.inc index 803c1eb0bf..0f49ea0979 100644 --- a/example/29_batched_gemm_bias_e_permute/run_batched_gemm_bias_e_permute_example.inc +++ b/example/29_batched_gemm_bias_e_permute/run_batched_gemm_bias_e_permute_example.inc @@ -1,3 +1,5 @@ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" // hardcoded for NumDimM == NumDimN == NumDimK == 2 template & src, const Axes& axes, Functor functor, Ten return true; } +#pragma clang diagnostic pop diff --git a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp index ad40f84201..ca9ac02648 100644 --- a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp +++ b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp @@ -16,6 +16,9 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + using ::ck::DeviceMem; using ::ck::HostTensorDescriptor; using ::ck::make_ParallelTensorFunctor; @@ -419,3 +422,4 @@ int main(int argc, char* argv[]) return 0; } +#pragma clang diagnostic pop diff --git a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp index a0905d6505..29cfc418b8 100644 --- a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp +++ b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp @@ -16,6 +16,9 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + using ::ck::DeviceMem; using ::ck::HostTensorDescriptor; using ::ck::make_ParallelTensorFunctor; @@ -419,3 +422,4 @@ int main(int argc, char* argv[]) return 0; } +#pragma clang diagnostic pop diff --git a/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp index 64305b85cf..5f96155c04 100644 --- a/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp +++ b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp @@ -16,6 +16,9 @@ #include "gemm_utils.hpp" #include "run_gemm_example.inc" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + /** * @brief Tile partitioner with output offset support. * @@ -961,3 +964,5 @@ int main(int argc, char* argv[]) } return EXIT_SUCCESS; } + +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp index 8085e0aeea..b2d1120d25 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp @@ -12,6 +12,9 @@ #include "device_base.hpp" #include "ck/utility/ignore.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace device { @@ -178,3 +181,5 @@ struct DeviceGroupedGemm : public BaseOperator } // namespace device } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3_common.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3_common.hpp index a739af898f..d3574ad417 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3_common.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_wmma_cshuffle_v3_common.hpp @@ -18,6 +18,9 @@ #include "ck/utility/scheduler_enum.hpp" #include "ck/utility/integral_constant.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace device { @@ -900,3 +903,4 @@ struct DeviceGemmGemm_Wmma_CShuffleV3_Common_Invoker_Arg } // namespace device } // namespace tensor_operation } // namespace ck +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp index f63b20b0f0..b9a04b9686 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp @@ -18,6 +18,9 @@ #include "ck/host_utility/kernel_launch.hpp" #include "ck/host_utility/io.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace device { @@ -779,3 +782,5 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm @@ -692,3 +695,5 @@ struct GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp }; } // namespace ck + +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp index 7818074b7f..3d13ae6585 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp @@ -19,6 +19,9 @@ #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { /// @brief \"Universal\" GEMM kernel with SplitK support. @@ -1096,3 +1099,4 @@ struct GridwiseGemm_wmma_cshuffle_v3 }; } // namespace ck +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_ab_scale.hpp index 92561d00d4..d4a8bcb537 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_ab_scale.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_ab_scale.hpp @@ -16,6 +16,9 @@ #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { template @@ -938,3 +941,5 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 }; } // namespace ck + +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp index f765662904..f44af8eaec 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp @@ -15,6 +15,9 @@ #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_common.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { template @@ -1019,3 +1022,5 @@ struct GridwiseGemm_xdl_cshuffle_v2 }; } // namespace ck + +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp index d926efab84..23ee2e0ac7 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp @@ -16,6 +16,9 @@ #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_common.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { // Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same @@ -1577,3 +1580,6 @@ struct GridwiseGemm_xdl_cshuffle_v3 }; } // namespace ck + +#pragma clang diagnostic pop + diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp index a81679ea78..91a8e4b22d 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp @@ -15,6 +15,9 @@ #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_common.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { // Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same @@ -1412,3 +1415,4 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle }; } // namespace ck +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp index f9be9e494b..d7c8042a73 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp @@ -15,6 +15,9 @@ #include "ck/utility/env.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_common.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { // Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same @@ -1585,3 +1588,5 @@ struct GridwiseGemm_xdl_cshuffle_v3 }; } // namespace ck + +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp index 671cfe4967..21172ffcbf 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp @@ -18,6 +18,9 @@ #define DEBUG_LOG 0 +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { // Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same @@ -1822,3 +1825,5 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3 }; } // namespace ck + +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp index 54260d4386..a0ffab38ed 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp @@ -17,6 +17,9 @@ #define DEBUG_LOG 0 +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { // Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same @@ -1284,3 +1287,4 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3 }; } // namespace ck +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp index 28bcf14cd0..f7131f235b 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp @@ -17,6 +17,9 @@ #define DEBUG_LOG 0 +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { // Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same @@ -1474,3 +1477,5 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle }; } // namespace ck + +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp index fa0f401743..8bd4a35a25 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle.hpp @@ -17,6 +17,9 @@ #define DEBUG_LOG 0 +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { // Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same @@ -1550,3 +1553,4 @@ struct GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle }; } // namespace ck +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp index fa231c9b02..96a2f3c2e7 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp @@ -17,6 +17,9 @@ #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_common.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { // Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same @@ -1749,3 +1752,4 @@ struct GridwiseGemmMX_xdl_cshuffle_v3 }; } // namespace ck +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp index 43a46d6ff4..2161b58641 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp @@ -16,6 +16,9 @@ #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_common.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { // Currently we do not have a elegant way to put single lds buffer & double lds buffer pipe in same @@ -1795,3 +1798,4 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle }; } // namespace ck +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp index 3134096899..b81e9254aa 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_splitk_lds_direct_load.hpp @@ -20,6 +20,9 @@ #include "ck/tensor_operation/gpu/device/matrix_padder.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_common.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { template @@ -1172,3 +1175,5 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk }; } // namespace ck + +#pragma clang diagnostic pop diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp index adb653e7d4..df560a4c0e 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp @@ -16,6 +16,9 @@ #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_common.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { template && is_pointer_v, bool>::type = false> -__host__ __device__ PY c_style_pointer_cast(PX p_x) +__host__ __device__ PY c_style_pointer_cast([[clang::lifetimebound]] PX p_x) { #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wold-style-cast" diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp index ce4c92425e..13a946a247 100644 --- a/include/ck/utility/dynamic_buffer.hpp +++ b/include/ck/utility/dynamic_buffer.hpp @@ -15,6 +15,9 @@ #include "amd_transpose_load.hpp" #include "generic_memory_space_atomic.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { // T may be scalar or vector @@ -46,7 +49,8 @@ struct DynamicBuffer return 1; }(); - __host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size) + __host__ __device__ constexpr DynamicBuffer([[clang::lifetimebound]] T* p_data, + ElementSpaceSize element_space_size) : p_data_{p_data}, element_space_size_{element_space_size} { } @@ -498,3 +502,5 @@ make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element } } // namespace ck + +#pragma clang diagnostic pop diff --git a/include/ck/utility/span.hpp b/include/ck/utility/span.hpp index c0e68c95f4..eef376878c 100644 --- a/include/ck/utility/span.hpp +++ b/include/ck/utility/span.hpp @@ -7,6 +7,9 @@ #include #include +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { template @@ -65,3 +68,5 @@ class span }; } // namespace ck + +#pragma clang diagnostic pop diff --git a/include/ck/utility/tuple.hpp b/include/ck/utility/tuple.hpp index 16cd35e1d6..038a81b992 100644 --- a/include/ck/utility/tuple.hpp +++ b/include/ck/utility/tuple.hpp @@ -9,6 +9,9 @@ #include "ck/utility/enable_if.hpp" #include +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace detail { @@ -43,7 +46,7 @@ struct TupleElementKeyData // for read access of tuple element template __host__ __device__ constexpr const Data& -get_tuple_element_data_reference(const TupleElementKeyData& x) +get_tuple_element_data_reference([[clang::lifetimebound]] const TupleElementKeyData& x) { return static_cast(x.mData); } @@ -100,6 +103,7 @@ struct TupleImpl, Xs...> : TupleElementKeyData __host__ __device__ constexpr const auto& GetElementDataByKey(TupleElementKey) const + [[clang::lifetimebound]] { return get_tuple_element_data_reference>(*this); } @@ -268,3 +272,5 @@ template using tuple_element_or_t = typename detail::tuple_element_or_impl::type; } // namespace ck + +#pragma clang diagnostic pop diff --git a/include/ck/utility/workgroup_barrier.hpp b/include/ck/utility/workgroup_barrier.hpp index 0be341da88..b77c98d709 100644 --- a/include/ck/utility/workgroup_barrier.hpp +++ b/include/ck/utility/workgroup_barrier.hpp @@ -5,6 +5,9 @@ #include #include +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { struct workgroup_barrier { @@ -60,3 +63,5 @@ struct workgroup_barrier uint32_t* base_ptr; }; } // namespace ck + +#pragma clang diagnostic pop diff --git a/include/ck_tile/core/arch/mma/mma_pipeline.hpp b/include/ck_tile/core/arch/mma/mma_pipeline.hpp index f2bde7a686..0f497a6ce6 100644 --- a/include/ck_tile/core/arch/mma/mma_pipeline.hpp +++ b/include/ck_tile/core/arch/mma/mma_pipeline.hpp @@ -9,6 +9,9 @@ #include "mma_traits.hpp" #include "mma_transforms.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile::core::arch::mma { /*! @enum MmaPipelineOptionFlag @@ -341,3 +344,5 @@ concept MmaPipelineInterface = std::derived_from - CK_TILE_DEVICE static decltype(auto) exec(VecType&& v) + CK_TILE_DEVICE static decltype(auto) exec([[clang::lifetimebound]] VecType&& v) { return std::forward(v); } diff --git a/include/ck_tile/core/arch/workgroup_barrier.hpp b/include/ck_tile/core/arch/workgroup_barrier.hpp index 2560fe501c..64a6e63a40 100644 --- a/include/ck_tile/core/arch/workgroup_barrier.hpp +++ b/include/ck_tile/core/arch/workgroup_barrier.hpp @@ -10,7 +10,7 @@ namespace ck_tile { struct workgroup_barrier { - CK_TILE_DEVICE workgroup_barrier(uint32_t* ptr) : base_ptr(ptr) {} + CK_TILE_DEVICE workgroup_barrier([[clang::lifetimebound]] uint32_t* ptr) : base_ptr(ptr) {} CK_TILE_DEVICE uint32_t ld(uint32_t offset = 0) { diff --git a/include/ck_tile/core/container/span.hpp b/include/ck_tile/core/container/span.hpp index 4cce87eb6f..e0a8768590 100644 --- a/include/ck_tile/core/container/span.hpp +++ b/include/ck_tile/core/container/span.hpp @@ -8,6 +8,9 @@ #include #include +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile { // implement the c++20 std::span, lightweight, non-owning reference to a sequence @@ -76,3 +79,4 @@ class span }; } // namespace ck_tile +#pragma clang diagnostic pop diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp index 1705f5b0f2..0a6a1fcba9 100644 --- a/include/ck_tile/core/tensor/buffer_view.hpp +++ b/include/ck_tile/core/tensor/buffer_view.hpp @@ -17,6 +17,9 @@ #include "ck_tile/core/utility/type_traits.hpp" #include "ck_tile/core/utility/ignore.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile { // T may be scalar or vector @@ -240,7 +243,8 @@ struct buffer_view -CK_TILE_HOST_DEVICE constexpr auto make_buffer_view(T* __restrict__ p, BufferSizeType buffer_size) +CK_TILE_HOST_DEVICE constexpr auto make_buffer_view([[clang::lifetimebound]] T* __restrict__ p, + BufferSizeType buffer_size) { return buffer_view{p, buffer_size}; } @@ -1325,3 +1331,5 @@ CK_TILE_HOST_DEVICE void print(const buffer_view && std::is_pointer_v, bool>::type = false> -CK_TILE_HOST_DEVICE PY c_style_pointer_cast(PX p_x) +CK_TILE_HOST_DEVICE PY c_style_pointer_cast([[clang::lifetimebound]] PX p_x) { #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wold-style-cast" diff --git a/include/ck_tile/ops/batched_contraction/kernel/batched_contraction_kernel.hpp b/include/ck_tile/ops/batched_contraction/kernel/batched_contraction_kernel.hpp index 968d5d6ac2..8e7ea3ce09 100644 --- a/include/ck_tile/ops/batched_contraction/kernel/batched_contraction_kernel.hpp +++ b/include/ck_tile/ops/batched_contraction/kernel/batched_contraction_kernel.hpp @@ -8,6 +8,9 @@ #include "ck_tile/ops/batched_contraction/utils/tensor_descriptor_utils.hpp" #include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + /** * @file batched_contraction_kernel.hpp * @brief Batched Tensor Contraction Operations @@ -687,3 +690,5 @@ struct BatchedContractionKernel }; } // namespace ck_tile + +#pragma clang diagnostic pop diff --git a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp index bd98918b90..f051188442 100644 --- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp +++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp @@ -10,6 +10,9 @@ #include "ck_tile/ops/common.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile { struct FlatmmProblem { @@ -970,3 +973,5 @@ struct FlatmmKernel }; } // namespace ck_tile + +#pragma clang diagnostic pop diff --git a/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp index ff96139f18..b4bc5ac38e 100644 --- a/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp +++ b/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp @@ -10,6 +10,9 @@ #include "ck_tile/ops/common.hpp" #include "ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile { template , @@ -468,3 +471,5 @@ struct GroupedFlatmmKernel : FlatmmKernel* } } // namespace ck_tile + +#pragma clang diagnostic pop diff --git a/include/ck_tile/ops/fmha/block/variants.hpp b/include/ck_tile/ops/fmha/block/variants.hpp index b6f79873b4..a33da8c9f6 100644 --- a/include/ck_tile/ops/fmha/block/variants.hpp +++ b/include/ck_tile/ops/fmha/block/variants.hpp @@ -19,6 +19,9 @@ #define CK_TILE_ATTENTION_USE_SOFTSIGN_ASM 0 #endif +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile { namespace internal { __device__ inline float @@ -333,3 +336,5 @@ struct ComposedAttention }; } // namespace ck_tile + +#pragma clang diagnostic pop diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp index 06ab134f85..07eda483d2 100644 --- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp +++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp @@ -10,6 +10,9 @@ #include #include +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + #if !defined(CK_TILE_HAS_ROW_NEWBCAST) // row_newbcast (DPP modifier 0x157) support by architecture: // - Not supported: gfx908 (MI100) and older @@ -3125,3 +3128,5 @@ struct MoeSortingMultiPhaseKernel_P23 #undef MOE_SORTING_MOCK_ID } // namespace ck_tile + +#pragma clang diagnostic pop diff --git a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp index d113336a3e..4ee97fb902 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp @@ -15,6 +15,9 @@ #include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp" #include "ck_tile/core/utility/type_traits.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile { /// @brief The GEMM kernel host arguments. @@ -168,3 +171,5 @@ struct GemmKernel } }; } // namespace ck_tile + +#pragma clang diagnostic pop diff --git a/include/ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp index 9fc8ef83c3..84eb3da752 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp @@ -15,6 +15,9 @@ #include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp" #include "ck_tile/core/utility/type_traits.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile { /// @brief The MultiABD GEMM kernel host arguments. @@ -195,3 +198,5 @@ struct GemmKernelMultiABD } }; } // namespace ck_tile + +#pragma clang diagnostic pop diff --git a/include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp b/include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp index 6360e868e5..8db4b3284c 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp @@ -15,6 +15,9 @@ #include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp" #include "ck_tile/core/utility/type_traits.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile { /// @brief The MultiD GEMM kernel host arguments. @@ -190,3 +193,5 @@ struct GemmKernelMultiD } }; } // namespace ck_tile + +#pragma clang diagnostic pop diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp index bb054eeaa3..eeeff930a8 100644 --- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp @@ -14,6 +14,9 @@ #include +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile { /// @brief The Grouped GEMM kernel host arguments. @@ -575,3 +578,5 @@ struct GroupedGemmKernel }; } // namespace ck_tile + +#pragma clang diagnostic pop diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp index d5ba324326..44745d15ee 100644 --- a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp @@ -36,7 +36,7 @@ struct UniversalGemmHostArgs const std::array& as_ptr_, const std::array& bs_ptr_, const std::array& ds_ptr_, - void* e_ptr_, + [[clang::lifetimebound]] void* e_ptr_, index_t k_batch_, index_t M_, index_t N_, diff --git a/include/ck_tile/ops/gemm_mx/kernel/scale_pointer.hpp b/include/ck_tile/ops/gemm_mx/kernel/scale_pointer.hpp index 204372c036..b9597c7517 100644 --- a/include/ck_tile/ops/gemm_mx/kernel/scale_pointer.hpp +++ b/include/ck_tile/ops/gemm_mx/kernel/scale_pointer.hpp @@ -5,6 +5,9 @@ #include "ck_tile/core.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile { template @@ -111,3 +114,4 @@ struct MXScalePointer }; } // namespace ck_tile +#pragma clang diagnostic pop diff --git a/include/ck_tile/ops/gemm_quant/block/block_gemm_quant_common.hpp b/include/ck_tile/ops/gemm_quant/block/block_gemm_quant_common.hpp index fcf1261754..89d2cd59ea 100644 --- a/include/ck_tile/ops/gemm_quant/block/block_gemm_quant_common.hpp +++ b/include/ck_tile/ops/gemm_quant/block/block_gemm_quant_common.hpp @@ -5,6 +5,9 @@ #include "ck_tile/core.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile { // Common utilities for quantized GEMM block operations @@ -224,3 +227,4 @@ struct AQPickerCommon : public BlockGemmQuantBase float scale_reg_f = 0.0f; }; } // namespace ck_tile +#pragma clang diagnostic pop diff --git a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp index 5e7fb0e4da..8396a3e0eb 100644 --- a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp +++ b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp @@ -14,6 +14,9 @@ #include "ck_tile/host/concat.hpp" #include "ck_tile/ops/gemm_quant/pipeline/tile_gemm_quant_traits.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile { namespace detail { @@ -1574,3 +1577,4 @@ struct QuantGemmKernel }; } // namespace ck_tile +#pragma clang diagnostic pop diff --git a/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp index 8b77b01e2f..8cb589c67c 100644 --- a/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp +++ b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp @@ -15,6 +15,9 @@ #include +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile { /// @brief The Grouped GEMM kernel host arguments. @@ -646,3 +649,4 @@ struct QuantGroupedGemmKernel }; } // namespace ck_tile +#pragma clang diagnostic pop diff --git a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp index 2efb435d5b..ab24665a47 100644 --- a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp +++ b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp @@ -7,6 +7,9 @@ #include "ck_tile/host/convolution_parameter.hpp" #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile { enum class GroupedConvDirection @@ -261,3 +264,5 @@ CK_TILE_HOST SplitImagePieceInfo calculate_spatial_piece(ck_tile::index_t piece_ } } // namespace ck_tile + +#pragma clang diagnostic pop diff --git a/include/ck_tile/ops/pooling/kernel/pool_kernel.hpp b/include/ck_tile/ops/pooling/kernel/pool_kernel.hpp index 91be63b803..d9a9c9cdb8 100644 --- a/include/ck_tile/ops/pooling/kernel/pool_kernel.hpp +++ b/include/ck_tile/ops/pooling/kernel/pool_kernel.hpp @@ -8,6 +8,9 @@ #include "ck_tile/ops/common.hpp" #include +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck_tile { /// @brief Host arguments for pooling operations @@ -575,3 +578,5 @@ struct PoolKernel }; } // namespace ck_tile + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp index 16937fc7e8..0501430265 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp @@ -10,6 +10,9 @@ #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -352,3 +355,5 @@ struct ReferenceAvgPoolBwd : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp index 8c96f04930..8948954ed8 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp @@ -10,6 +10,9 @@ #include "ck/library/utility/host_tensor.hpp" #include +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -422,3 +425,4 @@ struct ReferenceBatchedGemm_GQA : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp index 01079bc5ba..cde4bf2585 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp @@ -10,6 +10,9 @@ #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -182,3 +185,5 @@ struct ReferenceCGemm : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp index 459e2b52fc..9104b55892 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp @@ -11,6 +11,9 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -366,3 +369,5 @@ struct ReferenceColumnToImage : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp index d73ceb1de5..b93e02c2f0 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_contraction.hpp @@ -11,6 +11,9 @@ #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -507,3 +510,5 @@ struct ReferenceBatchedContraction_G1_M3_N2_K1 : public ck::tensor_operation::de } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp index 24af36770c..62b44c7b77 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp @@ -10,6 +10,9 @@ #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -506,3 +509,5 @@ struct ReferenceConvBwdData : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp index 2e2dfeae46..de5fd48eff 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp @@ -10,6 +10,9 @@ #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -451,3 +454,5 @@ struct ReferenceConvBwdWeight : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp index 92115e6be4..1493379bf4 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp @@ -23,6 +23,9 @@ #include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -567,3 +570,5 @@ struct ReferenceConvFwd : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp index bd35fca181..a98a7fe29e 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp @@ -9,6 +9,9 @@ #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -190,3 +193,5 @@ struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp index aa6500185b..17bf92e490 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp @@ -9,6 +9,9 @@ #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -198,3 +201,5 @@ struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp index 25fcebe64b..a0f7784e4b 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp @@ -10,6 +10,9 @@ #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -108,3 +111,5 @@ struct ReferenceElementwise : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp index 1e5246313f..931728d5da 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp @@ -10,6 +10,9 @@ #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -145,3 +148,4 @@ struct ReferencefpAintBGemm : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp index 052cf8eb9d..7aecc8107d 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp @@ -11,6 +11,9 @@ #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -246,3 +249,5 @@ struct ReferenceGemm : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp index b701733af0..deb665a0cb 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp @@ -7,6 +7,9 @@ #include #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -234,3 +237,5 @@ struct ReferenceGemmLayernorm : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multi_abd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multi_abd.hpp index 2d766e621b..225da36c61 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multi_abd.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multi_abd.hpp @@ -14,6 +14,9 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -192,3 +195,5 @@ struct ReferenceGemmMultiABD : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp index 24ddef739a..85e954dab1 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp @@ -10,6 +10,9 @@ #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -155,3 +158,5 @@ struct ReferenceGemmMultipleD : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp index ee2f664df1..1565663408 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp @@ -12,6 +12,9 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -210,3 +213,5 @@ struct ReferenceGroupnorm : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp index 4d78da35c1..945955aae2 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp @@ -12,6 +12,9 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -230,3 +233,5 @@ struct ReferenceGroupnormBwd : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp index c982ccb575..430af54805 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp @@ -12,6 +12,9 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/numeric.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -330,3 +333,5 @@ struct ReferenceImageToColumn : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp index e72947e387..919d6020e4 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp @@ -12,6 +12,9 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -258,3 +261,5 @@ struct ReferenceLayernorm : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp index d0e2730484..2eacce3276 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp @@ -12,6 +12,9 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -199,3 +202,5 @@ struct ReferenceLayernormBwd : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp index 7372e1132c..ab8ce8eb90 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp @@ -11,6 +11,9 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -110,3 +113,5 @@ struct ReferenceMaxPoolBwd : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp index 26a0607508..5b50643ffc 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp @@ -11,6 +11,9 @@ #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -293,3 +296,4 @@ struct ReferenceMoeGemm : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp index 4032260609..8fdea7d3c7 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp @@ -11,6 +11,9 @@ #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -278,3 +281,4 @@ struct ReferenceMoeGemm1BlockScale : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale_splitk.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale_splitk.hpp index 9d9b8a62f5..9ad504e064 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale_splitk.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale_splitk.hpp @@ -11,6 +11,9 @@ #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -230,3 +233,4 @@ struct ReferenceMoeGemm1BlockScaleSplitK : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp index 937ab82e80..0f4fffc7a1 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp @@ -12,6 +12,9 @@ #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -266,3 +269,4 @@ struct ReferenceMoeGemm2 : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2_blockscale.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2_blockscale.hpp index 8295fc8ca2..ff10d7fa16 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2_blockscale.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2_blockscale.hpp @@ -11,6 +11,9 @@ #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -246,3 +249,4 @@ struct ReferenceMoeGemm2BlockScale : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm1.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm1.hpp index 10bcc5aa8e..ecd48c6dc9 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm1.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm1.hpp @@ -11,6 +11,9 @@ #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -262,3 +265,4 @@ struct ReferenceMoeMXGemm1 : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm2.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm2.hpp index a08c03d14b..8be787f6d9 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm2.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_mx_gemm2.hpp @@ -11,6 +11,9 @@ #include "ck/tensor_operation/gpu/device/device_base.hpp" #include "ck/library/utility/host_tensor.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -236,3 +239,4 @@ struct ReferenceMoeMXGemm2 : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp index 3930fcd7cd..a27ceb43f8 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp @@ -11,6 +11,9 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -234,3 +237,4 @@ struct ReferenceMXGemm : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp index 7fab05ad99..f6f3275126 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp @@ -14,6 +14,9 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -358,3 +361,5 @@ struct ReferencePoolingFwd : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp index 1ee471c95c..2b64e320cd 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp @@ -12,6 +12,9 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions" + namespace ck { namespace tensor_operation { namespace host { @@ -203,3 +206,5 @@ struct ReferenceSparseEmbedding3ForwardLayernorm : public device::BaseOperator } // namespace host } // namespace tensor_operation } // namespace ck + +#pragma clang diagnostic pop diff --git a/tile_engine/ops/common/utils.hpp b/tile_engine/ops/common/utils.hpp index 4a7c2d586b..df4037f6b5 100644 --- a/tile_engine/ops/common/utils.hpp +++ b/tile_engine/ops/common/utils.hpp @@ -72,7 +72,8 @@ struct KernelInstance }; template -std::ostream& operator<<(std::ostream& os, const KernelInstance& obj) +std::ostream& operator<<([[clang::lifetimebound]] std::ostream& os, + const KernelInstance& obj) { os << "{\n" << " \"name\": \"" << obj.name_ << "\",\n" @@ -82,7 +83,7 @@ std::ostream& operator<<(std::ostream& os, const KernelInstance& obj) return os; } -std::ostream& operator<<(std::ostream& os, const PerformanceResult& result) +std::ostream& operator<<([[clang::lifetimebound]] std::ostream& os, const PerformanceResult& result) { os << "{\n" << " \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_ << ",\n" diff --git a/tile_engine/ops/gemm/gemm_benchmark.hpp b/tile_engine/ops/gemm/gemm_benchmark.hpp index 7439264a39..afe375e1d9 100644 --- a/tile_engine/ops/gemm/gemm_benchmark.hpp +++ b/tile_engine/ops/gemm/gemm_benchmark.hpp @@ -26,7 +26,8 @@ struct GemmProblem bool structured_sparsity_; - friend std::ostream& operator<<(std::ostream& os, const GemmProblem& problem) + friend std::ostream& operator<<([[clang::lifetimebound]] std::ostream& os, + const GemmProblem& problem) { os << "{\n" << " \"split_k\":" << problem.split_k_ << ",\n" From 58e802dcf17181b61d3a6767d02dd7d6dde2bd8d Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Wed, 22 Apr 2026 11:05:11 -0700 Subject: [PATCH 3/3] Fix ck4inductor conv instance parsing for NumGroupsToMerge parameter (#6434) ## Summary - Add `num_groups_to_merge` field to `CKGroupedConvFwdOp` dataclass to match the new (#4273) `NumGroupsToMerge` template parameter added to `DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3` - Enable inductor tests by default in Jenkins CI ## Test plan - [x] Built wheel without patch: `test_gen_conv_instances` fails with `TypeError: takes from 47 to 50 positional arguments but 51 were given` - [x] Built wheel with patch: `test_gen_conv_instances` passes --- Jenkinsfile | 12 +++------- python/ck4inductor/grouped_conv_fwd/op.py | 1 + script/run_inductor_tests.sh | 28 +++++++++++++++++++++++ 3 files changed, 32 insertions(+), 9 deletions(-) create mode 100755 script/run_inductor_tests.sh diff --git a/Jenkinsfile b/Jenkinsfile index f3bb013790..170e0bf432 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -915,13 +915,7 @@ def Build_CK(Map conf=[:]){ cmake_build(conf) if ( params.RUN_INDUCTOR_TESTS && arch == "gfx90a" ){ echo "Run inductor codegen tests" - sh """ - python3 -m venv ${env.WORKSPACE}/projects/composablekernel - . ${env.WORKSPACE}/projects/composablekernel/bin/activate - python3 -m pip install pytest build setuptools setuptools_scm - python3 -m pip install . - python3 -m pytest python/test/test_gen_instances.py - """ + sh "projects/composablekernel/script/run_inductor_tests.sh" } // run performance tests, stash the logs, results will be processed on the master node dir("projects/composablekernel/script"){ @@ -1338,8 +1332,8 @@ pipeline { description: "Generate a detailed time trace (default: OFF)") booleanParam( name: "RUN_INDUCTOR_TESTS", - defaultValue: false, - description: "Run inductor codegen tests (default: OFF)") + defaultValue: true, + description: "Run inductor codegen tests (default: ON)") booleanParam( name: "RUN_CODEGEN_TESTS", defaultValue: true, diff --git a/python/ck4inductor/grouped_conv_fwd/op.py b/python/ck4inductor/grouped_conv_fwd/op.py index 8301f0d07f..576c36f66d 100644 --- a/python/ck4inductor/grouped_conv_fwd/op.py +++ b/python/ck4inductor/grouped_conv_fwd/op.py @@ -67,6 +67,7 @@ class CKGroupedConvFwdOp: b_compute_dtype: Optional[str] = None direct_load: Optional[bool] = None + num_groups_to_merge: Optional[int] = None def name(self): # cpp alias for template instance diff --git a/script/run_inductor_tests.sh b/script/run_inductor_tests.sh new file mode 100755 index 0000000000..6fed9d2dce --- /dev/null +++ b/script/run_inductor_tests.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Run inductor codegen tests +# This script is called from Jenkinsfile to reduce pipeline bytecode size + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CK_DIR="$(dirname "$SCRIPT_DIR")" +VENV_DIR="${WORKSPACE:-/tmp}/ck-inductor-venv" +export UV_CACHE_DIR="${WORKSPACE:-/tmp}/.uv-cache" + +cd "$CK_DIR" + +echo "Setting up Python virtual environment at $VENV_DIR" +python3 -m venv "$VENV_DIR" +. "$VENV_DIR/bin/activate" + +echo "Installing uv for faster package installation" +pip install uv + +echo "Installing test dependencies" +uv pip install pytest build setuptools setuptools_scm + +echo "Installing ck4inductor package" +uv pip install . + +echo "Running inductor codegen tests" +python3 -m pytest python/test/test_gen_instances.py -v