From e02c566795e4e00d3ee03f2a75f44d72e6da5754 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Thu, 21 May 2026 19:43:50 -0700 Subject: [PATCH] [rocm-libraries] ROCm/rocm-libraries#7612 (commit 5427d24) [CK] upgrade CI to rocm7.13 as default compiler (#7612) ## Motivation Upgrade the default docker and compiler version in CI to rocm7.13. In order to pass all the checks I had to also clean up a lot of non-ascii characters in the source code comments and modify a couple of tests that were affected by a new compiler logic. ## Technical Details ## Test Plan ## Test Result ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --------- Co-authored-by: Aviral Goel --- Dockerfile | 31 +++------ Jenkinsfile | 37 ++++++++--- dispatcher/src/registry.cpp | 2 +- .../01_gemm/gemm_wmma_fp8_v3_reg_spill.cpp | 2 +- example/26_contraction/common_instances.hpp | 2 +- ...uped_query_attention_forward_wmma_fp16.cpp | 2 +- ...ulti_query_attention_forward_wmma_fp16.cpp | 2 +- ...rouped_conv_bwd_data_bias_relu_example.inc | 2 +- .../run_grouped_conv_bwd_data_example.inc | 2 +- ...nd_bwd_data_xdl_bilinear_residual_fp16.cpp | 2 +- .../64_fpAintB_gemm/fp16int8_gemm_wmma.cpp | 4 +- .../common_instances.hpp | 2 +- example/ck_tile/01_fmha/fmha_fwd.hpp | 4 +- ...ed_convolution_backward_weight_streamk.cpp | 2 +- example/ck_tile/42_mx_gemm/run_mx_gemm.inc | 2 +- .../builder/factory/conv_algorithms.hpp | 2 +- .../builder/factory/conv_dispatcher.hpp | 2 +- ...le_grouped_convolution_backward_weight.hpp | 4 +- ...raits_tile_grouped_convolution_forward.hpp | 10 +-- ...unit_instance_to_conv_traits_instances.cpp | 2 +- .../test/impl/conv_algorithm_types.hpp | 2 +- experimental/builder/test/testing_utils.cpp | 2 +- ...gemm_xdl_waveletmodel_cshuffle_conv_v3.hpp | 12 ++-- .../arch/amd_buffer_addressing_builtins.hpp | 12 ++-- .../ck_tile/core/arch/amd_cluster_load.hpp | 14 ++-- include/ck_tile/core/container/sequence.hpp | 2 +- include/ck_tile/core/tensor/buffer_view.hpp | 6 +- include/ck_tile/core/tensor/load_tile.hpp | 2 +- .../core/tensor/load_tile_transpose.hpp | 6 +- .../core/tensor/load_tile_transpose.hpp.bk | 6 +- .../core/tensor/tile_scatter_gather.hpp | 2 +- include/ck_tile/core/tensor/tile_window.hpp | 8 +-- .../ck_tile/core/tensor/tile_window.hpp.bk | 8 +-- include/ck_tile/core/utility/debug.hpp | 2 +- include/ck_tile/core/utility/type_traits.hpp | 2 +- .../ck_tile/host/reference/reference_topk.hpp | 2 +- .../kernel/batched_contraction_kernel.hpp | 16 ++--- include/ck_tile/ops/common/streamk_common.hpp | 4 +- .../unary_element_wise_operation.hpp | 2 +- .../chainer/cshuffle_epilogue_schedule.hpp | 8 ++- .../ops/fmha/kernel/fmha_bwd_kernel.hpp | 6 +- ..._batch_prefill_pipeline_qr_ks_vs_async.hpp | 22 +++---- .../pipeline/block_fmha_fwd_v3_pipeline.hpp | 24 +++---- .../gemm/block/block_wp_asmem_breg_creg.hpp | 2 +- .../ops/gemm/kernel/gemm_tile_partitioner.hpp | 14 ++-- .../gemm_pipeline_ag_bg_cr_comp_v3.hpp | 12 ++-- .../gemm_pipeline_ag_bg_cr_comp_v4.hpp | 16 ++--- .../gemm_pipeline_ag_bg_cr_comp_v5.hpp | 12 ++-- .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp | 24 +++---- .../gemm_pipeline_agmem_bgmem_creg_v1.hpp | 24 +++---- .../gemm_pipeline_agmem_bgmem_creg_v2.hpp | 8 +-- .../ops/gemm/warp/warp_gemm_params.hpp | 2 +- .../block_universal_gemm_as_aquant_bs_cr.hpp | 2 +- .../gemm_quant/kernel/gemm_quant_kernel.hpp | 6 +- .../pipeline/gemm_group_quant_utils.hpp | 12 ++-- ...ouped_convolution_backward_data_kernel.hpp | 2 +- ...ped_convolution_backward_weight_kernel.hpp | 8 +-- .../grouped_convolution_forward_kernel.hpp | 2 +- .../utils/transform_conv_fwd_to_gemm.hpp | 6 +- .../ops/reduce/block/block_reduce2d.hpp | 2 +- ...ice_grouped_conv_bwd_data_xdl_instance.hpp | 14 ++-- ...d_conv_bwd_weight_wavelet_xdl_instance.hpp | 12 ++-- .../contraction_instance_common.hpp | 26 ++++---- ...ta_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp | 8 +-- ...ata_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp | 8 +-- ...ata_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp | 8 +-- rocm_ck/include/rocm_ck/args.hpp | 22 +++---- rocm_ck/include/rocm_ck/datatype.hpp | 12 ++-- rocm_ck/include/rocm_ck/fixed_string.hpp | 6 +- rocm_ck/include/rocm_ck/gpu_target.hpp | 2 +- rocm_ck/include/rocm_ck/index_t.hpp | 2 +- rocm_ck/include/rocm_ck/layout.hpp | 4 +- rocm_ck/include/rocm_ck/ops.hpp | 10 +-- rocm_ck/include/rocm_ck/physical_tensor.hpp | 4 +- rocm_ck/include/rocm_ck/resolved_tensor.hpp | 12 ++-- rocm_ck/tests/unit/unit_args.cpp | 2 +- .../src/selective_test_filter.py | 13 +++- .../mma/pipeline/test_amdgcn_scale_mma.cpp | 12 ++-- .../mma/pipeline/test_amdgcn_sparse_mma.cpp | 20 +++--- test/ck_tile/core/container/unit_sequence.cpp | 2 +- .../flatmm/test_mx_flatmm_fixtures.hpp | 2 +- test/ck_tile/fmha/test_fmha_fwd.cpp | 2 +- test/ck_tile/gemm_block_scale/CMakeLists.txt | 14 ---- ...gemm_quant_aquant_mem_decode_interwave.cpp | 27 -------- ...emm_quant_aquant_mem_prefill_interwave.cpp | 27 -------- .../test_gemm_quant_bquant_splitk_decode.cpp | 8 +-- .../test_gemm_quant_bquant_splitk_prefill.cpp | 8 +-- .../test_gemm_quant_fixtures.hpp | 26 +++----- .../test_mx_gemm_pipeline_wmma_base.hpp | 4 +- ...k_tile_grouped_conv_bwd_weight_streamk.cpp | 22 +++---- .../grouped_gemm/test_grouped_gemm_util.hpp | 4 +- .../test_cluster_load_async_to_lds.cpp | 66 +++++++++---------- test/ck_tile/utility/test_sequence.cpp | 6 +- test/ck_tile/utility/test_static_ford.cpp | 4 +- test/cluster_launch/test_cluster_launch.cpp | 2 +- test/cluster_load/test_cluster_load.cpp | 2 +- test/cluster_load/test_cluster_load_async.cpp | 6 +- test/data_type/test_mx_bf6_pk4scale.cpp | 4 +- test/data_type/test_mx_fp6_pk4scale.cpp | 4 +- test/gemm/test_gemm_vgpr.cpp | 2 +- test/mx_wmma_op/mx_wmma_op.hpp | 6 +- test/synchronization/monitor_mwait.cpp | 2 +- .../ops/pooling/pooling_benchmark_single.cpp | 2 +- 103 files changed, 416 insertions(+), 475 deletions(-) delete mode 100644 test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp delete mode 100644 test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp diff --git a/Dockerfile b/Dockerfile index 39d2277e50..7c0da210d3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,7 @@ FROM ubuntu:24.04 ARG DEBIAN_FRONTEND=noninteractive -ARG ROCMVERSION=7.1.1 -ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/ +ARG ROCMVERSION=7.13 # TheRock nightly tarball configuration. # By default, discovers the latest tarball from the nightlies index. @@ -26,7 +25,7 @@ ENV HIP_PLATFORM=amd # Add rocm repository RUN set -xe && \ - apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl + apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl cmake git vim nano zip RUN if [ "$compiler_version" = "therock" ]; then \ rm -rf /opt/rocm && mkdir /opt/rocm && \ @@ -43,11 +42,12 @@ RUN if [ "$compiler_version" = "therock" ]; then \ tar -xzf /tmp/rocm.tar.gz -C /opt/rocm --strip-components=1 && \ rm /tmp/rocm.tar.gz ; \ else echo "using the release compiler" && \ - wget https://repo.radeon.com/amdgpu-install/7.1.1/ubuntu/noble/amdgpu-install_7.1.1.70101-1_all.deb && \ - apt install ./amdgpu-install_7.1.1.70101-1_all.deb -y && \ - apt update && \ - apt install python3-setuptools python3-wheel -y && \ - apt install rocm-dev -y; \ + wget https://repo.amd.com/rocm/tarball-multi-arch/therock-dist-linux-multiarch-7.13.0.tar.gz && \ + rm -rf /opt/rocm && mkdir /opt/rocm && \ + tar -xzf therock-dist-linux-multiarch-7.13.0.tar.gz -C /opt/rocm --strip-components=1 && \ + rm therock-dist-linux-multiarch-7.13.0.tar.gz && \ + wget https://repo.radeon.com/amdgpu-install/31.30/ubuntu/noble/amdgpu-install_31.30.313000-1_all.deb && \ + apt install ./amdgpu-install_31.30.313000-1_all.deb -y; \ fi # Install SCCACHE @@ -58,19 +58,14 @@ RUN set -x && \ mkdir -p ${SCCACHE_INSTALL_LOCATION} && \ wget -qO sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v$SCCACHE_VERSION/sccache-v$SCCACHE_VERSION-x86_64-unknown-linux-musl.tar.gz && \ tar -xzf sccache.tar.gz --strip-components=1 -C ${SCCACHE_INSTALL_LOCATION} && \ - chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache - + chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache && \ # Install dependencies -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ build-essential \ - cmake \ - git \ - iputils-ping \ jq \ libelf-dev \ libnuma-dev \ libpthread-stubs0-dev \ - mpich \ net-tools \ pkg-config \ python3-full \ @@ -79,17 +74,11 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- sshpass \ stunnel \ software-properties-common \ - vim \ - nano \ zlib1g-dev \ - zip \ libzstd-dev \ openssh-server \ clang-format-18 \ kmod && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - rm -rf amdgpu-install* && \ #Install latest ccache git clone https://github.com/ccache/ccache.git && \ cd ccache && mkdir build && cd build && cmake .. && make install && \ diff --git a/Jenkinsfile b/Jenkinsfile index 12faaaeca2..92a3156097 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -357,13 +357,7 @@ def getBaseDockerImageName(){ img = "${params.USE_CUSTOM_DOCKER}" } else{ - def ROCM_numeric = parseVersion("${params.ROCMVERSION}") - if ( ROCM_numeric.major <= 7 && ROCM_numeric.minor < 2 ){ - img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}" - } - else{ - img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm${params.ROCMVERSION}" - } + img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}" } return img } @@ -1284,8 +1278,8 @@ pipeline { description: 'If you want to use a custom docker image, please specify it here (default: leave blank).') string( name: 'ROCMVERSION', - defaultValue: '7.1.1', - description: 'Specify which ROCM version to use: 7.1.1 (default).') + defaultValue: '7.13', + description: 'Specify which ROCM version to use: 7.13 (default).') string( name: 'COMPILER_VERSION', defaultValue: '', @@ -1498,6 +1492,7 @@ pipeline { stage('Docker /opt/rocm'){ agent{ label rocmnode("nogpu") } steps{ + deleteDir() buildDocker('/opt/rocm') cleanWs() } @@ -1529,6 +1524,7 @@ pipeline { --file-filter=*.cpp --force --enable=all --output-file=ck_cppcheck.log""" } steps{ + deleteDir() buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd) archiveArtifacts "build/ck_cppcheck.log" cleanWs() @@ -1548,6 +1544,7 @@ pipeline { xargs -P 8 -I{} sh -c 'clang-format-18 -style=file {} | diff -u - {} || (echo "ERROR: {} needs formatting" && exit 1)'""" } steps{ + deleteDir() buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd) cleanWs() } @@ -1648,6 +1645,7 @@ pipeline { ./bin/test_grouped_convnd_fwd_tile""" } steps{ + deleteDir() buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } @@ -1676,6 +1674,7 @@ pipeline { ./bin/test_grouped_convnd_fwd_large_cases && ./bin/test_grouped_convnd_bwd_data_large_cases && ./bin/test_grouped_convnd_fwd_bias_clamp_large_cases""" } steps{ + deleteDir() buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } @@ -1716,6 +1715,7 @@ pipeline { ./bin/test_grouped_convnd_bwd_weight_dataset_xdl""" } steps{ + deleteDir() buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } @@ -1742,6 +1742,7 @@ pipeline { execute_args = build_and_run_fmha("gfx90a") } steps{ + deleteDir() buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } @@ -1758,6 +1759,7 @@ pipeline { execute_args = build_and_run_fmha("gfx942") } steps{ + deleteDir() buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } @@ -1774,6 +1776,7 @@ pipeline { execute_args = build_and_run_fmha("gfx950") } steps{ + deleteDir() buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } @@ -1790,6 +1793,7 @@ pipeline { execute_args = build_and_run_fmha("gfx1201") } steps{ + deleteDir() buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } @@ -1833,6 +1837,7 @@ pipeline { python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """ } steps{ + deleteDir() buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } @@ -1879,6 +1884,7 @@ pipeline { python3 ../tile_engine/ops/gemm/grouped_gemm/grouped_gemm_benchmark.py . --problem-sizes "1024,1024,1024" --group-counts 8 --warmup 5 --repeat 5 --verbose --json grouped_gemm_results.json """ } steps{ + deleteDir() buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } @@ -1910,6 +1916,7 @@ pipeline { python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """ } steps{ + deleteDir() buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } @@ -1935,6 +1942,7 @@ pipeline { python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """ } steps{ + deleteDir() buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } @@ -1962,6 +1970,7 @@ pipeline { execute_args = build_client_examples("gfx942") } steps{ + deleteDir() Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } @@ -1978,6 +1987,7 @@ pipeline { execute_args = build_client_examples("gfx950") } steps{ + deleteDir() Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } @@ -1995,6 +2005,7 @@ pipeline { execute_args = build_client_examples("gfx908") } steps{ + deleteDir() Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } @@ -2012,6 +2023,7 @@ pipeline { execute_args = build_client_examples_and_codegen_tests("gfx90a") } steps{ + deleteDir() Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } @@ -2024,6 +2036,7 @@ pipeline { } agent{ label rocmnode("gfx942") } steps{ + deleteDir() script { def execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \ -DCMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" \ @@ -2048,6 +2061,7 @@ pipeline { execute_args = build_client_examples("gfx10-1-generic") } steps{ + deleteDir() Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } @@ -2065,6 +2079,7 @@ pipeline { execute_args = build_client_examples("gfx10-3-generic") } steps{ + deleteDir() Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } @@ -2081,6 +2096,7 @@ pipeline { execute_args = build_client_examples("gfx11-generic") } steps{ + deleteDir() Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } @@ -2097,6 +2113,7 @@ pipeline { execute_args = build_client_examples("gfx12-generic") } steps{ + deleteDir() Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } @@ -2112,6 +2129,7 @@ pipeline { setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1250" -DDISABLE_DL_KERNELS="ON" """ } steps{ + deleteDir() Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:npi-mi450-latest", config_targets: "install", no_reboot:true, build_type: 'Release', prefixpath: '/usr/local') cleanWs() } @@ -2153,6 +2171,7 @@ pipeline { } agent { label 'mici' } steps{ + deleteDir() process_results() cleanWs() } diff --git a/dispatcher/src/registry.cpp b/dispatcher/src/registry.cpp index cd17fcbd53..d04f02bbc8 100644 --- a/dispatcher/src/registry.cpp +++ b/dispatcher/src/registry.cpp @@ -57,7 +57,7 @@ bool Registry::register_kernel(KernelInstancePtr instance, Priority priority) // Store under the encoded identifier so Registry::lookup(KernelKey) finds it. // Previously stored under instance->get_name(), but lookup(KernelKey) queries by - // key.encode_identifier() — those keys never matched, breaking key-based lookup. + // key.encode_identifier() - those keys never matched, breaking key-based lookup. if(Base::register_kernel(instance->get_key().encode_identifier(), instance, priority)) { if(auto_export_enabled_ && auto_export_on_every_registration_) diff --git a/example/01_gemm/gemm_wmma_fp8_v3_reg_spill.cpp b/example/01_gemm/gemm_wmma_fp8_v3_reg_spill.cpp index 5a2778986d..b9d8fe2966 100644 --- a/example/01_gemm/gemm_wmma_fp8_v3_reg_spill.cpp +++ b/example/01_gemm/gemm_wmma_fp8_v3_reg_spill.cpp @@ -1,4 +1,4 @@ -// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT /** diff --git a/example/26_contraction/common_instances.hpp b/example/26_contraction/common_instances.hpp index ef39d844da..a6c6ed5621 100644 --- a/example/26_contraction/common_instances.hpp +++ b/example/26_contraction/common_instances.hpp @@ -197,7 +197,7 @@ using DeviceOpInstanceMN_FP64 = ck::tensor_operation::device:: // Macro to instantiate all four layout variants of DeviceOpInstance. // -// BASE: Generic (for fp16/bf16/fp32) or FP64 (for fp64 — different tile sizes) +// BASE: Generic (for fp16/bf16/fp32) or FP64 (for fp64 - different tile sizes) // SUFFIX: NN for bilinear (DsDataType = Tuple), // N for scale (DsDataType = Tuple<>) // diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp index 35a1289f86..66b2aa8508 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp @@ -4,7 +4,7 @@ /* Grouped Query Attention, Ainslie, Joshua, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebrón, and Sumit -Sanghai. “GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints.” +Sanghai. "GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints." arXiv, May 22, 2023. https://doi.org/10.48550/arXiv.2305.13245. Example is GQA-4 diff --git a/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp index 224f0ec596..baef346231 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp @@ -3,7 +3,7 @@ /* Multi-Query Attention -Shazeer, Noam. “Fast Transformer Decoding: One Write-Head Is All You Need.” arXiv.org, November 6, +Shazeer, Noam. "Fast Transformer Decoding: One Write-Head Is All You Need." arXiv.org, November 6, 2019. https://arxiv.org/abs/1911.02150v1. */ diff --git a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc index d4174f9f1a..9ddc541463 100644 --- a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc +++ b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc @@ -105,7 +105,7 @@ bool run_conv_bwd_data_bias_relu(const ExecutionConfig& config, if(!conv.IsSupportedArgument(argument)) { std::cout << "device_conv with the specified compilation parameters does " - "not support this Conv problem — skipping." + "not support this Conv problem - skipping." << std::endl; return true; diff --git a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc index 1f3ca7ac80..0e9da3e5e3 100644 --- a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc +++ b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc @@ -93,7 +93,7 @@ bool run_conv_bwd_data(const ExecutionConfig& config, if(!conv.IsSupportedArgument(argument)) { std::cout << "device_conv with the specified compilation parameters does " - "not support this Conv problem — skipping." + "not support this Conv problem - skipping." << std::endl; return true; diff --git a/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp b/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp index b7a11baf5a..2bca388deb 100644 --- a/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp +++ b/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp @@ -207,7 +207,7 @@ bool run_grouped_conv(bool do_verification, if(!conv.IsSupportedArgument(argument)) { std::cout << "The device op with the specified compilation parameters does " - "not support this convolution problem — skipping." + "not support this convolution problem - skipping." << std::endl; return true; } diff --git a/example/64_fpAintB_gemm/fp16int8_gemm_wmma.cpp b/example/64_fpAintB_gemm/fp16int8_gemm_wmma.cpp index ee78490420..450d1b643f 100644 --- a/example/64_fpAintB_gemm/fp16int8_gemm_wmma.cpp +++ b/example/64_fpAintB_gemm/fp16int8_gemm_wmma.cpp @@ -6,8 +6,8 @@ #include "ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp" // Implementation follows the paper: -// Kim, Young Jin, Rawn Henry, Raffy Fahim, and Hany Hassan Awadalla. “Who Says Elephants Can’t Run: -// Bringing Large Scale MoE Models into Cloud Scale Production.” arXiv, November 17, 2022. +// Kim, Young Jin, Rawn Henry, Raffy Fahim, and Hany Hassan Awadalla. "Who Says Elephants Can’t Run: +// Bringing Large Scale MoE Models into Cloud Scale Production." arXiv, November 17, 2022. // https://doi.org/10.48550/arXiv.2211.10017. Assume weight (Matrix B) is add preprocess to // unsigned. diff --git a/example/66_complex_contraction_bilinear/common_instances.hpp b/example/66_complex_contraction_bilinear/common_instances.hpp index 3ae168cb72..98bf7bace6 100644 --- a/example/66_complex_contraction_bilinear/common_instances.hpp +++ b/example/66_complex_contraction_bilinear/common_instances.hpp @@ -197,7 +197,7 @@ using DeviceOpInstanceMN_FP64 = ck::tensor_operation::device:: // Macro to instantiate all four layout variants of DeviceOpInstance. // -// BASE: Generic (for fp16/bf16/fp32) or FP64 (for fp64 — different tile sizes) +// BASE: Generic (for fp16/bf16/fp32) or FP64 (for fp64 - different tile sizes) // SUFFIX: NN for bilinear (DsDataType = Tuple), // N for scale (DsDataType = Tuple<>) // diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp index 98e2df2e1e..7a7e07d903 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.hpp +++ b/example/ck_tile/01_fmha/fmha_fwd.hpp @@ -677,7 +677,7 @@ struct fmha_batch_prefill_args // GLOBAL_LOAD_LDS: required when (a) the page is smaller than one K/V tile // so per-page SRD is impossible, AND (b) the total KV-pool byte size // exceeds INT32_MAX so SRD's 32-bit byte offset cannot address it. -// BUFFER_LOAD: every other case — the SGPR-resident SRD path is fastest. +// BUFFER_LOAD: every other case - the SGPR-resident SRD path is fastest. // Inputs are taken as plain integers so the helper has no template parameter // and can be called from each codegen-emitted dispatcher arm with the arm's // compile-time kN0 / element_bytes substituted as constants. @@ -691,7 +691,7 @@ fmha_batch_prefill_select_kv_load_mode(ck_tile::index_t page_block_size, // Promote every operand to long_index_t so overflow is impossible regardless // of multiplication order. A bare `static_cast(num_total_pages) // * batch_stride_k * element_bytes` only works because of left-to-right - // associativity — a future reorder of the operands would silently truncate. + // associativity - a future reorder of the operands would silently truncate. const auto kv_pool_bytes = static_cast(num_total_pages) * static_cast(batch_stride_k) * static_cast(element_bytes); diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_streamk.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_streamk.cpp index fa6cf38cf0..39228511d5 100644 --- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_streamk.cpp +++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_streamk.cpp @@ -59,7 +59,7 @@ int main(int argc, char* argv[]) const std::string reduction = arg_parser.get_str("streamk_reduction"); const bool persistent = arg_parser.get_int("streamk_persistent") != 0; - // Dispatch on reduction strategy × persistent DP + // Dispatch on reduction strategy x persistent DP if(reduction == "linear" && !persistent) { using Invoker = GroupedConvolutionBackwardWeightInvoker< diff --git a/example/ck_tile/42_mx_gemm/run_mx_gemm.inc b/example/ck_tile/42_mx_gemm/run_mx_gemm.inc index 7ccd4e4273..9a375d2810 100644 --- a/example/ck_tile/42_mx_gemm/run_mx_gemm.inc +++ b/example/ck_tile/42_mx_gemm/run_mx_gemm.inc @@ -175,7 +175,7 @@ int run_mx_gemm_with_layouts(int argc, char* argv[], ALayout, BLayout, CLayout) constexpr ck_tile::index_t NXdlPackEff = (NIterPerWarp_ >= 2 && NIterPerWarp_ % 2 == 0) ? 2 : 1; constexpr ck_tile::index_t KXdlPackEff = (KIterPerWarp_ >= 2 && KIterPerWarp_ % 2 == 0) ? 2 : 1; - // Pack scales: [M, K/32] e8m0_t → [M/MXdlPackEff, K/32/KXdlPackEff] int32_t + // Pack scales: [M, K/32] e8m0_t -> [M/MXdlPackEff, K/32/KXdlPackEff] int32_t // Original unpacked tensors are kept for CPU reference validation constexpr ck_tile::index_t XdlMNThread = GemmConfig::M_Warp_Tile; constexpr ck_tile::index_t XdlKThread = 64 / XdlMNThread; diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_algorithms.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_algorithms.hpp index a90e338c18..60b104add5 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_algorithms.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_algorithms.hpp @@ -63,7 +63,7 @@ concept TileAlgorithm = ConvAlgorithmDescriptor && SpecifiesTileThreadBlock && SpecifiesTileConvSpecialization && SpecifiesTileBlockGemm && SpecifiesTileOptimizations; -// Depthwise tile-based algorithm concept (no GEMM — direct spatial pipeline) +// Depthwise tile-based algorithm concept (no GEMM - direct spatial pipeline) template concept DepthwiseAlgorithm = ConvAlgorithmDescriptor && SpecifiesDepthwiseConvParams; diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_dispatcher.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_dispatcher.hpp index 9ee2953647..bb5a6268de 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_dispatcher.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_dispatcher.hpp @@ -116,7 +116,7 @@ constexpr auto make_conv_instance() { return typename ReferenceFactory::Instance{}; } - // Depthwise tile algorithm — direct spatial pipeline, no GEMM + // Depthwise tile algorithm - direct spatial pipeline, no GEMM else if constexpr(DepthwiseAlgorithm) { return typename ConvDepthwiseTileFactory::Instance{}; diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp index ea292d8bc4..2670555401 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp @@ -66,10 +66,10 @@ struct InstanceTraits{}); diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp index 28ba99b033..931db03afd 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp @@ -31,7 +31,7 @@ namespace reflect { namespace detail { -// Guards access to TilePartitioner members — primary template is depthwise (void partitioner). +// Guards access to TilePartitioner members - primary template is depthwise (void partitioner). template struct TilePartitionerFields { @@ -62,7 +62,7 @@ struct TilePartitionerFields static constexpr int kKWarpTile = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}); }; -// Guards access to GemmPipeline scheduling members — primary template is depthwise. +// Guards access to GemmPipeline scheduling members - primary template is depthwise. template struct GemmPipelineFields { @@ -99,7 +99,7 @@ struct InstanceTraits; static constexpr int kMPerBlock = TPF::kMPerBlock; static constexpr int kNPerBlock = TPF::kNPerBlock; @@ -128,7 +128,7 @@ struct InstanceTraits; using GemmPipeline = GemmPipeline_; static constexpr ck_tile::GemmPipelineScheduler kPipelineScheduler = GPF::kPipelineScheduler; diff --git a/experimental/builder/test/conv/ck/unit_instance_to_conv_traits_instances.cpp b/experimental/builder/test/conv/ck/unit_instance_to_conv_traits_instances.cpp index fa4bc73bd2..ff0410f658 100644 --- a/experimental/builder/test/conv/ck/unit_instance_to_conv_traits_instances.cpp +++ b/experimental/builder/test/conv/ck/unit_instance_to_conv_traits_instances.cpp @@ -46,7 +46,7 @@ using ::ck_tile::builder::PipelineVersion; // ============================================================================ // Comprehensive Transformation Tests - Per Device Class Template // ============================================================================ -// These tests verify the complete InstanceTraits → ConvTraits transformation +// These tests verify the complete InstanceTraits -> ConvTraits transformation // for each forward convolution Device class template. // ============================================================================ diff --git a/experimental/builder/test/impl/conv_algorithm_types.hpp b/experimental/builder/test/impl/conv_algorithm_types.hpp index e1f6891c58..3dd8273928 100644 --- a/experimental/builder/test/impl/conv_algorithm_types.hpp +++ b/experimental/builder/test/impl/conv_algorithm_types.hpp @@ -744,7 +744,7 @@ using ConvAlgorithm_Tile_GroupedConvolutionKernel_StreamK = TileOptimizations_, TileStreamK_>; -// CK Tile depthwise convolution algorithm (no GEMM — direct spatial pipeline) +// CK Tile depthwise convolution algorithm (no GEMM - direct spatial pipeline) using ConvAlgorithm_Tile_DepthwiseConvolutionKernel = ConvAlgorithmTemplate; diff --git a/experimental/builder/test/testing_utils.cpp b/experimental/builder/test/testing_utils.cpp index e9677e5940..1ee5e76f74 100644 --- a/experimental/builder/test/testing_utils.cpp +++ b/experimental/builder/test/testing_utils.cpp @@ -21,7 +21,7 @@ namespace ck_tile::test { // Wagner-Fischer Algorithm for Computing Edit Distance and Inline Diff // // OUTPUT FORMAT: [expected|actual] for differences, plain text for matches -// Example: "hello world" vs "hello earth" → "hello [world|earth]" +// Example: "hello world" vs "hello earth" -> "hello [world|earth]" // // This function implements the Wagner-Fischer algorithm (1974), which is the classic // dynamic programming solution for computing the minimum edit distance (Levenshtein distance) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle_conv_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle_conv_v3.hpp index c98e47f4f7..7170a40eb6 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle_conv_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle_conv_v3.hpp @@ -1,4 +1,4 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates // SPDX-License-Identifier: MIT #pragma once @@ -26,7 +26,7 @@ namespace ck { // - Math waves (threads 0..TileMath-1): LDS read + MFMA + CShuffle epilogue // // The conv-to-GEMM descriptor transforms (which generate heavy VALU ops) execute only on load -// waves, while math waves see simple LDS layouts — eliminating the VALU/MFMA slot conflict. +// waves, while math waves see simple LDS layouts - eliminating the VALU/MFMA slot conflict. template LDS, once before LDS->global). // For !TransposeC && !IsMxGemm, the SFC access count equals // (MXdlPerWave / CShuffleMXdlPerWavePerShuffle) * (NXdlPerWave / // CShuffleNXdlPerWavePerShuffle) because the M2/M4/N2 SFC dimensions have equal diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp index aac6d39647..7f1c264bbf 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp @@ -1449,21 +1449,21 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0) // Bypasses the SRD's 32-bit offset limit; required when the KV cache exceeds // INT32_MAX (2GB) byte offset on the SRD voffset path. // -// !!! M0 PRECONDITION — IMPLICIT INPUT NOT VISIBLE IN OPERAND LIST !!! +// !!! M0 PRECONDITION - IMPLICIT INPUT NOT VISIBLE IN OPERAND LIST !!! // // The LDS destination address is taken from M0 (per AMD CDNA3 ISA §10.3: // `LDS_ADDR = LDSbase + LDSoffset(M0[17:2] * 4) + INST.OFFSET + ThreadID*4`). // M0 does NOT appear as an operand of these instructions or of the inline -// asm below — the compiler cannot see the dependency. Caller must: +// asm below - the compiler cannot see the dependency. Caller must: // // 1. Initialize M0 once before the load loop: // `m0_set_with_memory(amd_wave_read_first_lane(lds_byte_offset));` -// M0 is SALU-only — `m0_set_with_memory` uses an "s" constraint to +// M0 is SALU-only - `m0_set_with_memory` uses an "s" constraint to // enforce this. Direct VALU writes to M0 are illegal. // // 2. Advance M0 between successive issues: // `m0_inc_with_memory(size_per_issue);` -// `size_per_issue` MUST be a multiple of 4 — GLOBAL/FLAT LDS path +// `size_per_issue` MUST be a multiple of 4 - GLOBAL/FLAT LDS path // only honors M0[17:2]*4 (dword-aligned), so low 2 bits are silently // dropped (NOTE: this differs from MUBUF buffer_load_lds which uses // M0[15:0] as a raw byte offset). @@ -1479,7 +1479,7 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0) // // Verified instruction emission (HIP 6.4 / clang 19, gfx942 + gfx950): // `global_load_lds_dwordx4` is a single instruction (encoding 0xDDF48000 -// 0x007F0000), NOT software-expanded into 4× dword. Same encoding on both +// 0x007F0000), NOT software-expanded into 4x dword. Same encoding on both // arches. The opcode is undocumented in CDNA3 ISA spec §13.6.2 but // supported by the LLVM AMDGPU backend. // @@ -1501,7 +1501,7 @@ async_global_load_lds_dwordxn(void* smem, const void* global_addr, bool_constant // Inline asm: only the global address is an explicit operand. The LDS // destination is implicit via M0 (see contract above). `"=r"(smem)` is a -// SSA scheduling anchor only — `smem` is NOT written by this asm; the +// SSA scheduling anchor only - `smem` is NOT written by this asm; the // load goes to LDS at `M0[17:2]*4 + offset:0 + ThreadID*4`. #define CK_TILE_GLOBAL_LOAD_LDS_INSTR(instr) \ if constexpr(pre_nop) \ diff --git a/include/ck_tile/core/arch/amd_cluster_load.hpp b/include/ck_tile/core/arch/amd_cluster_load.hpp index 1af1b6b2ec..19ca7690a6 100644 --- a/include/ck_tile/core/arch/amd_cluster_load.hpp +++ b/include/ck_tile/core/arch/amd_cluster_load.hpp @@ -32,7 +32,7 @@ CK_TILE_DEVICE __attribute__((address_space(3))) T* to_lds(T* ptr) #endif // __gfx1250__ // Struct specializations for CLUSTER_LOAD_B32/B64/B128. -// Primary template intentionally undefined — compile error for unsupported sizes. +// Primary template intentionally undefined - compile error for unsupported sizes. template struct cluster_load; @@ -104,7 +104,7 @@ CK_TILE_DEVICE T cluster_multicast_load(const T* addr, int mask) } // --------------------------------------------------------------------------- -// CLUSTER_LOAD_ASYNC_TO_LDS_B* — async global→LDS multicast (gfx1250 only) +// CLUSTER_LOAD_ASYNC_TO_LDS_B* - async global->LDS multicast (gfx1250 only) // --------------------------------------------------------------------------- // Unlike CLUSTER_LOAD_B*, data lands in LDS (not VGPRs) and is tracked by // ASYNCcnt. Wait with s_wait_asynccnt(0) on the requesting wave, then use @@ -120,7 +120,7 @@ CK_TILE_DEVICE T cluster_multicast_load(const T* addr, int mask) // by the hardware instruction (default 0). // Struct specializations for CLUSTER_LOAD_ASYNC_TO_LDS_B32/B64/B128. -// Primary template intentionally undefined — compile error for unsupported sizes. +// Primary template intentionally undefined - compile error for unsupported sizes. template struct cluster_load_async_to_lds; @@ -184,10 +184,10 @@ struct cluster_load_async_to_lds<16, inst_offset> }; // Generic wrapper: issues CLUSTER_LOAD_ASYNC_TO_LDS_B* sized to T. -// `src` — global source pointer (generic address space; cast to global internally) -// `lds_dst` — per-lane LDS destination pointer (must be address_space(3)) -// `mask` — M0[15:0] WGP participation mask; M0[16] sets early-timeout -// `inst_offset` — compile-time immediate byte offset added to lds_dst by the hardware +// `src` - global source pointer (generic address space; cast to global internally) +// `lds_dst` - per-lane LDS destination pointer (must be address_space(3)) +// `mask` - M0[15:0] WGP participation mask; M0[16] sets early-timeout +// `inst_offset` - compile-time immediate byte offset added to lds_dst by the hardware template CK_TILE_DEVICE void cluster_multicast_load_async_to_lds(const T* src, __attribute__((address_space(3))) diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp index 4e94d6e902..87e55183df 100644 --- a/include/ck_tile/core/container/sequence.hpp +++ b/include/ck_tile/core/container/sequence.hpp @@ -544,7 +544,7 @@ struct sequence_sort_helper, Compare, sequence> { constexpr index_t n = sizeof...(Vs); sort_result r{{{Vs...}}, {{Idx...}}}; - // insertion sort — O(N^2) constexpr steps, O(1) template depth + // insertion sort - O(N^2) constexpr steps, O(1) template depth for(index_t i = 1; i < n; ++i) { for(index_t j = i; j > 0 && Compare{}(r.values[j], r.values[j - 1]); --j) diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp index 63072ab559..bd69cedbb6 100644 --- a/include/ck_tile/core/tensor/buffer_view.hpp +++ b/include/ck_tile/core/tensor/buffer_view.hpp @@ -467,7 +467,7 @@ struct buffer_view* g_src = reinterpret_cast*>(p_uniform_ptr + i + linear_offset); - // reinterpret_cast changes only the element type (generic→generic, no address-space - // change). to_lds then converts generic→address_space(3) using a pragma-guarded + // reinterpret_cast changes only the element type (generic->generic, no address-space + // change). to_lds then converts generic->address_space(3) using a pragma-guarded // C-style cast, matching the pattern used by the rest of the codebase. auto* lds_ptr = to_lds(reinterpret_cast*>(smem)); diff --git a/include/ck_tile/core/tensor/load_tile.hpp b/include/ck_tile/core/tensor/load_tile.hpp index 14b8b2842a..e996334bfd 100644 --- a/include/ck_tile/core/tensor/load_tile.hpp +++ b/include/ck_tile/core/tensor/load_tile.hpp @@ -45,7 +45,7 @@ CK_TILE_DEVICE auto load_tile(const TileWindow_& tile_window, * * @note This function is a modification of the existing load function. * It has been extended with two additional parameters: it takes a tuple as input - * and an elementwise function. For each A = A0, A1… AN, the elementwise function + * and an elementwise function. For each A = A0, A1... AN, the elementwise function * is additionally applied during a single read. */ template ; - // 3. PS→RHS mapping constraints + // 3. PS->RHS mapping constraints static constexpr auto input_ps_major = InDstrEncode::ps_to_rhss_major_; static constexpr auto input_ps_minor = InDstrEncode::ps_to_rhss_minor_; @@ -249,7 +249,7 @@ struct DefaultTranspose util::is_sequence_suffix_v; - // 4. YS→RHS mapping constraints + // 4. YS->RHS mapping constraints static constexpr auto input_ys_major = InDstrEncode::ys_to_rhs_major_; static constexpr auto input_ys_minor = InDstrEncode::ys_to_rhs_minor_; static constexpr auto quad_ys_major = QuadEncoding::ys_to_rhs_major_; @@ -354,7 +354,7 @@ struct TransposeTileDistributionTraits }, number{}); - // for PS→RHS mapping(both major and minor), we need to modify the last element (which is for + // for PS->RHS mapping(both major and minor), we need to modify the last element (which is for // thread distr) of the major sequence static constexpr auto dst_ps_to_rhss_major = generate_tuple( // for major because of dst_out_hs_lengthss is reversed, this index also need to be reversed diff --git a/include/ck_tile/core/tensor/load_tile_transpose.hpp.bk b/include/ck_tile/core/tensor/load_tile_transpose.hpp.bk index 5f73d4934a..2757760d75 100644 --- a/include/ck_tile/core/tensor/load_tile_transpose.hpp.bk +++ b/include/ck_tile/core/tensor/load_tile_transpose.hpp.bk @@ -132,7 +132,7 @@ struct DefaultTranspose static constexpr bool suffix_valid_dim1 = util::is_sequence_suffix_v; - // 3. PS→RHS mapping constraints + // 3. PS->RHS mapping constraints static constexpr auto input_ps_major = InDstrEncode::ps_to_rhss_major_; static constexpr auto input_ps_minor = InDstrEncode::ps_to_rhss_minor_; @@ -157,7 +157,7 @@ struct DefaultTranspose util::is_sequence_suffix_v; - // 4. YS→RHS mapping constraints + // 4. YS->RHS mapping constraints static constexpr auto input_ys_major = InDstrEncode::ys_to_rhs_major_; static constexpr auto input_ys_minor = InDstrEncode::ys_to_rhs_minor_; static constexpr auto quad_ys_major = QuadEncoding::ys_to_rhs_major_; @@ -264,7 +264,7 @@ struct TransposeTileDistributionTraits }, number{}); - // for PS→RHS mapping(both major and minor), we need to modify the last element (which is for + // for PS->RHS mapping(both major and minor), we need to modify the last element (which is for // thread distr) of the major sequence static constexpr auto dst_ps_to_rhss_major = generate_tuple( // for major because of dst_out_hs_lengthss is reversed, this index also need to be reversed diff --git a/include/ck_tile/core/tensor/tile_scatter_gather.hpp b/include/ck_tile/core/tensor/tile_scatter_gather.hpp index 45131abb97..1f34261526 100644 --- a/include/ck_tile/core/tensor/tile_scatter_gather.hpp +++ b/include/ck_tile/core/tensor/tile_scatter_gather.hpp @@ -405,7 +405,7 @@ struct tile_scatter_gather // register window to be reused as scratch and scattered the SRD writes // across two conditional branches, which gfx950's packed // buffer_load_dwordx4 issue window doesn't tolerate (gfx942 absorbs it - // via per-tile single-dword loads). __builtin_assume is hint-only — + // via per-tile single-dword loads). __builtin_assume is hint-only - // no branch, no scratch SGPRs, no codegen impact. __builtin_assume(size > 0); using BufType = remove_cvref_t; diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp index 7e86af6f2a..d3dd6596e5 100644 --- a/include/ck_tile/core/tensor/tile_window.hpp +++ b/include/ck_tile/core/tensor/tile_window.hpp @@ -190,12 +190,12 @@ struct tile_window_with_static_distribution /** * @brief Load tile with elementwise function * - * @note Load tile with elementwise — during value loading, an - * elementwise function is executed for each A0, A1, … AN. - * The values A0, A1, … AN are read by the same thread. In this way, we + * @note Load tile with elementwise - during value loading, an + * elementwise function is executed for each A0, A1, ... AN. + * The values A0, A1, ... AN are read by the same thread. In this way, we * reduce the amount of information loaded into the registers. * The same thread, during vectorized reading, accesses the same set of - * data from A0, A1, A2, … AN. + * data from A0, A1, A2, ... AN. */ template * { * [[maybe_unused]] AsmScopeMarker marker; // Emits CK_ASM_SCOPE_START * // ... code you want to delimit in assembly ... - * } // marker goes out of scope → Emits CK_ASM_SCOPE_END + * } // marker goes out of scope -> Emits CK_ASM_SCOPE_END * */ struct AsmScopeMarker diff --git a/include/ck_tile/core/utility/type_traits.hpp b/include/ck_tile/core/utility/type_traits.hpp index a9584bd592..1273062612 100644 --- a/include/ck_tile/core/utility/type_traits.hpp +++ b/include/ck_tile/core/utility/type_traits.hpp @@ -12,7 +12,7 @@ namespace ck_tile { -// `always_false_v` — a value-template that is always `false` but whose +// `always_false_v` - a value-template that is always `false` but whose // evaluation is deferred until template instantiation. The canonical use is // inside the `else` arm of an `if constexpr` chain or under an arch-gated // `#if` to fire a `static_assert` ONLY when the offending instantiation is diff --git a/include/ck_tile/host/reference/reference_topk.hpp b/include/ck_tile/host/reference/reference_topk.hpp index 8c9c87e3ee..d51373b1b8 100644 --- a/include/ck_tile/host/reference/reference_topk.hpp +++ b/include/ck_tile/host/reference/reference_topk.hpp @@ -16,7 +16,7 @@ namespace ck_tile { /* similiar to torch.topk() x (Tensor) – the input tensor. - k (int) – the k in “top-k” + k (int) – the k in "top-k" dim (int, optional) – the dimension to sort along largest (bool, optional) – largest or smallest elements sorted (bool, optional) – elements in sorted order or not diff --git a/include/ck_tile/ops/batched_contraction/kernel/batched_contraction_kernel.hpp b/include/ck_tile/ops/batched_contraction/kernel/batched_contraction_kernel.hpp index 988daf1c27..bb0d1f7bb3 100644 --- a/include/ck_tile/ops/batched_contraction/kernel/batched_contraction_kernel.hpp +++ b/include/ck_tile/ops/batched_contraction/kernel/batched_contraction_kernel.hpp @@ -33,7 +33,7 @@ * **E[G₀,G₁,...,M₀,M₁,...,N₀,N₁,...] = epilogue_op(C, D₀, D₁, D₂, ...)** * * Where: - * **C[G₀,G₁,...,M₀,M₁,...,N₀,N₁,...] = Σ_{K₀,K₁,...} A[G₀,G₁,...,M₀,M₁,...,K₀,K₁,...] × + * **C[G₀,G₁,...,M₀,M₁,...,N₀,N₁,...] = Σ_{K₀,K₁,...} A[G₀,G₁,...,M₀,M₁,...,K₀,K₁,...] x * B[G₀,G₁,...,N₀,N₁,...,K₀,K₁,...]** * * Where: @@ -50,16 +50,16 @@ * to the dot product computation in matrix multiplication. * * **Dimension Flattening Strategy**: - * - **M dimensions** (from tensor A) → Flattened into matrix rows (M_total) - * - **N dimensions** (from tensor B) → Flattened into matrix columns (N_total) - * - **K dimensions** (contraction dims) → Flattened into inner dimension (K_total) - * - **G dimensions** (batch dims) → Handled through batch processing + * - **M dimensions** (from tensor A) -> Flattened into matrix rows (M_total) + * - **N dimensions** (from tensor B) -> Flattened into matrix columns (N_total) + * - **K dimensions** (contraction dims) -> Flattened into inner dimension (K_total) + * - **G dimensions** (batch dims) -> Handled through batch processing * * **Mathematical Transformation**: * ``` - * Original: E[g,m₀,m₁,n₀,n₁] = Σ_{k₀,k₁} A[g,m₀,m₁,k₀,k₁] × B[g,n₀,n₁,k₀,k₁] - * Flattened: E[g,M,N] = Σ_K A[g,M,K] × B[g,N,K] (where M=m₀×m₁, N=n₀×n₁, K=k₀×k₁) - * GEMM Form: E = A × Bᵀ + * Original: E[g,m₀,m₁,n₀,n₁] = Σ_{k₀,k₁} A[g,m₀,m₁,k₀,k₁] x B[g,n₀,n₁,k₀,k₁] + * Flattened: E[g,M,N] = Σ_K A[g,M,K] x B[g,N,K] (where M=m₀xm₁, N=n₀xn₁, K=k₀xk₁) + * GEMM Form: E = A x Bᵀ * * **Why This Approach Is Optimal**: * Rather than implementing tensor contraction from scratch, this kernel leverages the highly diff --git a/include/ck_tile/ops/common/streamk_common.hpp b/include/ck_tile/ops/common/streamk_common.hpp index d291244aa4..f4d4c44811 100644 --- a/include/ck_tile/ops/common/streamk_common.hpp +++ b/include/ck_tile/ops/common/streamk_common.hpp @@ -256,8 +256,8 @@ struct StreamKReductionOps /// /// @tparam TilePartitioner_ Partitioner type (persistent or non-persistent specialization). /// @param tile_partitioner The partitioner instance from kernel args. -/// @param dp_tile_func Callable(index_t tile_idx) — processes one full DP tile. -/// @param sk_func Callable(index_t sk_cta_idx) — runs the StreamK loop for this CTA. +/// @param dp_tile_func Callable(index_t tile_idx) - processes one full DP tile. +/// @param sk_func Callable(index_t sk_cta_idx) - runs the StreamK loop for this CTA. template CK_TILE_DEVICE void StreamKDispatch(const TilePartitioner_& tile_partitioner, DPTileFunc dp_tile_func, SKFunc sk_func) diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp index 3f5e32383d..a6b3f86b94 100644 --- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp +++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp @@ -796,7 +796,7 @@ struct PassThrough { y = ck_tile::type_convert>(x); } - /* otherwise (r-value or const) → do nothing */ + /* otherwise (r-value or const) -> do nothing */ } template diff --git a/include/ck_tile/ops/epilogue/chainer/cshuffle_epilogue_schedule.hpp b/include/ck_tile/ops/epilogue/chainer/cshuffle_epilogue_schedule.hpp index 683bfe7377..c9351a783f 100644 --- a/include/ck_tile/ops/epilogue/chainer/cshuffle_epilogue_schedule.hpp +++ b/include/ck_tile/ops/epilogue/chainer/cshuffle_epilogue_schedule.hpp @@ -13,17 +13,19 @@ namespace ck_tile { /// @par Purpose /// Each tag corresponds to a pre-built schedule, these are used to select a schedule -/// Standard epilogue schedule: Slice → CastStore → Load → ApplyD → Store → Move +/// Standard epilogue schedule: Slice -> CastStore -> Load -> ApplyD -> Store -> Move struct DefaultScheduleTag { }; -/// RowCol quantization schedule: Slice → ScaleWindow → CastStore → Load → ApplyD → Store → Move +/// RowCol quantization schedule: Slice -> ScaleWindow -> CastStore -> Load -> ApplyD -> Store -> +/// Move struct RowColQuantScheduleTag { }; -/// Tensor quantization schedule: Slice → ScaleScalar → CastStore → Load → ApplyD → Store → Move +/// Tensor quantization schedule: Slice -> ScaleScalar -> CastStore -> Load -> ApplyD -> Store -> +/// Move struct TensorQuantScheduleTag { }; diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp index 7aff21530d..07220f4457 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp @@ -33,17 +33,17 @@ struct FmhaBwdWorkspaceManager // CPU workspace (prepared by host, read-only for kernels): // index_t nsplits[batch or 1] - // — per-batch nsplits array (batch element in deterministic group mode) + // - per-batch nsplits array (batch element in deterministic group mode) // [OPTIONAL, only for deterministic group mode] // long_index_t dq_acc_offsets[batch] - // — per-batch offset array + // - per-batch offset array // GPU WORKSPACE BELOW (read & written by kernels): // [OPTIONAL, only for !kUseQrQtrDorPipeline] // AccDataType dq_acc[total_elements] - // — dq_acc compact buffer (zeroed if necessary) + // - dq_acc compact buffer (zeroed if necessary) // - total_elements = sum_i(nhead * nsplits_i * seqq_i) * hdim_q // - Layout within each batch: [nhead, nsplits_i, seqq_i, hdim_q] // - note: use physical (including padding) length for seqq_i for group mode diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp index 3f284c5c91..444fdef69b 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp @@ -162,7 +162,7 @@ CK_TILE_HOST_DEVICE void kv_offset_array_transform(const IndexArrayType& physica const index_t& thread_coord_start = coord_vec[kCoordAxis]; constexpr index_t kInPageOffsetMask = (1 << kLog2PageSize) - 1; - // Addressing strategy — four cases controlled by (kPageBlockSize vs kN0, kUseGlobalLoad_): + // Addressing strategy - four cases controlled by (kPageBlockSize vs kN0, kUseGlobalLoad_): // // Case 1: kPageBlockSize >= kN0 // SRD is rebased per-tile to the page base (rebase_{k,v}_window in caller). @@ -733,24 +733,24 @@ struct BlockFmhaBatchPrefillPipelineQRKSVSAsync // The decomposition pattern differs by memory layout: // // VECTORIZED_LAYOUT (ColumnMajor, custom distribution): - // 3D decomposition: K = K2 × K0 × K1 + // 3D decomposition: K = K2 x K0 x K1 // - K2 (V_KIterOuter): Outer iteration count // - K0 (V_KLanes): Lanes for K dimension (matches GEMM kABKLane) // - K1 (V_KIterInner): Vector load size (matches GEMM kKPerThread) // - hs_lengthss_[I1] = {K2, K0, K1}, size = 3 (or {K0, K1} size = 2 if no outer iter) // // LINEAR_LAYOUT ColumnMajor (base class distribution): - // 2D decomposition: K = K0 × K1 + // 2D decomposition: K = K0 x K1 // - K0: Lanes for K dimension (may not match GEMM kABKLane) // - K1: Vector load size // - hs_lengthss_[I1] = {K0, K1}, size = 2 // // LINEAR_LAYOUT RowMajor (base class distribution): - // 4D decomposition: K = K0 × K1 × K2 × K3 (uses shuffle_tile for GEMM alignment) - // 3D decomposition: K = K0 × K1 × K2 (fallback case) + // 4D decomposition: K = K0 x K1 x K2 x K3 (uses shuffle_tile for GEMM alignment) + // 3D decomposition: K = K0 x K1 x K2 (fallback case) // - Page lookup uses Y-space's last dimension only (inner iteration) // - // V_PageIdxRepeat = total number of page lookups per thread = V_KIterOuter × V_KIterInner + // V_PageIdxRepeat = total number of page lookups per thread = V_KIterOuter x V_KIterInner constexpr index_t V_KIterInner = VDstrEncode::hs_lengthss_[I1].back(); // Compute V_KIterOuter and V_KLanes based on memory layout and K decomposition @@ -938,7 +938,7 @@ struct BlockFmhaBatchPrefillPipelineQRKSVSAsync v_physical_pages, stride_v, page_stride_v, v_coord, v_offsets, current_seq_k); } - // v_offsets semantics — see the four-case addressing-strategy block above + // v_offsets semantics - see the four-case addressing-strategy block above // kNeedFullOffset in kv_offset_array_transform. Three cases reach this lambda: // Case 1 (kPageBlockSize >= kN0): within-page offset; page base in SRD. // Case 2 (page_size < kN0, kUseGlobalLoad): within-page offset; page base computed @@ -970,7 +970,7 @@ struct BlockFmhaBatchPrefillPipelineQRKSVSAsync // Initial V SRD rebase. Single source of truth: rebase_v_window's own // `if constexpr(kPageBlockSize >= kN0)` makes this a no-op for case 2/3. - // Do not re-add an outer guard here — it would duplicate the inner check + // Do not re-add an outer guard here - it would duplicate the inner check // and drift if the lambda's gating condition ever changes. rebase_v_window(v_dram_window, v_physical_pages[number<0>{}]); @@ -1015,12 +1015,12 @@ struct BlockFmhaBatchPrefillPipelineQRKSVSAsync // To support smaller page sizes (cross-page tiles), need: // // 1. K descale: Load per-token k_descale_vec[NRepeat] based on k_physical_pages[k0] - // - After GEMM0 (S = Q × K^T), apply column-wise scaling: S[:,j] *= k_descale[j] + // - After GEMM0 (S = Q x K^T), apply column-wise scaling: S[:,j] *= k_descale[j] // - Requires modifying s_acc_element_func to accept column index // // 2. V descale: Load per-token v_descale_vec[V_PageIdxRepeat] based on // v_physical_pages[k0] - // - Before GEMM1 (O = P × V), apply row-wise scaling to P: P[i,j] *= v_descale[j] + // - Before GEMM1 (O = P x V), apply row-wise scaling to P: P[i,j] *= v_descale[j] // - Or pre-scale V in LDS (more complex) // // 3. K and V may be on different pages for the same token index, so need separate @@ -1554,7 +1554,7 @@ struct BlockFmhaBatchPrefillPipelineQRKSVSAsync k_dram_window.update_physical_pages(k_physical_pages); rebase_k_window(k_dram_window, k_physical_pages[number<0>{}]); - // After sink→window transition (i_total_loops == num_sink_loop), V window + // After sink->window transition (i_total_loops == num_sink_loop), V window // was advanced by kN0 (one normal iter), but current_seq_k jumped by k_advance // = seqlen_k_start - sink_seq_end + kN0 > kN0. Re-init V to current_seq_k. if constexpr(kHasSink) diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp index ac868ce4b8..fbe2bd819e 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp @@ -33,7 +33,7 @@ namespace ck_tile { // warp gemm calls = MIterPerWarp * NIterPerWarp * KIterPerWarp // MFMAs per call = WarpGemm::kK / WarpGemm::WarpGemmAttribute::Impl::kK (kKIter) // -// For bf16/fp16 kKIter=1; for fp8 kKIter=2 (K=32 warp gemm wraps 2× K=16 MFMA). +// For bf16/fp16 kKIter=1; for fp8 kKIter=2 (K=32 warp gemm wraps 2x K=16 MFMA). // --------------------------------------------------------------------------- template static constexpr ck_tile::index_t block_gemm_mfma_count_v = @@ -65,7 +65,7 @@ struct CoreLoopSchedulerDefaultBase { using Params = CoreLoopSchedulingParams; - // Phase helper: GEMM0 compute (QK matmul) — MFMA interleaved with TRANS + VALU + // Phase helper: GEMM0 compute (QK matmul) - MFMA interleaved with TRANS + VALU CK_TILE_DEVICE static constexpr void schedule_gemm0_compute() { static_for<0, Params::kMfmaPerWarpGemm0, 1>{}([&](auto) { @@ -75,7 +75,7 @@ struct CoreLoopSchedulerDefaultBase }); } - // Phase helper: GEMM1 compute (PV matmul) — optional packed-FP32 preamble + MFMA/VALU + // Phase helper: GEMM1 compute (PV matmul) - optional packed-FP32 preamble + MFMA/VALU CK_TILE_DEVICE static constexpr void schedule_gemm1_compute() { #if !CK_TILE_DISABLE_PACKED_FP32 @@ -87,7 +87,7 @@ struct CoreLoopSchedulerDefaultBase }); } - // Phase helper: load phase (memory/LDS loads) — VALU + SALU + // Phase helper: load phase (memory/LDS loads) - VALU + SALU CK_TILE_DEVICE static constexpr void schedule_load_phase() { __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::VALU, 2, 0); @@ -119,21 +119,21 @@ struct CoreLoopSchedulerDefaultBase template struct CoreLoopSchedulerImpl; -// bf16 — uses default base +// bf16 - uses default base template struct CoreLoopSchedulerImpl : CoreLoopSchedulerDefaultBase { }; -// fp16 — uses default base +// fp16 - uses default base template struct CoreLoopSchedulerImpl : CoreLoopSchedulerDefaultBase { }; -// fp8 — asymmetric GEMM0 scheduling for 2× K iterations +// fp8 - asymmetric GEMM0 scheduling for 2x K iterations // // FP8 GEMM0 has 16 MFMAs (kKIter=2) but the same TRANS work as bf16/fp16 (softmax // exp count is dtype-independent). The uniform (MFMA:1, TRANS:2, VALU:2) pattern @@ -141,8 +141,8 @@ struct CoreLoopSchedulerImpl struct CoreLoopSchedulerImpl : CoreLoopSchedulerDefaultBase @@ -165,12 +165,12 @@ struct CoreLoopSchedulerImplpermlane->max->mul chain, creating // a data dependency gap around MFMAs 8-11. Use a looser VALU constraint for the // second half to give the scheduler freedom to place v_fma where available. CK_TILE_DEVICE static constexpr void schedule_gemm1_compute() @@ -190,7 +190,7 @@ struct CoreLoopSchedulerImpl CK_TILE_DEVICE static constexpr void schedule(ck_tile::number, ck_tile::number) diff --git a/include/ck_tile/ops/gemm/block/block_wp_asmem_breg_creg.hpp b/include/ck_tile/ops/gemm/block/block_wp_asmem_breg_creg.hpp index b03046d192..48abbd3594 100644 --- a/include/ck_tile/ops/gemm/block/block_wp_asmem_breg_creg.hpp +++ b/include/ck_tile/ops/gemm/block/block_wp_asmem_breg_creg.hpp @@ -104,7 +104,7 @@ struct BlockWeightPreshuffleASmemBRegCReg { constexpr auto a_load_dstr = make_static_tile_distribution(MakeABlockDistributionEncode()); - // create MIterPerWarp × KIterPerWarp window + // create MIterPerWarp x KIterPerWarp window return generate_tuple( [&](auto kIter) { return generate_tuple( diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp index 8405606ddf..cdfbafc6c0 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp @@ -496,10 +496,10 @@ struct GemmClusterTilePartitioner EXAMPLE CONFIGURATION: - - Cluster dimensions: ClusterM = 2, ClusterN = 2 (2×2 cluster) - - Grid dimensions: GridM = 6, GridN = 4 (6×4 output tiles) - - Number of clusters: (6/2) × (4/2) = 3 × 2 = 6 clusters - - Blocks per cluster: 2 × 2 = 4 blocks + - Cluster dimensions: ClusterM = 2, ClusterN = 2 (2x2 cluster) + - Grid dimensions: GridM = 6, GridN = 4 (6x4 output tiles) + - Number of clusters: (6/2) x (4/2) = 3 x 2 = 6 clusters + - Blocks per cluster: 2 x 2 = 4 blocks The tables below show which BLOCK (identified by its flattened cluster_id) processes each output TILE position (tile_m, tile_n). Values 0-5 represent the 6 @@ -515,7 +515,7 @@ struct GemmClusterTilePartitioner TILE ASSIGNMENT (each cell shows which cluster processes that tile): - N→ 0 1 2 3 + N-> 0 1 2 3 ┌────────────────────────┐ M 0 │ │ 0 │ 0 │ 3 │ 3 │ │ ├────────────────────┤ @@ -548,7 +548,7 @@ struct GemmClusterTilePartitioner TILE ASSIGNMENT (interleaved along both M and N): - N→ 0 1 2 3 + N-> 0 1 2 3 ┌────────────────────────┐ M 0 │ │ 0 │ 3 │ 0 │ 3 │ │ ├────────────────────┤ @@ -581,7 +581,7 @@ struct GemmClusterTilePartitioner TILE ASSIGNMENT (interleaved along M, contiguous along N): - N→ 0 1 2 3 + N-> 0 1 2 3 ┌────────────────────────┐ M 0 │ │ 0 │ 0 │ 3 │ 3 │ │ ├────────────────────┤ diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp index 2d757b0b54..8ebc8fb822 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp @@ -504,21 +504,21 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3 // initialize C tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); - // Load tile — during value loading, an elementwise function is executed for each A0, - // A1, … AN. The values A0, A1, … AN are read by the same thread. + // Load tile - during value loading, an elementwise function is executed for each A0, + // A1, ... AN. The values A0, A1, ... AN are read by the same thread. auto elementwise_As_res = load_tile_with_elementwise(a_copy_dram_window, a_element_func); - // Move each A — the enhanced function move_tile_window is executed, which takes a tuple + // Move each A - the enhanced function move_tile_window is executed, which takes a tuple // as input. move_tile_window(a_copy_dram_window, a_dram_tile_window_step); - // Load tile — during value loading, an elementwise function is executed for each B0, - // B1, … BN. The values B0, B1, … BN are read by the same thread. + // Load tile - during value loading, an elementwise function is executed for each B0, + // B1, ... BN. The values B0, B1, ... BN are read by the same thread. auto elementwise_Bs_res = load_tile_with_elementwise(b_copy_dram_window, b_element_func); - // Move each B — the enhanced function move_tile_window is executed, which takes a tuple + // Move each B - the enhanced function move_tile_window is executed, which takes a tuple // as input. move_tile_window(b_copy_dram_window, b_dram_tile_window_step); diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp index b67cda5618..c1d5f6e74a 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp @@ -373,11 +373,11 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4 }, number{}); - // Load tile — during value loading, an elementwise function is executed for each A0, - // A1, … AN. The values A0, A1, … AN are read by the same thread. + // Load tile - during value loading, an elementwise function is executed for each A0, + // A1, ... AN. The values A0, A1, ... AN are read by the same thread. auto elementwise_As_res = load_tile_with_elementwise(a_tile_windows, a_element_func); - // Move each A — the enhanced function move_tile_window is executed, which takes a tuple + // Move each A - the enhanced function move_tile_window is executed, which takes a tuple // as input. move_tile_window(a_tile_windows, a_dram_tile_window_step); @@ -392,11 +392,11 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4 }, number{}); - // Load tile — during value loading, an elementwise function is executed for each B0, - // B1, … BN. The values B0, B1, … BN are read by the same thread. + // Load tile - during value loading, an elementwise function is executed for each B0, + // B1, ... BN. The values B0, B1, ... BN are read by the same thread. auto elementwise_Bs_res = load_tile_with_elementwise(b_tile_windows, b_element_func); - // Move each B — the enhanced function move_tile_window is executed, which takes a tuple + // Move each B - the enhanced function move_tile_window is executed, which takes a tuple // as input. move_tile_window(b_tile_windows, b_dram_tile_window_step); @@ -502,8 +502,8 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4 Base::LocalPrefill(b_copy_lds_window1, elementwise_Bs_res); } - // Load tile — during value loading, an elementwise function is executed for each A0, - // A1, … AN. The values A0, A1, … AN are read by the same thread. + // Load tile - during value loading, an elementwise function is executed for each A0, + // A1, ... AN. The values A0, A1, ... AN are read by the same thread. elementwise_As_res = load_tile_with_elementwise(a_tile_windows, a_element_func); move_tile_window(a_tile_windows, a_dram_tile_window_step); diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp index 6d4b6a7758..a1f1cfce50 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp @@ -274,19 +274,19 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5 auto MemoryOpsStep = [&](auto idx) { // Memory read half here. - // Load tile — during value loading, an elementwise function is executed for each - // A0, A1, … AN. The values A0, A1, … AN are read by the same thread. + // Load tile - during value loading, an elementwise function is executed for each + // A0, A1, ... AN. The values A0, A1, ... AN are read by the same thread. elementwise_As_res = load_tile_with_elementwise(a_copy_dram_window, a_element_func); - // Move each A — the enhanced function move_tile_window is executed, which takes a + // Move each A - the enhanced function move_tile_window is executed, which takes a // tuple as input. move_tile_window(a_copy_dram_window, a_dram_tile_window_step); - // Load tile — during value loading, an elementwise function is executed for each - // B0, B1, … BN. The values B0, B1, … BN are read by the same thread. + // Load tile - during value loading, an elementwise function is executed for each + // B0, B1, ... BN. The values B0, B1, ... BN are read by the same thread. elementwise_Bs_res = load_tile_with_elementwise(b_copy_dram_window, b_element_func); - // Move each B — the enhanced function move_tile_window is executed, which takes a + // Move each B - the enhanced function move_tile_window is executed, which takes a // tuple as input. move_tile_window(b_copy_dram_window, b_dram_tile_window_step); diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp index 1f87ec6bff..a55caf9615 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp @@ -358,19 +358,19 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem // prefetch // global read 0 - // Load tile — during value loading, an elementwise function is executed for each A0, - // A1, … AN. The values A0, A1, … AN are read by the same thread. + // Load tile - during value loading, an elementwise function is executed for each A0, + // A1, ... AN. The values A0, A1, ... AN are read by the same thread. a_block_tiles.at(I0{}) = load_tile_with_elementwise(a_copy_dram_window, a_element_func); - // Move each A — the enhanced function move_tile_window is executed, which takes a tuple + // Move each A - the enhanced function move_tile_window is executed, which takes a tuple // as input. move_tile_window(a_copy_dram_window, a_dram_tile_window_step); - // Load tile — during value loading, an elementwise function is executed for each B0, - // B1, … BN. The values B0, B1, … BN are read by the same thread. + // Load tile - during value loading, an elementwise function is executed for each B0, + // B1, ... BN. The values B0, B1, ... BN are read by the same thread. b_block_tiles.at(I0{}) = load_tile_with_elementwise(b_copy_dram_window, b_element_func); - // Move each B — the enhanced function move_tile_window is executed, which takes a tuple + // Move each B - the enhanced function move_tile_window is executed, which takes a tuple // as input. move_tile_window(b_copy_dram_window, b_dram_tile_window_step); @@ -670,19 +670,19 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem // prefetch // global read 0 - // Load tile — during value loading, an elementwise function is executed for each A0, - // A1, … AN. The values A0, A1, … AN are read by the same thread. + // Load tile - during value loading, an elementwise function is executed for each A0, + // A1, ... AN. The values A0, A1, ... AN are read by the same thread. a_block_tiles.at(I0{}) = load_tile_with_elementwise(a_copy_dram_window, a_element_func); - // Move each A — the enhanced function move_tile_window is executed, which takes a tuple + // Move each A - the enhanced function move_tile_window is executed, which takes a tuple // as input. move_tile_window(a_copy_dram_window, a_dram_tile_window_step); - // Load tile — during value loading, an elementwise function is executed for each B0, - // B1, … BN. The values B0, B1, … BN are read by the same thread. + // Load tile - during value loading, an elementwise function is executed for each B0, + // B1, ... BN. The values B0, B1, ... BN are read by the same thread. b_block_tiles.at(I0{}) = load_tile_with_elementwise(b_copy_dram_window, b_element_func); - // Move each B — the enhanced function move_tile_window is executed, which takes a tuple + // Move each B - the enhanced function move_tile_window is executed, which takes a tuple // as input. move_tile_window(b_copy_dram_window, b_dram_tile_window_step); diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp index 105ead63dc..8efb7b7ac1 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp @@ -252,22 +252,22 @@ struct GemmPipelineAGmemBGmemCRegV1 : public BaseGemmPipelineAGmemBGmemCRegV1 N -> QScale -> KIterPerQScale static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { static_for<0, NIterPerWarp, 1>{}([&](auto nIter) { // Iterate over quantization groups diff --git a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp index 00e50af5d8..b277780d6a 100644 --- a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp +++ b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp @@ -999,7 +999,7 @@ struct QuantGemmKernel // Number of K-dimension quantization groups per block constexpr auto bqk_per_block = TilePartitioner::KPerBlock / BQuantGroupSize::kK; - // The pre-shuffled layout flattens warp_n × + // The pre-shuffled layout flattens warp_n x // bqk_per_block scales per row, Padded up to warp_size // to ensure coalesced memory access. constexpr auto tile_window_width = @@ -1007,7 +1007,7 @@ struct QuantGemmKernel // Adapts based on fine vs coarse quantization granularity: // - Fine-grained (BQuantGroupSize::kN < warp_n): - // Multiple quant groups per warp → fewer rows needed per block. + // Multiple quant groups per warp -> fewer rows needed per block. // height = block_n / warp_per_group // // - Coarse-grained (BQuantGroupSize::kN >= warp_n): @@ -1329,7 +1329,7 @@ struct QuantGemmKernel if constexpr(std::is_same_v) { - // For RowMajor C, M is the row dimension — check M alignment here because + // For RowMajor C, M is the row dimension - check M alignment here because // ALayout=RowMajor does not check M (it only checks K), leaving a gap for // the RowMajorA + RowMajorC combination. if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false && diff --git a/include/ck_tile/ops/gemm_quant/pipeline/gemm_group_quant_utils.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_group_quant_utils.hpp index 48c27945b3..513c106de3 100644 --- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_group_quant_utils.hpp +++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_group_quant_utils.hpp @@ -240,7 +240,7 @@ struct tile_distribution_encoding_pattern_bq : public tile_distribution_encoding // Each warp processes multiple scales: WarpGemm::kN / NPerQ scales per warp. // // Example: NPerQ=8, WarpGemm::kN=16, KPerQ=128, BlockGemmShape::kK=256 - // → 2 scales per warp in N, 2 K-groups per block + // -> 2 scales per warp in N, 2 K-groups per block // N1: Number of K-dimension quantization groups per block, // Each K-group of KPerQ elements shares the same scale. @@ -280,7 +280,7 @@ struct tile_distribution_encoding_pattern_bq : public tile_distribution_encoding // Some warps share the same scale (KR > 1 creates warp grouping). // // Example: NPerQ=32, WarpGemm::kN=16, NWarps=4 - // → KR=2 (2 warps share same scale), K1=2 (2 unique scale groups) + // -> KR=2 (2 warps share same scale), K1=2 (2 unique scale groups) // KR: Number of warps sharing the same scale // K1: Number of distinct warp groups (unique scales) @@ -318,7 +318,7 @@ struct tile_distribution_encoding_pattern_bq : public tile_distribution_encoding // All warps share the same scale value for their N-tiles. // // Example: NPerQ=128, WarpGemm::kN=16, NWarps=4 - // → 128 >= 16*4=64, so all 4 warps use the same scale + // -> 128 >= 16*4=64, so all 4 warps use the same scale // N1: K-dimension quantization groups // N0: Minimal (1) since scale is shared across N @@ -356,18 +356,18 @@ struct tile_distribution_encoding_pattern_bq : public tile_distribution_encoding /// - Each warp processes multiple scales (WarpGemm::kN / NPerQ scales per warp) /// - Distribution includes explicit replication factor (XR = NPerQ) for scale /// broadcast - /// - Example: NPerQ=8, WarpGemm::kN=16, NWarps=4 → 2 scales per warp + /// - Example: NPerQ=8, WarpGemm::kN=16, NWarps=4 -> 2 scales per warp /// /// 2. Medium-grained quantization (WarpGemm::kN <= NPerQ <= WarpGemm::kN * NWarps): /// - Each warp handles exactly one quantization scale /// - Scales are distributed across warps with replication factor XR = NPerQ / /// WarpGemm::kN - /// - Example: NPerQ=64, WarpGemm::kN=16, NWarps=4 → 1 scale per warp, XR=4 + /// - Example: NPerQ=64, WarpGemm::kN=16, NWarps=4 -> 1 scale per warp, XR=4 /// /// 3. Coarse-grained quantization (NPerQ > WarpGemm::kN * NWarps): /// - Quantization group spans multiple warps /// - All warps share the same scale value - /// - Example: NPerQ=128, WarpGemm::kN=16, NWarps=4 → all warps use same scale + /// - Example: NPerQ=128, WarpGemm::kN=16, NWarps=4 -> all warps use same scale /// /// @return A static tile distribution encoding for the BQ scale tensor if constexpr(NPerQ < WarpGemm::kN) diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp index e353dc8b54..f818d3b757 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp @@ -460,7 +460,7 @@ struct GroupedConvBwdDataKernelArgs long_index_t group_stride_c; // Split-N support fields - initialize to safe defaults - index_t n_splits = 1; // Number of batch splits (e.g., 2 for 128→64×2) + index_t n_splits = 1; // Number of batch splits (e.g., 2 for 128->64x2) index_t n_per_split = 1; // Batches per split (N_ from transformer) index_t original_n = 1; // Original batch size before splitting index_t input_batch_stride = 0; // Stride to next batch in input tensor diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp index db9ac89763..babeae77d1 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp @@ -370,7 +370,7 @@ struct GroupedConvBwdWeightKernelArgs void* workspace_ptr = nullptr; - // StreamK tile partitioner — stored directly when TilePartitioner_ is a real type, + // StreamK tile partitioner - stored directly when TilePartitioner_ is a real type, // empty struct when void (Split-K path). Constructed with dummy values here; // properly initialized in MakeKernelArgs before device-side use. struct EmptyPartitioner @@ -651,7 +651,7 @@ struct GroupedConvolutionBackwardWeightKernel return false; } } - // Runtime arch check — complements the static_assert in operator(). + // Runtime arch check - complements the static_assert in operator(). // Both are needed: this check runs on the host (where get_compiler_target() // isn't available since HIP's host pass doesn't define __gfx*__ macros), // while the static_assert in operator() catches misuse at device compile time. @@ -1075,7 +1075,7 @@ struct GroupedConvolutionBackwardWeightKernel CK_TILE_DEVICE void RunStreamK(GroupedConvBwdWeightKernelArgsSpecialized& kargs) const { - // Device-side compile-time arch check — complements the runtime check in + // Device-side compile-time arch check - complements the runtime check in // IsSupportedArgument(). Both are needed: the runtime check runs on the host // (where get_compiler_target() isn't available since HIP's host pass doesn't // define __gfx*__ macros), while this catches misuse at device compile time. @@ -1258,7 +1258,7 @@ struct GroupedConvolutionBackwardWeightKernel amd_wave_read_first_lane(partner_start_iter < tile_iter_end); // If the partner of the tile-starter is not in this tile, - // then all partials are accumulated — write final result. + // then all partials are accumulated - write final result. if(tile_started && !partner_in_tile) { auto c_block_window_out = diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp index 5b1803224f..14c4356aa1 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp @@ -411,7 +411,7 @@ struct GroupedConvFwdKernelArgs long_index_t group_stride_c; // Split-N support fields - initialize to safe defaults - index_t n_splits = 1; // Number of batch splits (e.g., 2 for 128→64×2) + index_t n_splits = 1; // Number of batch splits (e.g., 2 for 128->64x2) index_t n_per_split = 1; // Batches per split (N_ from transformer) index_t original_n = 1; // Original batch size before splitting index_t input_batch_stride = 0; // Stride to next batch in input tensor diff --git a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp index 46e3033ef1..8e9d808d4c 100644 --- a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp +++ b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp @@ -131,7 +131,7 @@ struct TransformConvFwdToGemm // Calculate split-image factors AFTER considering split-N // Returns: should_split flag and optimal split factors for D, H, W dimensions - // Strategy: Hierarchical splitting with priority order D → H → W + // Strategy: Hierarchical splitting with priority order D -> H -> W // Dynamically increases split factors until memory fits below threshold // // NOTE: Layout validation should be done at the invoker level before calling this function @@ -188,7 +188,7 @@ struct TransformConvFwdToGemm return info; } - // Split-image is needed - use hierarchical priority: D → H → W + // Split-image is needed - use hierarchical priority: D -> H -> W info.should_split = true; // Hierarchical splitting strategy: @@ -253,7 +253,7 @@ struct TransformConvFwdToGemm // Use maximum allowed split as best effort (capped at 64 total pieces) info.num_d_pieces = (D_out < 4) ? D_out : 4; // Cap at 4 info.num_h_pieces = (H_out < 4) ? H_out : 4; // Cap at 4 - info.num_w_pieces = (W_out < 4) ? W_out : 4; // Cap at 4 (max 4×4×4=64) + info.num_w_pieces = (W_out < 4) ? W_out : 4; // Cap at 4 (max 4x4x4=64) return info; } diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp index a14f103eb6..f4245aec62 100644 --- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp +++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp @@ -177,7 +177,7 @@ struct BlockReduce2d } // uniform_sequence_gen_t generates sequence of NSize elements filled with Value - // e.g., uniform_sequence_gen_t<2, 1> → {1, 1} and uniform_sequence_gen_t<3, 4> → {4, 4, 4} + // e.g., uniform_sequence_gen_t<2, 1> -> {1, 1} and uniform_sequence_gen_t<3, 4> -> {4, 4, 4} template > diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp index de5cf4e1cc..057ea19cbd 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp @@ -167,7 +167,7 @@ using device_grouped_conv_bwd_data_xdl_f16_noshuffle_instances = // ##############################################| Spatial| | | | | Type| Type| Type| DataType| Type| Type| Operation| Operation| Operation| DataSpecialization| GemmM| GemmN| PrefetchStage| Size| Block| Block| Block| | | XDL| XDL| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| PerWave| PerWave| _MBlock_MPerBlock| ScalarPerVector| // ##############################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | Lengths_AK0_M_AK1| ArrangeOrder| | | PerVector| PerVector_AK1| | Lengths_BK0_N_BK1| ArrangeOrder| | | PerVector| PerVector_BK1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| // ##############################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - // f16_f16_f32_f16 — noshuffle epilogue (CDEBlockTransferScalarPerVector_NPerBlock = 1) + // f16_f16_f32_f16 - noshuffle epilogue (CDEBlockTransferScalarPerVector_NPerBlock = 1) DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 1>, DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 1>, @@ -333,9 +333,9 @@ using device_grouped_conv_bwd_data_xdl_bf16_instances = std::tuple< // clang-format on >; -// bf16_bf16_f32_bf16 — noshuffle epilogue (CDEBlockTransferScalarPerVector_NPerBlock = 1) +// bf16_bf16_f32_bf16 - noshuffle epilogue (CDEBlockTransferScalarPerVector_NPerBlock = 1) // Same tile shapes as bf16_instances but with ScalarPerVector=1, enabling the no-shuffle fast path -// (VGPR → Global direct write, 0 LDS barriers) instead of CShuffle (VGPR → LDS → Global, 8 +// (VGPR -> Global direct write, 0 LDS barriers) instead of CShuffle (VGPR -> LDS -> Global, 8 // barriers). template , S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 32, 1, 8>, 1>, DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout, DsLayout, ELayout, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, PassThrough, ConvSpec, true, true, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 32, 1, 8>, 1>, @@ -552,7 +552,7 @@ using device_grouped_conv_bwd_data_xdl_f32_noshuffle_instances = // clang-format on >; -// bf16 — BBlockTransfer parameters matching the non-grouped DeviceConvNdBwdDataNwcKxcNwk_Xdl +// bf16 - BBlockTransfer parameters matching the non-grouped DeviceConvNdBwdDataNwcKxcNwk_Xdl // instances. The key difference from bf16_instances: BBlockTransfer uses S<4, BlockSize/4, 1> // thread cluster and S<2, 0, 1> arrange order, which gives full thread utilization for B-matrix // loads. These are optimal when opt3 flat descriptor path is active (G=1, 2D convolutions). @@ -584,7 +584,7 @@ using device_grouped_conv_bwd_data_xdl_bf16_nongrouped_match_instances = std::tu // clang-format on >; -// f16 — BBlockTransfer parameters matching the non-grouped DeviceConvNdBwdDataNwcKxcNwk_Xdl +// f16 - BBlockTransfer parameters matching the non-grouped DeviceConvNdBwdDataNwcKxcNwk_Xdl // instances. template ; -// f32 — BBlockTransfer parameters matching the non-grouped DeviceConvNdBwdDataNwcKxcNwk_Xdl +// f32 - BBlockTransfer parameters matching the non-grouped DeviceConvNdBwdDataNwcKxcNwk_Xdl // instances. F32 uses K1=4, KPerBlock=16, and smaller scalar-per-vector values. template , S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 32, 2>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 32, 1, 8>, 2>, - // M=64 N=64: 2x2 waves, 16 AccVGPRs/wave — smallest tile, targets Group 3 shapes + // M=64 N=64: 2x2 waves, 16 AccVGPRs/wave - smallest tile, targets Group 3 shapes DeviceGroupedConvBwdWeight_Xdl_WaveletModel_CShuffleV3, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 4, false, S<4, 32, 2>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 4, false, 1, 1, S<1, 32, 1, 8>, 2>, - // M=128 N=64: 2x2 waves, 32 AccVGPRs/wave — asymmetric M>N + // M=128 N=64: 2x2 waves, 32 AccVGPRs/wave - asymmetric M>N DeviceGroupedConvBwdWeight_Xdl_WaveletModel_CShuffleV3, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, S<4, 32, 2>, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 4, false, 1, 1, S<1, 32, 1, 8>, 2>, - // M=64 N=128: 2x2 waves, 32 AccVGPRs/wave — asymmetric N>M + // M=64 N=128: 2x2 waves, 32 AccVGPRs/wave - asymmetric N>M DeviceGroupedConvBwdWeight_Xdl_WaveletModel_CShuffleV3, S<2, 0, 1>, S<1, 0, 2>, 1, 2, 4, false, S<4, 32, 2>, S<2, 0, 1>, S<1, 0, 2>, 1, 4, 4, false, 1, 1, S<1, 32, 1, 8>, 2> // clang-format on >; @@ -61,7 +61,7 @@ using device_grouped_conv_bwd_weight_wavelet_xdl_c_shuffle_f16_instances = std:: // 2-way (4,2) wave-specialized instances for F16. // TileLoad=256 (4 load waves) + TileMath=128 (2 math waves) = 384 threads (6 waves). // Same load cluster as (4,4): S<4, 32, 2>=256. -// Math waves halved → each wave handles more MXdl/NXdl work → more AccVGPRs. +// Math waves halved -> each wave handles more MXdl/NXdl work -> more AccVGPRs. template ; -// BF16 wavelet instances — same tile configs as F16, with BF16 in/out and F32 compute. +// BF16 wavelet instances - same tile configs as F16, with BF16 in/out and F32 compute. template ; -// BF16 (4,2) wavelet instances — same tile configs as F16 (4,2). +// BF16 (4,2) wavelet instances - same tile configs as F16 (4,2). template {}); - // 3. Default — noshuffle epilogue + // 3. Default - noshuffle epilogue add_device_operation_instances( instances, device_grouped_conv_bwd_data_xdl_bf16_noshuffle_instances<2, @@ -50,7 +50,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances( Empty_Tuple, NHWGC, ConvBwdDataDefault>{}); - // 4. Filter1x1Stride1Pad0 — noshuffle epilogue + // 4. Filter1x1Stride1Pad0 - noshuffle epilogue add_device_operation_instances(instances, device_grouped_conv_bwd_data_xdl_bf16_noshuffle_instances< 2, @@ -59,7 +59,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances( Empty_Tuple, NHWGC, ConvBwdDataFilter1x1Stride1Pad0>{}); - // 5. Default — nongrouped_match instances + // 5. Default - nongrouped_match instances add_device_operation_instances( instances, device_grouped_conv_bwd_data_xdl_bf16_nongrouped_match_instances<2, @@ -68,7 +68,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances( Empty_Tuple, NHWGC, ConvBwdDataDefault>{}); - // 6. Filter1x1Stride1Pad0 — nongrouped_match instances + // 6. Filter1x1Stride1Pad0 - nongrouped_match instances add_device_operation_instances(instances, device_grouped_conv_bwd_data_xdl_bf16_nongrouped_match_instances< 2, diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp index 3bbd4a37e5..dfcfac2bfa 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp @@ -41,7 +41,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances( Empty_Tuple, NHWGC, ConvBwdDataFilter1x1Stride1Pad0>{}); - // 3. Default — noshuffle epilogue + // 3. Default - noshuffle epilogue add_device_operation_instances( instances, device_grouped_conv_bwd_data_xdl_f16_noshuffle_instances<2, @@ -50,7 +50,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances( Empty_Tuple, NHWGC, ConvBwdDataDefault>{}); - // 4. Filter1x1Stride1Pad0 — noshuffle epilogue + // 4. Filter1x1Stride1Pad0 - noshuffle epilogue add_device_operation_instances(instances, device_grouped_conv_bwd_data_xdl_f16_noshuffle_instances< 2, @@ -59,7 +59,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances( Empty_Tuple, NHWGC, ConvBwdDataFilter1x1Stride1Pad0>{}); - // 5. Default — nongrouped_match instances + // 5. Default - nongrouped_match instances add_device_operation_instances( instances, device_grouped_conv_bwd_data_xdl_f16_nongrouped_match_instances<2, @@ -68,7 +68,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances( Empty_Tuple, NHWGC, ConvBwdDataDefault>{}); - // 6. Filter1x1Stride1Pad0 — nongrouped_match instances + // 6. Filter1x1Stride1Pad0 - nongrouped_match instances add_device_operation_instances(instances, device_grouped_conv_bwd_data_xdl_f16_nongrouped_match_instances< 2, diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp index 344c35c5ca..011a9d5cd1 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp @@ -41,7 +41,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances( Empty_Tuple, NHWGC, ConvBwdDataFilter1x1Stride1Pad0>{}); - // 3. Default — noshuffle epilogue + // 3. Default - noshuffle epilogue add_device_operation_instances( instances, device_grouped_conv_bwd_data_xdl_f32_noshuffle_instances<2, @@ -50,7 +50,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances( Empty_Tuple, NHWGC, ConvBwdDataDefault>{}); - // 4. Filter1x1Stride1Pad0 — noshuffle epilogue + // 4. Filter1x1Stride1Pad0 - noshuffle epilogue add_device_operation_instances(instances, device_grouped_conv_bwd_data_xdl_f32_noshuffle_instances< 2, @@ -59,7 +59,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances( Empty_Tuple, NHWGC, ConvBwdDataFilter1x1Stride1Pad0>{}); - // 5. Default — nongrouped_match instances + // 5. Default - nongrouped_match instances add_device_operation_instances( instances, device_grouped_conv_bwd_data_xdl_f32_nongrouped_match_instances<2, @@ -68,7 +68,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances( Empty_Tuple, NHWGC, ConvBwdDataDefault>{}); - // 6. Filter1x1Stride1Pad0 — nongrouped_match instances + // 6. Filter1x1Stride1Pad0 - nongrouped_match instances add_device_operation_instances(instances, device_grouped_conv_bwd_data_xdl_f32_nongrouped_match_instances< 2, diff --git a/rocm_ck/include/rocm_ck/args.hpp b/rocm_ck/include/rocm_ck/args.hpp index 2da845e8ea..28f2e07448 100644 --- a/rocm_ck/include/rocm_ck/args.hpp +++ b/rocm_ck/include/rocm_ck/args.hpp @@ -1,29 +1,29 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Role: abi — shared between host and device. Trivially copyable, no CK deps. +// Role: abi - shared between host and device. Trivially copyable, no CK deps. // // Args is a hardware buffer for passing data between CPU and GPU during a -// kernel call. It carries raw pointers, shapes, strides, and scalar values — +// kernel call. It carries raw pointers, shapes, strides, and scalar values - // nothing more. All semantic meaning (which tensor is "A", which scalar is // "alpha", input vs output) lives in the Signature, not here. // // This is deliberately one type for all operations. Per-operation structs -// (GemmArgs, FmhaArgs, ...) would make the dispatcher a closed set — adding +// (GemmArgs, FmhaArgs, ...) would make the dispatcher a closed set - adding // an operation means adding a type, updating launch code, and changing the // kpack format. A generic buffer keeps the dispatcher open. // // Capacity limits (kMaxRank=6, kMaxTensors=16, kMaxScalars=16) are sized to // the most demanding current operation (FMHA backward: ~12 tensors, ~12 // scalars, rank-6 for grouped 3D conv). If a future operation exceeds these, -// bump the constants — the layout is not versioned, and the 4KB HSA kernarg +// bump the constants - the layout is not versioned, and the 4KB HSA kernarg // budget has room. Don't over-provision speculatively. // // Key constraints: -// - Trivially copyable, standard layout — required for HSA kernarg passing. -// - Fixed-capacity arrays, no heap — sizeof fits the 4KB kernarg budget. -// - const void* for all tensor pointers — the entry kernel casts to the +// - Trivially copyable, standard layout - required for HSA kernarg passing. +// - Fixed-capacity arrays, no heap - sizeof fits the 4KB kernarg budget. +// - const void* for all tensor pointers - the entry kernel casts to the // concrete type. Input vs output semantics live in the Signature. -// - No runtime type tags on scalars — the Signature declares types at +// - No runtime type tags on scalars - the Signature declares types at // compile time. The entry kernel reads the correct union member. // - Slot ordering is the invariant: tensors[i] maps to Signature::tensors[i]. @@ -44,11 +44,11 @@ constexpr int kMaxScalars = 16; // FMHA with masking+dropout needs ~12 struct TensorArg { const void* ptr; // 8 bytes (offset 0) - std::array lengths; // 24 bytes (offset 8) — int32 - std::array strides; // 48 bytes (offset 32) — int64 + std::array lengths; // 24 bytes (offset 8) - int32 + std::array strides; // 48 bytes (offset 32) - int64 }; -// FP16/BF16/FP8 scalars use f32 — scalar precision >= tensor precision. +// FP16/BF16/FP8 scalars use f32 - scalar precision >= tensor precision. union ScalarValue { float f32; diff --git a/rocm_ck/include/rocm_ck/datatype.hpp b/rocm_ck/include/rocm_ck/datatype.hpp index 43da2d14a9..e59b71a72f 100644 --- a/rocm_ck/include/rocm_ck/datatype.hpp +++ b/rocm_ck/include/rocm_ck/datatype.hpp @@ -1,6 +1,6 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Role: types — DataType enum, constexpr queries. No runtime, no CK deps. +// Role: types - DataType enum, constexpr queries. No runtime, no CK deps. #pragma once @@ -13,19 +13,19 @@ namespace rocm_ck { // FP8 = e4m3, BF8 = e5m2 (CK convention). enum class DataType : uint8_t { - // Floating point — standard widths + // Floating point - standard widths FP64, FP32, FP16, BF16, - // FP8 variants — see note below + // FP8 variants - see note below FP8_FNUZ, BF8_FNUZ, FP8_OCP, BF8_OCP, - // Integer types — signed and unsigned at each width + // Integer types - signed and unsigned at each width I4, I8, I16, @@ -37,12 +37,12 @@ enum class DataType : uint8_t U64 }; -// FP8 variants — FNUZ and OCP are different number formats, not just HW hints. +// FP8 variants - FNUZ and OCP are different number formats, not just HW hints. // FNUZ: gfx942 native (higher bias, no Inf, max 240) // OCP: gfx950 native (OCP standard, has Inf, max 448) // Non-native formats run in software (slower) and produce different numerical // results. Choose based on target GPU and model training format. -// We keep FNUZ and OCP explicit rather than a generic FP8 — the numerical +// We keep FNUZ and OCP explicit rather than a generic FP8 - the numerical // differences matter for compatibility and schema-driven test coverage. // TODO - We may introduce a generic FP8/BF8 that resolves to the hardware-native type. // See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/fp8_numbers.html diff --git a/rocm_ck/include/rocm_ck/fixed_string.hpp b/rocm_ck/include/rocm_ck/fixed_string.hpp index 105c354c23..4ca0678e35 100644 --- a/rocm_ck/include/rocm_ck/fixed_string.hpp +++ b/rocm_ck/include/rocm_ck/fixed_string.hpp @@ -1,10 +1,10 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Role: types — FixedString. No runtime, no CK deps. +// Role: types - FixedString. No runtime, no CK deps. // // A compile-time string for use in template parameters (NTTPs). // -// C++20 requires template parameters to be "structural types" — loosely, types +// C++20 requires template parameters to be "structural types" - loosely, types // that are trivially comparable and don't contain pointers or references. // std::string and std::string_view fail this requirement (internal pointer). // @@ -16,7 +16,7 @@ // When to use FixedString vs std::string_view: // - FixedString: the type must be structural (template parameters). // - string_view: consteval-only types that never become template parameters -// (e.g., ResolvedTensor — see resolved_tensor.hpp). +// (e.g., ResolvedTensor - see resolved_tensor.hpp). // // The capacity is a template parameter so each use site documents its limit: // FixedString<16> name("bias"); // tensor names: 15 chars max diff --git a/rocm_ck/include/rocm_ck/gpu_target.hpp b/rocm_ck/include/rocm_ck/gpu_target.hpp index 0a0ed83803..751d4f136a 100644 --- a/rocm_ck/include/rocm_ck/gpu_target.hpp +++ b/rocm_ck/include/rocm_ck/gpu_target.hpp @@ -1,6 +1,6 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Role: types — GpuTarget enum. No runtime, no CK deps. +// Role: types - GpuTarget enum. No runtime, no CK deps. #pragma once diff --git a/rocm_ck/include/rocm_ck/index_t.hpp b/rocm_ck/include/rocm_ck/index_t.hpp index 70886d7f5e..1509a355e9 100644 --- a/rocm_ck/include/rocm_ck/index_t.hpp +++ b/rocm_ck/include/rocm_ck/index_t.hpp @@ -1,6 +1,6 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Role: types — index_t, long_index_t. No runtime, no CK deps. +// Role: types - index_t, long_index_t. No runtime, no CK deps. #pragma once diff --git a/rocm_ck/include/rocm_ck/layout.hpp b/rocm_ck/include/rocm_ck/layout.hpp index f740038e67..d6b18e18e3 100644 --- a/rocm_ck/include/rocm_ck/layout.hpp +++ b/rocm_ck/include/rocm_ck/layout.hpp @@ -1,6 +1,6 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Role: types — Layout enum, constexpr/consteval helpers. No runtime, no CK deps. +// Role: types - Layout enum, constexpr/consteval helpers. No runtime, no CK deps. #pragma once @@ -12,7 +12,7 @@ namespace rocm_ck { -// Auto is a resolve-time placeholder — Signature::resolve() replaces it with +// Auto is a resolve-time placeholder - Signature::resolve() replaces it with // the concrete layout from the operator slot. It never reaches the kernel. enum class Layout : uint8_t { diff --git a/rocm_ck/include/rocm_ck/ops.hpp b/rocm_ck/include/rocm_ck/ops.hpp index ca49cc75d9..a469ab18f6 100644 --- a/rocm_ck/include/rocm_ck/ops.hpp +++ b/rocm_ck/include/rocm_ck/ops.hpp @@ -1,6 +1,6 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Role: meta — operator structs, Op variant. No runtime, no CK deps. +// Role: meta - operator structs, Op variant. No runtime, no CK deps. // // Operators are the edges of a Signature's compute graph. Each operator // names its tensor slots as string_view labels (e.g., "A", "bias", "query") @@ -8,13 +8,13 @@ // owns the tensor definitions; operators just reference them by name. // // This separation means operators are reusable across different tensor -// configurations — a GemmOp doesn't care whether its "lhs" is FP16 or BF16, +// configurations - a GemmOp doesn't care whether its "lhs" is FP16 or BF16, // Row or Col. That's resolved later when the Signature is validated. // // The Op variant is the closed set of supported operator types. Adding a // new operator means adding a struct here and a variant alternative. -// Fused operations (like FMHA) are single operators — not chains of -// elementwise + GEMM — because CK Tile implements them as monolithic kernels. +// Fused operations (like FMHA) are single operators - not chains of +// elementwise + GEMM - because CK Tile implements them as monolithic kernels. #pragma once @@ -26,7 +26,7 @@ namespace rocm_ck { // Matrix multiplication: out = lhs x rhs. -// acc_dtype is the accumulation type — defaults to FP32, the universal safe +// acc_dtype is the accumulation type - defaults to FP32, the universal safe // choice across all input types. struct GemmOp { diff --git a/rocm_ck/include/rocm_ck/physical_tensor.hpp b/rocm_ck/include/rocm_ck/physical_tensor.hpp index fcec91a51c..8dca7627b1 100644 --- a/rocm_ck/include/rocm_ck/physical_tensor.hpp +++ b/rocm_ck/include/rocm_ck/physical_tensor.hpp @@ -1,9 +1,9 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Role: types — PhysicalTensor. No runtime, no CK deps. +// Role: types - PhysicalTensor. No runtime, no CK deps. // // A PhysicalTensor maps a named tensor from the Signature graph to a slot -// in the generic Args buffer. Not every tensor in a compute graph is physical — +// in the generic Args buffer. Not every tensor in a compute graph is physical - // intermediate values (e.g., the S matrix in FMHA = Q*K^T) live only in // registers and never appear in device memory. The physical tensor table // describes exactly what the host needs to pack into Args. diff --git a/rocm_ck/include/rocm_ck/resolved_tensor.hpp b/rocm_ck/include/rocm_ck/resolved_tensor.hpp index 4a2ce00672..7390d4b795 100644 --- a/rocm_ck/include/rocm_ck/resolved_tensor.hpp +++ b/rocm_ck/include/rocm_ck/resolved_tensor.hpp @@ -1,30 +1,30 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Role: types — ResolvedTensor, ResolvedQuantization. No runtime, no CK deps. +// Role: types - ResolvedTensor, ResolvedQuantization. No runtime, no CK deps. // // ResolvedTensor is the intermediate result of consteval resolution. It exists -// only at compile time — produced by Signature::resolve() and consumed by +// only at compile time - produced by Signature::resolve() and consumed by // makeSpec(), both consteval. It never appears in compiled code. // // In the user-facing Signature, tensors can have Layout::Auto (inherit from // operator slot) and omit fields with sensible defaults. After resolution, // every field is concrete. The base fields (name, dtype, rank, layout) -// describe a plain dense tensor — enough for most operands (GEMM inputs, +// describe a plain dense tensor - enough for most operands (GEMM inputs, // outputs, bias vectors). Some tensors carry additional metadata beyond the // dense description. Block-quantized tensors (e.g., INT4 weights) need a // scale tensor and group size. We use optional sub-structs for these // extensions, keeping the common case clean without bloating every instance. // // Why std::string_view instead of FixedString? -// ResolvedTensor is consteval-only — produced and consumed entirely at +// ResolvedTensor is consteval-only - produced and consumed entirely at // compile time. No library loading, no runtime lifetime concerns. The // string_views point to string literals from user code (e.g., -// GemmOp{.lhs = "A"}), which have static storage duration — no dangling. +// GemmOp{.lhs = "A"}), which have static storage duration - no dangling. // FixedString is required for PhysicalTensor because it IS used as a // template parameter (NTTP), which requires structural types (no pointers). // ResolvedTensor is never a template parameter. // -// Plain aggregate — no methods, no validation. Resolution validates; this +// Plain aggregate - no methods, no validation. Resolution validates; this // type just carries the result to makeSpec(). #pragma once diff --git a/rocm_ck/tests/unit/unit_args.cpp b/rocm_ck/tests/unit/unit_args.cpp index c57b93dce2..766281a1bf 100644 --- a/rocm_ck/tests/unit/unit_args.cpp +++ b/rocm_ck/tests/unit/unit_args.cpp @@ -125,7 +125,7 @@ TEST(ScalarValue, StoresAndRetrievesUInt32) } // ============================================================================ -// Args field coverage — batch_strides and workspace_ptr +// Args field coverage - batch_strides and workspace_ptr // ============================================================================ TEST(Args, BatchStridesFieldExists) diff --git a/script/dependency-parser/src/selective_test_filter.py b/script/dependency-parser/src/selective_test_filter.py index 551ed06eb0..427abcffbd 100644 --- a/script/dependency-parser/src/selective_test_filter.py +++ b/script/dependency-parser/src/selective_test_filter.py @@ -252,13 +252,20 @@ def main(): # Extract basenames for regex (e.g., bin/test_gemm -> test_gemm) test_names = [os.path.basename(t) for t in tests] + # Anchor each name with ^...$ and escape regex metacharacters so that + # `ctest -R` does exact-name matching rather than substring matching + # (otherwise e.g. 'test_grouped_convnd_bwd_weight' would substring-match + # 'test_grouped_convnd_bwd_weight_bilinear' and try to run an + # executable that was never built). + anchored = [f"^{re.escape(n)}$" for n in test_names] + # Split into chunks - for i in range(0, len(test_names), chunk_size): - chunk = test_names[i:i + chunk_size] + for i in range(0, len(anchored), chunk_size): + chunk = anchored[i:i + chunk_size] regex_chunks.append("|".join(chunk)) # Keep single regex for backward compatibility (but may be too long) - regex = "|".join(test_names) + regex = "|".join(anchored) else: regex = "" diff --git a/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_scale_mma.cpp b/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_scale_mma.cpp index a9adeba7d7..6a984d6332 100644 --- a/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_scale_mma.cpp +++ b/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_scale_mma.cpp @@ -63,17 +63,17 @@ void ScaleMfmaGfx950Specialization_impl() TEST(ScaleMMATrait, ScaleMfmaGfx950Specialization) { - // Test fp8 → fp32 scale MFMA for GFX950 (16x16x128) + // Test fp8 -> fp32 scale MFMA for GFX950 (16x16x128) ScaleMfmaGfx950Specialization_impl(); - // Test bf8 → fp32 scale MFMA for GFX950 (16x16x128) + // Test bf8 -> fp32 scale MFMA for GFX950 (16x16x128) ScaleMfmaGfx950Specialization_impl(); - // Test fp4 → fp32 scale MFMA for GFX950 (16x16x128) + // Test fp4 -> fp32 scale MFMA for GFX950 (16x16x128) ScaleMfmaGfx950Specialization_impl(); - // Test fp8 → fp32 scale MFMA for GFX950 (32x32x64) + // Test fp8 -> fp32 scale MFMA for GFX950 (32x32x64) ScaleMfmaGfx950Specialization_impl(); - // Test bf8 → fp32 scale MFMA for GFX950 (32x32x64) + // Test bf8 -> fp32 scale MFMA for GFX950 (32x32x64) ScaleMfmaGfx950Specialization_impl(); - // Test fp4 → fp32 scale MFMA for GFX950 (32x32x64) + // Test fp4 -> fp32 scale MFMA for GFX950 (32x32x64) ScaleMfmaGfx950Specialization_impl(); std::cout << "GFX950 scale MFMA specialization is correct" << std::endl; diff --git a/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_sparse_mma.cpp b/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_sparse_mma.cpp index be631f0659..86e363aabc 100644 --- a/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_sparse_mma.cpp +++ b/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_sparse_mma.cpp @@ -30,7 +30,7 @@ using CompilerTargetGfx950 = decltype(make_amdgcn_gfx9_target fp32 sparse MFMA for GFX950 (16x16x32) using TestSparseMfma16x16 = amdgcn_mma::WaveTileK><<<1, waveSize>>>(a, b, c, out); }; // Initialize A with 2:4 structured sparsity pattern: {1, 0, 1, 0, ...} - // This ensures the sparse compression transform is actually exercised — + // This ensures the sparse compression transform is actually exercised - // a no-op or broken compression would pass zeros through, causing incorrect results. const std::function sparseAInit = [](size_t i) -> fp16_t { return (i % 2 == 0) ? type_convert(1) : type_convert(0); @@ -317,7 +317,7 @@ void sparse_transform_verify(const std::vector& input, } // Helper: build expected index from a per-group 4-bit pattern, repeated for all groups. -// Each group of 4 input elements contributes 2 compressed elements → 2 x 2-bit index fields = 4 +// Each group of 4 input elements contributes 2 compressed elements -> 2 x 2-bit index fields = 4 // bits. static int32_t build_repeated_group_idx(int num_groups, int32_t group_bits_4) { @@ -337,8 +337,8 @@ static int32_t build_alternating_group_idx(int num_groups, int32_t even_bits_4, } // 1. Basic correctness: valid divisible sizes -// Input pattern: {1, 0, 3, 0, 5, 0, 7, 0, ...} → non-zeros at slots 0,2 -// Group idx pattern: field0=0b00 (slot 0), field1=0b10 (slot 2) → 0b1000 +// Input pattern: {1, 0, 3, 0, 5, 0, 7, 0, ...} -> non-zeros at slots 0,2 +// Group idx pattern: field0=0b00 (slot 0), field1=0b10 (slot 2) -> 0b1000 template void sparse_transform_test_case() { @@ -392,7 +392,7 @@ TEST(SparseTransformsTest, AllZeroInput) // nonzero_elems initializes to {a_vec[slot2]=0, a_vec[slot3]=V}. // Only j=3 triggers: nonzero_elems[0]=V, field0=0b11, pos becomes 1. // nonzero_elems[1] keeps its init V. Output: {V, V}. -// Group idx pattern: field0=0b11, field1=0b10 (default) → 0b1011 +// Group idx pattern: field0=0b11, field1=0b10 (default) -> 0b1011 template void sparse_transform_single_nonzero() { @@ -421,7 +421,7 @@ TEST(SparseTransformsTest, SingleNonZeroPerGroup) // Non-zeros at slots 1 and 3 in each group. // Input: {0, a, 0, b, ...}. Output: {a, b, ...}. -// Group idx pattern: field0=0b01 (slot 1), field1=0b11 (slot 3) → 0b1101 +// Group idx pattern: field0=0b01 (slot 1), field1=0b11 (slot 3) -> 0b1101 template void sparse_transform_slots_1_and_3() { @@ -452,7 +452,7 @@ TEST(SparseTransformsTest, NonZerosAtSlots1And3) // Non-zeros at slots 0 and 3 in each group (non-adjacent). // Input: {a, 0, 0, b, ...}. Output: {a, b, ...}. -// Group idx pattern: field0=0b00 (slot 0), field1=0b11 (slot 3) → 0b1100 +// Group idx pattern: field0=0b00 (slot 0), field1=0b11 (slot 3) -> 0b1100 template void sparse_transform_slots_0_and_3() { @@ -482,8 +482,8 @@ TEST(SparseTransformsTest, NonZerosAtSlots0And3) } // Mixed sparsity pattern: even groups have non-zeros at slots 0,2; odd groups at slots 1,3. -// Even group idx: field0=0b00, field1=0b10 → 0b1000 -// Odd group idx: field0=0b01, field1=0b11 → 0b1101 +// Even group idx: field0=0b00, field1=0b10 -> 0b1000 +// Odd group idx: field0=0b01, field1=0b11 -> 0b1101 template void sparse_transform_mixed() { diff --git a/test/ck_tile/core/container/unit_sequence.cpp b/test/ck_tile/core/container/unit_sequence.cpp index 2ce0d0f7e8..f0c97e1abc 100644 --- a/test/ck_tile/core/container/unit_sequence.cpp +++ b/test/ck_tile/core/container/unit_sequence.cpp @@ -399,7 +399,7 @@ TEST(SequenceSort, SortedMapWithDuplicates) using ExpectedSorted = sequence<1, 1, 3, 3>; EXPECT_TRUE((std::is_same::value)); // Verify round-trip: original[map[i]] == sorted[i] for all i - // (don't assert specific index order for duplicates — sort stability may vary) + // (don't assert specific index order for duplicates - sort stability may vary) EXPECT_EQ(Seq::at(Map::at(0)), Sorted::at(0)); EXPECT_EQ(Seq::at(Map::at(1)), Sorted::at(1)); EXPECT_EQ(Seq::at(Map::at(2)), Sorted::at(2)); diff --git a/test/ck_tile/flatmm/test_mx_flatmm_fixtures.hpp b/test/ck_tile/flatmm/test_mx_flatmm_fixtures.hpp index c4adb3e2da..1f738f9379 100644 --- a/test/ck_tile/flatmm/test_mx_flatmm_fixtures.hpp +++ b/test/ck_tile/flatmm/test_mx_flatmm_fixtures.hpp @@ -12,7 +12,7 @@ using FP6 = ck_tile::pk_fp6x16_t; using FP8 = ck_tile::fp8_t; using FP16 = ck_tile::fp16_t; -// Concrete test fixture — inherits all logic from TestMXFlatmmBase. +// Concrete test fixture - inherits all logic from TestMXFlatmmBase. // Tuple layout: template class TestMXFlatmm : public TestMXFlatmmBase diff --git a/test/ck_tile/fmha/test_fmha_fwd.cpp b/test/ck_tile/fmha/test_fmha_fwd.cpp index daf239fea9..6ae33da30f 100644 --- a/test/ck_tile/fmha/test_fmha_fwd.cpp +++ b/test/ck_tile/fmha/test_fmha_fwd.cpp @@ -605,7 +605,7 @@ TEST_P(Dropout, DataTypeConfig) if constexpr(std::is_same_v) { if(hdim_q > 128 && mode == mode_enum::batch) - GTEST_SKIP() << "Skipped: fp16 dropout d256 batch — compiler bug (ROCm >= 7.12)"; + GTEST_SKIP() << "Skipped: fp16 dropout d256 batch - compiler bug (ROCm >= 7.12)"; } #endif diff --git a/test/ck_tile/gemm_block_scale/CMakeLists.txt b/test/ck_tile/gemm_block_scale/CMakeLists.txt index 21d34f7b34..676f01e355 100644 --- a/test/ck_tile/gemm_block_scale/CMakeLists.txt +++ b/test/ck_tile/gemm_block_scale/CMakeLists.txt @@ -14,21 +14,11 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12") # AQuant tests - split into 10 files # AQuant Memory Pipeline tests - add_gtest_executable(test_tile_gemm_quant_aquant_mem_prefill_interwave - test_gemm_quant_aquant_mem_prefill_interwave.cpp - ) - target_compile_options(test_tile_gemm_quant_aquant_mem_prefill_interwave PRIVATE ${TEST_GEMM_COMPILE_OPTIONS}) - add_gtest_executable(test_tile_gemm_quant_aquant_mem_decode_intrawave test_gemm_quant_aquant_mem_decode_intrawave.cpp ) target_compile_options(test_tile_gemm_quant_aquant_mem_decode_intrawave PRIVATE ${TEST_GEMM_COMPILE_OPTIONS}) - add_gtest_executable(test_tile_gemm_quant_aquant_mem_decode_interwave - test_gemm_quant_aquant_mem_decode_interwave.cpp - ) - target_compile_options(test_tile_gemm_quant_aquant_mem_decode_interwave PRIVATE ${TEST_GEMM_COMPILE_OPTIONS}) - add_gtest_executable(test_tile_gemm_quant_aquant_base_rcr test_gemm_quant_aquant_base_rcr.cpp ) @@ -267,9 +257,7 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12") # Collect all test targets for umbrella label set(CK_TILE_GEMM_BLOCK_SCALE_TEST_TARGETS # AQuant tests - test_tile_gemm_quant_aquant_mem_prefill_interwave test_tile_gemm_quant_aquant_mem_decode_intrawave - test_tile_gemm_quant_aquant_mem_decode_interwave test_tile_gemm_quant_aquant_base_rcr test_tile_gemm_quant_aquant_base_rrr_crr test_tile_gemm_quant_aquant_base_ccr @@ -324,9 +312,7 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12") # Target to build only AQuant memory pipeline tests add_custom_target(test_tile_gemm_aquant_mem_all) add_dependencies(test_tile_gemm_aquant_mem_all - test_tile_gemm_quant_aquant_mem_prefill_interwave test_tile_gemm_quant_aquant_mem_decode_intrawave - test_tile_gemm_quant_aquant_mem_decode_interwave ) # Umbrella target to build all gemm quant tests diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp deleted file mode 100644 index 1ef57716c9..0000000000 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include "test_gemm_quant_common.hpp" - -using AQuantGrouped = std::integral_constant; - -// Type combinations for AQuant tests - Mem Decode Interwave Configuration -// Tuple format: -// clang-format off -using AQuantMemDecodeInterwaveTypes = ::testing::Types< - std::tuple, - std::tuple, - std::tuple, - std::tuple ->; -// clang-format on - -// Test suite for AQuant Mem Decode Interwave -TYPED_TEST_SUITE(TestCkTileGemmAQuantMem, AQuantMemDecodeInterwaveTypes); - -// AQuant tests -TYPED_TEST(TestCkTileGemmAQuantMem, AQuantMemDecodeInterwaveTest) -{ - this->run_test_with_validation(16, 64, 512); -} diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp deleted file mode 100644 index fde3ec977b..0000000000 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include "test_gemm_quant_common.hpp" - -using AQuantGrouped = std::integral_constant; - -// Type combinations for AQuant tests - Mem Prefill Interwave Configuration -// Tuple format: -// clang-format off -using AQuantMemPrefillInterwaveTypes = ::testing::Types< - std::tuple, - std::tuple, - std::tuple, - std::tuple ->; -// clang-format on - -// Test suite for AQuant Mem Prefill Interwave -TYPED_TEST_SUITE(TestCkTileGemmAQuantMem, AQuantMemPrefillInterwaveTypes); - -// AQuant tests -TYPED_TEST(TestCkTileGemmAQuantMem, AQuantMemPrefillInterwaveTest) -{ - this->run_test_with_validation(1024, 1024, 1024); -} diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp index 7ab7d22dc7..04f6b3d483 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp @@ -23,25 +23,25 @@ TYPED_TEST_SUITE(TestCkTileGemmBQuant, BQuantSplitKDecodeTypes); // BQuant split-K tests TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK2Test) { - // K=1024 for split_k=2: 1024/2=512=4×128 ✓ + // K=1024 for split_k=2: 1024/2=512=4x128 this->run_test_with_validation(32, 128, 1024, 2); } TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK3Test) { - // K=3072 for split_k=3: 3072/3=1024=8×128 ✓ + // K=3072 for split_k=3: 3072/3=1024=8x128 this->run_test_with_validation(32, 128, 3072, 3); } TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK4Test) { - // K=2048 for split_k=4: 2048/4=512=4×128 ✓ + // K=2048 for split_k=4: 2048/4=512=4x128 this->run_test_with_validation(32, 128, 2048, 4); } TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK5Test) { - // K=2560 for split_k=5: 2560/5=512=4×128 ✓ + // K=2560 for split_k=5: 2560/5=512=4x128 // Also K must be divisible by K_Tile(256)*split_k(5)=1280 this->run_test_with_validation(32, 128, 2560, 5); } diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_prefill.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_prefill.cpp index c076f89e59..5de4e64639 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_prefill.cpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_prefill.cpp @@ -23,28 +23,28 @@ TYPED_TEST_SUITE(TestCkTileGemmBQuant, BQuantSplitKPrefillTypes); // BQuant split-K tests TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK2Test) { - // K=1024 for split_k=2: 1024/2=512=4×128 ✓ + // K=1024 for split_k=2: 1024/2=512=4x128 // K must be divisible by K_Tile(128)*split_k(2)=256 this->run_test_with_validation(128, 128, 1024, 2); } TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK3Test) { - // K=3072 for split_k=3: 3072/3=1024=8×128 ✓ + // K=3072 for split_k=3: 3072/3=1024=8x128 // K must be divisible by K_Tile(128)*split_k(3)=384 this->run_test_with_validation(128, 128, 3072, 3); } TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK4Test) { - // K=2048 for split_k=4: 2048/4=512=4×128 ✓ + // K=2048 for split_k=4: 2048/4=512=4x128 // K must be divisible by K_Tile(128)*split_k(4)=512 this->run_test_with_validation(128, 128, 2048, 4); } TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK5Test) { - // K=1920 for split_k=5: 1920/5=384=3×128 ✓ + // K=1920 for split_k=5: 1920/5=384=3x128 // K must be divisible by K_Tile(128)*split_k(5)=640 this->run_test_with_validation(128, 128, 1920, 5); } diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp index abe84bac2e..42d850c6bb 100644 --- a/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp +++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp @@ -68,28 +68,20 @@ struct GemmConfigPrefillIntrawave : public GemmConfigBase static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Intrawave; }; -struct GemmConfigPrefillInterwave : public GemmConfigBase -{ - static constexpr ck_tile::index_t M_Tile = 128; - static constexpr ck_tile::index_t N_Tile = 128; - static constexpr ck_tile::index_t K_Tile = 128; - static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Interwave; -}; - struct GemmConfigDecodeIntrawave : public GemmConfigBase { static constexpr ck_tile::index_t M_Tile = 16; static constexpr ck_tile::index_t N_Tile = 64; static constexpr ck_tile::index_t K_Tile = 256; - static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Intrawave; -}; - -struct GemmConfigDecodeInterwave : public GemmConfigBase -{ - static constexpr ck_tile::index_t M_Tile = 16; - static constexpr ck_tile::index_t N_Tile = 64; - static constexpr ck_tile::index_t K_Tile = 256; - static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Interwave; + // Workaround for ROCm 7.13 compiler codegen regression on gfx1201 (RDNA4). + // AQuantGemmPipelineAgBgCrMem always uses the Intrawave pipeline implementation + // regardless of this value, but the Scheduler enum changes the Problem type identity, + // causing a different template instantiation. The Intrawave instantiation triggers + // incorrect ISA generation (wrong global instruction scheduling in the hot loop), + // producing ~3% wrong values for FP8/BF8 AQuant GEMM with K > K_Tile. + // Setting Interwave creates a different instantiation that gets correct codegen. + // Revert to Intrawave once the compiler is fixed. + static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Interwave; }; struct GemmConfigMx : public GemmConfigBase diff --git a/test/ck_tile/gemm_mx/test_mx_gemm_pipeline_wmma_base.hpp b/test/ck_tile/gemm_mx/test_mx_gemm_pipeline_wmma_base.hpp index 8491e7abe2..0e2a37c892 100644 --- a/test/ck_tile/gemm_mx/test_mx_gemm_pipeline_wmma_base.hpp +++ b/test/ck_tile/gemm_mx/test_mx_gemm_pipeline_wmma_base.hpp @@ -56,11 +56,11 @@ class TestCkTileMxGemmPipelineWmmaBase : public TestCkTileMxGemmPipeline always valid if constexpr(a_scale_e8m0 && b_scale_e8m0) return true; - // Both non-E8M0 → must match (both are F4 by rule 1) + // Both non-E8M0 -> must match (both are F4 by rule 1) if constexpr(!a_scale_e8m0 && !b_scale_e8m0) return std::is_same_v; diff --git a/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight_streamk.cpp b/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight_streamk.cpp index 0e59d22f2c..60cc947253 100644 --- a/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight_streamk.cpp +++ b/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight_streamk.cpp @@ -458,8 +458,8 @@ TEST(StreamKConvBwdWeight, Tree_EndToEnd_MultiGroup) 2, 4, 128, 128, 3, 3, 16, 16, 4, 1))); } -// Stride > 1 — shrinks Ho/Wo, changing the K/tile ratio and DP/SK split. -// Hi=16, Wi=16, 3x3 filter, stride=2, pad=1 → Ho=Wo=8, GemmK=N*64 +// Stride > 1 - shrinks Ho/Wo, changing the K/tile ratio and DP/SK split. +// Hi=16, Wi=16, 3x3 filter, stride=2, pad=1 -> Ho=Wo=8, GemmK=N*64 TEST(StreamKConvBwdWeight, Linear_EndToEnd_Stride2) { EXPECT_TRUE((run_streamk_vs_splitk_test>(1, @@ -504,8 +504,8 @@ TEST(StreamKConvBwdWeight, Tree_EndToEnd_Stride2) 1))); } -// Pure DP — num_tiles evenly divides grid, so sk_ctas=0. -// K=256, C=128, 3x3 → GemmM=256, GemmN=1152 → tiles=2*9=18, grid=3*1=3, 18%3=0 +// Pure DP - num_tiles evenly divides grid, so sk_ctas=0. +// K=256, C=128, 3x3 -> GemmM=256, GemmN=1152 -> tiles=2*9=18, grid=3*1=3, 18%3=0 TEST(StreamKConvBwdWeight, Linear_EndToEnd_PureDP) { EXPECT_TRUE((run_streamk_vs_splitk_test>( @@ -518,8 +518,8 @@ TEST(StreamKConvBwdWeight, Tree_EndToEnd_PureDP) 1, 4, 256, 128, 3, 3, 16, 16, 3, 1))); } -// Single output tile — all work is SK, zero DP tiles. -// K=128, C=128, 1x1 filter, stride=1, pad=0 → GemmM=128, GemmN=128, tiles=1 +// Single output tile - all work is SK, zero DP tiles. +// K=128, C=128, 1x1 filter, stride=1, pad=0 -> GemmM=128, GemmN=128, tiles=1 TEST(StreamKConvBwdWeight, Linear_EndToEnd_SingleTile) { EXPECT_TRUE((run_streamk_vs_splitk_test>(1, @@ -564,7 +564,7 @@ TEST(StreamKConvBwdWeight, Tree_EndToEnd_SingleTile) 0))); } -// Large N — GemmK = 32*16*16 = 8192, many K iterations per tile. +// Large N - GemmK = 32*16*16 = 8192, many K iterations per tile. TEST(StreamKConvBwdWeight, Linear_EndToEnd_LargeN) { EXPECT_TRUE((run_streamk_vs_splitk_test>( @@ -577,7 +577,7 @@ TEST(StreamKConvBwdWeight, Tree_EndToEnd_LargeN) 1, 32, 128, 128, 3, 3, 16, 16, 4, 1))); } -// Higher occupancy — doubles the grid, more SK CTAs share tiles. +// Higher occupancy - doubles the grid, more SK CTAs share tiles. TEST(StreamKConvBwdWeight, Linear_EndToEnd_HigherOccupancy) { EXPECT_TRUE((run_streamk_vs_splitk_test>( @@ -590,7 +590,7 @@ TEST(StreamKConvBwdWeight, Tree_EndToEnd_HigherOccupancy) 1, 4, 128, 128, 3, 3, 16, 16, 4, 2))); } -// Persistent DP — workgroups loop over DP tiles, then do SK work. +// Persistent DP - workgroups loop over DP tiles, then do SK work. TEST(StreamKConvBwdWeight, LinearPersistent_EndToEnd_SmallShape) { EXPECT_TRUE((run_streamk_vs_splitk_test>( @@ -619,7 +619,7 @@ TEST(StreamKConvBwdWeight, TreePersistent_EndToEnd_MultiGroup) // Negative tests: IsSupportedArgument should reject invalid shapes // ============================================================================ -// C not divisible by VectorSizeB (=8) → rejected +// C not divisible by VectorSizeB (=8) -> rejected TEST(StreamKConvBwdWeight, IsSupportedArgument_RejectsUnalignedC) { using Kernel = TestKernel; @@ -630,7 +630,7 @@ TEST(StreamKConvBwdWeight, IsSupportedArgument_RejectsUnalignedC) EXPECT_FALSE(Kernel::IsSupportedArgument(kargs)); } -// K not divisible by VectorSizeA (=4) → rejected +// K not divisible by VectorSizeA (=4) -> rejected TEST(StreamKConvBwdWeight, IsSupportedArgument_RejectsUnalignedK) { using Kernel = TestKernel; diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp index 1c955ff98a..16c7dfcb29 100644 --- a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp +++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp @@ -140,7 +140,7 @@ class TestCkTileGroupedGemm : public ::testing::Test EXPECT_TRUE(Kernel::IsSupportedArgument(kargs)); // Use the filtered kargs (zero-dim groups are excluded by MakeKargs) to derive - // the correct grid size and group count — not the raw gemm_descs vector. + // the correct grid size and group count - not the raw gemm_descs vector. const dim3 blocks = Kernel::BlockSize(); if(kargs.empty()) return; @@ -468,7 +468,7 @@ class TestCkTileGroupedGemm : public ::testing::Test bool pass{true}; for(int i = 0; i < group_count; ++i) { - // Groups with M=0 or N=0 produce no output — skip validation. + // Groups with M=0 or N=0 produce no output - skip validation. // K=0 groups do produce output (all zeros) and are validated normally. if(Ms[i] == 0 || Ns[i] == 0) continue; diff --git a/test/ck_tile/multicast_load/test_cluster_load_async_to_lds.cpp b/test/ck_tile/multicast_load/test_cluster_load_async_to_lds.cpp index 2d75e0dac9..1519597219 100644 --- a/test/ck_tile/multicast_load/test_cluster_load_async_to_lds.cpp +++ b/test/ck_tile/multicast_load/test_cluster_load_async_to_lds.cpp @@ -1,28 +1,28 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// Unit test suite for cluster_multicast_load_async_to_lds — the CK Tile wrapper +// Unit test suite for cluster_multicast_load_async_to_lds - the CK Tile wrapper // around CLUSTER_LOAD_ASYNC_TO_LDS_B* (gfx1250 only). // // Complements test_cluster_load_multicast.cpp (CLUSTER_LOAD_B, VGPR destination) // by testing behaviors unique to the async LDS path: // -// Group 1: SingleWGP baseline — B32/B64/B128, mask=0x1 and mask=0x0 -// Group 2: LDSVisibility — non-requesting waves read LDS after barrier -// Group 3: LDS address layout — per-lane VDST strided addressing -// Group 4: MultiWGP broadcast — async LDS delivery at cluster scale (1D and 2D cluster dims) -// Group 5: ASYNCcnt ordering — CLUSTER and GLOBAL async loads share one counter -// Group 6: PartialBroadcast — non-contiguous mask, mixed instruction types -// Group 8: MultiWGP + LDSVisibility — canonical GEMM tile-load pattern -// Group 10: ConcurrentGroups — LDS routing isolation between independent groups -// Group 11: BufferViewAsyncGet — cluster_async_get() through buffer_view, +// Group 1: SingleWGP baseline - B32/B64/B128, mask=0x1 and mask=0x0 +// Group 2: LDSVisibility - non-requesting waves read LDS after barrier +// Group 3: LDS address layout - per-lane VDST strided addressing +// Group 4: MultiWGP broadcast - async LDS delivery at cluster scale (1D and 2D cluster dims) +// Group 5: ASYNCcnt ordering - CLUSTER and GLOBAL async loads share one counter +// Group 6: PartialBroadcast - non-contiguous mask, mixed instruction types +// Group 8: MultiWGP + LDSVisibility - canonical GEMM tile-load pattern +// Group 10: ConcurrentGroups - LDS routing isolation between independent groups +// Group 11: BufferViewAsyncGet - cluster_async_get() through buffer_view, // including ISA-specified INST_OFFSET behaviour // // Synchronization primitives used: -// s_wait_asynccnt<0>() — wait for all pending async LDS writes to complete. +// s_wait_asynccnt<0>() - wait for all pending async LDS writes to complete. // ASYNCcnt decrements only when the LDS write is committed and visible to // subsequent DS reads on the same wave. -// block_sync_lds_direct_load<0>() — s_wait_asynccnt<0> + s_barrier_signal/wait. +// block_sync_lds_direct_load<0>() - s_wait_asynccnt<0> + s_barrier_signal/wait. // Used when multiple waves in a WG must synchronize after an async LDS load. #include "gtest/gtest.h" @@ -109,7 +109,7 @@ void run_async_lds_test(const std::vector& h_src, int mask, const char* test_ } // --------------------------------------------------------------------------- -// Group 1: SingleWGP — B32, B64, B128, mask=0x1 +// Group 1: SingleWGP - B32, B64, B128, mask=0x1 // --------------------------------------------------------------------------- TEST(AsyncLDS, B32_SingleWGP) @@ -149,7 +149,7 @@ TEST(AsyncLDS, B128_SingleWGP) } // --------------------------------------------------------------------------- -// Group 2: LDSVisibility — cross-wave LDS sharing after async load +// Group 2: LDSVisibility - cross-wave LDS sharing after async load // --------------------------------------------------------------------------- // 4 waves per WG (128 threads). Wave 0 issues the async cluster load into // LDS[0..31], then all waves synchronize via block_sync_lds_direct_load @@ -267,7 +267,7 @@ TEST(LDSVisibility, B128_FourWaves) } // --------------------------------------------------------------------------- -// Group 3: LDSAddressLayout — per-lane VDST addressing (strided) +// Group 3: LDSAddressLayout - per-lane VDST addressing (strided) // --------------------------------------------------------------------------- // CLUSTER_LOAD_ASYNC_TO_LDS_B* supplies the LDS destination address via a // per-lane VGPR (VDST). Each lane independently specifies where in LDS its @@ -276,7 +276,7 @@ TEST(LDSVisibility, B128_FourWaves) // // Each lane writes to lds_buf[lane_id * kStride], leaving kStride-1 unused // slots between lanes. The strided slots are zero-initialized before the -// async load so that any unwritten slot reads back 0 — which cannot +// async load so that any unwritten slot reads back 0 - which cannot // collide with src[i] = 1000 + i. If the hardware ignores VDST and // writes to lds_buf[lane_id] instead, lanes 1..31 read from their strided // slots and find zeros, causing a FAIL. @@ -363,10 +363,10 @@ TEST(LDSAddressLayout, B32_Strided_SingleWGP) } // --------------------------------------------------------------------------- -// Group 4: MultiWGP Broadcast — async LDS delivery at cluster scale +// Group 4: MultiWGP Broadcast - async LDS delivery at cluster scale // --------------------------------------------------------------------------- // All WGPs in a cluster load from the same single source value (true broadcast). -// Each lane within a WGP loads from shared_src → lds_buf[lane_id] via per-lane +// Each lane within a WGP loads from shared_src -> lds_buf[lane_id] via per-lane // VDST, so every LDS slot in every WGP ends up holding the broadcast value. // // Wave 0 of each WGP issues the async cluster load; s_wait_asynccnt<0> ensures @@ -569,11 +569,11 @@ TEST(MultiWGPBroadcast, B32_2x2Cluster) } // --------------------------------------------------------------------------- -// Group 7: ZeroMask — mask=0x0 degrades to non-multicast async load +// Group 7: ZeroMask - mask=0x0 degrades to non-multicast async load // --------------------------------------------------------------------------- // ISA spec: "If M0[15:0] == 0, this is treated as a non-Cluster-multicast load: // return only to the requesting WGP (it is not treated as 'do not return to -// any wave')." Data still lands in the requesting WGP's LDS — no deadlock, +// any wave')." Data still lands in the requesting WGP's LDS - no deadlock, // no lost load. TEST(AsyncLDS, B32_ZeroMask) @@ -589,7 +589,7 @@ TEST(AsyncLDS, B32_ZeroMask) } // --------------------------------------------------------------------------- -// Group 5: ASYNCcnt Ordering — CLUSTER_LOAD_ASYNC_TO_LDS + GLOBAL_LOAD_ASYNC_TO_LDS +// Group 5: ASYNCcnt Ordering - CLUSTER_LOAD_ASYNC_TO_LDS + GLOBAL_LOAD_ASYNC_TO_LDS // --------------------------------------------------------------------------- // Both instructions share a single ASYNCcnt on gfx1250, so one // s_wait_asynccnt<0>() is sufficient to guarantee both async LDS writes @@ -598,7 +598,7 @@ TEST(AsyncLDS, B32_ZeroMask) // A single wave issues both instructions back-to-back with no wait between: // 1. cluster_multicast_load_async_to_lds(src_a + lane) -> lds_a[lane] // 2. global_load_async_to_lds_b32(src_b + lane) -> lds_b[lane] -// 3. s_wait_asynccnt<0>() — one wait must drain both +// 3. s_wait_asynccnt<0>() - one wait must drain both // 4. Read lds_a[lane] -> dst_a[lane], lds_b[lane] -> dst_b[lane] // // If s_wait_asynccnt only drained one instruction type, the wave would read @@ -628,7 +628,7 @@ struct ASYNCcntOrderingKernel __builtin_amdgcn_global_load_async_to_lds_b32( ck_tile::to_global(src_b + lane_id), ck_tile::to_lds(lds_b + lane_id), 0, 0); - // Step 3: Single wait — must drain both async loads. + // Step 3: Single wait - must drain both async loads. ck_tile::s_wait_asynccnt<0>(); // Step 4: Read both LDS slots. Correct data in both confirms shared @@ -705,7 +705,7 @@ TEST(ASYNCcntOrdering, MixedAsyncLoads_B32_SingleWGP) } // --------------------------------------------------------------------------- -// Group 6: PartialBroadcast — non-contiguous mask, mixed instruction types +// Group 6: PartialBroadcast - non-contiguous mask, mixed instruction types // --------------------------------------------------------------------------- // 4 WGPs, mask = 0x5 (binary 0101). WGPs 0 and 2 participate in the cluster // multicast; WGPs 1 and 3 do not. @@ -719,7 +719,7 @@ TEST(ASYNCcntOrdering, MixedAsyncLoads_B32_SingleWGP) // Expected LDS: 5000 + lane in every slot // // This simultaneously verifies: -// 1. Multicast data is delivered only to WGPs whose bits are set in M0 — +// 1. Multicast data is delivered only to WGPs whose bits are set in M0 - // non-participating WGPs do not receive the broadcast value. // 2. Both async instruction types coexist in the same cluster on the same // ASYNCcnt without cross-contaminating each other's LDS destinations. @@ -823,7 +823,7 @@ TEST(PartialBroadcast, B32_4WGP_Mask0x5) } // --------------------------------------------------------------------------- -// Group 8: MultiWGP + LDSVisibility Combined — the canonical GEMM tile-load +// Group 8: MultiWGP + LDSVisibility Combined - the canonical GEMM tile-load // --------------------------------------------------------------------------- // 4 WGPs in a cluster, 4 waves per WG (128 threads). Wave 0 of each WG issues // cluster_multicast_load_async_to_lds (true broadcast: all lanes load from the @@ -966,7 +966,7 @@ TEST(MultiWGPLDSVisibility, B128_4WGP_4Waves) } // --------------------------------------------------------------------------- -// Group 10: ConcurrentGroups — LDS routing isolation between independent groups +// Group 10: ConcurrentGroups - LDS routing isolation between independent groups // --------------------------------------------------------------------------- // 4 WGPs in one cluster, two independent broadcast groups: // WGPs 0/1: mask = 0x3, load val_a into LDS @@ -977,7 +977,7 @@ TEST(MultiWGPLDSVisibility, B128_4WGP_4Waves) // // For CLUSTER_LOAD_B (VGPR destination), misdirected data would land in a // per-thread VGPR that is private to one wave and physically unreadable by -// another WG — so VGPR tests cannot detect LDS routing bugs. Here, if the +// another WG - so VGPR tests cannot detect LDS routing bugs. Here, if the // hardware routes val_a to WGPs 2/3's LDS (or vice versa), the host // verification catches it. This is the only test that can expose such a bug. @@ -1100,7 +1100,7 @@ TEST(ConcurrentGroupsLDS, B128_4WGP_TwoGroups) } // --------------------------------------------------------------------------- -// Group 11: BufferViewAsyncGet — cluster_async_get() through buffer_view +// Group 11: BufferViewAsyncGet - cluster_async_get() through buffer_view // --------------------------------------------------------------------------- // Tests the buffer_view::cluster_async_get() interface, which wraps // cluster_multicast_load_async_to_lds and handles global pointer arithmetic @@ -1126,7 +1126,7 @@ using TestBufView = ck_tile::buffer_view; -// Kernel 1: basic load — each lane loads src[lane_id] into lds_buf[lane_id] +// Kernel 1: basic load - each lane loads src[lane_id] into lds_buf[lane_id] // via buffer_view::cluster_async_get. struct BufferViewBasicKernel { @@ -1189,9 +1189,9 @@ TEST(BufferViewAsyncGet, B32_BasicLoad) EXPECT_EQ(h_dst[i], h_src[i]) << "B32_BasicLoad: mismatch at lane " << i; } -// Kernel 2: inst_offset=4 — each lane supplies VDST = &lds_buf[lane*2]. +// Kernel 2: inst_offset=4 - each lane supplies VDST = &lds_buf[lane*2]. // ISA applies inst_offset to both VADDR and VDST: -// LDS[VDST+4] = GLOBAL[VADDR+4] → lds_buf[lane*2+1] = src[lane+1] +// LDS[VDST+4] = GLOBAL[VADDR+4] -> lds_buf[lane*2+1] = src[lane+1] // src is allocated with NUM_LANES+1 elements so lane 31 reads src[32] safely. struct BufferViewInstOffsetKernel { @@ -1263,7 +1263,7 @@ TEST(BufferViewAsyncGet, B32_InstOffset) const int sentinel = 0xDEADBEEF; for(int i = 0; i < NUM_LANES; i++) { - // Even slot (VDST): inst_offset skips this — sentinel must remain. + // Even slot (VDST): inst_offset skips this - sentinel must remain. EXPECT_EQ(h_dst[i * 2], sentinel) << "lane " << i << " even slot: expected sentinel, got " << h_dst[i * 2]; diff --git a/test/ck_tile/utility/test_sequence.cpp b/test/ck_tile/utility/test_sequence.cpp index 9e75411e64..9f468bc6a1 100644 --- a/test/ck_tile/utility/test_sequence.cpp +++ b/test/ck_tile/utility/test_sequence.cpp @@ -178,7 +178,7 @@ TEST(CkTileSequence, UniformSequenceGenLarger) } // ============================================================================ -// sequence_reverse_inclusive_scan tests — runtime value verification +// sequence_reverse_inclusive_scan tests - runtime value verification // ============================================================================ TEST(CkTileSequence, ReverseInclusiveScanProduct) @@ -217,7 +217,7 @@ TEST(CkTileSequence, ReverseInclusiveScanEmpty) } // ============================================================================ -// sequence_inclusive_scan (forward) tests — runtime value verification +// sequence_inclusive_scan (forward) tests - runtime value verification // ============================================================================ TEST(CkTileSequence, ForwardInclusiveScanSum) @@ -273,7 +273,7 @@ TEST(CkTileSequence, ForwardInclusiveScanEmpty) } // ============================================================================ -// sequence_map_inverse tests — runtime round-trip verification +// sequence_map_inverse tests - runtime round-trip verification // ============================================================================ TEST(CkTileSequence, MapInverseIdentity) diff --git a/test/ck_tile/utility/test_static_ford.cpp b/test/ck_tile/utility/test_static_ford.cpp index 7337471647..e9b2e8c197 100644 --- a/test/ck_tile/utility/test_static_ford.cpp +++ b/test/ck_tile/utility/test_static_ford.cpp @@ -10,7 +10,7 @@ using namespace ck_tile; // ============================================================================ -// static_ford Tests — Identity Order (default) +// static_ford Tests - Identity Order (default) // ============================================================================ TEST(CkTileStaticFord, Identity2D) @@ -122,7 +122,7 @@ TEST(CkTileStaticFord, IdentityWithUnitDim) } // ============================================================================ -// static_ford Tests — Non-Identity Order (primary template with decompose_reordered) +// static_ford Tests - Non-Identity Order (primary template with decompose_reordered) // ============================================================================ TEST(CkTileStaticFord, ReversedOrder2D) diff --git a/test/cluster_launch/test_cluster_launch.cpp b/test/cluster_launch/test_cluster_launch.cpp index 0686fed2c8..648c530524 100644 --- a/test/cluster_launch/test_cluster_launch.cpp +++ b/test/cluster_launch/test_cluster_launch.cpp @@ -131,7 +131,7 @@ TEST(ClusterLaunch, ClusterBuiltins) out_mem.FromDevice(out_host.data()); // cluster_id_x = blockIdx.x / clusterSize - // Blocks 0,1 → cluster 0; Blocks 2,3 → cluster 1 + // Blocks 0,1 -> cluster 0; Blocks 2,3 -> cluster 1 for(int block = 0; block < numBlocks; ++block) { int expected_cluster_id = block / clusterSize; diff --git a/test/cluster_load/test_cluster_load.cpp b/test/cluster_load/test_cluster_load.cpp index 5a5d4cf646..b0be4be2fc 100644 --- a/test/cluster_load/test_cluster_load.cpp +++ b/test/cluster_load/test_cluster_load.cpp @@ -18,7 +18,7 @@ using ::ck::DeviceMem; constexpr int kTileSize = 32; // -// cluster_load: Global → VGPRs with a WGP participation mask. +// cluster_load: Global -> VGPRs with a WGP participation mask. // Templated kernels covering 4-byte (int), 8-byte (int2), and 16-byte (int4) loads. // diff --git a/test/cluster_load/test_cluster_load_async.cpp b/test/cluster_load/test_cluster_load_async.cpp index a18a9c8291..0a1113a3c3 100644 --- a/test/cluster_load/test_cluster_load_async.cpp +++ b/test/cluster_load/test_cluster_load_async.cpp @@ -18,7 +18,7 @@ using ::ck::DeviceMem; constexpr int kTileSize = 32; // -// cluster_load_async: Global → LDS with a WGP participation mask. +// cluster_load_async: Global -> LDS with a WGP participation mask. // Templated kernels covering 1-byte (char), 4-byte (int), 8-byte (int2), // and 16-byte (int4) async loads. // @@ -28,7 +28,7 @@ extern __shared__ char shared_lds[]; // --- Templated kernels ---------------------------------------------------- -// Single WGP, async load global → LDS, copy LDS → output. mask = 0x1. +// Single WGP, async load global -> LDS, copy LDS -> output. mask = 0x1. template __global__ void cluster_load_async_single_wgp_kernel(const T* __restrict__ in, T* __restrict__ out, int n) @@ -131,7 +131,7 @@ __global__ void cluster_load_async_partial_mask_kernel( } } -// LDS bounds check — sentinel region adjacent to loaded tile remains zero. +// LDS bounds check - sentinel region adjacent to loaded tile remains zero. template __global__ void cluster_load_async_bounds_check_kernel(const T* __restrict__ in, T* __restrict__ out, int n) diff --git a/test/data_type/test_mx_bf6_pk4scale.cpp b/test/data_type/test_mx_bf6_pk4scale.cpp index 9820549730..bed773538a 100644 --- a/test/data_type/test_mx_bf6_pk4scale.cpp +++ b/test/data_type/test_mx_bf6_pk4scale.cpp @@ -95,7 +95,7 @@ TYPED_TEST(MXBF6Pk4ScaleTypedTest, DeviceWavewiseBlock32) constexpr float Val = 2.0f; std::vector out(M * N, -1.0f); std::vector scale(2 * M); - // Test scale variations: 16 rows × 128 columns + // Test scale variations: 16 rows x 128 columns // - Each row has different 8 scale factor (scale[m]-packed 4 and scale[m+16]-packed4) // - Within a row, every 16 consecutive columns share the same scale factor for(int m = 0; m < M; m++) @@ -237,7 +237,7 @@ TYPED_TEST(MXBF6Pk4ScaleTypedTest, DeviceWavewiseBlock16) constexpr float Val = 2.0f; std::vector out(M * N, -1.0f); std::vector scale(2 * M); - // Test scale variations: 16 rows × 128 columns + // Test scale variations: 16 rows x 128 columns // - Each row has different 8 scale factor (scale[m]-packed 4 and scale[m+16]-packed4) // - Within a row, every 16 consecutive columns share the same scale factor for(int m = 0; m < M; m++) diff --git a/test/data_type/test_mx_fp6_pk4scale.cpp b/test/data_type/test_mx_fp6_pk4scale.cpp index e3e99e49aa..3fba00d68c 100644 --- a/test/data_type/test_mx_fp6_pk4scale.cpp +++ b/test/data_type/test_mx_fp6_pk4scale.cpp @@ -95,7 +95,7 @@ TYPED_TEST(MXFP6Pk4ScaleTypedTest, DeviceWavewiseBlock32) constexpr float Val = 2.0f; std::vector out(M * N, -1.0f); std::vector scale(2 * M); - // Test scale variations: 16 rows × 128 columns + // Test scale variations: 16 rows x 128 columns // - Each row has different 8 scale factor (scale[m]-packed 4 and scale[m+16]-packed4) // - Within a row, every 16 consecutive columns share the same scale factor for(int m = 0; m < M; m++) @@ -238,7 +238,7 @@ TYPED_TEST(MXFP6Pk4ScaleTypedTest, DeviceWavewiseBlock16) constexpr float Val = 2.0f; std::vector out(M * N, -1.0f); std::vector scale(2 * M); - // Test scale variations: 16 rows × 128 columns + // Test scale variations: 16 rows x 128 columns // - Each row has different 8 scale factor (scale[m]-packed 4 and scale[m+16]-packed4) // - Within a row, every 16 consecutive columns share the same scale factor for(int m = 0; m < M; m++) diff --git a/test/gemm/test_gemm_vgpr.cpp b/test/gemm/test_gemm_vgpr.cpp index 46fabf74f3..45a6ef7861 100644 --- a/test/gemm/test_gemm_vgpr.cpp +++ b/test/gemm/test_gemm_vgpr.cpp @@ -1,4 +1,4 @@ -// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT /** diff --git a/test/mx_wmma_op/mx_wmma_op.hpp b/test/mx_wmma_op/mx_wmma_op.hpp index 242111f273..31c551d445 100644 --- a/test/mx_wmma_op/mx_wmma_op.hpp +++ b/test/mx_wmma_op/mx_wmma_op.hpp @@ -853,7 +853,7 @@ struct TestMXWMMA std::vector idx({i, j}); if constexpr(is_same_v) { - // f4x2_pk_t packs two f4 values — print both + // f4x2_pk_t packs two f4 values - print both auto pack = mat(idx); std::cout << ck::type_convert(f4_t(pack.template unpack<>(Number<0>{}))) << "/" // lo/hi separator within a packed element @@ -863,7 +863,7 @@ struct TestMXWMMA else if constexpr(is_same_v || is_same_v) { - // f6_pk_t packs packed_size f6_t values — print all + // f6_pk_t packs packed_size f6_t values - print all auto pack = mat(idx); for(index_t k = 0; k < DataType::packed_size; ++k) { @@ -876,7 +876,7 @@ struct TestMXWMMA else if constexpr(is_same_v || is_same_v) { - // bf6_pk_t packs packed_size bf6_t values — print all + // bf6_pk_t packs packed_size bf6_t values - print all auto pack = mat(idx); for(index_t k = 0; k < DataType::packed_size; ++k) { diff --git a/test/synchronization/monitor_mwait.cpp b/test/synchronization/monitor_mwait.cpp index 8fa418cc5a..b3490bb08a 100644 --- a/test/synchronization/monitor_mwait.cpp +++ b/test/synchronization/monitor_mwait.cpp @@ -1,4 +1,4 @@ -// Copyright © Advanced Micro Devices, Inc., or its affiliates. +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT #include "gtest/gtest.h" diff --git a/tile_engine/ops/pooling/pooling_benchmark_single.cpp b/tile_engine/ops/pooling/pooling_benchmark_single.cpp index 0d872a9f51..ff71081883 100644 --- a/tile_engine/ops/pooling/pooling_benchmark_single.cpp +++ b/tile_engine/ops/pooling/pooling_benchmark_single.cpp @@ -30,7 +30,7 @@ // The kernel header is included via compile command line with -include flag. // -------------------------------------------------------------------------- -// Benchmark implementation — templated on pooling dimension so that only +// Benchmark implementation - templated on pooling dimension so that only // the matching branch is instantiated (2D or 3D). // --------------------------------------------------------------------------