From e02c566795e4e00d3ee03f2a75f44d72e6da5754 Mon Sep 17 00:00:00 2001
From: Illia Silin <98187287+illsilin@users.noreply.github.com>
Date: Thu, 21 May 2026 19:43:50 -0700
Subject: [PATCH] [rocm-libraries] ROCm/rocm-libraries#7612 (commit 5427d24)

[CK] upgrade CI to rocm7.13 as default compiler (#7612)

## Motivation

Upgrade the default docker and compiler version in CI to rocm7.13.
In order to pass all the checks I had to also clean up a lot of
non-ascii characters in the source code comments and modify a couple of
tests that were affected by a new compiler logic.

## Technical Details

<!-- Explain the changes along with any relevant GitHub links. -->

## Test Plan

<!-- Explain any relevant testing done to verify this PR. -->

## Test Result

<!-- Briefly summarize test outcomes. -->

## Submission Checklist

- [ ] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.

---------

Co-authored-by: Aviral Goel <aviral.goel@amd.com>
---
 Dockerfile                                    | 31 +++------
 Jenkinsfile                                   | 37 ++++++++---
 dispatcher/src/registry.cpp                   |  2 +-
 .../01_gemm/gemm_wmma_fp8_v3_reg_spill.cpp    |  2 +-
 example/26_contraction/common_instances.hpp   |  2 +-
 ...uped_query_attention_forward_wmma_fp16.cpp |  2 +-
 ...ulti_query_attention_forward_wmma_fp16.cpp |  2 +-
 ...rouped_conv_bwd_data_bias_relu_example.inc |  2 +-
 .../run_grouped_conv_bwd_data_example.inc     |  2 +-
 ...nd_bwd_data_xdl_bilinear_residual_fp16.cpp |  2 +-
 .../64_fpAintB_gemm/fp16int8_gemm_wmma.cpp    |  4 +-
 .../common_instances.hpp                      |  2 +-
 example/ck_tile/01_fmha/fmha_fwd.hpp          |  4 +-
 ...ed_convolution_backward_weight_streamk.cpp |  2 +-
 example/ck_tile/42_mx_gemm/run_mx_gemm.inc    |  2 +-
 .../builder/factory/conv_algorithms.hpp       |  2 +-
 .../builder/factory/conv_dispatcher.hpp       |  2 +-
 ...le_grouped_convolution_backward_weight.hpp |  4 +-
 ...raits_tile_grouped_convolution_forward.hpp | 10 +--
 ...unit_instance_to_conv_traits_instances.cpp |  2 +-
 .../test/impl/conv_algorithm_types.hpp        |  2 +-
 experimental/builder/test/testing_utils.cpp   |  2 +-
 ...gemm_xdl_waveletmodel_cshuffle_conv_v3.hpp | 12 ++--
 .../arch/amd_buffer_addressing_builtins.hpp   | 12 ++--
 .../ck_tile/core/arch/amd_cluster_load.hpp    | 14 ++--
 include/ck_tile/core/container/sequence.hpp   |  2 +-
 include/ck_tile/core/tensor/buffer_view.hpp   |  6 +-
 include/ck_tile/core/tensor/load_tile.hpp     |  2 +-
 .../core/tensor/load_tile_transpose.hpp       |  6 +-
 .../core/tensor/load_tile_transpose.hpp.bk    |  6 +-
 .../core/tensor/tile_scatter_gather.hpp       |  2 +-
 include/ck_tile/core/tensor/tile_window.hpp   |  8 +--
 .../ck_tile/core/tensor/tile_window.hpp.bk    |  8 +--
 include/ck_tile/core/utility/debug.hpp        |  2 +-
 include/ck_tile/core/utility/type_traits.hpp  |  2 +-
 .../ck_tile/host/reference/reference_topk.hpp |  2 +-
 .../kernel/batched_contraction_kernel.hpp     | 16 ++---
 include/ck_tile/ops/common/streamk_common.hpp |  4 +-
 .../unary_element_wise_operation.hpp          |  2 +-
 .../chainer/cshuffle_epilogue_schedule.hpp    |  8 ++-
 .../ops/fmha/kernel/fmha_bwd_kernel.hpp       |  6 +-
 ..._batch_prefill_pipeline_qr_ks_vs_async.hpp | 22 +++----
 .../pipeline/block_fmha_fwd_v3_pipeline.hpp   | 24 +++----
 .../gemm/block/block_wp_asmem_breg_creg.hpp   |  2 +-
 .../ops/gemm/kernel/gemm_tile_partitioner.hpp | 14 ++--
 .../gemm_pipeline_ag_bg_cr_comp_v3.hpp        | 12 ++--
 .../gemm_pipeline_ag_bg_cr_comp_v4.hpp        | 16 ++---
 .../gemm_pipeline_ag_bg_cr_comp_v5.hpp        | 12 ++--
 .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp   | 24 +++----
 .../gemm_pipeline_agmem_bgmem_creg_v1.hpp     | 24 +++----
 .../gemm_pipeline_agmem_bgmem_creg_v2.hpp     |  8 +--
 .../ops/gemm/warp/warp_gemm_params.hpp        |  2 +-
 .../block_universal_gemm_as_aquant_bs_cr.hpp  |  2 +-
 .../gemm_quant/kernel/gemm_quant_kernel.hpp   |  6 +-
 .../pipeline/gemm_group_quant_utils.hpp       | 12 ++--
 ...ouped_convolution_backward_data_kernel.hpp |  2 +-
 ...ped_convolution_backward_weight_kernel.hpp |  8 +--
 .../grouped_convolution_forward_kernel.hpp    |  2 +-
 .../utils/transform_conv_fwd_to_gemm.hpp      |  6 +-
 .../ops/reduce/block/block_reduce2d.hpp       |  2 +-
 ...ice_grouped_conv_bwd_data_xdl_instance.hpp | 14 ++--
 ...d_conv_bwd_weight_wavelet_xdl_instance.hpp | 12 ++--
 .../contraction_instance_common.hpp           | 26 ++++----
 ...ta_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp |  8 +--
 ...ata_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp |  8 +--
 ...ata_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp |  8 +--
 rocm_ck/include/rocm_ck/args.hpp              | 22 +++----
 rocm_ck/include/rocm_ck/datatype.hpp          | 12 ++--
 rocm_ck/include/rocm_ck/fixed_string.hpp      |  6 +-
 rocm_ck/include/rocm_ck/gpu_target.hpp        |  2 +-
 rocm_ck/include/rocm_ck/index_t.hpp           |  2 +-
 rocm_ck/include/rocm_ck/layout.hpp            |  4 +-
 rocm_ck/include/rocm_ck/ops.hpp               | 10 +--
 rocm_ck/include/rocm_ck/physical_tensor.hpp   |  4 +-
 rocm_ck/include/rocm_ck/resolved_tensor.hpp   | 12 ++--
 rocm_ck/tests/unit/unit_args.cpp              |  2 +-
 .../src/selective_test_filter.py              | 13 +++-
 .../mma/pipeline/test_amdgcn_scale_mma.cpp    | 12 ++--
 .../mma/pipeline/test_amdgcn_sparse_mma.cpp   | 20 +++---
 test/ck_tile/core/container/unit_sequence.cpp |  2 +-
 .../flatmm/test_mx_flatmm_fixtures.hpp        |  2 +-
 test/ck_tile/fmha/test_fmha_fwd.cpp           |  2 +-
 test/ck_tile/gemm_block_scale/CMakeLists.txt  | 14 ----
 ...gemm_quant_aquant_mem_decode_interwave.cpp | 27 --------
 ...emm_quant_aquant_mem_prefill_interwave.cpp | 27 --------
 .../test_gemm_quant_bquant_splitk_decode.cpp  |  8 +--
 .../test_gemm_quant_bquant_splitk_prefill.cpp |  8 +--
 .../test_gemm_quant_fixtures.hpp              | 26 +++-----
 .../test_mx_gemm_pipeline_wmma_base.hpp       |  4 +-
 ...k_tile_grouped_conv_bwd_weight_streamk.cpp | 22 +++----
 .../grouped_gemm/test_grouped_gemm_util.hpp   |  4 +-
 .../test_cluster_load_async_to_lds.cpp        | 66 +++++++++----------
 test/ck_tile/utility/test_sequence.cpp        |  6 +-
 test/ck_tile/utility/test_static_ford.cpp     |  4 +-
 test/cluster_launch/test_cluster_launch.cpp   |  2 +-
 test/cluster_load/test_cluster_load.cpp       |  2 +-
 test/cluster_load/test_cluster_load_async.cpp |  6 +-
 test/data_type/test_mx_bf6_pk4scale.cpp       |  4 +-
 test/data_type/test_mx_fp6_pk4scale.cpp       |  4 +-
 test/gemm/test_gemm_vgpr.cpp                  |  2 +-
 test/mx_wmma_op/mx_wmma_op.hpp                |  6 +-
 test/synchronization/monitor_mwait.cpp        |  2 +-
 .../ops/pooling/pooling_benchmark_single.cpp  |  2 +-
 103 files changed, 416 insertions(+), 475 deletions(-)
 delete mode 100644 test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp
 delete mode 100644 test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp

diff --git a/Dockerfile b/Dockerfile
index 39d2277e50..7c0da210d3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,7 @@
 
 FROM ubuntu:24.04
 ARG DEBIAN_FRONTEND=noninteractive
-ARG ROCMVERSION=7.1.1
-ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
+ARG ROCMVERSION=7.13
 
 # TheRock nightly tarball configuration.
 # By default, discovers the latest tarball from the nightlies index.
@@ -26,7 +25,7 @@ ENV HIP_PLATFORM=amd
 
 # Add rocm repository
 RUN set -xe && \
-    apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
+    apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl cmake git vim nano zip
 
 RUN if [ "$compiler_version" = "therock" ]; then \
         rm -rf /opt/rocm && mkdir /opt/rocm && \
@@ -43,11 +42,12 @@ RUN if [ "$compiler_version" = "therock" ]; then \
         tar -xzf /tmp/rocm.tar.gz -C /opt/rocm --strip-components=1 && \
         rm /tmp/rocm.tar.gz ; \
     else echo "using the release compiler" && \
-        wget https://repo.radeon.com/amdgpu-install/7.1.1/ubuntu/noble/amdgpu-install_7.1.1.70101-1_all.deb && \
-        apt install ./amdgpu-install_7.1.1.70101-1_all.deb -y && \
-        apt update && \
-        apt install python3-setuptools python3-wheel -y && \
-        apt install rocm-dev -y; \
+        wget https://repo.amd.com/rocm/tarball-multi-arch/therock-dist-linux-multiarch-7.13.0.tar.gz && \
+        rm -rf /opt/rocm && mkdir /opt/rocm && \
+        tar -xzf therock-dist-linux-multiarch-7.13.0.tar.gz -C /opt/rocm --strip-components=1 && \
+        rm therock-dist-linux-multiarch-7.13.0.tar.gz && \
+        wget https://repo.radeon.com/amdgpu-install/31.30/ubuntu/noble/amdgpu-install_31.30.313000-1_all.deb && \
+        apt install ./amdgpu-install_31.30.313000-1_all.deb -y; \
     fi
     
 # Install SCCACHE
@@ -58,19 +58,14 @@ RUN set -x && \
     mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
     wget -qO sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v$SCCACHE_VERSION/sccache-v$SCCACHE_VERSION-x86_64-unknown-linux-musl.tar.gz && \
     tar -xzf sccache.tar.gz --strip-components=1 -C ${SCCACHE_INSTALL_LOCATION} && \
-    chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache
-
+    chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache && \
 # Install dependencies
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
     build-essential \
-    cmake \
-    git \
-    iputils-ping \
     jq \
     libelf-dev \
     libnuma-dev \
     libpthread-stubs0-dev \
-    mpich \
     net-tools \
     pkg-config \
     python3-full \
@@ -79,17 +74,11 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
     sshpass \
     stunnel \
     software-properties-common \
-    vim \
-    nano \
     zlib1g-dev \
-    zip \
     libzstd-dev \
     openssh-server \
     clang-format-18 \
     kmod && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* && \
-    rm -rf amdgpu-install* && \
 #Install latest ccache
     git clone https://github.com/ccache/ccache.git && \
     cd ccache && mkdir build && cd build && cmake .. && make install && \
diff --git a/Jenkinsfile b/Jenkinsfile
index 12faaaeca2..92a3156097 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -357,13 +357,7 @@ def getBaseDockerImageName(){
         img = "${params.USE_CUSTOM_DOCKER}"
     }
     else{
-        def ROCM_numeric = parseVersion("${params.ROCMVERSION}")
-        if ( ROCM_numeric.major <= 7 && ROCM_numeric.minor < 2 ){
-            img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}"
-            }
-        else{
-            img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm${params.ROCMVERSION}"
-            }
+        img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}"
         }
     return img
 }
@@ -1284,8 +1278,8 @@ pipeline {
             description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
         string(
             name: 'ROCMVERSION',
-            defaultValue: '7.1.1',
-            description: 'Specify which ROCM version to use: 7.1.1 (default).')
+            defaultValue: '7.13',
+            description: 'Specify which ROCM version to use: 7.13 (default).')
         string(
             name: 'COMPILER_VERSION',
             defaultValue: '',
@@ -1498,6 +1492,7 @@ pipeline {
                 stage('Docker /opt/rocm'){
                     agent{ label rocmnode("nogpu") }
                     steps{
+                        deleteDir()
                         buildDocker('/opt/rocm')
                         cleanWs()
                     }
@@ -1529,6 +1524,7 @@ pipeline {
                                 --file-filter=*.cpp --force --enable=all --output-file=ck_cppcheck.log"""
                     }
                     steps{
+                        deleteDir()
                         buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd)
                         archiveArtifacts "build/ck_cppcheck.log"
                         cleanWs()
@@ -1548,6 +1544,7 @@ pipeline {
                                 xargs -P 8 -I{} sh -c 'clang-format-18 -style=file {} | diff -u - {} || (echo "ERROR: {} needs formatting" && exit 1)'"""
                     }
                     steps{
+                        deleteDir()
                         buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd)
                         cleanWs()
                     }
@@ -1648,6 +1645,7 @@ pipeline {
                                            ./bin/test_grouped_convnd_fwd_tile"""
                     }
                     steps{
+                        deleteDir()
                         buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                         cleanWs()
                     }
@@ -1676,6 +1674,7 @@ pipeline {
                                            ./bin/test_grouped_convnd_fwd_large_cases && ./bin/test_grouped_convnd_bwd_data_large_cases && ./bin/test_grouped_convnd_fwd_bias_clamp_large_cases"""
                     }
                     steps{
+                        deleteDir()
                         buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                         cleanWs()
                     }
@@ -1716,6 +1715,7 @@ pipeline {
                                            ./bin/test_grouped_convnd_bwd_weight_dataset_xdl"""
                     }
                     steps{
+                        deleteDir()
                         buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                         cleanWs()
                     }
@@ -1742,6 +1742,7 @@ pipeline {
                         execute_args = build_and_run_fmha("gfx90a")
                     }
                     steps{
+                        deleteDir()
                         buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                         cleanWs()
                     }
@@ -1758,6 +1759,7 @@ pipeline {
                         execute_args = build_and_run_fmha("gfx942")
                     }
                     steps{
+                        deleteDir()
                         buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                         cleanWs()
                     }
@@ -1774,6 +1776,7 @@ pipeline {
                         execute_args = build_and_run_fmha("gfx950")
                     }
                     steps{
+                        deleteDir()
                         buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                         cleanWs()
                     }
@@ -1790,6 +1793,7 @@ pipeline {
                         execute_args = build_and_run_fmha("gfx1201")
                     }
                     steps{
+                        deleteDir()
                         buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                         cleanWs()
                     }
@@ -1833,6 +1837,7 @@ pipeline {
                                            python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
                     }
                     steps{
+                        deleteDir()
                         buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                         cleanWs()
                     }
@@ -1879,6 +1884,7 @@ pipeline {
                                            python3 ../tile_engine/ops/gemm/grouped_gemm/grouped_gemm_benchmark.py . --problem-sizes "1024,1024,1024" --group-counts 8 --warmup 5 --repeat 5 --verbose --json grouped_gemm_results.json """
                     }
                     steps{
+                        deleteDir()
                         buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                         cleanWs()
                     }
@@ -1910,6 +1916,7 @@ pipeline {
                                            python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
                     }
                     steps{
+                        deleteDir()
                         buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                         cleanWs()
                     }
@@ -1935,6 +1942,7 @@ pipeline {
                                            python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
                     }
                     steps{
+                        deleteDir()
                         buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
                         cleanWs()
                     }
@@ -1962,6 +1970,7 @@ pipeline {
                         execute_args = build_client_examples("gfx942")
                     }
                     steps{
+                        deleteDir()
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                         cleanWs()
                     }
@@ -1978,6 +1987,7 @@ pipeline {
                         execute_args = build_client_examples("gfx950")
                     }
                     steps{
+                        deleteDir()
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                         cleanWs()
                     }
@@ -1995,6 +2005,7 @@ pipeline {
                         execute_args = build_client_examples("gfx908")
                     }
                     steps{
+                        deleteDir()
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                         cleanWs()
                     }
@@ -2012,6 +2023,7 @@ pipeline {
                         execute_args = build_client_examples_and_codegen_tests("gfx90a")
                     }
                     steps{
+                        deleteDir()
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                         cleanWs()
                     }
@@ -2024,6 +2036,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx942") }
                     steps{
+                        deleteDir()
                         script {
                             def execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
                                                 -DCMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" \
@@ -2048,6 +2061,7 @@ pipeline {
                         execute_args = build_client_examples("gfx10-1-generic")
                     }
                     steps{
+                        deleteDir()
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                         cleanWs()
                     }
@@ -2065,6 +2079,7 @@ pipeline {
                         execute_args = build_client_examples("gfx10-3-generic")
                     }
                     steps{
+                        deleteDir()
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                         cleanWs()
                     }
@@ -2081,6 +2096,7 @@ pipeline {
                         execute_args = build_client_examples("gfx11-generic")
                     }
                     steps{
+                        deleteDir()
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                         cleanWs()
                     }
@@ -2097,6 +2113,7 @@ pipeline {
                         execute_args = build_client_examples("gfx12-generic")
                     }
                     steps{
+                        deleteDir()
                         Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                         cleanWs()
                     }
@@ -2112,6 +2129,7 @@ pipeline {
                         setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1250" -DDISABLE_DL_KERNELS="ON" """
                     }
                     steps{
+                        deleteDir()
                         Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:npi-mi450-latest", config_targets: "install", no_reboot:true, build_type: 'Release', prefixpath: '/usr/local')
                         cleanWs()
                     }
@@ -2153,6 +2171,7 @@ pipeline {
                     }
                     agent { label 'mici' }
                     steps{
+                        deleteDir()
                         process_results()
                         cleanWs()
                     }
diff --git a/dispatcher/src/registry.cpp b/dispatcher/src/registry.cpp
index cd17fcbd53..d04f02bbc8 100644
--- a/dispatcher/src/registry.cpp
+++ b/dispatcher/src/registry.cpp
@@ -57,7 +57,7 @@ bool Registry::register_kernel(KernelInstancePtr instance, Priority priority)
 
     // Store under the encoded identifier so Registry::lookup(KernelKey) finds it.
     // Previously stored under instance->get_name(), but lookup(KernelKey) queries by
-    // key.encode_identifier() — those keys never matched, breaking key-based lookup.
+    // key.encode_identifier() - those keys never matched, breaking key-based lookup.
     if(Base::register_kernel(instance->get_key().encode_identifier(), instance, priority))
     {
         if(auto_export_enabled_ && auto_export_on_every_registration_)
diff --git a/example/01_gemm/gemm_wmma_fp8_v3_reg_spill.cpp b/example/01_gemm/gemm_wmma_fp8_v3_reg_spill.cpp
index 5a2778986d..b9d8fe2966 100644
--- a/example/01_gemm/gemm_wmma_fp8_v3_reg_spill.cpp
+++ b/example/01_gemm/gemm_wmma_fp8_v3_reg_spill.cpp
@@ -1,4 +1,4 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
 /**
diff --git a/example/26_contraction/common_instances.hpp b/example/26_contraction/common_instances.hpp
index ef39d844da..a6c6ed5621 100644
--- a/example/26_contraction/common_instances.hpp
+++ b/example/26_contraction/common_instances.hpp
@@ -197,7 +197,7 @@ using DeviceOpInstanceMN_FP64 = ck::tensor_operation::device::
 
 // Macro to instantiate all four layout variants of DeviceOpInstance.
 //
-// BASE:   Generic (for fp16/bf16/fp32) or FP64 (for fp64 — different tile sizes)
+// BASE:   Generic (for fp16/bf16/fp32) or FP64 (for fp64 - different tile sizes)
 // SUFFIX: NN for bilinear (DsDataType = Tuple<DDataType>),
 //         N  for scale    (DsDataType = Tuple<>)
 //
diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp
index 35a1289f86..66b2aa8508 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp
@@ -4,7 +4,7 @@
 /*
 Grouped Query Attention,
 Ainslie, Joshua, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebrón, and Sumit
-Sanghai. “GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints.”
+Sanghai. "GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints."
 arXiv, May 22, 2023. https://doi.org/10.48550/arXiv.2305.13245.
 
 Example is GQA-4
diff --git a/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp
index 224f0ec596..baef346231 100644
--- a/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp
@@ -3,7 +3,7 @@
 
 /*
 Multi-Query Attention
-Shazeer, Noam. “Fast Transformer Decoding: One Write-Head Is All You Need.” arXiv.org, November 6,
+Shazeer, Noam. "Fast Transformer Decoding: One Write-Head Is All You Need." arXiv.org, November 6,
 2019. https://arxiv.org/abs/1911.02150v1.
 
 */
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc
index d4174f9f1a..9ddc541463 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc
+++ b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc
@@ -105,7 +105,7 @@ bool run_conv_bwd_data_bias_relu(const ExecutionConfig& config,
     if(!conv.IsSupportedArgument(argument))
     {
         std::cout << "device_conv with the specified compilation parameters does "
-                     "not support this Conv problem — skipping."
+                     "not support this Conv problem - skipping."
                   << std::endl;
 
         return true;
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc
index 1f3ca7ac80..0e9da3e5e3 100644
--- a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc
+++ b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc
@@ -93,7 +93,7 @@ bool run_conv_bwd_data(const ExecutionConfig& config,
     if(!conv.IsSupportedArgument(argument))
     {
         std::cout << "device_conv with the specified compilation parameters does "
-                     "not support this Conv problem — skipping."
+                     "not support this Conv problem - skipping."
                   << std::endl;
 
         return true;
diff --git a/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp b/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp
index b7a11baf5a..2bca388deb 100644
--- a/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp
+++ b/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp
@@ -207,7 +207,7 @@ bool run_grouped_conv(bool do_verification,
     if(!conv.IsSupportedArgument(argument))
     {
         std::cout << "The device op with the specified compilation parameters does "
-                     "not support this convolution problem — skipping."
+                     "not support this convolution problem - skipping."
                   << std::endl;
         return true;
     }
diff --git a/example/64_fpAintB_gemm/fp16int8_gemm_wmma.cpp b/example/64_fpAintB_gemm/fp16int8_gemm_wmma.cpp
index ee78490420..450d1b643f 100644
--- a/example/64_fpAintB_gemm/fp16int8_gemm_wmma.cpp
+++ b/example/64_fpAintB_gemm/fp16int8_gemm_wmma.cpp
@@ -6,8 +6,8 @@
 #include "ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp"
 
 // Implementation follows the paper:
-// Kim, Young Jin, Rawn Henry, Raffy Fahim, and Hany Hassan Awadalla. “Who Says Elephants Can’t Run:
-// Bringing Large Scale MoE Models into Cloud Scale Production.” arXiv, November 17, 2022.
+// Kim, Young Jin, Rawn Henry, Raffy Fahim, and Hany Hassan Awadalla. "Who Says Elephants Can’t Run:
+// Bringing Large Scale MoE Models into Cloud Scale Production." arXiv, November 17, 2022.
 // https://doi.org/10.48550/arXiv.2211.10017. Assume weight (Matrix B) is add preprocess to
 // unsigned.
 
diff --git a/example/66_complex_contraction_bilinear/common_instances.hpp b/example/66_complex_contraction_bilinear/common_instances.hpp
index 3ae168cb72..98bf7bace6 100644
--- a/example/66_complex_contraction_bilinear/common_instances.hpp
+++ b/example/66_complex_contraction_bilinear/common_instances.hpp
@@ -197,7 +197,7 @@ using DeviceOpInstanceMN_FP64 = ck::tensor_operation::device::
 
 // Macro to instantiate all four layout variants of DeviceOpInstance.
 //
-// BASE:   Generic (for fp16/bf16/fp32) or FP64 (for fp64 — different tile sizes)
+// BASE:   Generic (for fp16/bf16/fp32) or FP64 (for fp64 - different tile sizes)
 // SUFFIX: NN for bilinear (DsDataType = Tuple<DDataType>),
 //         N  for scale    (DsDataType = Tuple<>)
 //
diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp
index 98e2df2e1e..7a7e07d903 100644
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -677,7 +677,7 @@ struct fmha_batch_prefill_args
 //   GLOBAL_LOAD_LDS: required when (a) the page is smaller than one K/V tile
 //     so per-page SRD is impossible, AND (b) the total KV-pool byte size
 //     exceeds INT32_MAX so SRD's 32-bit byte offset cannot address it.
-//   BUFFER_LOAD: every other case — the SGPR-resident SRD path is fastest.
+//   BUFFER_LOAD: every other case - the SGPR-resident SRD path is fastest.
 // Inputs are taken as plain integers so the helper has no template parameter
 // and can be called from each codegen-emitted dispatcher arm with the arm's
 // compile-time kN0 / element_bytes substituted as constants.
@@ -691,7 +691,7 @@ fmha_batch_prefill_select_kv_load_mode(ck_tile::index_t page_block_size,
     // Promote every operand to long_index_t so overflow is impossible regardless
     // of multiplication order. A bare `static_cast<long_index_t>(num_total_pages)
     // * batch_stride_k * element_bytes` only works because of left-to-right
-    // associativity — a future reorder of the operands would silently truncate.
+    // associativity - a future reorder of the operands would silently truncate.
     const auto kv_pool_bytes = static_cast<ck_tile::long_index_t>(num_total_pages) *
                                static_cast<ck_tile::long_index_t>(batch_stride_k) *
                                static_cast<ck_tile::long_index_t>(element_bytes);
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_streamk.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_streamk.cpp
index fa6cf38cf0..39228511d5 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_streamk.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_streamk.cpp
@@ -59,7 +59,7 @@ int main(int argc, char* argv[])
         const std::string reduction = arg_parser.get_str("streamk_reduction");
         const bool persistent       = arg_parser.get_int("streamk_persistent") != 0;
 
-        // Dispatch on reduction strategy × persistent DP
+        // Dispatch on reduction strategy x persistent DP
         if(reduction == "linear" && !persistent)
         {
             using Invoker = GroupedConvolutionBackwardWeightInvoker<
diff --git a/example/ck_tile/42_mx_gemm/run_mx_gemm.inc b/example/ck_tile/42_mx_gemm/run_mx_gemm.inc
index 7ccd4e4273..9a375d2810 100644
--- a/example/ck_tile/42_mx_gemm/run_mx_gemm.inc
+++ b/example/ck_tile/42_mx_gemm/run_mx_gemm.inc
@@ -175,7 +175,7 @@ int run_mx_gemm_with_layouts(int argc, char* argv[], ALayout, BLayout, CLayout)
     constexpr ck_tile::index_t NXdlPackEff = (NIterPerWarp_ >= 2 && NIterPerWarp_ % 2 == 0) ? 2 : 1;
     constexpr ck_tile::index_t KXdlPackEff = (KIterPerWarp_ >= 2 && KIterPerWarp_ % 2 == 0) ? 2 : 1;
 
-    // Pack scales: [M, K/32] e8m0_t → [M/MXdlPackEff, K/32/KXdlPackEff] int32_t
+    // Pack scales: [M, K/32] e8m0_t -> [M/MXdlPackEff, K/32/KXdlPackEff] int32_t
     // Original unpacked tensors are kept for CPU reference validation
     constexpr ck_tile::index_t XdlMNThread = GemmConfig::M_Warp_Tile;
     constexpr ck_tile::index_t XdlKThread  = 64 / XdlMNThread;
diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_algorithms.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_algorithms.hpp
index a90e338c18..60b104add5 100644
--- a/experimental/builder/include/ck_tile/builder/factory/conv_algorithms.hpp
+++ b/experimental/builder/include/ck_tile/builder/factory/conv_algorithms.hpp
@@ -63,7 +63,7 @@ concept TileAlgorithm = ConvAlgorithmDescriptor<T> && SpecifiesTileThreadBlock<T
                         SpecifiesTileTransfer<T> && SpecifiesTileConvSpecialization<T> &&
                         SpecifiesTileBlockGemm<T> && SpecifiesTileOptimizations<T>;
 
-// Depthwise tile-based algorithm concept (no GEMM — direct spatial pipeline)
+// Depthwise tile-based algorithm concept (no GEMM - direct spatial pipeline)
 template <typename T>
 concept DepthwiseAlgorithm = ConvAlgorithmDescriptor<T> && SpecifiesDepthwiseConvParams<T>;
 
diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_dispatcher.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_dispatcher.hpp
index 9ee2953647..bb5a6268de 100644
--- a/experimental/builder/include/ck_tile/builder/factory/conv_dispatcher.hpp
+++ b/experimental/builder/include/ck_tile/builder/factory/conv_dispatcher.hpp
@@ -116,7 +116,7 @@ constexpr auto make_conv_instance()
     {
         return typename ReferenceFactory<SIGNATURE, ALGORITHM, VERSION>::Instance{};
     }
-    // Depthwise tile algorithm — direct spatial pipeline, no GEMM
+    // Depthwise tile algorithm - direct spatial pipeline, no GEMM
     else if constexpr(DepthwiseAlgorithm<AlgoType>)
     {
         return typename ConvDepthwiseTileFactory<SIGNATURE, ALGORITHM, VERSION>::Instance{};
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp
index ea292d8bc4..2670555401 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp
@@ -66,10 +66,10 @@ struct InstanceTraits<ck_tile::GroupedConvolutionBackwardWeightKernel<GroupedCon
     static constexpr int kMPerBlock = TilePartitioner_::MPerBlock;
     static constexpr int kNPerBlock = TilePartitioner_::NPerBlock;
     static constexpr int kKPerBlock = TilePartitioner_::KPerBlock;
-    // Partitioner — detect StreamK by checking for PERSISTENT member
+    // Partitioner - detect StreamK by checking for PERSISTENT member
     static constexpr bool kIsStreamK = requires { TilePartitioner_::PERSISTENT; };
 
-    // Warp configuration — sourced from pipeline's BlockGemmShape (works for both
+    // Warp configuration - sourced from pipeline's BlockGemmShape (works for both
     // GemmSpatiallyLocalTilePartitioner and StreamKTilePartitioner).
     using BlockGemmShape_           = typename GemmPipeline_::BlockGemmShape;
     static constexpr int kMWarp     = BlockGemmShape_::BlockWarps::at(number<0>{});
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp
index 28ba99b033..931db03afd 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp
@@ -31,7 +31,7 @@ namespace reflect {
 
 namespace detail {
 
-// Guards access to TilePartitioner members — primary template is depthwise (void partitioner).
+// Guards access to TilePartitioner members - primary template is depthwise (void partitioner).
 template <typename TilePartitioner, bool IsDepthwise>
 struct TilePartitionerFields
 {
@@ -62,7 +62,7 @@ struct TilePartitionerFields<TilePartitioner, false>
     static constexpr int kKWarpTile = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
 };
 
-// Guards access to GemmPipeline scheduling members — primary template is depthwise.
+// Guards access to GemmPipeline scheduling members - primary template is depthwise.
 template <typename GemmPipeline, bool IsDepthwise>
 struct GemmPipelineFields
 {
@@ -99,7 +99,7 @@ struct InstanceTraits<ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraits
     static constexpr ck_tile::ConvolutionSpecialization ConvSpecialization =
         GroupedConvTraitsType_::ConvSpecialization;
 
-    // Layout types (void for depthwise — access guarded in instance_string())
+    // Layout types (void for depthwise - access guarded in instance_string())
     using InLayout  = typename GroupedConvTraitsType_::InLayout;
     using WeiLayout = typename GroupedConvTraitsType_::WeiLayout;
     using DsLayout  = typename GroupedConvTraitsType_::DsLayout;
@@ -112,7 +112,7 @@ struct InstanceTraits<ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraits
     static constexpr bool kEnableSplitImage = GroupedConvTraitsType_::EnableSplitImage;
     static constexpr int kExplicitGemm      = GroupedConvTraitsType_::ExplicitGemm;
 
-    // TilePartitioner fields — safe for both GEMM and depthwise (void) partitioners
+    // TilePartitioner fields - safe for both GEMM and depthwise (void) partitioners
     using TPF                       = detail::TilePartitionerFields<TilePartitioner_, kIsDepthwise>;
     static constexpr int kMPerBlock = TPF::kMPerBlock;
     static constexpr int kNPerBlock = TPF::kNPerBlock;
@@ -128,7 +128,7 @@ struct InstanceTraits<ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraits
     using ADataType = typename GemmPipeline_::ADataType;
     using BDataType = typename GemmPipeline_::BDataType;
 
-    // GemmPipeline scheduling fields — safe for both paths
+    // GemmPipeline scheduling fields - safe for both paths
     using GPF          = detail::GemmPipelineFields<GemmPipeline_, kIsDepthwise>;
     using GemmPipeline = GemmPipeline_;
     static constexpr ck_tile::GemmPipelineScheduler kPipelineScheduler = GPF::kPipelineScheduler;
diff --git a/experimental/builder/test/conv/ck/unit_instance_to_conv_traits_instances.cpp b/experimental/builder/test/conv/ck/unit_instance_to_conv_traits_instances.cpp
index fa4bc73bd2..ff0410f658 100644
--- a/experimental/builder/test/conv/ck/unit_instance_to_conv_traits_instances.cpp
+++ b/experimental/builder/test/conv/ck/unit_instance_to_conv_traits_instances.cpp
@@ -46,7 +46,7 @@ using ::ck_tile::builder::PipelineVersion;
 // ============================================================================
 // Comprehensive Transformation Tests - Per Device Class Template
 // ============================================================================
-// These tests verify the complete InstanceTraits → ConvTraits transformation
+// These tests verify the complete InstanceTraits -> ConvTraits transformation
 // for each forward convolution Device class template.
 // ============================================================================
 
diff --git a/experimental/builder/test/impl/conv_algorithm_types.hpp b/experimental/builder/test/impl/conv_algorithm_types.hpp
index e1f6891c58..3dd8273928 100644
--- a/experimental/builder/test/impl/conv_algorithm_types.hpp
+++ b/experimental/builder/test/impl/conv_algorithm_types.hpp
@@ -744,7 +744,7 @@ using ConvAlgorithm_Tile_GroupedConvolutionKernel_StreamK =
                           TileOptimizations_,
                           TileStreamK_>;
 
-// CK Tile depthwise convolution algorithm (no GEMM — direct spatial pipeline)
+// CK Tile depthwise convolution algorithm (no GEMM - direct spatial pipeline)
 using ConvAlgorithm_Tile_DepthwiseConvolutionKernel =
     ConvAlgorithmTemplate<TileDepthwiseConvParams_>;
 
diff --git a/experimental/builder/test/testing_utils.cpp b/experimental/builder/test/testing_utils.cpp
index e9677e5940..1ee5e76f74 100644
--- a/experimental/builder/test/testing_utils.cpp
+++ b/experimental/builder/test/testing_utils.cpp
@@ -21,7 +21,7 @@ namespace ck_tile::test {
 // Wagner-Fischer Algorithm for Computing Edit Distance and Inline Diff
 //
 // OUTPUT FORMAT: [expected|actual] for differences, plain text for matches
-// Example: "hello world" vs "hello earth" → "hello [world|earth]"
+// Example: "hello world" vs "hello earth" -> "hello [world|earth]"
 //
 // This function implements the Wagner-Fischer algorithm (1974), which is the classic
 // dynamic programming solution for computing the minimum edit distance (Levenshtein distance)
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle_conv_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle_conv_v3.hpp
index c98e47f4f7..7170a40eb6 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle_conv_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle_conv_v3.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates
 // SPDX-License-Identifier: MIT
 
 #pragma once
@@ -26,7 +26,7 @@ namespace ck {
 //   - Math waves (threads 0..TileMath-1): LDS read + MFMA + CShuffle epilogue
 //
 // The conv-to-GEMM descriptor transforms (which generate heavy VALU ops) execute only on load
-// waves, while math waves see simple LDS layouts — eliminating the VALU/MFMA slot conflict.
+// waves, while math waves see simple LDS layouts - eliminating the VALU/MFMA slot conflict.
 
 template <typename ALayout,
           typename BLayout,
@@ -177,14 +177,14 @@ struct GridwiseGemm_xdl_waveletmodel_cshuffle_conv_v3
     static constexpr auto BlockSize       = TileMathThreadGroupSize;
     static constexpr auto LaunchBlockSize = TileLoadThreadGroupSize + TileMathThreadGroupSize;
 
-    // DirectLoad is not supported — wavelet model requires LDS as the sync boundary
+    // DirectLoad is not supported, wavelet model requires LDS as the sync boundary
     static constexpr bool DirectLoadEnabled = false;
 
     // =========================================================================
     // Wave specialization thread groups
     // =========================================================================
 
-    // Math threads are 0..TileMath-1 (must be first — BlockwiseGemmXdlops_v1 uses
+    // Math threads are 0..TileMath-1 (must be first, BlockwiseGemmXdlops_v1 uses
     // get_thread_local_1d_id() directly for wave index computation)
     struct TileMathThreadGroup
     {
@@ -448,7 +448,7 @@ struct GridwiseGemm_xdl_waveletmodel_cshuffle_conv_v3
         constexpr auto data_lds_bytes = a_block_space_size_aligned * sizeof(ADataType) +
                                         b_block_space_size_aligned * sizeof(BDataType);
 
-        // C shuffle LDS (reuses same memory after GEMM loop — mutually exclusive)
+        // C shuffle LDS (reuses same memory after GEMM loop, mutually exclusive)
         constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
             Base::GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DeviceArch{});
 
@@ -653,7 +653,7 @@ struct GridwiseGemm_xdl_waveletmodel_cshuffle_conv_v3
                 num_k_block_main_loop);
 
             // Match epilogue LDS syncs: RunEpilogue calls block_sync_lds() twice per
-            // SFC access iteration (once before VGPR→LDS, once before LDS→global).
+            // SFC access iteration (once before VGPR->LDS, once before LDS->global).
             // For !TransposeC && !IsMxGemm, the SFC access count equals
             // (MXdlPerWave / CShuffleMXdlPerWavePerShuffle) * (NXdlPerWave /
             // CShuffleNXdlPerWavePerShuffle) because the M2/M4/N2 SFC dimensions have equal
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index aac6d39647..7f1c264bbf 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -1449,21 +1449,21 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
 // Bypasses the SRD's 32-bit offset limit; required when the KV cache exceeds
 // INT32_MAX (2GB) byte offset on the SRD voffset path.
 //
-// !!! M0 PRECONDITION — IMPLICIT INPUT NOT VISIBLE IN OPERAND LIST !!!
+// !!! M0 PRECONDITION - IMPLICIT INPUT NOT VISIBLE IN OPERAND LIST !!!
 //
 //   The LDS destination address is taken from M0 (per AMD CDNA3 ISA §10.3:
 //   `LDS_ADDR = LDSbase + LDSoffset(M0[17:2] * 4) + INST.OFFSET + ThreadID*4`).
 //   M0 does NOT appear as an operand of these instructions or of the inline
-//   asm below — the compiler cannot see the dependency. Caller must:
+//   asm below - the compiler cannot see the dependency. Caller must:
 //
 //     1. Initialize M0 once before the load loop:
 //          `m0_set_with_memory(amd_wave_read_first_lane(lds_byte_offset));`
-//        M0 is SALU-only — `m0_set_with_memory` uses an "s" constraint to
+//        M0 is SALU-only - `m0_set_with_memory` uses an "s" constraint to
 //        enforce this. Direct VALU writes to M0 are illegal.
 //
 //     2. Advance M0 between successive issues:
 //          `m0_inc_with_memory(size_per_issue);`
-//        `size_per_issue` MUST be a multiple of 4 — GLOBAL/FLAT LDS path
+//        `size_per_issue` MUST be a multiple of 4 - GLOBAL/FLAT LDS path
 //        only honors M0[17:2]*4 (dword-aligned), so low 2 bits are silently
 //        dropped (NOTE: this differs from MUBUF buffer_load_lds which uses
 //        M0[15:0] as a raw byte offset).
@@ -1479,7 +1479,7 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
 //
 // Verified instruction emission (HIP 6.4 / clang 19, gfx942 + gfx950):
 //   `global_load_lds_dwordx4` is a single instruction (encoding 0xDDF48000
-//   0x007F0000), NOT software-expanded into 4× dword. Same encoding on both
+//   0x007F0000), NOT software-expanded into 4x dword. Same encoding on both
 //   arches. The opcode is undocumented in CDNA3 ISA spec §13.6.2 but
 //   supported by the LLVM AMDGPU backend.
 //
@@ -1501,7 +1501,7 @@ async_global_load_lds_dwordxn(void* smem, const void* global_addr, bool_constant
 
 // Inline asm: only the global address is an explicit operand. The LDS
 // destination is implicit via M0 (see contract above). `"=r"(smem)` is a
-// SSA scheduling anchor only — `smem` is NOT written by this asm; the
+// SSA scheduling anchor only - `smem` is NOT written by this asm; the
 // load goes to LDS at `M0[17:2]*4 + offset:0 + ThreadID*4`.
 #define CK_TILE_GLOBAL_LOAD_LDS_INSTR(instr)                                 \
     if constexpr(pre_nop)                                                    \
diff --git a/include/ck_tile/core/arch/amd_cluster_load.hpp b/include/ck_tile/core/arch/amd_cluster_load.hpp
index 1af1b6b2ec..19ca7690a6 100644
--- a/include/ck_tile/core/arch/amd_cluster_load.hpp
+++ b/include/ck_tile/core/arch/amd_cluster_load.hpp
@@ -32,7 +32,7 @@ CK_TILE_DEVICE __attribute__((address_space(3))) T* to_lds(T* ptr)
 #endif // __gfx1250__
 
 // Struct specializations for CLUSTER_LOAD_B32/B64/B128.
-// Primary template intentionally undefined — compile error for unsupported sizes.
+// Primary template intentionally undefined - compile error for unsupported sizes.
 template <index_t bytes>
 struct cluster_load;
 
@@ -104,7 +104,7 @@ CK_TILE_DEVICE T cluster_multicast_load(const T* addr, int mask)
 }
 
 // ---------------------------------------------------------------------------
-// CLUSTER_LOAD_ASYNC_TO_LDS_B* — async global→LDS multicast (gfx1250 only)
+// CLUSTER_LOAD_ASYNC_TO_LDS_B* - async global->LDS multicast (gfx1250 only)
 // ---------------------------------------------------------------------------
 // Unlike CLUSTER_LOAD_B*, data lands in LDS (not VGPRs) and is tracked by
 // ASYNCcnt. Wait with s_wait_asynccnt(0) on the requesting wave, then use
@@ -120,7 +120,7 @@ CK_TILE_DEVICE T cluster_multicast_load(const T* addr, int mask)
 // by the hardware instruction (default 0).
 
 // Struct specializations for CLUSTER_LOAD_ASYNC_TO_LDS_B32/B64/B128.
-// Primary template intentionally undefined — compile error for unsupported sizes.
+// Primary template intentionally undefined - compile error for unsupported sizes.
 template <index_t bytes, index_t inst_offset = 0>
 struct cluster_load_async_to_lds;
 
@@ -184,10 +184,10 @@ struct cluster_load_async_to_lds<16, inst_offset>
 };
 
 // Generic wrapper: issues CLUSTER_LOAD_ASYNC_TO_LDS_B* sized to T.
-// `src`         — global source pointer (generic address space; cast to global internally)
-// `lds_dst`     — per-lane LDS destination pointer (must be address_space(3))
-// `mask`        — M0[15:0] WGP participation mask; M0[16] sets early-timeout
-// `inst_offset` — compile-time immediate byte offset added to lds_dst by the hardware
+// `src`         - global source pointer (generic address space; cast to global internally)
+// `lds_dst`     - per-lane LDS destination pointer (must be address_space(3))
+// `mask`        - M0[15:0] WGP participation mask; M0[16] sets early-timeout
+// `inst_offset` - compile-time immediate byte offset added to lds_dst by the hardware
 template <typename T, index_t inst_offset = 0>
 CK_TILE_DEVICE void cluster_multicast_load_async_to_lds(const T* src,
                                                         __attribute__((address_space(3)))
diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp
index 4e94d6e902..87e55183df 100644
--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -544,7 +544,7 @@ struct sequence_sort_helper<sequence<Vs...>, Compare, sequence<Idx...>>
     {
         constexpr index_t n = sizeof...(Vs);
         sort_result r{{{Vs...}}, {{Idx...}}};
-        // insertion sort — O(N^2) constexpr steps, O(1) template depth
+        // insertion sort - O(N^2) constexpr steps, O(1) template depth
         for(index_t i = 1; i < n; ++i)
         {
             for(index_t j = i; j > 0 && Compare{}(r.values[j], r.values[j - 1]); --j)
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index 63072ab559..bd69cedbb6 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -467,7 +467,7 @@ struct buffer_view<address_space_enum::global,
     }
 
     // i is offset of T, not X. i should be aligned to X.
-    // mask — M0[15:0] WGP participation mask; M0[16] sets early-timeout.
+    // mask - M0[15:0] WGP participation mask; M0[16] sets early-timeout.
     template <typename X,
               index_t inst_offset = 0,
               typename std::enable_if<
@@ -489,8 +489,8 @@ struct buffer_view<address_space_enum::global,
         const remove_cvref_t<X>* g_src =
             reinterpret_cast<const remove_cvref_t<X>*>(p_uniform_ptr + i + linear_offset);
 
-        // reinterpret_cast changes only the element type (generic→generic, no address-space
-        // change). to_lds then converts generic→address_space(3) using a pragma-guarded
+        // reinterpret_cast changes only the element type (generic->generic, no address-space
+        // change). to_lds then converts generic->address_space(3) using a pragma-guarded
         // C-style cast, matching the pattern used by the rest of the codebase.
         auto* lds_ptr = to_lds(reinterpret_cast<remove_cvref_t<X>*>(smem));
 
diff --git a/include/ck_tile/core/tensor/load_tile.hpp b/include/ck_tile/core/tensor/load_tile.hpp
index 14b8b2842a..e996334bfd 100644
--- a/include/ck_tile/core/tensor/load_tile.hpp
+++ b/include/ck_tile/core/tensor/load_tile.hpp
@@ -45,7 +45,7 @@ CK_TILE_DEVICE auto load_tile(const TileWindow_& tile_window,
  *
  * @note This function is a modification of the existing load function.
  *       It has been extended with two additional parameters: it takes a tuple as input
- *       and an elementwise function. For each A = A0, A1… AN, the elementwise function
+ *       and an elementwise function. For each A = A0, A1... AN, the elementwise function
  *       is additionally applied during a single read.
  */
 template <typename... TileWindow_,
diff --git a/include/ck_tile/core/tensor/load_tile_transpose.hpp b/include/ck_tile/core/tensor/load_tile_transpose.hpp
index feb2546d36..eded23ab19 100644
--- a/include/ck_tile/core/tensor/load_tile_transpose.hpp
+++ b/include/ck_tile/core/tensor/load_tile_transpose.hpp
@@ -224,7 +224,7 @@ struct DefaultTranspose
         static constexpr bool suffix_valid_dim1 =
             util::is_sequence_suffix_v<decltype(quad_hs[I1]), decltype(input_hs[I1])>;
 
-        // 3. PS→RHS mapping constraints
+        // 3. PS->RHS mapping constraints
         static constexpr auto input_ps_major = InDstrEncode::ps_to_rhss_major_;
         static constexpr auto input_ps_minor = InDstrEncode::ps_to_rhss_minor_;
 
@@ -249,7 +249,7 @@ struct DefaultTranspose
             util::is_sequence_suffix_v<decltype(shifted_quad_ps_minor0),
                                        decltype(input_ps_minor_last)>;
 
-        // 4. YS→RHS mapping constraints
+        // 4. YS->RHS mapping constraints
         static constexpr auto input_ys_major = InDstrEncode::ys_to_rhs_major_;
         static constexpr auto input_ys_minor = InDstrEncode::ys_to_rhs_minor_;
         static constexpr auto quad_ys_major  = QuadEncoding::ys_to_rhs_major_;
@@ -354,7 +354,7 @@ struct TransposeTileDistributionTraits
         },
         number<InDstrEncode::NDimX>{});
 
-    // for PS→RHS mapping(both major and minor), we need to modify the last element (which is for
+    // for PS->RHS mapping(both major and minor), we need to modify the last element (which is for
     // thread distr) of the major sequence
     static constexpr auto dst_ps_to_rhss_major = generate_tuple(
         // for major because of dst_out_hs_lengthss is reversed, this index also need to be reversed
diff --git a/include/ck_tile/core/tensor/load_tile_transpose.hpp.bk b/include/ck_tile/core/tensor/load_tile_transpose.hpp.bk
index 5f73d4934a..2757760d75 100644
--- a/include/ck_tile/core/tensor/load_tile_transpose.hpp.bk
+++ b/include/ck_tile/core/tensor/load_tile_transpose.hpp.bk
@@ -132,7 +132,7 @@ struct DefaultTranspose
         static constexpr bool suffix_valid_dim1 =
             util::is_sequence_suffix_v<decltype(quad_hs[I1]), decltype(input_hs[I1])>;
 
-        // 3. PS→RHS mapping constraints
+        // 3. PS->RHS mapping constraints
         static constexpr auto input_ps_major = InDstrEncode::ps_to_rhss_major_;
         static constexpr auto input_ps_minor = InDstrEncode::ps_to_rhss_minor_;
 
@@ -157,7 +157,7 @@ struct DefaultTranspose
             util::is_sequence_suffix_v<decltype(shifted_quad_ps_minor0),
                                        decltype(input_ps_minor_last)>;
 
-        // 4. YS→RHS mapping constraints
+        // 4. YS->RHS mapping constraints
         static constexpr auto input_ys_major = InDstrEncode::ys_to_rhs_major_;
         static constexpr auto input_ys_minor = InDstrEncode::ys_to_rhs_minor_;
         static constexpr auto quad_ys_major  = QuadEncoding::ys_to_rhs_major_;
@@ -264,7 +264,7 @@ struct TransposeTileDistributionTraits
         },
         number<InDstrEncode::NDimX>{});
 
-    // for PS→RHS mapping(both major and minor), we need to modify the last element (which is for
+    // for PS->RHS mapping(both major and minor), we need to modify the last element (which is for
     // thread distr) of the major sequence
     static constexpr auto dst_ps_to_rhss_major = generate_tuple(
         // for major because of dst_out_hs_lengthss is reversed, this index also need to be reversed
diff --git a/include/ck_tile/core/tensor/tile_scatter_gather.hpp b/include/ck_tile/core/tensor/tile_scatter_gather.hpp
index 45131abb97..1f34261526 100644
--- a/include/ck_tile/core/tensor/tile_scatter_gather.hpp
+++ b/include/ck_tile/core/tensor/tile_scatter_gather.hpp
@@ -405,7 +405,7 @@ struct tile_scatter_gather
         // register window to be reused as scratch and scattered the SRD writes
         // across two conditional branches, which gfx950's packed
         // buffer_load_dwordx4 issue window doesn't tolerate (gfx942 absorbs it
-        // via per-tile single-dword loads). __builtin_assume is hint-only —
+        // via per-tile single-dword loads). __builtin_assume is hint-only -
         // no branch, no scratch SGPRs, no codegen impact.
         __builtin_assume(size > 0);
         using BufType                         = remove_cvref_t<decltype(bottom_tensor_view_.buf_)>;
diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index 7e86af6f2a..d3dd6596e5 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -190,12 +190,12 @@ struct tile_window_with_static_distribution
     /**
      * @brief Load tile with elementwise function
      *
-     * @note Load tile with elementwise — during value loading, an
-     *       elementwise function is executed for each A0, A1, … AN.
-     *       The values A0, A1, … AN are read by the same thread. In this way, we
+     * @note Load tile with elementwise - during value loading, an
+     *       elementwise function is executed for each A0, A1, ... AN.
+     *       The values A0, A1, ... AN are read by the same thread. In this way, we
      *       reduce the amount of information loaded into the registers.
      *       The same thread, during vectorized reading, accesses the same set of
-     *       data from A0, A1, A2, … AN.
+     *       data from A0, A1, A2, ... AN.
      */
     template <typename... TileWindow_,
               typename ElementWise_,
diff --git a/include/ck_tile/core/tensor/tile_window.hpp.bk b/include/ck_tile/core/tensor/tile_window.hpp.bk
index 6272375a07..33c7f9ae78 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp.bk
+++ b/include/ck_tile/core/tensor/tile_window.hpp.bk
@@ -184,12 +184,12 @@ struct tile_window_with_static_distribution
     /**
      * @brief Load tile with elementwise function
      *
-     * @note Load tile with elementwise — during value loading, an
-     *       elementwise function is executed for each A0, A1, … AN.
-     *       The values A0, A1, … AN are read by the same thread. In this way, we
+     * @note Load tile with elementwise - during value loading, an
+     *       elementwise function is executed for each A0, A1, ... AN.
+     *       The values A0, A1, ... AN are read by the same thread. In this way, we
      *       reduce the amount of information loaded into the registers.
      *       The same thread, during vectorized reading, accesses the same set of
-     *       data from A0, A1, A2, … AN.
+     *       data from A0, A1, A2, ... AN.
      */
     template <typename... TileWindow_,
               typename ElementWise_,
diff --git a/include/ck_tile/core/utility/debug.hpp b/include/ck_tile/core/utility/debug.hpp
index 0da08271eb..23c6215a5e 100644
--- a/include/ck_tile/core/utility/debug.hpp
+++ b/include/ck_tile/core/utility/debug.hpp
@@ -149,7 +149,7 @@ struct CK_PRINTF_WARP0 : public CK_PRINTF<Ts...>
  *   {
  *       [[maybe_unused]] AsmScopeMarker marker;   // Emits CK_ASM_SCOPE_START
  *       // ... code you want to delimit in assembly ...
- *   } // marker goes out of scope → Emits CK_ASM_SCOPE_END
+ *   } // marker goes out of scope -> Emits CK_ASM_SCOPE_END
  *
  */
 struct AsmScopeMarker
diff --git a/include/ck_tile/core/utility/type_traits.hpp b/include/ck_tile/core/utility/type_traits.hpp
index a9584bd592..1273062612 100644
--- a/include/ck_tile/core/utility/type_traits.hpp
+++ b/include/ck_tile/core/utility/type_traits.hpp
@@ -12,7 +12,7 @@
 
 namespace ck_tile {
 
-// `always_false_v<T...>` — a value-template that is always `false` but whose
+// `always_false_v<T...>` - a value-template that is always `false` but whose
 // evaluation is deferred until template instantiation. The canonical use is
 // inside the `else` arm of an `if constexpr` chain or under an arch-gated
 // `#if` to fire a `static_assert` ONLY when the offending instantiation is
diff --git a/include/ck_tile/host/reference/reference_topk.hpp b/include/ck_tile/host/reference/reference_topk.hpp
index 8c9c87e3ee..d51373b1b8 100644
--- a/include/ck_tile/host/reference/reference_topk.hpp
+++ b/include/ck_tile/host/reference/reference_topk.hpp
@@ -16,7 +16,7 @@ namespace ck_tile {
 /*
     similiar to torch.topk()
     x (Tensor) – the input tensor.
-    k (int) – the k in “top-k”
+    k (int) – the k in "top-k"
     dim (int, optional) – the dimension to sort along
     largest (bool, optional) – largest or smallest elements
     sorted (bool, optional) – elements in sorted order or not
diff --git a/include/ck_tile/ops/batched_contraction/kernel/batched_contraction_kernel.hpp b/include/ck_tile/ops/batched_contraction/kernel/batched_contraction_kernel.hpp
index 988daf1c27..bb0d1f7bb3 100644
--- a/include/ck_tile/ops/batched_contraction/kernel/batched_contraction_kernel.hpp
+++ b/include/ck_tile/ops/batched_contraction/kernel/batched_contraction_kernel.hpp
@@ -33,7 +33,7 @@
  * **E[G₀,G₁,...,M₀,M₁,...,N₀,N₁,...] = epilogue_op(C, D₀, D₁, D₂, ...)**
  *
  * Where:
- * **C[G₀,G₁,...,M₀,M₁,...,N₀,N₁,...] = Σ_{K₀,K₁,...} A[G₀,G₁,...,M₀,M₁,...,K₀,K₁,...] ×
+ * **C[G₀,G₁,...,M₀,M₁,...,N₀,N₁,...] = Σ_{K₀,K₁,...} A[G₀,G₁,...,M₀,M₁,...,K₀,K₁,...] x
  * B[G₀,G₁,...,N₀,N₁,...,K₀,K₁,...]**
  *
  * Where:
@@ -50,16 +50,16 @@
  * to the dot product computation in matrix multiplication.
  *
  * **Dimension Flattening Strategy**:
- * - **M dimensions** (from tensor A) → Flattened into matrix rows (M_total)
- * - **N dimensions** (from tensor B) → Flattened into matrix columns (N_total)
- * - **K dimensions** (contraction dims) → Flattened into inner dimension (K_total)
- * - **G dimensions** (batch dims) → Handled through batch processing
+ * - **M dimensions** (from tensor A) -> Flattened into matrix rows (M_total)
+ * - **N dimensions** (from tensor B) -> Flattened into matrix columns (N_total)
+ * - **K dimensions** (contraction dims) -> Flattened into inner dimension (K_total)
+ * - **G dimensions** (batch dims) -> Handled through batch processing
  *
  * **Mathematical Transformation**:
  * ```
- * Original: E[g,m₀,m₁,n₀,n₁] = Σ_{k₀,k₁} A[g,m₀,m₁,k₀,k₁] × B[g,n₀,n₁,k₀,k₁]
- * Flattened: E[g,M,N] = Σ_K A[g,M,K] × B[g,N,K]  (where M=m₀×m₁, N=n₀×n₁, K=k₀×k₁)
- * GEMM Form: E = A × Bᵀ
+ * Original: E[g,m₀,m₁,n₀,n₁] = Σ_{k₀,k₁} A[g,m₀,m₁,k₀,k₁] x B[g,n₀,n₁,k₀,k₁]
+ * Flattened: E[g,M,N] = Σ_K A[g,M,K] x B[g,N,K]  (where M=m₀xm₁, N=n₀xn₁, K=k₀xk₁)
+ * GEMM Form: E = A x Bᵀ
  *
  * **Why This Approach Is Optimal**:
  * Rather than implementing tensor contraction from scratch, this kernel leverages the highly
diff --git a/include/ck_tile/ops/common/streamk_common.hpp b/include/ck_tile/ops/common/streamk_common.hpp
index d291244aa4..f4d4c44811 100644
--- a/include/ck_tile/ops/common/streamk_common.hpp
+++ b/include/ck_tile/ops/common/streamk_common.hpp
@@ -256,8 +256,8 @@ struct StreamKReductionOps
 ///
 /// @tparam TilePartitioner_ Partitioner type (persistent or non-persistent specialization).
 /// @param tile_partitioner The partitioner instance from kernel args.
-/// @param dp_tile_func     Callable(index_t tile_idx) — processes one full DP tile.
-/// @param sk_func          Callable(index_t sk_cta_idx) — runs the StreamK loop for this CTA.
+/// @param dp_tile_func     Callable(index_t tile_idx) - processes one full DP tile.
+/// @param sk_func          Callable(index_t sk_cta_idx) - runs the StreamK loop for this CTA.
 template <typename TilePartitioner_, typename DPTileFunc, typename SKFunc>
 CK_TILE_DEVICE void
 StreamKDispatch(const TilePartitioner_& tile_partitioner, DPTileFunc dp_tile_func, SKFunc sk_func)
diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
index 3f5e32383d..a6b3f86b94 100644
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -796,7 +796,7 @@ struct PassThrough
         {
             y = ck_tile::type_convert<raw_t<Y>>(x);
         }
-        /*  otherwise (r-value or const)     → do nothing  */
+        /*  otherwise (r-value or const)     -> do nothing  */
     }
 
     template <typename Y, typename X>
diff --git a/include/ck_tile/ops/epilogue/chainer/cshuffle_epilogue_schedule.hpp b/include/ck_tile/ops/epilogue/chainer/cshuffle_epilogue_schedule.hpp
index 683bfe7377..c9351a783f 100644
--- a/include/ck_tile/ops/epilogue/chainer/cshuffle_epilogue_schedule.hpp
+++ b/include/ck_tile/ops/epilogue/chainer/cshuffle_epilogue_schedule.hpp
@@ -13,17 +13,19 @@ namespace ck_tile {
 /// @par Purpose
 ///     Each tag corresponds to a pre-built schedule, these are used to select a schedule
 
-/// Standard epilogue schedule: Slice → CastStore → Load → ApplyD → Store → Move
+/// Standard epilogue schedule: Slice -> CastStore -> Load -> ApplyD -> Store -> Move
 struct DefaultScheduleTag
 {
 };
 
-/// RowCol quantization schedule: Slice → ScaleWindow → CastStore → Load → ApplyD → Store → Move
+/// RowCol quantization schedule: Slice -> ScaleWindow -> CastStore -> Load -> ApplyD -> Store ->
+/// Move
 struct RowColQuantScheduleTag
 {
 };
 
-/// Tensor quantization schedule: Slice → ScaleScalar → CastStore → Load → ApplyD → Store → Move
+/// Tensor quantization schedule: Slice -> ScaleScalar -> CastStore -> Load -> ApplyD -> Store ->
+/// Move
 struct TensorQuantScheduleTag
 {
 };
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
index 7aff21530d..07220f4457 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
@@ -33,17 +33,17 @@ struct FmhaBwdWorkspaceManager
     // CPU workspace (prepared by host, read-only for kernels):
 
     // index_t nsplits[batch or 1]
-    //   — per-batch nsplits array (batch element in deterministic group mode)
+    //   - per-batch nsplits array (batch element in deterministic group mode)
 
     // [OPTIONAL, only for deterministic group mode]
     // long_index_t dq_acc_offsets[batch]
-    //   — per-batch offset array
+    //   - per-batch offset array
 
     // GPU WORKSPACE BELOW (read & written by kernels):
 
     // [OPTIONAL, only for !kUseQrQtrDorPipeline]
     // AccDataType dq_acc[total_elements]
-    //   — dq_acc compact buffer (zeroed if necessary)
+    //   - dq_acc compact buffer (zeroed if necessary)
     //   - total_elements = sum_i(nhead * nsplits_i * seqq_i) * hdim_q
     //   - Layout within each batch: [nhead, nsplits_i, seqq_i, hdim_q]
     //   - note: use physical (including padding) length for seqq_i for group mode
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
index 3f284c5c91..444fdef69b 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp
@@ -162,7 +162,7 @@ CK_TILE_HOST_DEVICE void kv_offset_array_transform(const IndexArrayType& physica
     const index_t& thread_coord_start   = coord_vec[kCoordAxis];
     constexpr index_t kInPageOffsetMask = (1 << kLog2PageSize) - 1;
 
-    // Addressing strategy — four cases controlled by (kPageBlockSize vs kN0, kUseGlobalLoad_):
+    // Addressing strategy - four cases controlled by (kPageBlockSize vs kN0, kUseGlobalLoad_):
     //
     //   Case 1: kPageBlockSize >= kN0
     //     SRD is rebased per-tile to the page base (rebase_{k,v}_window in caller).
@@ -733,24 +733,24 @@ struct BlockFmhaBatchPrefillPipelineQRKSVSAsync
         // The decomposition pattern differs by memory layout:
         //
         // VECTORIZED_LAYOUT (ColumnMajor, custom distribution):
-        //   3D decomposition: K = K2 × K0 × K1
+        //   3D decomposition: K = K2 x K0 x K1
         //   - K2 (V_KIterOuter): Outer iteration count
         //   - K0 (V_KLanes):     Lanes for K dimension (matches GEMM kABKLane)
         //   - K1 (V_KIterInner): Vector load size (matches GEMM kKPerThread)
         //   - hs_lengthss_[I1] = {K2, K0, K1}, size = 3 (or {K0, K1} size = 2 if no outer iter)
         //
         // LINEAR_LAYOUT ColumnMajor (base class distribution):
-        //   2D decomposition: K = K0 × K1
+        //   2D decomposition: K = K0 x K1
         //   - K0: Lanes for K dimension (may not match GEMM kABKLane)
         //   - K1: Vector load size
         //   - hs_lengthss_[I1] = {K0, K1}, size = 2
         //
         // LINEAR_LAYOUT RowMajor (base class distribution):
-        //   4D decomposition: K = K0 × K1 × K2 × K3 (uses shuffle_tile for GEMM alignment)
-        //   3D decomposition: K = K0 × K1 × K2 (fallback case)
+        //   4D decomposition: K = K0 x K1 x K2 x K3 (uses shuffle_tile for GEMM alignment)
+        //   3D decomposition: K = K0 x K1 x K2 (fallback case)
         //   - Page lookup uses Y-space's last dimension only (inner iteration)
         //
-        // V_PageIdxRepeat = total number of page lookups per thread = V_KIterOuter × V_KIterInner
+        // V_PageIdxRepeat = total number of page lookups per thread = V_KIterOuter x V_KIterInner
         constexpr index_t V_KIterInner = VDstrEncode::hs_lengthss_[I1].back();
 
         // Compute V_KIterOuter and V_KLanes based on memory layout and K decomposition
@@ -938,7 +938,7 @@ struct BlockFmhaBatchPrefillPipelineQRKSVSAsync
                     v_physical_pages, stride_v, page_stride_v, v_coord, v_offsets, current_seq_k);
             }
 
-            // v_offsets semantics — see the four-case addressing-strategy block above
+            // v_offsets semantics - see the four-case addressing-strategy block above
             // kNeedFullOffset in kv_offset_array_transform. Three cases reach this lambda:
             //   Case 1 (kPageBlockSize >= kN0):           within-page offset; page base in SRD.
             //   Case 2 (page_size < kN0, kUseGlobalLoad): within-page offset; page base computed
@@ -970,7 +970,7 @@ struct BlockFmhaBatchPrefillPipelineQRKSVSAsync
 
         // Initial V SRD rebase. Single source of truth: rebase_v_window's own
         // `if constexpr(kPageBlockSize >= kN0)` makes this a no-op for case 2/3.
-        // Do not re-add an outer guard here — it would duplicate the inner check
+        // Do not re-add an outer guard here - it would duplicate the inner check
         // and drift if the lambda's gating condition ever changes.
         rebase_v_window(v_dram_window, v_physical_pages[number<0>{}]);
 
@@ -1015,12 +1015,12 @@ struct BlockFmhaBatchPrefillPipelineQRKSVSAsync
             // To support smaller page sizes (cross-page tiles), need:
             //
             // 1. K descale: Load per-token k_descale_vec[NRepeat] based on k_physical_pages[k0]
-            //    - After GEMM0 (S = Q × K^T), apply column-wise scaling: S[:,j] *= k_descale[j]
+            //    - After GEMM0 (S = Q x K^T), apply column-wise scaling: S[:,j] *= k_descale[j]
             //    - Requires modifying s_acc_element_func to accept column index
             //
             // 2. V descale: Load per-token v_descale_vec[V_PageIdxRepeat] based on
             // v_physical_pages[k0]
-            //    - Before GEMM1 (O = P × V), apply row-wise scaling to P: P[i,j] *= v_descale[j]
+            //    - Before GEMM1 (O = P x V), apply row-wise scaling to P: P[i,j] *= v_descale[j]
             //    - Or pre-scale V in LDS (more complex)
             //
             // 3. K and V may be on different pages for the same token index, so need separate
@@ -1554,7 +1554,7 @@ struct BlockFmhaBatchPrefillPipelineQRKSVSAsync
                     k_dram_window.update_physical_pages(k_physical_pages);
                 rebase_k_window(k_dram_window, k_physical_pages[number<0>{}]);
 
-                // After sink→window transition (i_total_loops == num_sink_loop), V window
+                // After sink->window transition (i_total_loops == num_sink_loop), V window
                 // was advanced by kN0 (one normal iter), but current_seq_k jumped by k_advance
                 // = seqlen_k_start - sink_seq_end + kN0 > kN0.  Re-init V to current_seq_k.
                 if constexpr(kHasSink)
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp
index ac868ce4b8..fbe2bd819e 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_v3_pipeline.hpp
@@ -33,7 +33,7 @@ namespace ck_tile {
 //   warp gemm calls = MIterPerWarp * NIterPerWarp * KIterPerWarp
 //   MFMAs per call  = WarpGemm::kK / WarpGemm::WarpGemmAttribute::Impl::kK  (kKIter)
 //
-// For bf16/fp16 kKIter=1; for fp8 kKIter=2 (K=32 warp gemm wraps 2× K=16 MFMA).
+// For bf16/fp16 kKIter=1; for fp8 kKIter=2 (K=32 warp gemm wraps 2x K=16 MFMA).
 // ---------------------------------------------------------------------------
 template <typename BlockGemm>
 static constexpr ck_tile::index_t block_gemm_mfma_count_v =
@@ -65,7 +65,7 @@ struct CoreLoopSchedulerDefaultBase
 {
     using Params = CoreLoopSchedulingParams<PipelineProblem>;
 
-    // Phase helper: GEMM0 compute (QK matmul) — MFMA interleaved with TRANS + VALU
+    // Phase helper: GEMM0 compute (QK matmul) - MFMA interleaved with TRANS + VALU
     CK_TILE_DEVICE static constexpr void schedule_gemm0_compute()
     {
         static_for<0, Params::kMfmaPerWarpGemm0, 1>{}([&](auto) {
@@ -75,7 +75,7 @@ struct CoreLoopSchedulerDefaultBase
         });
     }
 
-    // Phase helper: GEMM1 compute (PV matmul) — optional packed-FP32 preamble + MFMA/VALU
+    // Phase helper: GEMM1 compute (PV matmul) - optional packed-FP32 preamble + MFMA/VALU
     CK_TILE_DEVICE static constexpr void schedule_gemm1_compute()
     {
 #if !CK_TILE_DISABLE_PACKED_FP32
@@ -87,7 +87,7 @@ struct CoreLoopSchedulerDefaultBase
         });
     }
 
-    // Phase helper: load phase (memory/LDS loads) — VALU + SALU
+    // Phase helper: load phase (memory/LDS loads) - VALU + SALU
     CK_TILE_DEVICE static constexpr void schedule_load_phase()
     {
         __builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::VALU, 2, 0);
@@ -119,21 +119,21 @@ struct CoreLoopSchedulerDefaultBase
 template <typename PipelineProblem, typename QDataType, typename KDataType, typename VDataType>
 struct CoreLoopSchedulerImpl;
 
-// bf16 — uses default base
+// bf16 - uses default base
 template <typename PipelineProblem>
 struct CoreLoopSchedulerImpl<PipelineProblem, ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t>
     : CoreLoopSchedulerDefaultBase<PipelineProblem>
 {
 };
 
-// fp16 — uses default base
+// fp16 - uses default base
 template <typename PipelineProblem>
 struct CoreLoopSchedulerImpl<PipelineProblem, ck_tile::half_t, ck_tile::half_t, ck_tile::half_t>
     : CoreLoopSchedulerDefaultBase<PipelineProblem>
 {
 };
 
-// fp8 — asymmetric GEMM0 scheduling for 2× K iterations
+// fp8 - asymmetric GEMM0 scheduling for 2x K iterations
 //
 // FP8 GEMM0 has 16 MFMAs (kKIter=2) but the same TRANS work as bf16/fp16 (softmax
 // exp count is dtype-independent).  The uniform (MFMA:1, TRANS:2, VALU:2) pattern
@@ -141,8 +141,8 @@ struct CoreLoopSchedulerImpl<PipelineProblem, ck_tile::half_t, ck_tile::half_t,
 // with nothing to interleave (7 back-to-back MFMAs).
 //
 // Fix: split into two halves matching the natural K iteration boundary:
-//   K iter 0 (MFMAs 1-8):  TRANS-heavy — softmax exp + add reduction chain
-//   K iter 1 (MFMAs 9-16): VALU-heavy  — P scale + cvt_pk_fp8 + o_acc rescale
+//   K iter 0 (MFMAs 1-8):  TRANS-heavy - softmax exp + add reduction chain
+//   K iter 1 (MFMAs 9-16): VALU-heavy  - P scale + cvt_pk_fp8 + o_acc rescale
 template <typename PipelineProblem>
 struct CoreLoopSchedulerImpl<PipelineProblem, ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::fp8_t>
     : CoreLoopSchedulerDefaultBase<PipelineProblem>
@@ -165,12 +165,12 @@ struct CoreLoopSchedulerImpl<PipelineProblem, ck_tile::fp8_t, ck_tile::fp8_t, ck
         });
     }
 
-    // Phase helper: GEMM1 compute (PV matmul) — asymmetric for fmha_alu0 data dependency
+    // Phase helper: GEMM1 compute (PV matmul) - asymmetric for fmha_alu0 data dependency
     //
     // fmha_alu0 runs during PV GEMM on the OTHER sp buffer:
     //   v_perm (byte packing) + v_max3 (row max) + permlane + v_fma (sp_delta)
     //
-    // The v_fma chain depends on the serial max3→permlane→max→mul chain, creating
+    // The v_fma chain depends on the serial max3->permlane->max->mul chain, creating
     // a data dependency gap around MFMAs 8-11.  Use a looser VALU constraint for the
     // second half to give the scheduler freedom to place v_fma where available.
     CK_TILE_DEVICE static constexpr void schedule_gemm1_compute()
@@ -190,7 +190,7 @@ struct CoreLoopSchedulerImpl<PipelineProblem, ck_tile::fp8_t, ck_tile::fp8_t, ck
         });
     }
 
-    // Must override schedule() — static methods have no virtual dispatch
+    // Must override schedule() - static methods have no virtual dispatch
     template <ck_tile::index_t WaveGroup, ck_tile::index_t Phase>
     CK_TILE_DEVICE static constexpr void schedule(ck_tile::number<WaveGroup>,
                                                   ck_tile::number<Phase>)
diff --git a/include/ck_tile/ops/gemm/block/block_wp_asmem_breg_creg.hpp b/include/ck_tile/ops/gemm/block/block_wp_asmem_breg_creg.hpp
index b03046d192..48abbd3594 100644
--- a/include/ck_tile/ops/gemm/block/block_wp_asmem_breg_creg.hpp
+++ b/include/ck_tile/ops/gemm/block/block_wp_asmem_breg_creg.hpp
@@ -104,7 +104,7 @@ struct BlockWeightPreshuffleASmemBRegCReg
     {
         constexpr auto a_load_dstr = make_static_tile_distribution(MakeABlockDistributionEncode());
 
-        // create MIterPerWarp × KIterPerWarp window
+        // create MIterPerWarp x KIterPerWarp window
         return generate_tuple(
             [&](auto kIter) {
                 return generate_tuple(
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
index 8405606ddf..cdfbafc6c0 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
@@ -496,10 +496,10 @@ struct GemmClusterTilePartitioner
 
 
     EXAMPLE CONFIGURATION:
-    - Cluster dimensions: ClusterM = 2, ClusterN = 2 (2×2 cluster)
-    - Grid dimensions:    GridM = 6, GridN = 4 (6×4 output tiles)
-    - Number of clusters: (6/2) × (4/2) = 3 × 2 = 6 clusters
-    - Blocks per cluster: 2 × 2 = 4 blocks
+    - Cluster dimensions: ClusterM = 2, ClusterN = 2 (2x2 cluster)
+    - Grid dimensions:    GridM = 6, GridN = 4 (6x4 output tiles)
+    - Number of clusters: (6/2) x (4/2) = 3 x 2 = 6 clusters
+    - Blocks per cluster: 2 x 2 = 4 blocks
 
     The tables below show which BLOCK (identified by its flattened cluster_id) processes
     each output TILE position (tile_m, tile_n). Values 0-5 represent the 6
@@ -515,7 +515,7 @@ struct GemmClusterTilePartitioner
 
     TILE ASSIGNMENT (each cell shows which cluster processes that tile):
 
-             N→ 0    1    2    3
+             N-> 0    1    2    3
            ┌────────────────────────┐
       M  0 │ │  0  │  0  │  3  │  3  │
            │ ├────────────────────┤
@@ -548,7 +548,7 @@ struct GemmClusterTilePartitioner
 
     TILE ASSIGNMENT (interleaved along both M and N):
 
-             N→ 0    1    2    3
+             N-> 0    1    2    3
            ┌────────────────────────┐
       M  0 │ │  0  │  3  │  0  │  3  │
            │ ├────────────────────┤
@@ -581,7 +581,7 @@ struct GemmClusterTilePartitioner
 
     TILE ASSIGNMENT (interleaved along M, contiguous along N):
 
-             N→ 0    1    2    3
+             N-> 0    1    2    3
            ┌────────────────────────┐
       M  0 │ │  0  │  0  │  3  │  3  │
            │ ├────────────────────┤
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
index 2d757b0b54..8ebc8fb822 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -504,21 +504,21 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
             // initialize C
             tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
 
-            // Load tile — during value loading, an elementwise function is executed for each A0,
-            // A1, … AN. The values A0, A1, … AN are read by the same thread.
+            // Load tile - during value loading, an elementwise function is executed for each A0,
+            // A1, ... AN. The values A0, A1, ... AN are read by the same thread.
             auto elementwise_As_res =
                 load_tile_with_elementwise(a_copy_dram_window, a_element_func);
 
-            // Move each A — the enhanced function move_tile_window is executed, which takes a tuple
+            // Move each A - the enhanced function move_tile_window is executed, which takes a tuple
             // as input.
             move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
 
-            // Load tile — during value loading, an elementwise function is executed for each B0,
-            // B1, … BN. The values B0, B1, … BN are read by the same thread.
+            // Load tile - during value loading, an elementwise function is executed for each B0,
+            // B1, ... BN. The values B0, B1, ... BN are read by the same thread.
             auto elementwise_Bs_res =
                 load_tile_with_elementwise(b_copy_dram_window, b_element_func);
 
-            // Move each B — the enhanced function move_tile_window is executed, which takes a tuple
+            // Move each B - the enhanced function move_tile_window is executed, which takes a tuple
             // as input.
             move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index b67cda5618..c1d5f6e74a 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -373,11 +373,11 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                 },
                 number<AsLayout::size()>{});
 
-            // Load tile — during value loading, an elementwise function is executed for each A0,
-            // A1, … AN. The values A0, A1, … AN are read by the same thread.
+            // Load tile - during value loading, an elementwise function is executed for each A0,
+            // A1, ... AN. The values A0, A1, ... AN are read by the same thread.
             auto elementwise_As_res = load_tile_with_elementwise(a_tile_windows, a_element_func);
 
-            // Move each A — the enhanced function move_tile_window is executed, which takes a tuple
+            // Move each A - the enhanced function move_tile_window is executed, which takes a tuple
             // as input.
             move_tile_window(a_tile_windows, a_dram_tile_window_step);
 
@@ -392,11 +392,11 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                 },
                 number<AsLayout::size()>{});
 
-            // Load tile — during value loading, an elementwise function is executed for each B0,
-            // B1, … BN. The values B0, B1, … BN are read by the same thread.
+            // Load tile - during value loading, an elementwise function is executed for each B0,
+            // B1, ... BN. The values B0, B1, ... BN are read by the same thread.
             auto elementwise_Bs_res = load_tile_with_elementwise(b_tile_windows, b_element_func);
 
-            // Move each B — the enhanced function move_tile_window is executed, which takes a tuple
+            // Move each B - the enhanced function move_tile_window is executed, which takes a tuple
             // as input.
             move_tile_window(b_tile_windows, b_dram_tile_window_step);
 
@@ -502,8 +502,8 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                 Base::LocalPrefill(b_copy_lds_window1, elementwise_Bs_res);
             }
 
-            // Load tile — during value loading, an elementwise function is executed for each A0,
-            // A1, … AN. The values A0, A1, … AN are read by the same thread.
+            // Load tile - during value loading, an elementwise function is executed for each A0,
+            // A1, ... AN. The values A0, A1, ... AN are read by the same thread.
             elementwise_As_res = load_tile_with_elementwise(a_tile_windows, a_element_func);
             move_tile_window(a_tile_windows, a_dram_tile_window_step);
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
index 6d4b6a7758..a1f1cfce50 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
@@ -274,19 +274,19 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
             auto MemoryOpsStep = [&](auto idx) {
                 // Memory read half here.
 
-                // Load tile — during value loading, an elementwise function is executed for each
-                // A0, A1, … AN. The values A0, A1, … AN are read by the same thread.
+                // Load tile - during value loading, an elementwise function is executed for each
+                // A0, A1, ... AN. The values A0, A1, ... AN are read by the same thread.
                 elementwise_As_res = load_tile_with_elementwise(a_copy_dram_window, a_element_func);
 
-                // Move each A — the enhanced function move_tile_window is executed, which takes a
+                // Move each A - the enhanced function move_tile_window is executed, which takes a
                 // tuple as input.
                 move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
 
-                // Load tile — during value loading, an elementwise function is executed for each
-                // B0, B1, … BN. The values B0, B1, … BN are read by the same thread.
+                // Load tile - during value loading, an elementwise function is executed for each
+                // B0, B1, ... BN. The values B0, B1, ... BN are read by the same thread.
                 elementwise_Bs_res = load_tile_with_elementwise(b_copy_dram_window, b_element_func);
 
-                // Move each B — the enhanced function move_tile_window is executed, which takes a
+                // Move each B - the enhanced function move_tile_window is executed, which takes a
                 // tuple as input.
                 move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
index 1f87ec6bff..a55caf9615 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -358,19 +358,19 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
 
             // prefetch
             // global read 0
-            // Load tile — during value loading, an elementwise function is executed for each A0,
-            // A1, … AN. The values A0, A1, … AN are read by the same thread.
+            // Load tile - during value loading, an elementwise function is executed for each A0,
+            // A1, ... AN. The values A0, A1, ... AN are read by the same thread.
             a_block_tiles.at(I0{}) = load_tile_with_elementwise(a_copy_dram_window, a_element_func);
 
-            // Move each A — the enhanced function move_tile_window is executed, which takes a tuple
+            // Move each A - the enhanced function move_tile_window is executed, which takes a tuple
             // as input.
             move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
 
-            // Load tile — during value loading, an elementwise function is executed for each B0,
-            // B1, … BN. The values B0, B1, … BN are read by the same thread.
+            // Load tile - during value loading, an elementwise function is executed for each B0,
+            // B1, ... BN. The values B0, B1, ... BN are read by the same thread.
             b_block_tiles.at(I0{}) = load_tile_with_elementwise(b_copy_dram_window, b_element_func);
 
-            // Move each B — the enhanced function move_tile_window is executed, which takes a tuple
+            // Move each B - the enhanced function move_tile_window is executed, which takes a tuple
             // as input.
             move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
 
@@ -670,19 +670,19 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
             // prefetch
             // global read 0
 
-            // Load tile — during value loading, an elementwise function is executed for each A0,
-            // A1, … AN. The values A0, A1, … AN are read by the same thread.
+            // Load tile - during value loading, an elementwise function is executed for each A0,
+            // A1, ... AN. The values A0, A1, ... AN are read by the same thread.
             a_block_tiles.at(I0{}) = load_tile_with_elementwise(a_copy_dram_window, a_element_func);
 
-            // Move each A — the enhanced function move_tile_window is executed, which takes a tuple
+            // Move each A - the enhanced function move_tile_window is executed, which takes a tuple
             // as input.
             move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
 
-            // Load tile — during value loading, an elementwise function is executed for each B0,
-            // B1, … BN. The values B0, B1, … BN are read by the same thread.
+            // Load tile - during value loading, an elementwise function is executed for each B0,
+            // B1, ... BN. The values B0, B1, ... BN are read by the same thread.
             b_block_tiles.at(I0{}) = load_tile_with_elementwise(b_copy_dram_window, b_element_func);
 
-            // Move each B — the enhanced function move_tile_window is executed, which takes a tuple
+            // Move each B - the enhanced function move_tile_window is executed, which takes a tuple
             // as input.
             move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
index 105ead63dc..8efb7b7ac1 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -252,22 +252,22 @@ struct GemmPipelineAGmemBGmemCRegV1 : public BaseGemmPipelineAGmemBGmemCRegV1<Pr
 
             // prefetch
             // global read 0
-            // Load tile — during value loading, an elementwise function is executed for each A0,
-            // A1, … AN. The values A0, A1, … AN are read by the same thread.
+            // Load tile - during value loading, an elementwise function is executed for each A0,
+            // A1, ... AN. The values A0, A1, ... AN are read by the same thread.
             auto elementwise_As_res =
                 load_tile_with_elementwise(as_copy_dram_window, a_element_func);
 
-            // Load tile — during value loading, an elementwise function is executed for each B0,
-            // B1, … BN. The values B0, B1, … BN are read by the same thread.
+            // Load tile - during value loading, an elementwise function is executed for each B0,
+            // B1, ... BN. The values B0, B1, ... BN are read by the same thread.
             auto elementwise_Bs_res =
                 load_tile_with_elementwise(bs_copy_dram_window, b_element_func);
 
             {
                 // move to 1
-                // Move each A — the enhanced function move_tile_window is executed, which takes a
+                // Move each A - the enhanced function move_tile_window is executed, which takes a
                 // tuple as input.
                 move_tile_window(as_copy_dram_window, a_dram_tile_window_step);
-                // Move each B — the enhanced function move_tile_window is executed, which takes a
+                // Move each B - the enhanced function move_tile_window is executed, which takes a
                 // tuple as input.
                 move_tile_window(bs_copy_dram_window, b_dram_tile_window_step);
 
@@ -465,22 +465,22 @@ struct GemmPipelineAGmemBGmemCRegV1 : public BaseGemmPipelineAGmemBGmemCRegV1<Pr
 
             // prefetch
             // global read 0
-            // Load tile — during value loading, an elementwise function is executed for each A0,
-            // A1, … AN. The values A0, A1, … AN are read by the same thread.
+            // Load tile - during value loading, an elementwise function is executed for each A0,
+            // A1, ... AN. The values A0, A1, ... AN are read by the same thread.
             auto elementwise_As_res =
                 load_tile_with_elementwise(as_copy_dram_window, a_element_func);
 
-            // Load tile — during value loading, an elementwise function is executed for each B0,
-            // B1, … BN. The values B0, B1, … BN are read by the same thread.
+            // Load tile - during value loading, an elementwise function is executed for each B0,
+            // B1, ... BN. The values B0, B1, ... BN are read by the same thread.
             auto elementwise_Bs_res =
                 load_tile_with_elementwise(bs_copy_dram_window, b_element_func);
 
             {
                 // move to 1
-                // Move each A — the enhanced function move_tile_window is executed, which takes a
+                // Move each A - the enhanced function move_tile_window is executed, which takes a
                 // tuple as input.
                 move_tile_window(as_copy_dram_window, a_dram_tile_window_step);
-                // Move each B — the enhanced function move_tile_window is executed, which takes a
+                // Move each B - the enhanced function move_tile_window is executed, which takes a
                 // tuple as input.
                 move_tile_window(bs_copy_dram_window, b_dram_tile_window_step);
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
index dd17acb9e9..218bb7cd40 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
@@ -216,12 +216,12 @@ struct GemmPipelineAGmemBGmemCRegV2 : public BaseGemmPipelineAGmemBGmemCRegV2<Pr
 
             // prefetch
             // global read 0
-            // Load tile — during value loading, an elementwise function is executed for each A0,
-            // A1, … AN. The values A0, A1, … AN are read by the same thread.
+            // Load tile - during value loading, an elementwise function is executed for each A0,
+            // A1, ... AN. The values A0, A1, ... AN are read by the same thread.
             auto elementwise_As_res =
                 load_tile_with_elementwise(as_copy_dram_window, a_element_func);
-            // Load tile — during value loading, an elementwise function is executed for each B0,
-            // B1, … BN. The values B0, B1, … BN are read by the same thread.
+            // Load tile - during value loading, an elementwise function is executed for each B0,
+            // B1, ... BN. The values B0, B1, ... BN are read by the same thread.
             auto elementwise_Bs_res =
                 load_tile_with_elementwise(bs_copy_dram_window, b_element_func);
 
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_params.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_params.hpp
index 7e98096063..ee9f7427c0 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_params.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_params.hpp
@@ -1,4 +1,4 @@
-// Copyright © Advanced Micro Devices, Inc. or its affiliates.
+// Copyright (c) Advanced Micro Devices, Inc. or its affiliates.
 // SPDX-License-Identifier: MIT
 
 #pragma once
diff --git a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
index af47b9bc67..8190c1e14c 100644
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
@@ -419,7 +419,7 @@ struct AQuantBlockUniversalGemmAsBsCr
             // Track which KRepeat chunk is currently loaded
             index_t current_k_repeat_loaded = -1;
 
-            // Restructured loop: M → N → QScale → KIterPerQScale
+            // Restructured loop: M -> N -> QScale -> KIterPerQScale
             static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
                 static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                     // Iterate over quantization groups
diff --git a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
index 00e50af5d8..b277780d6a 100644
--- a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
+++ b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
@@ -999,7 +999,7 @@ struct QuantGemmKernel
                     // Number of K-dimension quantization groups per block
                     constexpr auto bqk_per_block = TilePartitioner::KPerBlock / BQuantGroupSize::kK;
 
-                    // The pre-shuffled layout flattens warp_n ×
+                    // The pre-shuffled layout flattens warp_n x
                     // bqk_per_block scales per row, Padded up to warp_size
                     // to ensure coalesced memory access.
                     constexpr auto tile_window_width =
@@ -1007,7 +1007,7 @@ struct QuantGemmKernel
 
                     // Adapts based on fine vs coarse quantization granularity:
                     //   - Fine-grained (BQuantGroupSize::kN < warp_n):
-                    //       Multiple quant groups per warp → fewer rows needed per block.
+                    //       Multiple quant groups per warp -> fewer rows needed per block.
                     //       height = block_n / warp_per_group
                     //
                     //   - Coarse-grained (BQuantGroupSize::kN >= warp_n):
@@ -1329,7 +1329,7 @@ struct QuantGemmKernel
 
         if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
         {
-            // For RowMajor C, M is the row dimension — check M alignment here because
+            // For RowMajor C, M is the row dimension - check M alignment here because
             // ALayout=RowMajor does not check M (it only checks K), leaving a gap for
             // the RowMajorA + RowMajorC combination.
             if(kargs.M % TilePartitioner::MPerBlock != 0 && GemmPipeline::kPadM == false &&
diff --git a/include/ck_tile/ops/gemm_quant/pipeline/gemm_group_quant_utils.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_group_quant_utils.hpp
index 48c27945b3..513c106de3 100644
--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_group_quant_utils.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_group_quant_utils.hpp
@@ -240,7 +240,7 @@ struct tile_distribution_encoding_pattern_bq : public tile_distribution_encoding
                 // Each warp processes multiple scales: WarpGemm::kN / NPerQ scales per warp.
                 //
                 // Example: NPerQ=8, WarpGemm::kN=16, KPerQ=128, BlockGemmShape::kK=256
-                //          → 2 scales per warp in N, 2 K-groups per block
+                //          -> 2 scales per warp in N, 2 K-groups per block
 
                 // N1: Number of K-dimension quantization groups per block,
                 //      Each K-group of KPerQ elements shares the same scale.
@@ -280,7 +280,7 @@ struct tile_distribution_encoding_pattern_bq : public tile_distribution_encoding
                 // Some warps share the same scale (KR > 1 creates warp grouping).
                 //
                 // Example: NPerQ=32, WarpGemm::kN=16, NWarps=4
-                //          → KR=2 (2 warps share same scale), K1=2 (2 unique scale groups)
+                //          -> KR=2 (2 warps share same scale), K1=2 (2 unique scale groups)
 
                 // KR: Number of warps sharing the same scale
                 // K1: Number of distinct warp groups (unique scales)
@@ -318,7 +318,7 @@ struct tile_distribution_encoding_pattern_bq : public tile_distribution_encoding
                 // All warps share the same scale value for their N-tiles.
                 //
                 // Example: NPerQ=128, WarpGemm::kN=16, NWarps=4
-                //          → 128 >= 16*4=64, so all 4 warps use the same scale
+                //          -> 128 >= 16*4=64, so all 4 warps use the same scale
 
                 // N1: K-dimension quantization groups
                 // N0: Minimal (1) since scale is shared across N
@@ -356,18 +356,18 @@ struct tile_distribution_encoding_pattern_bq : public tile_distribution_encoding
             ///    - Each warp processes multiple scales (WarpGemm::kN / NPerQ scales per warp)
             ///    - Distribution includes explicit replication factor (XR = NPerQ) for scale
             ///    broadcast
-            ///    - Example: NPerQ=8, WarpGemm::kN=16, NWarps=4 → 2 scales per warp
+            ///    - Example: NPerQ=8, WarpGemm::kN=16, NWarps=4 -> 2 scales per warp
             ///
             /// 2. Medium-grained quantization (WarpGemm::kN <= NPerQ <= WarpGemm::kN * NWarps):
             ///    - Each warp handles exactly one quantization scale
             ///    - Scales are distributed across warps with replication factor XR = NPerQ /
             ///    WarpGemm::kN
-            ///    - Example: NPerQ=64, WarpGemm::kN=16, NWarps=4 → 1 scale per warp, XR=4
+            ///    - Example: NPerQ=64, WarpGemm::kN=16, NWarps=4 -> 1 scale per warp, XR=4
             ///
             /// 3. Coarse-grained quantization (NPerQ > WarpGemm::kN * NWarps):
             ///    - Quantization group spans multiple warps
             ///    - All warps share the same scale value
-            ///    - Example: NPerQ=128, WarpGemm::kN=16, NWarps=4 → all warps use same scale
+            ///    - Example: NPerQ=128, WarpGemm::kN=16, NWarps=4 -> all warps use same scale
             ///
             /// @return A static tile distribution encoding for the BQ scale tensor
             if constexpr(NPerQ < WarpGemm::kN)
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
index e353dc8b54..f818d3b757 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
@@ -460,7 +460,7 @@ struct GroupedConvBwdDataKernelArgs
     long_index_t group_stride_c;
 
     // Split-N support fields - initialize to safe defaults
-    index_t n_splits            = 1; // Number of batch splits (e.g., 2 for 128→64×2)
+    index_t n_splits            = 1; // Number of batch splits (e.g., 2 for 128->64x2)
     index_t n_per_split         = 1; // Batches per split (N_ from transformer)
     index_t original_n          = 1; // Original batch size before splitting
     index_t input_batch_stride  = 0; // Stride to next batch in input tensor
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
index db9ac89763..babeae77d1 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
@@ -370,7 +370,7 @@ struct GroupedConvBwdWeightKernelArgs
 
     void* workspace_ptr = nullptr;
 
-    // StreamK tile partitioner — stored directly when TilePartitioner_ is a real type,
+    // StreamK tile partitioner - stored directly when TilePartitioner_ is a real type,
     // empty struct when void (Split-K path). Constructed with dummy values here;
     // properly initialized in MakeKernelArgs before device-side use.
     struct EmptyPartitioner
@@ -651,7 +651,7 @@ struct GroupedConvolutionBackwardWeightKernel
                 return false;
             }
         }
-        // Runtime arch check — complements the static_assert in operator().
+        // Runtime arch check - complements the static_assert in operator().
         // Both are needed: this check runs on the host (where get_compiler_target()
         // isn't available since HIP's host pass doesn't define __gfx*__ macros),
         // while the static_assert in operator() catches misuse at device compile time.
@@ -1075,7 +1075,7 @@ struct GroupedConvolutionBackwardWeightKernel
 
     CK_TILE_DEVICE void RunStreamK(GroupedConvBwdWeightKernelArgsSpecialized& kargs) const
     {
-        // Device-side compile-time arch check — complements the runtime check in
+        // Device-side compile-time arch check - complements the runtime check in
         // IsSupportedArgument(). Both are needed: the runtime check runs on the host
         // (where get_compiler_target() isn't available since HIP's host pass doesn't
         // define __gfx*__ macros), while this catches misuse at device compile time.
@@ -1258,7 +1258,7 @@ struct GroupedConvolutionBackwardWeightKernel
                         amd_wave_read_first_lane(partner_start_iter < tile_iter_end);
 
                     // If the partner of the tile-starter is not in this tile,
-                    // then all partials are accumulated — write final result.
+                    // then all partials are accumulated - write final result.
                     if(tile_started && !partner_in_tile)
                     {
                         auto c_block_window_out =
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
index 5b1803224f..14c4356aa1 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
@@ -411,7 +411,7 @@ struct GroupedConvFwdKernelArgs
     long_index_t group_stride_c;
 
     // Split-N support fields - initialize to safe defaults
-    index_t n_splits            = 1; // Number of batch splits (e.g., 2 for 128→64×2)
+    index_t n_splits            = 1; // Number of batch splits (e.g., 2 for 128->64x2)
     index_t n_per_split         = 1; // Batches per split (N_ from transformer)
     index_t original_n          = 1; // Original batch size before splitting
     index_t input_batch_stride  = 0; // Stride to next batch in input tensor
diff --git a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp
index 46e3033ef1..8e9d808d4c 100644
--- a/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp
@@ -131,7 +131,7 @@ struct TransformConvFwdToGemm
 
     // Calculate split-image factors AFTER considering split-N
     // Returns: should_split flag and optimal split factors for D, H, W dimensions
-    // Strategy: Hierarchical splitting with priority order D → H → W
+    // Strategy: Hierarchical splitting with priority order D -> H -> W
     // Dynamically increases split factors until memory fits below threshold
     //
     // NOTE: Layout validation should be done at the invoker level before calling this function
@@ -188,7 +188,7 @@ struct TransformConvFwdToGemm
             return info;
         }
 
-        // Split-image is needed - use hierarchical priority: D → H → W
+        // Split-image is needed - use hierarchical priority: D -> H -> W
         info.should_split = true;
 
         // Hierarchical splitting strategy:
@@ -253,7 +253,7 @@ struct TransformConvFwdToGemm
         // Use maximum allowed split as best effort (capped at 64 total pieces)
         info.num_d_pieces = (D_out < 4) ? D_out : 4; // Cap at 4
         info.num_h_pieces = (H_out < 4) ? H_out : 4; // Cap at 4
-        info.num_w_pieces = (W_out < 4) ? W_out : 4; // Cap at 4 (max 4×4×4=64)
+        info.num_w_pieces = (W_out < 4) ? W_out : 4; // Cap at 4 (max 4x4x4=64)
 
         return info;
     }
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
index a14f103eb6..f4245aec62 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
@@ -177,7 +177,7 @@ struct BlockReduce2d
     }
 
     // uniform_sequence_gen_t<NSize, Value> generates sequence of NSize elements filled with Value
-    // e.g., uniform_sequence_gen_t<2, 1> → {1, 1} and uniform_sequence_gen_t<3, 4> → {4, 4, 4}
+    // e.g., uniform_sequence_gen_t<2, 1> -> {1, 1} and uniform_sequence_gen_t<3, 4> -> {4, 4, 4}
     template <typename XDistributedTensor_,
               typename ReduceFunc,
               typename ReducePacksPerXDim = uniform_sequence_gen_t<2, 1>>
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
index de5cf4e1cc..057ea19cbd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
@@ -167,7 +167,7 @@ using device_grouped_conv_bwd_data_xdl_f16_noshuffle_instances =
         // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
         // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
         // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
-        // f16_f16_f32_f16 — noshuffle epilogue (CDEBlockTransferScalarPerVector_NPerBlock = 1)
+        // f16_f16_f32_f16 - noshuffle epilogue (CDEBlockTransferScalarPerVector_NPerBlock = 1)
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                1>,
 
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,            1,            1,     S<1, 32, 1, 8>,                1>,
@@ -333,9 +333,9 @@ using device_grouped_conv_bwd_data_xdl_bf16_instances = std::tuple<
     // clang-format on
     >;
 
-// bf16_bf16_f32_bf16 — noshuffle epilogue (CDEBlockTransferScalarPerVector_NPerBlock = 1)
+// bf16_bf16_f32_bf16 - noshuffle epilogue (CDEBlockTransferScalarPerVector_NPerBlock = 1)
 // Same tile shapes as bf16_instances but with ScalarPerVector=1, enabling the no-shuffle fast path
-// (VGPR → Global direct write, 0 LDS barriers) instead of CShuffle (VGPR → LDS → Global, 8
+// (VGPR -> Global direct write, 0 LDS barriers) instead of CShuffle (VGPR -> LDS -> Global, 8
 // barriers).
 template <index_t NDimSpatial,
           typename ALayout,
@@ -532,7 +532,7 @@ using device_grouped_conv_bwd_data_xdl_f32_noshuffle_instances =
          // ##############################################|    Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
          // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
          // ##############################################|           |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
-        // f32_f32_f32_f32 — noshuffle epilogue (CDEBlockTransferScalarPerVector_NPerBlock = 1)
+        // f32_f32_f32_f32 - noshuffle epilogue (CDEBlockTransferScalarPerVector_NPerBlock = 1)
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              4,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,            1,            1,     S<1, 32, 1, 8>,                1>,
 
         DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1< NDimSpatial, ALayout, BLayout,    DsLayout, ELayout,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough,  PassThrough,    PassThrough,            ConvSpec,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,            1,            1,     S<1, 32, 1, 8>,                1>,
@@ -552,7 +552,7 @@ using device_grouped_conv_bwd_data_xdl_f32_noshuffle_instances =
         // clang-format on
         >;
 
-// bf16 — BBlockTransfer parameters matching the non-grouped DeviceConvNdBwdDataNwcKxcNwk_Xdl
+// bf16 - BBlockTransfer parameters matching the non-grouped DeviceConvNdBwdDataNwcKxcNwk_Xdl
 // instances. The key difference from bf16_instances: BBlockTransfer uses S<4, BlockSize/4, 1>
 // thread cluster and S<2, 0, 1> arrange order, which gives full thread utilization for B-matrix
 // loads. These are optimal when opt3 flat descriptor path is active (G=1, 2D convolutions).
@@ -584,7 +584,7 @@ using device_grouped_conv_bwd_data_xdl_bf16_nongrouped_match_instances = std::tu
     // clang-format on
     >;
 
-// f16 — BBlockTransfer parameters matching the non-grouped DeviceConvNdBwdDataNwcKxcNwk_Xdl
+// f16 - BBlockTransfer parameters matching the non-grouped DeviceConvNdBwdDataNwcKxcNwk_Xdl
 // instances.
 template <index_t NDimSpatial,
           typename ALayout,
@@ -610,7 +610,7 @@ using device_grouped_conv_bwd_data_xdl_f16_nongrouped_match_instances = std::tup
     // clang-format on
     >;
 
-// f32 — BBlockTransfer parameters matching the non-grouped DeviceConvNdBwdDataNwcKxcNwk_Xdl
+// f32 - BBlockTransfer parameters matching the non-grouped DeviceConvNdBwdDataNwcKxcNwk_Xdl
 // instances. F32 uses K1=4, KPerBlock=16, and smaller scalar-per-vector values.
 template <index_t NDimSpatial,
           typename ALayout,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wavelet_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wavelet_xdl_instance.hpp
index b0090dadbf..c451fa08e1 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wavelet_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wavelet_xdl_instance.hpp
@@ -49,11 +49,11 @@ using device_grouped_conv_bwd_weight_wavelet_xdl_c_shuffle_f16_instances = std::
     //                                                                                                                                                                 | Stage  | Thrds  | Thrds  |    |    |   |  | XDL | XDL| /Wv| /Wv| A cluster     |A ArrangeOrd|A SrcAccOrd | AVDim|ASSca|ADSca|AExM| B cluster     |B ArrangeOrd|B SrcAccOrd | BVDim|BSSca|BDSca|BExN|CShM|CShN|  C cluster       |CSca|
     // M=128 N=128: 2x2 waves, 64 AccVGPRs/wave
     DeviceGroupedConvBwdWeight_Xdl_WaveletModel_CShuffleV3<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec,    1,      256,     256,   128, 128,  32, 8,  32,  32,   2,   2, S<4, 32, 2>, S<2, 0, 1>, S<1, 0, 2>,     1,    4,    4, false, S<4, 32, 2>, S<2, 0, 1>, S<1, 0, 2>,     1,    4,    4, false,   1,   1, S<1, 32, 1, 8>,    2>,
-    // M=64 N=64: 2x2 waves, 16 AccVGPRs/wave — smallest tile, targets Group 3 shapes
+    // M=64 N=64: 2x2 waves, 16 AccVGPRs/wave - smallest tile, targets Group 3 shapes
     DeviceGroupedConvBwdWeight_Xdl_WaveletModel_CShuffleV3<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec,    1,      256,     256,    64,  64,  32, 8,  32,  32,   1,   1, S<4, 32, 2>, S<2, 0, 1>, S<1, 0, 2>,     1,    2,    4, false, S<4, 32, 2>, S<2, 0, 1>, S<1, 0, 2>,     1,    2,    4, false,   1,   1, S<1, 32, 1, 8>,    2>,
-    // M=128 N=64: 2x2 waves, 32 AccVGPRs/wave — asymmetric M>N
+    // M=128 N=64: 2x2 waves, 32 AccVGPRs/wave - asymmetric M>N
     DeviceGroupedConvBwdWeight_Xdl_WaveletModel_CShuffleV3<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec,    1,      256,     256,   128,  64,  32, 8,  32,  32,   2,   1, S<4, 32, 2>, S<2, 0, 1>, S<1, 0, 2>,     1,    4,    4, false, S<4, 32, 2>, S<2, 0, 1>, S<1, 0, 2>,     1,    2,    4, false,   1,   1, S<1, 32, 1, 8>,    2>,
-    // M=64 N=128: 2x2 waves, 32 AccVGPRs/wave — asymmetric N>M
+    // M=64 N=128: 2x2 waves, 32 AccVGPRs/wave - asymmetric N>M
     DeviceGroupedConvBwdWeight_Xdl_WaveletModel_CShuffleV3<NDimSpatial, ALayout, BLayout, ELayout, F16, F16, F16, F32, PassThrough, PassThrough, PassThrough, ConvSpec,    1,      256,     256,    64, 128,  32, 8,  32,  32,   1,   2, S<4, 32, 2>, S<2, 0, 1>, S<1, 0, 2>,     1,    2,    4, false, S<4, 32, 2>, S<2, 0, 1>, S<1, 0, 2>,     1,    4,    4, false,   1,   1, S<1, 32, 1, 8>,    2>
     // clang-format on
     >;
@@ -61,7 +61,7 @@ using device_grouped_conv_bwd_weight_wavelet_xdl_c_shuffle_f16_instances = std::
 // 2-way (4,2) wave-specialized instances for F16.
 // TileLoad=256 (4 load waves) + TileMath=128 (2 math waves) = 384 threads (6 waves).
 // Same load cluster as (4,4): S<4, 32, 2>=256.
-// Math waves halved → each wave handles more MXdl/NXdl work → more AccVGPRs.
+// Math waves halved -> each wave handles more MXdl/NXdl work -> more AccVGPRs.
 template <ck::index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -80,7 +80,7 @@ using device_grouped_conv_bwd_weight_wavelet_4w2_xdl_c_shuffle_f16_instances = s
     // clang-format on
     >;
 
-// BF16 wavelet instances — same tile configs as F16, with BF16 in/out and F32 compute.
+// BF16 wavelet instances - same tile configs as F16, with BF16 in/out and F32 compute.
 template <ck::index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
@@ -101,7 +101,7 @@ using device_grouped_conv_bwd_weight_wavelet_xdl_c_shuffle_bf16_instances = std:
     // clang-format on
     >;
 
-// BF16 (4,2) wavelet instances — same tile configs as F16 (4,2).
+// BF16 (4,2) wavelet instances - same tile configs as F16 (4,2).
 template <ck::index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
diff --git a/library/src/tensor_operation_instance/gpu/contraction/contraction_instance_common.hpp b/library/src/tensor_operation_instance/gpu/contraction/contraction_instance_common.hpp
index e9f838107e..b4af2d6325 100644
--- a/library/src/tensor_operation_instance/gpu/contraction/contraction_instance_common.hpp
+++ b/library/src/tensor_operation_instance/gpu/contraction/contraction_instance_common.hpp
@@ -21,24 +21,24 @@
 // add_device_* function inside ck::tensor_operation::device::instance.
 //
 // Parameters:
-//   INST_TPL     — instance template (e.g. device_contraction_kk_instance,
+//   INST_TPL     - instance template (e.g. device_contraction_kk_instance,
 //                  device_contraction_f64_kk_instance)
-//   OP_NAME      — lowercase operation name for identifier construction
+//   OP_NAME      - lowercase operation name for identifier construction
 //                  (bilinear or scale)
-//   CDE_OP       — C++ element-wise operation type for template argument
+//   CDE_OP       - C++ element-wise operation type for template argument
 //                  (Bilinear or Scale)
-//   NDIM_VAL     — number of dimensions (2 or 6)
-//   NAME_SUFFIX  — data-type and layout suffix for the generated names
+//   NDIM_VAL     - number of dimensions (2 or 6)
+//   NAME_SUFFIX  - data-type and layout suffix for the generated names
 //                  (e.g. f32_f32_f32_f32_kknn, bf16_bf16_bf16_bf16_compute_f32_knnn)
-//   ADATA        — ADataType
-//   BDATA        — BDataType
-//   ACC          — AccDataType
-//   CSHUFFLE     — CShuffleDataType
-//   DS_TUPLE     — DsDataType (e.g. F32_Tuple, Empty_Tuple)
-//   EDATA        — EDataType
-//   COMPUTE      — ComputeDataType
+//   ADATA        - ADataType
+//   BDATA        - BDataType
+//   ACC          - AccDataType
+//   CSHUFFLE     - CShuffleDataType
+//   DS_TUPLE     - DsDataType (e.g. F32_Tuple, Empty_Tuple)
+//   EDATA        - EDataType
+//   COMPUTE      - ComputeDataType
 //
-// Example — bilinear, F32, kk layout, 2D:
+// Example - bilinear, F32, kk layout, 2D:
 //
 //   CK_CONTRACTION_INSTANCE(device_contraction_kk_instance,
 //       bilinear, Bilinear, 2, f32_f32_f32_f32_kknn,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
index 1c7cd770b9..f4f66d4732 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
@@ -41,7 +41,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
                                                         Empty_Tuple,
                                                         NHWGC,
                                                         ConvBwdDataFilter1x1Stride1Pad0>{});
-    // 3. Default — noshuffle epilogue
+    // 3. Default - noshuffle epilogue
     add_device_operation_instances(
         instances,
         device_grouped_conv_bwd_data_xdl_bf16_noshuffle_instances<2,
@@ -50,7 +50,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
                                                                   Empty_Tuple,
                                                                   NHWGC,
                                                                   ConvBwdDataDefault>{});
-    // 4. Filter1x1Stride1Pad0 — noshuffle epilogue
+    // 4. Filter1x1Stride1Pad0 - noshuffle epilogue
     add_device_operation_instances(instances,
                                    device_grouped_conv_bwd_data_xdl_bf16_noshuffle_instances<
                                        2,
@@ -59,7 +59,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
                                        Empty_Tuple,
                                        NHWGC,
                                        ConvBwdDataFilter1x1Stride1Pad0>{});
-    // 5. Default — nongrouped_match instances
+    // 5. Default - nongrouped_match instances
     add_device_operation_instances(
         instances,
         device_grouped_conv_bwd_data_xdl_bf16_nongrouped_match_instances<2,
@@ -68,7 +68,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
                                                                          Empty_Tuple,
                                                                          NHWGC,
                                                                          ConvBwdDataDefault>{});
-    // 6. Filter1x1Stride1Pad0 — nongrouped_match instances
+    // 6. Filter1x1Stride1Pad0 - nongrouped_match instances
     add_device_operation_instances(instances,
                                    device_grouped_conv_bwd_data_xdl_bf16_nongrouped_match_instances<
                                        2,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
index 3bbd4a37e5..dfcfac2bfa 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -41,7 +41,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(
                                                        Empty_Tuple,
                                                        NHWGC,
                                                        ConvBwdDataFilter1x1Stride1Pad0>{});
-    // 3. Default — noshuffle epilogue
+    // 3. Default - noshuffle epilogue
     add_device_operation_instances(
         instances,
         device_grouped_conv_bwd_data_xdl_f16_noshuffle_instances<2,
@@ -50,7 +50,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(
                                                                  Empty_Tuple,
                                                                  NHWGC,
                                                                  ConvBwdDataDefault>{});
-    // 4. Filter1x1Stride1Pad0 — noshuffle epilogue
+    // 4. Filter1x1Stride1Pad0 - noshuffle epilogue
     add_device_operation_instances(instances,
                                    device_grouped_conv_bwd_data_xdl_f16_noshuffle_instances<
                                        2,
@@ -59,7 +59,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(
                                        Empty_Tuple,
                                        NHWGC,
                                        ConvBwdDataFilter1x1Stride1Pad0>{});
-    // 5. Default — nongrouped_match instances
+    // 5. Default - nongrouped_match instances
     add_device_operation_instances(
         instances,
         device_grouped_conv_bwd_data_xdl_f16_nongrouped_match_instances<2,
@@ -68,7 +68,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(
                                                                         Empty_Tuple,
                                                                         NHWGC,
                                                                         ConvBwdDataDefault>{});
-    // 6. Filter1x1Stride1Pad0 — nongrouped_match instances
+    // 6. Filter1x1Stride1Pad0 - nongrouped_match instances
     add_device_operation_instances(instances,
                                    device_grouped_conv_bwd_data_xdl_f16_nongrouped_match_instances<
                                        2,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
index 344c35c5ca..011a9d5cd1 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
@@ -41,7 +41,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(
                                                        Empty_Tuple,
                                                        NHWGC,
                                                        ConvBwdDataFilter1x1Stride1Pad0>{});
-    // 3. Default — noshuffle epilogue
+    // 3. Default - noshuffle epilogue
     add_device_operation_instances(
         instances,
         device_grouped_conv_bwd_data_xdl_f32_noshuffle_instances<2,
@@ -50,7 +50,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(
                                                                  Empty_Tuple,
                                                                  NHWGC,
                                                                  ConvBwdDataDefault>{});
-    // 4. Filter1x1Stride1Pad0 — noshuffle epilogue
+    // 4. Filter1x1Stride1Pad0 - noshuffle epilogue
     add_device_operation_instances(instances,
                                    device_grouped_conv_bwd_data_xdl_f32_noshuffle_instances<
                                        2,
@@ -59,7 +59,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(
                                        Empty_Tuple,
                                        NHWGC,
                                        ConvBwdDataFilter1x1Stride1Pad0>{});
-    // 5. Default — nongrouped_match instances
+    // 5. Default - nongrouped_match instances
     add_device_operation_instances(
         instances,
         device_grouped_conv_bwd_data_xdl_f32_nongrouped_match_instances<2,
@@ -68,7 +68,7 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(
                                                                         Empty_Tuple,
                                                                         NHWGC,
                                                                         ConvBwdDataDefault>{});
-    // 6. Filter1x1Stride1Pad0 — nongrouped_match instances
+    // 6. Filter1x1Stride1Pad0 - nongrouped_match instances
     add_device_operation_instances(instances,
                                    device_grouped_conv_bwd_data_xdl_f32_nongrouped_match_instances<
                                        2,
diff --git a/rocm_ck/include/rocm_ck/args.hpp b/rocm_ck/include/rocm_ck/args.hpp
index 2da845e8ea..28f2e07448 100644
--- a/rocm_ck/include/rocm_ck/args.hpp
+++ b/rocm_ck/include/rocm_ck/args.hpp
@@ -1,29 +1,29 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Role: abi — shared between host and device. Trivially copyable, no CK deps.
+// Role: abi - shared between host and device. Trivially copyable, no CK deps.
 //
 // Args is a hardware buffer for passing data between CPU and GPU during a
-// kernel call. It carries raw pointers, shapes, strides, and scalar values —
+// kernel call. It carries raw pointers, shapes, strides, and scalar values -
 // nothing more. All semantic meaning (which tensor is "A", which scalar is
 // "alpha", input vs output) lives in the Signature, not here.
 //
 // This is deliberately one type for all operations. Per-operation structs
-// (GemmArgs, FmhaArgs, ...) would make the dispatcher a closed set — adding
+// (GemmArgs, FmhaArgs, ...) would make the dispatcher a closed set - adding
 // an operation means adding a type, updating launch code, and changing the
 // kpack format. A generic buffer keeps the dispatcher open.
 //
 // Capacity limits (kMaxRank=6, kMaxTensors=16, kMaxScalars=16) are sized to
 // the most demanding current operation (FMHA backward: ~12 tensors, ~12
 // scalars, rank-6 for grouped 3D conv). If a future operation exceeds these,
-// bump the constants — the layout is not versioned, and the 4KB HSA kernarg
+// bump the constants - the layout is not versioned, and the 4KB HSA kernarg
 // budget has room. Don't over-provision speculatively.
 //
 // Key constraints:
-//   - Trivially copyable, standard layout — required for HSA kernarg passing.
-//   - Fixed-capacity arrays, no heap — sizeof fits the 4KB kernarg budget.
-//   - const void* for all tensor pointers — the entry kernel casts to the
+//   - Trivially copyable, standard layout - required for HSA kernarg passing.
+//   - Fixed-capacity arrays, no heap - sizeof fits the 4KB kernarg budget.
+//   - const void* for all tensor pointers - the entry kernel casts to the
 //     concrete type. Input vs output semantics live in the Signature.
-//   - No runtime type tags on scalars — the Signature declares types at
+//   - No runtime type tags on scalars - the Signature declares types at
 //     compile time. The entry kernel reads the correct union member.
 //   - Slot ordering is the invariant: tensors[i] maps to Signature::tensors[i].
 
@@ -44,11 +44,11 @@ constexpr int kMaxScalars = 16; // FMHA with masking+dropout needs ~12
 struct TensorArg
 {
     const void* ptr;                            //  8 bytes  (offset 0)
-    std::array<index_t, kMaxRank> lengths;      // 24 bytes  (offset 8)   — int32
-    std::array<long_index_t, kMaxRank> strides; // 48 bytes  (offset 32)  — int64
+    std::array<index_t, kMaxRank> lengths;      // 24 bytes  (offset 8)   - int32
+    std::array<long_index_t, kMaxRank> strides; // 48 bytes  (offset 32)  - int64
 };
 
-// FP16/BF16/FP8 scalars use f32 — scalar precision >= tensor precision.
+// FP16/BF16/FP8 scalars use f32 - scalar precision >= tensor precision.
 union ScalarValue
 {
     float f32;
diff --git a/rocm_ck/include/rocm_ck/datatype.hpp b/rocm_ck/include/rocm_ck/datatype.hpp
index 43da2d14a9..e59b71a72f 100644
--- a/rocm_ck/include/rocm_ck/datatype.hpp
+++ b/rocm_ck/include/rocm_ck/datatype.hpp
@@ -1,6 +1,6 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Role: types — DataType enum, constexpr queries. No runtime, no CK deps.
+// Role: types - DataType enum, constexpr queries. No runtime, no CK deps.
 
 #pragma once
 
@@ -13,19 +13,19 @@ namespace rocm_ck {
 // FP8 = e4m3, BF8 = e5m2 (CK convention).
 enum class DataType : uint8_t
 {
-    // Floating point — standard widths
+    // Floating point - standard widths
     FP64,
     FP32,
     FP16,
     BF16,
 
-    // FP8 variants — see note below
+    // FP8 variants - see note below
     FP8_FNUZ,
     BF8_FNUZ,
     FP8_OCP,
     BF8_OCP,
 
-    // Integer types — signed and unsigned at each width
+    // Integer types - signed and unsigned at each width
     I4,
     I8,
     I16,
@@ -37,12 +37,12 @@ enum class DataType : uint8_t
     U64
 };
 
-// FP8 variants — FNUZ and OCP are different number formats, not just HW hints.
+// FP8 variants - FNUZ and OCP are different number formats, not just HW hints.
 //   FNUZ: gfx942 native (higher bias, no Inf, max 240)
 //   OCP:  gfx950 native (OCP standard, has Inf, max 448)
 // Non-native formats run in software (slower) and produce different numerical
 // results. Choose based on target GPU and model training format.
-// We keep FNUZ and OCP explicit rather than a generic FP8 — the numerical
+// We keep FNUZ and OCP explicit rather than a generic FP8 - the numerical
 // differences matter for compatibility and schema-driven test coverage.
 // TODO - We may introduce a generic FP8/BF8 that resolves to the hardware-native type.
 // See: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/fp8_numbers.html
diff --git a/rocm_ck/include/rocm_ck/fixed_string.hpp b/rocm_ck/include/rocm_ck/fixed_string.hpp
index 105c354c23..4ca0678e35 100644
--- a/rocm_ck/include/rocm_ck/fixed_string.hpp
+++ b/rocm_ck/include/rocm_ck/fixed_string.hpp
@@ -1,10 +1,10 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Role: types — FixedString. No runtime, no CK deps.
+// Role: types - FixedString. No runtime, no CK deps.
 //
 // A compile-time string for use in template parameters (NTTPs).
 //
-// C++20 requires template parameters to be "structural types" — loosely, types
+// C++20 requires template parameters to be "structural types" - loosely, types
 // that are trivially comparable and don't contain pointers or references.
 // std::string and std::string_view fail this requirement (internal pointer).
 //
@@ -16,7 +16,7 @@
 // When to use FixedString vs std::string_view:
 //   - FixedString: the type must be structural (template parameters).
 //   - string_view: consteval-only types that never become template parameters
-//     (e.g., ResolvedTensor — see resolved_tensor.hpp).
+//     (e.g., ResolvedTensor - see resolved_tensor.hpp).
 //
 // The capacity is a template parameter so each use site documents its limit:
 //   FixedString<16> name("bias");   // tensor names: 15 chars max
diff --git a/rocm_ck/include/rocm_ck/gpu_target.hpp b/rocm_ck/include/rocm_ck/gpu_target.hpp
index 0a0ed83803..751d4f136a 100644
--- a/rocm_ck/include/rocm_ck/gpu_target.hpp
+++ b/rocm_ck/include/rocm_ck/gpu_target.hpp
@@ -1,6 +1,6 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Role: types — GpuTarget enum. No runtime, no CK deps.
+// Role: types - GpuTarget enum. No runtime, no CK deps.
 
 #pragma once
 
diff --git a/rocm_ck/include/rocm_ck/index_t.hpp b/rocm_ck/include/rocm_ck/index_t.hpp
index 70886d7f5e..1509a355e9 100644
--- a/rocm_ck/include/rocm_ck/index_t.hpp
+++ b/rocm_ck/include/rocm_ck/index_t.hpp
@@ -1,6 +1,6 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Role: types — index_t, long_index_t. No runtime, no CK deps.
+// Role: types - index_t, long_index_t. No runtime, no CK deps.
 
 #pragma once
 
diff --git a/rocm_ck/include/rocm_ck/layout.hpp b/rocm_ck/include/rocm_ck/layout.hpp
index f740038e67..d6b18e18e3 100644
--- a/rocm_ck/include/rocm_ck/layout.hpp
+++ b/rocm_ck/include/rocm_ck/layout.hpp
@@ -1,6 +1,6 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Role: types — Layout enum, constexpr/consteval helpers. No runtime, no CK deps.
+// Role: types - Layout enum, constexpr/consteval helpers. No runtime, no CK deps.
 
 #pragma once
 
@@ -12,7 +12,7 @@
 
 namespace rocm_ck {
 
-// Auto is a resolve-time placeholder — Signature::resolve() replaces it with
+// Auto is a resolve-time placeholder - Signature::resolve() replaces it with
 // the concrete layout from the operator slot. It never reaches the kernel.
 enum class Layout : uint8_t
 {
diff --git a/rocm_ck/include/rocm_ck/ops.hpp b/rocm_ck/include/rocm_ck/ops.hpp
index ca49cc75d9..a469ab18f6 100644
--- a/rocm_ck/include/rocm_ck/ops.hpp
+++ b/rocm_ck/include/rocm_ck/ops.hpp
@@ -1,6 +1,6 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Role: meta — operator structs, Op variant. No runtime, no CK deps.
+// Role: meta - operator structs, Op variant. No runtime, no CK deps.
 //
 // Operators are the edges of a Signature's compute graph. Each operator
 // names its tensor slots as string_view labels (e.g., "A", "bias", "query")
@@ -8,13 +8,13 @@
 // owns the tensor definitions; operators just reference them by name.
 //
 // This separation means operators are reusable across different tensor
-// configurations — a GemmOp doesn't care whether its "lhs" is FP16 or BF16,
+// configurations - a GemmOp doesn't care whether its "lhs" is FP16 or BF16,
 // Row or Col. That's resolved later when the Signature is validated.
 //
 // The Op variant is the closed set of supported operator types. Adding a
 // new operator means adding a struct here and a variant alternative.
-// Fused operations (like FMHA) are single operators — not chains of
-// elementwise + GEMM — because CK Tile implements them as monolithic kernels.
+// Fused operations (like FMHA) are single operators - not chains of
+// elementwise + GEMM - because CK Tile implements them as monolithic kernels.
 
 #pragma once
 
@@ -26,7 +26,7 @@
 namespace rocm_ck {
 
 // Matrix multiplication: out = lhs x rhs.
-// acc_dtype is the accumulation type — defaults to FP32, the universal safe
+// acc_dtype is the accumulation type - defaults to FP32, the universal safe
 // choice across all input types.
 struct GemmOp
 {
diff --git a/rocm_ck/include/rocm_ck/physical_tensor.hpp b/rocm_ck/include/rocm_ck/physical_tensor.hpp
index fcec91a51c..8dca7627b1 100644
--- a/rocm_ck/include/rocm_ck/physical_tensor.hpp
+++ b/rocm_ck/include/rocm_ck/physical_tensor.hpp
@@ -1,9 +1,9 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Role: types — PhysicalTensor. No runtime, no CK deps.
+// Role: types - PhysicalTensor. No runtime, no CK deps.
 //
 // A PhysicalTensor maps a named tensor from the Signature graph to a slot
-// in the generic Args buffer. Not every tensor in a compute graph is physical —
+// in the generic Args buffer. Not every tensor in a compute graph is physical -
 // intermediate values (e.g., the S matrix in FMHA = Q*K^T) live only in
 // registers and never appear in device memory. The physical tensor table
 // describes exactly what the host needs to pack into Args.
diff --git a/rocm_ck/include/rocm_ck/resolved_tensor.hpp b/rocm_ck/include/rocm_ck/resolved_tensor.hpp
index 4a2ce00672..7390d4b795 100644
--- a/rocm_ck/include/rocm_ck/resolved_tensor.hpp
+++ b/rocm_ck/include/rocm_ck/resolved_tensor.hpp
@@ -1,30 +1,30 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Role: types — ResolvedTensor, ResolvedQuantization. No runtime, no CK deps.
+// Role: types - ResolvedTensor, ResolvedQuantization. No runtime, no CK deps.
 //
 // ResolvedTensor is the intermediate result of consteval resolution. It exists
-// only at compile time — produced by Signature::resolve() and consumed by
+// only at compile time - produced by Signature::resolve() and consumed by
 // makeSpec(), both consteval. It never appears in compiled code.
 //
 // In the user-facing Signature, tensors can have Layout::Auto (inherit from
 // operator slot) and omit fields with sensible defaults. After resolution,
 // every field is concrete. The base fields (name, dtype, rank, layout)
-// describe a plain dense tensor — enough for most operands (GEMM inputs,
+// describe a plain dense tensor - enough for most operands (GEMM inputs,
 // outputs, bias vectors). Some tensors carry additional metadata beyond the
 // dense description. Block-quantized tensors (e.g., INT4 weights) need a
 // scale tensor and group size. We use optional sub-structs for these
 // extensions, keeping the common case clean without bloating every instance.
 //
 // Why std::string_view instead of FixedString?
-//   ResolvedTensor is consteval-only — produced and consumed entirely at
+//   ResolvedTensor is consteval-only - produced and consumed entirely at
 //   compile time. No library loading, no runtime lifetime concerns. The
 //   string_views point to string literals from user code (e.g.,
-//   GemmOp{.lhs = "A"}), which have static storage duration — no dangling.
+//   GemmOp{.lhs = "A"}), which have static storage duration - no dangling.
 //   FixedString is required for PhysicalTensor because it IS used as a
 //   template parameter (NTTP), which requires structural types (no pointers).
 //   ResolvedTensor is never a template parameter.
 //
-// Plain aggregate — no methods, no validation. Resolution validates; this
+// Plain aggregate - no methods, no validation. Resolution validates; this
 // type just carries the result to makeSpec().
 
 #pragma once
diff --git a/rocm_ck/tests/unit/unit_args.cpp b/rocm_ck/tests/unit/unit_args.cpp
index c57b93dce2..766281a1bf 100644
--- a/rocm_ck/tests/unit/unit_args.cpp
+++ b/rocm_ck/tests/unit/unit_args.cpp
@@ -125,7 +125,7 @@ TEST(ScalarValue, StoresAndRetrievesUInt32)
 }
 
 // ============================================================================
-// Args field coverage — batch_strides and workspace_ptr
+// Args field coverage - batch_strides and workspace_ptr
 // ============================================================================
 
 TEST(Args, BatchStridesFieldExists)
diff --git a/script/dependency-parser/src/selective_test_filter.py b/script/dependency-parser/src/selective_test_filter.py
index 551ed06eb0..427abcffbd 100644
--- a/script/dependency-parser/src/selective_test_filter.py
+++ b/script/dependency-parser/src/selective_test_filter.py
@@ -252,13 +252,20 @@ def main():
         # Extract basenames for regex (e.g., bin/test_gemm -> test_gemm)
         test_names = [os.path.basename(t) for t in tests]
 
+        # Anchor each name with ^...$ and escape regex metacharacters so that
+        # `ctest -R` does exact-name matching rather than substring matching
+        # (otherwise e.g. 'test_grouped_convnd_bwd_weight' would substring-match
+        # 'test_grouped_convnd_bwd_weight_bilinear' and try to run an
+        # executable that was never built).
+        anchored = [f"^{re.escape(n)}$" for n in test_names]
+
         # Split into chunks
-        for i in range(0, len(test_names), chunk_size):
-            chunk = test_names[i:i + chunk_size]
+        for i in range(0, len(anchored), chunk_size):
+            chunk = anchored[i:i + chunk_size]
             regex_chunks.append("|".join(chunk))
 
         # Keep single regex for backward compatibility (but may be too long)
-        regex = "|".join(test_names)
+        regex = "|".join(anchored)
     else:
         regex = ""
 
diff --git a/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_scale_mma.cpp b/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_scale_mma.cpp
index a9adeba7d7..6a984d6332 100644
--- a/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_scale_mma.cpp
+++ b/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_scale_mma.cpp
@@ -63,17 +63,17 @@ void ScaleMfmaGfx950Specialization_impl()
 
 TEST(ScaleMMATrait, ScaleMfmaGfx950Specialization)
 {
-    // Test fp8 → fp32 scale MFMA for GFX950 (16x16x128)
+    // Test fp8 -> fp32 scale MFMA for GFX950 (16x16x128)
     ScaleMfmaGfx950Specialization_impl<fp8_t, fp8_t, fp32_t, 16u, 16u, 128u>();
-    // Test bf8 → fp32 scale MFMA for GFX950 (16x16x128)
+    // Test bf8 -> fp32 scale MFMA for GFX950 (16x16x128)
     ScaleMfmaGfx950Specialization_impl<bf8_t, bf8_t, fp32_t, 16u, 16u, 128u>();
-    // Test fp4 → fp32 scale MFMA for GFX950 (16x16x128)
+    // Test fp4 -> fp32 scale MFMA for GFX950 (16x16x128)
     ScaleMfmaGfx950Specialization_impl<pk_fp4_t, pk_fp4_t, fp32_t, 16u, 16u, 128u>();
-    // Test fp8 → fp32 scale MFMA for GFX950 (32x32x64)
+    // Test fp8 -> fp32 scale MFMA for GFX950 (32x32x64)
     ScaleMfmaGfx950Specialization_impl<fp8_t, fp8_t, fp32_t, 32u, 32u, 64u>();
-    // Test bf8 → fp32 scale MFMA for GFX950 (32x32x64)
+    // Test bf8 -> fp32 scale MFMA for GFX950 (32x32x64)
     ScaleMfmaGfx950Specialization_impl<bf8_t, bf8_t, fp32_t, 32u, 32u, 64u>();
-    // Test fp4 → fp32 scale MFMA for GFX950 (32x32x64)
+    // Test fp4 -> fp32 scale MFMA for GFX950 (32x32x64)
     ScaleMfmaGfx950Specialization_impl<pk_fp4_t, pk_fp4_t, fp32_t, 32u, 32u, 64u>();
 
     std::cout << "GFX950 scale MFMA specialization is correct" << std::endl;
diff --git a/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_sparse_mma.cpp b/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_sparse_mma.cpp
index be631f0659..86e363aabc 100644
--- a/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_sparse_mma.cpp
+++ b/test/ck_tile/core/arch/mma/pipeline/test_amdgcn_sparse_mma.cpp
@@ -30,7 +30,7 @@ using CompilerTargetGfx950 = decltype(make_amdgcn_gfx9_target<amdgcn_target_id::
 
 TEST(SparseMMATrait, SparseMfmaGfx950Specialization)
 {
-    // Test fp16 → fp32 sparse MFMA for GFX950 (16x16x32)
+    // Test fp16 -> fp32 sparse MFMA for GFX950 (16x16x32)
     using TestSparseMfma16x16 = amdgcn_mma<fp16_t,
                                            fp16_t,
                                            fp32_t,
@@ -218,7 +218,7 @@ TEST(SparseMMATrait, MmaSelector_Sparse_F16_F16_F32_16x16x32_Real)
                                  MmaPipelineTest<>::WaveTileK><<<1, waveSize>>>(a, b, c, out);
     };
     // Initialize A with 2:4 structured sparsity pattern: {1, 0, 1, 0, ...}
-    // This ensures the sparse compression transform is actually exercised —
+    // This ensures the sparse compression transform is actually exercised -
     // a no-op or broken compression would pass zeros through, causing incorrect results.
     const std::function<fp16_t(size_t)> sparseAInit = [](size_t i) -> fp16_t {
         return (i % 2 == 0) ? type_convert<fp16_t>(1) : type_convert<fp16_t>(0);
@@ -317,7 +317,7 @@ void sparse_transform_verify(const std::vector<Type>& input,
 }
 
 // Helper: build expected index from a per-group 4-bit pattern, repeated for all groups.
-// Each group of 4 input elements contributes 2 compressed elements → 2 x 2-bit index fields = 4
+// Each group of 4 input elements contributes 2 compressed elements -> 2 x 2-bit index fields = 4
 // bits.
 static int32_t build_repeated_group_idx(int num_groups, int32_t group_bits_4)
 {
@@ -337,8 +337,8 @@ static int32_t build_alternating_group_idx(int num_groups, int32_t even_bits_4,
 }
 
 // 1. Basic correctness: valid divisible sizes
-// Input pattern: {1, 0, 3, 0, 5, 0, 7, 0, ...} → non-zeros at slots 0,2
-// Group idx pattern: field0=0b00 (slot 0), field1=0b10 (slot 2) → 0b1000
+// Input pattern: {1, 0, 3, 0, 5, 0, 7, 0, ...} -> non-zeros at slots 0,2
+// Group idx pattern: field0=0b00 (slot 0), field1=0b10 (slot 2) -> 0b1000
 template <int NUM, int RATIO, typename Type>
 void sparse_transform_test_case()
 {
@@ -392,7 +392,7 @@ TEST(SparseTransformsTest, AllZeroInput)
 // nonzero_elems initializes to {a_vec[slot2]=0, a_vec[slot3]=V}.
 // Only j=3 triggers: nonzero_elems[0]=V, field0=0b11, pos becomes 1.
 // nonzero_elems[1] keeps its init V. Output: {V, V}.
-// Group idx pattern: field0=0b11, field1=0b10 (default) → 0b1011
+// Group idx pattern: field0=0b11, field1=0b10 (default) -> 0b1011
 template <int NUM>
 void sparse_transform_single_nonzero()
 {
@@ -421,7 +421,7 @@ TEST(SparseTransformsTest, SingleNonZeroPerGroup)
 
 // Non-zeros at slots 1 and 3 in each group.
 // Input: {0, a, 0, b, ...}. Output: {a, b, ...}.
-// Group idx pattern: field0=0b01 (slot 1), field1=0b11 (slot 3) → 0b1101
+// Group idx pattern: field0=0b01 (slot 1), field1=0b11 (slot 3) -> 0b1101
 template <int NUM>
 void sparse_transform_slots_1_and_3()
 {
@@ -452,7 +452,7 @@ TEST(SparseTransformsTest, NonZerosAtSlots1And3)
 
 // Non-zeros at slots 0 and 3 in each group (non-adjacent).
 // Input: {a, 0, 0, b, ...}. Output: {a, b, ...}.
-// Group idx pattern: field0=0b00 (slot 0), field1=0b11 (slot 3) → 0b1100
+// Group idx pattern: field0=0b00 (slot 0), field1=0b11 (slot 3) -> 0b1100
 template <int NUM>
 void sparse_transform_slots_0_and_3()
 {
@@ -482,8 +482,8 @@ TEST(SparseTransformsTest, NonZerosAtSlots0And3)
 }
 
 // Mixed sparsity pattern: even groups have non-zeros at slots 0,2; odd groups at slots 1,3.
-// Even group idx: field0=0b00, field1=0b10 → 0b1000
-// Odd  group idx: field0=0b01, field1=0b11 → 0b1101
+// Even group idx: field0=0b00, field1=0b10 -> 0b1000
+// Odd  group idx: field0=0b01, field1=0b11 -> 0b1101
 template <int NUM>
 void sparse_transform_mixed()
 {
diff --git a/test/ck_tile/core/container/unit_sequence.cpp b/test/ck_tile/core/container/unit_sequence.cpp
index 2ce0d0f7e8..f0c97e1abc 100644
--- a/test/ck_tile/core/container/unit_sequence.cpp
+++ b/test/ck_tile/core/container/unit_sequence.cpp
@@ -399,7 +399,7 @@ TEST(SequenceSort, SortedMapWithDuplicates)
     using ExpectedSorted = sequence<1, 1, 3, 3>;
     EXPECT_TRUE((std::is_same<Sorted, ExpectedSorted>::value));
     // Verify round-trip: original[map[i]] == sorted[i] for all i
-    // (don't assert specific index order for duplicates — sort stability may vary)
+    // (don't assert specific index order for duplicates - sort stability may vary)
     EXPECT_EQ(Seq::at(Map::at(0)), Sorted::at(0));
     EXPECT_EQ(Seq::at(Map::at(1)), Sorted::at(1));
     EXPECT_EQ(Seq::at(Map::at(2)), Sorted::at(2));
diff --git a/test/ck_tile/flatmm/test_mx_flatmm_fixtures.hpp b/test/ck_tile/flatmm/test_mx_flatmm_fixtures.hpp
index c4adb3e2da..1f738f9379 100644
--- a/test/ck_tile/flatmm/test_mx_flatmm_fixtures.hpp
+++ b/test/ck_tile/flatmm/test_mx_flatmm_fixtures.hpp
@@ -12,7 +12,7 @@ using FP6  = ck_tile::pk_fp6x16_t;
 using FP8  = ck_tile::fp8_t;
 using FP16 = ck_tile::fp16_t;
 
-// Concrete test fixture — inherits all logic from TestMXFlatmmBase.
+// Concrete test fixture - inherits all logic from TestMXFlatmmBase.
 // Tuple layout: <ADataType, BDataType, CDataType, MXFlatmmArchTraits>
 template <typename Tuple>
 class TestMXFlatmm : public TestMXFlatmmBase<Tuple>
diff --git a/test/ck_tile/fmha/test_fmha_fwd.cpp b/test/ck_tile/fmha/test_fmha_fwd.cpp
index daf239fea9..6ae33da30f 100644
--- a/test/ck_tile/fmha/test_fmha_fwd.cpp
+++ b/test/ck_tile/fmha/test_fmha_fwd.cpp
@@ -605,7 +605,7 @@ TEST_P(Dropout, DataTypeConfig)
     if constexpr(std::is_same_v<DataTypeConfig, FmhaFwdFp16>)
     {
         if(hdim_q > 128 && mode == mode_enum::batch)
-            GTEST_SKIP() << "Skipped: fp16 dropout d256 batch — compiler bug (ROCm >= 7.12)";
+            GTEST_SKIP() << "Skipped: fp16 dropout d256 batch - compiler bug (ROCm >= 7.12)";
     }
 #endif
 
diff --git a/test/ck_tile/gemm_block_scale/CMakeLists.txt b/test/ck_tile/gemm_block_scale/CMakeLists.txt
index 21d34f7b34..676f01e355 100644
--- a/test/ck_tile/gemm_block_scale/CMakeLists.txt
+++ b/test/ck_tile/gemm_block_scale/CMakeLists.txt
@@ -14,21 +14,11 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
     # AQuant tests - split into 10 files
 
     # AQuant Memory Pipeline tests
-    add_gtest_executable(test_tile_gemm_quant_aquant_mem_prefill_interwave
-        test_gemm_quant_aquant_mem_prefill_interwave.cpp
-    )
-    target_compile_options(test_tile_gemm_quant_aquant_mem_prefill_interwave PRIVATE ${TEST_GEMM_COMPILE_OPTIONS})
-
     add_gtest_executable(test_tile_gemm_quant_aquant_mem_decode_intrawave
         test_gemm_quant_aquant_mem_decode_intrawave.cpp
     )
     target_compile_options(test_tile_gemm_quant_aquant_mem_decode_intrawave PRIVATE ${TEST_GEMM_COMPILE_OPTIONS})
 
-    add_gtest_executable(test_tile_gemm_quant_aquant_mem_decode_interwave
-        test_gemm_quant_aquant_mem_decode_interwave.cpp
-    )
-    target_compile_options(test_tile_gemm_quant_aquant_mem_decode_interwave PRIVATE ${TEST_GEMM_COMPILE_OPTIONS})
-
     add_gtest_executable(test_tile_gemm_quant_aquant_base_rcr 
         test_gemm_quant_aquant_base_rcr.cpp
     )
@@ -267,9 +257,7 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
     # Collect all test targets for umbrella label
     set(CK_TILE_GEMM_BLOCK_SCALE_TEST_TARGETS
         # AQuant tests
-        test_tile_gemm_quant_aquant_mem_prefill_interwave
         test_tile_gemm_quant_aquant_mem_decode_intrawave
-        test_tile_gemm_quant_aquant_mem_decode_interwave
         test_tile_gemm_quant_aquant_base_rcr
         test_tile_gemm_quant_aquant_base_rrr_crr
         test_tile_gemm_quant_aquant_base_ccr
@@ -324,9 +312,7 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
     # Target to build only AQuant memory pipeline tests
     add_custom_target(test_tile_gemm_aquant_mem_all)
     add_dependencies(test_tile_gemm_aquant_mem_all
-        test_tile_gemm_quant_aquant_mem_prefill_interwave
         test_tile_gemm_quant_aquant_mem_decode_intrawave
-        test_tile_gemm_quant_aquant_mem_decode_interwave
     )
 
     # Umbrella target to build all gemm quant tests
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp
deleted file mode 100644
index 1ef57716c9..0000000000
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_decode_interwave.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_quant_common.hpp"
-
-using AQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::AQuantGrouped>;
-
-// Type combinations for AQuant tests - Mem Decode Interwave Configuration
-// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
-// QuantType, GemmConfig, QuantGroupSize>
-// clang-format off
-using AQuantMemDecodeInterwaveTypes = ::testing::Types<
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigDecodeInterwave, GroupSize1D_128>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigDecodeInterwave, GroupSize1D_128>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigDecodeInterwave, GroupSize1D_128>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigDecodeInterwave, GroupSize1D_128>
->;
-// clang-format on
-
-// Test suite for AQuant Mem Decode Interwave
-TYPED_TEST_SUITE(TestCkTileGemmAQuantMem, AQuantMemDecodeInterwaveTypes);
-
-// AQuant tests
-TYPED_TEST(TestCkTileGemmAQuantMem, AQuantMemDecodeInterwaveTest)
-{
-    this->run_test_with_validation(16, 64, 512);
-}
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp
deleted file mode 100644
index fde3ec977b..0000000000
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_aquant_mem_prefill_interwave.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#include "test_gemm_quant_common.hpp"
-
-using AQuantGrouped = std::integral_constant<ck_tile::QuantType, ck_tile::QuantType::AQuantGrouped>;
-
-// Type combinations for AQuant tests - Mem Prefill Interwave Configuration
-// Tuple format: <ALayout, BLayout, CLayout, AQLayout, ADataType, BDataType, QDataType, CDataType,
-// QuantType, GemmConfig, QuantGroupSize>
-// clang-format off
-using AQuantMemPrefillInterwaveTypes = ::testing::Types<
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, FP8, FP8, float, Half, AQuantGrouped, GemmConfigPrefillInterwave, GroupSize1D_128>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, BF8, BF8, float, Half, AQuantGrouped, GemmConfigPrefillInterwave, GroupSize1D_128>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, FP8, FP8, Half, AQuantGrouped, GemmConfigPrefillInterwave, GroupSize1D_128>,
-    std::tuple<RowMajor, ColumnMajor, RowMajor, RowMajor, PkInt4, BF8, BF8, Half, AQuantGrouped, GemmConfigPrefillInterwave, GroupSize1D_128>
->;
-// clang-format on
-
-// Test suite for AQuant Mem Prefill Interwave
-TYPED_TEST_SUITE(TestCkTileGemmAQuantMem, AQuantMemPrefillInterwaveTypes);
-
-// AQuant tests
-TYPED_TEST(TestCkTileGemmAQuantMem, AQuantMemPrefillInterwaveTest)
-{
-    this->run_test_with_validation(1024, 1024, 1024);
-}
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp
index 7ab7d22dc7..04f6b3d483 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_decode.cpp
@@ -23,25 +23,25 @@ TYPED_TEST_SUITE(TestCkTileGemmBQuant, BQuantSplitKDecodeTypes);
 // BQuant split-K tests
 TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK2Test)
 {
-    // K=1024 for split_k=2: 1024/2=512=4×128 ✓
+    // K=1024 for split_k=2: 1024/2=512=4x128
     this->run_test_with_validation(32, 128, 1024, 2);
 }
 
 TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK3Test)
 {
-    // K=3072 for split_k=3: 3072/3=1024=8×128 ✓
+    // K=3072 for split_k=3: 3072/3=1024=8x128
     this->run_test_with_validation(32, 128, 3072, 3);
 }
 
 TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK4Test)
 {
-    // K=2048 for split_k=4: 2048/4=512=4×128 ✓
+    // K=2048 for split_k=4: 2048/4=512=4x128
     this->run_test_with_validation(32, 128, 2048, 4);
 }
 
 TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK5Test)
 {
-    // K=2560 for split_k=5: 2560/5=512=4×128 ✓
+    // K=2560 for split_k=5: 2560/5=512=4x128
     // Also K must be divisible by K_Tile(256)*split_k(5)=1280
     this->run_test_with_validation(32, 128, 2560, 5);
 }
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_prefill.cpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_prefill.cpp
index c076f89e59..5de4e64639 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_prefill.cpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_bquant_splitk_prefill.cpp
@@ -23,28 +23,28 @@ TYPED_TEST_SUITE(TestCkTileGemmBQuant, BQuantSplitKPrefillTypes);
 // BQuant split-K tests
 TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK2Test)
 {
-    // K=1024 for split_k=2: 1024/2=512=4×128 ✓
+    // K=1024 for split_k=2: 1024/2=512=4x128
     // K must be divisible by K_Tile(128)*split_k(2)=256
     this->run_test_with_validation(128, 128, 1024, 2);
 }
 
 TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK3Test)
 {
-    // K=3072 for split_k=3: 3072/3=1024=8×128 ✓
+    // K=3072 for split_k=3: 3072/3=1024=8x128
     // K must be divisible by K_Tile(128)*split_k(3)=384
     this->run_test_with_validation(128, 128, 3072, 3);
 }
 
 TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK4Test)
 {
-    // K=2048 for split_k=4: 2048/4=512=4×128 ✓
+    // K=2048 for split_k=4: 2048/4=512=4x128
     // K must be divisible by K_Tile(128)*split_k(4)=512
     this->run_test_with_validation(128, 128, 2048, 4);
 }
 
 TYPED_TEST(TestCkTileGemmBQuant, BQuantGroupedSplitK5Test)
 {
-    // K=1920 for split_k=5: 1920/5=384=3×128 ✓
+    // K=1920 for split_k=5: 1920/5=384=3x128
     // K must be divisible by K_Tile(128)*split_k(5)=640
     this->run_test_with_validation(128, 128, 1920, 5);
 }
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp
index abe84bac2e..42d850c6bb 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp
@@ -68,28 +68,20 @@ struct GemmConfigPrefillIntrawave : public GemmConfigBase
     static constexpr auto Scheduler          = ck_tile::GemmPipelineScheduler::Intrawave;
 };
 
-struct GemmConfigPrefillInterwave : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 128;
-    static constexpr auto Scheduler          = ck_tile::GemmPipelineScheduler::Interwave;
-};
-
 struct GemmConfigDecodeIntrawave : public GemmConfigBase
 {
     static constexpr ck_tile::index_t M_Tile = 16;
     static constexpr ck_tile::index_t N_Tile = 64;
     static constexpr ck_tile::index_t K_Tile = 256;
-    static constexpr auto Scheduler          = ck_tile::GemmPipelineScheduler::Intrawave;
-};
-
-struct GemmConfigDecodeInterwave : public GemmConfigBase
-{
-    static constexpr ck_tile::index_t M_Tile = 16;
-    static constexpr ck_tile::index_t N_Tile = 64;
-    static constexpr ck_tile::index_t K_Tile = 256;
-    static constexpr auto Scheduler          = ck_tile::GemmPipelineScheduler::Interwave;
+    // Workaround for ROCm 7.13 compiler codegen regression on gfx1201 (RDNA4).
+    // AQuantGemmPipelineAgBgCrMem always uses the Intrawave pipeline implementation
+    // regardless of this value, but the Scheduler enum changes the Problem type identity,
+    // causing a different template instantiation. The Intrawave instantiation triggers
+    // incorrect ISA generation (wrong global instruction scheduling in the hot loop),
+    // producing ~3% wrong values for FP8/BF8 AQuant GEMM with K > K_Tile.
+    // Setting Interwave creates a different instantiation that gets correct codegen.
+    // Revert to Intrawave once the compiler is fixed.
+    static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Interwave;
 };
 
 struct GemmConfigMx : public GemmConfigBase
diff --git a/test/ck_tile/gemm_mx/test_mx_gemm_pipeline_wmma_base.hpp b/test/ck_tile/gemm_mx/test_mx_gemm_pipeline_wmma_base.hpp
index 8491e7abe2..0e2a37c892 100644
--- a/test/ck_tile/gemm_mx/test_mx_gemm_pipeline_wmma_base.hpp
+++ b/test/ck_tile/gemm_mx/test_mx_gemm_pipeline_wmma_base.hpp
@@ -56,11 +56,11 @@ class TestCkTileMxGemmPipelineWmmaBase : public TestCkTileMxGemmPipeline<Tuple,
         if constexpr(!b_is_f4 && !b_scale_e8m0)
             return false;
 
-        // Both E8M0 → always valid
+        // Both E8M0 -> always valid
         if constexpr(a_scale_e8m0 && b_scale_e8m0)
             return true;
 
-        // Both non-E8M0 → must match (both are F4 by rule 1)
+        // Both non-E8M0 -> must match (both are F4 by rule 1)
         if constexpr(!a_scale_e8m0 && !b_scale_e8m0)
             return std::is_same_v<AScaleDataType, BScaleDataType>;
 
diff --git a/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight_streamk.cpp b/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight_streamk.cpp
index 0e59d22f2c..60cc947253 100644
--- a/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight_streamk.cpp
+++ b/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight_streamk.cpp
@@ -458,8 +458,8 @@ TEST(StreamKConvBwdWeight, Tree_EndToEnd_MultiGroup)
         2, 4, 128, 128, 3, 3, 16, 16, 4, 1)));
 }
 
-// Stride > 1 — shrinks Ho/Wo, changing the K/tile ratio and DP/SK split.
-// Hi=16, Wi=16, 3x3 filter, stride=2, pad=1 → Ho=Wo=8, GemmK=N*64
+// Stride > 1 - shrinks Ho/Wo, changing the K/tile ratio and DP/SK split.
+// Hi=16, Wi=16, 3x3 filter, stride=2, pad=1 -> Ho=Wo=8, GemmK=N*64
 TEST(StreamKConvBwdWeight, Linear_EndToEnd_Stride2)
 {
     EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPartitioner>>(1,
@@ -504,8 +504,8 @@ TEST(StreamKConvBwdWeight, Tree_EndToEnd_Stride2)
                                                                          1)));
 }
 
-// Pure DP — num_tiles evenly divides grid, so sk_ctas=0.
-// K=256, C=128, 3x3 → GemmM=256, GemmN=1152 → tiles=2*9=18, grid=3*1=3, 18%3=0
+// Pure DP - num_tiles evenly divides grid, so sk_ctas=0.
+// K=256, C=128, 3x3 -> GemmM=256, GemmN=1152 -> tiles=2*9=18, grid=3*1=3, 18%3=0
 TEST(StreamKConvBwdWeight, Linear_EndToEnd_PureDP)
 {
     EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPartitioner>>(
@@ -518,8 +518,8 @@ TEST(StreamKConvBwdWeight, Tree_EndToEnd_PureDP)
         1, 4, 256, 128, 3, 3, 16, 16, 3, 1)));
 }
 
-// Single output tile — all work is SK, zero DP tiles.
-// K=128, C=128, 1x1 filter, stride=1, pad=0 → GemmM=128, GemmN=128, tiles=1
+// Single output tile - all work is SK, zero DP tiles.
+// K=128, C=128, 1x1 filter, stride=1, pad=0 -> GemmM=128, GemmN=128, tiles=1
 TEST(StreamKConvBwdWeight, Linear_EndToEnd_SingleTile)
 {
     EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPartitioner>>(1,
@@ -564,7 +564,7 @@ TEST(StreamKConvBwdWeight, Tree_EndToEnd_SingleTile)
                                                                          0)));
 }
 
-// Large N — GemmK = 32*16*16 = 8192, many K iterations per tile.
+// Large N - GemmK = 32*16*16 = 8192, many K iterations per tile.
 TEST(StreamKConvBwdWeight, Linear_EndToEnd_LargeN)
 {
     EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPartitioner>>(
@@ -577,7 +577,7 @@ TEST(StreamKConvBwdWeight, Tree_EndToEnd_LargeN)
         1, 32, 128, 128, 3, 3, 16, 16, 4, 1)));
 }
 
-// Higher occupancy — doubles the grid, more SK CTAs share tiles.
+// Higher occupancy - doubles the grid, more SK CTAs share tiles.
 TEST(StreamKConvBwdWeight, Linear_EndToEnd_HigherOccupancy)
 {
     EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPartitioner>>(
@@ -590,7 +590,7 @@ TEST(StreamKConvBwdWeight, Tree_EndToEnd_HigherOccupancy)
         1, 4, 128, 128, 3, 3, 16, 16, 4, 2)));
 }
 
-// Persistent DP — workgroups loop over DP tiles, then do SK work.
+// Persistent DP - workgroups loop over DP tiles, then do SK work.
 TEST(StreamKConvBwdWeight, LinearPersistent_EndToEnd_SmallShape)
 {
     EXPECT_TRUE((run_streamk_vs_splitk_test<TestKernel<LinearPersistentPartitioner>>(
@@ -619,7 +619,7 @@ TEST(StreamKConvBwdWeight, TreePersistent_EndToEnd_MultiGroup)
 // Negative tests: IsSupportedArgument should reject invalid shapes
 // ============================================================================
 
-// C not divisible by VectorSizeB (=8) → rejected
+// C not divisible by VectorSizeB (=8) -> rejected
 TEST(StreamKConvBwdWeight, IsSupportedArgument_RejectsUnalignedC)
 {
     using Kernel = TestKernel<LinearPartitioner>;
@@ -630,7 +630,7 @@ TEST(StreamKConvBwdWeight, IsSupportedArgument_RejectsUnalignedC)
     EXPECT_FALSE(Kernel::IsSupportedArgument(kargs));
 }
 
-// K not divisible by VectorSizeA (=4) → rejected
+// K not divisible by VectorSizeA (=4) -> rejected
 TEST(StreamKConvBwdWeight, IsSupportedArgument_RejectsUnalignedK)
 {
     using Kernel = TestKernel<TreePartitioner>;
diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
index 1c955ff98a..16c7dfcb29 100644
--- a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
@@ -140,7 +140,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
         EXPECT_TRUE(Kernel::IsSupportedArgument(kargs));
 
         // Use the filtered kargs (zero-dim groups are excluded by MakeKargs) to derive
-        // the correct grid size and group count — not the raw gemm_descs vector.
+        // the correct grid size and group count - not the raw gemm_descs vector.
         const dim3 blocks = Kernel::BlockSize();
         if(kargs.empty())
             return;
@@ -468,7 +468,7 @@ class TestCkTileGroupedGemm : public ::testing::Test
         bool pass{true};
         for(int i = 0; i < group_count; ++i)
         {
-            // Groups with M=0 or N=0 produce no output — skip validation.
+            // Groups with M=0 or N=0 produce no output - skip validation.
             // K=0 groups do produce output (all zeros) and are validated normally.
             if(Ms[i] == 0 || Ns[i] == 0)
                 continue;
diff --git a/test/ck_tile/multicast_load/test_cluster_load_async_to_lds.cpp b/test/ck_tile/multicast_load/test_cluster_load_async_to_lds.cpp
index 2d75e0dac9..1519597219 100644
--- a/test/ck_tile/multicast_load/test_cluster_load_async_to_lds.cpp
+++ b/test/ck_tile/multicast_load/test_cluster_load_async_to_lds.cpp
@@ -1,28 +1,28 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
-// Unit test suite for cluster_multicast_load_async_to_lds<T> — the CK Tile wrapper
+// Unit test suite for cluster_multicast_load_async_to_lds<T> - the CK Tile wrapper
 // around CLUSTER_LOAD_ASYNC_TO_LDS_B* (gfx1250 only).
 //
 // Complements test_cluster_load_multicast.cpp (CLUSTER_LOAD_B, VGPR destination)
 // by testing behaviors unique to the async LDS path:
 //
-//   Group 1:  SingleWGP baseline — B32/B64/B128, mask=0x1 and mask=0x0
-//   Group 2:  LDSVisibility — non-requesting waves read LDS after barrier
-//   Group 3:  LDS address layout — per-lane VDST strided addressing
-//   Group 4:  MultiWGP broadcast — async LDS delivery at cluster scale (1D and 2D cluster dims)
-//   Group 5:  ASYNCcnt ordering — CLUSTER and GLOBAL async loads share one counter
-//   Group 6:  PartialBroadcast — non-contiguous mask, mixed instruction types
-//   Group 8:  MultiWGP + LDSVisibility — canonical GEMM tile-load pattern
-//   Group 10: ConcurrentGroups — LDS routing isolation between independent groups
-//   Group 11: BufferViewAsyncGet — cluster_async_get() through buffer_view,
+//   Group 1:  SingleWGP baseline - B32/B64/B128, mask=0x1 and mask=0x0
+//   Group 2:  LDSVisibility - non-requesting waves read LDS after barrier
+//   Group 3:  LDS address layout - per-lane VDST strided addressing
+//   Group 4:  MultiWGP broadcast - async LDS delivery at cluster scale (1D and 2D cluster dims)
+//   Group 5:  ASYNCcnt ordering - CLUSTER and GLOBAL async loads share one counter
+//   Group 6:  PartialBroadcast - non-contiguous mask, mixed instruction types
+//   Group 8:  MultiWGP + LDSVisibility - canonical GEMM tile-load pattern
+//   Group 10: ConcurrentGroups - LDS routing isolation between independent groups
+//   Group 11: BufferViewAsyncGet - cluster_async_get() through buffer_view,
 //             including ISA-specified INST_OFFSET behaviour
 //
 // Synchronization primitives used:
-//   s_wait_asynccnt<0>() — wait for all pending async LDS writes to complete.
+//   s_wait_asynccnt<0>() - wait for all pending async LDS writes to complete.
 //     ASYNCcnt decrements only when the LDS write is committed and visible to
 //     subsequent DS reads on the same wave.
-//   block_sync_lds_direct_load<0>() — s_wait_asynccnt<0> + s_barrier_signal/wait.
+//   block_sync_lds_direct_load<0>() - s_wait_asynccnt<0> + s_barrier_signal/wait.
 //     Used when multiple waves in a WG must synchronize after an async LDS load.
 
 #include "gtest/gtest.h"
@@ -109,7 +109,7 @@ void run_async_lds_test(const std::vector<T>& h_src, int mask, const char* test_
 }
 
 // ---------------------------------------------------------------------------
-// Group 1: SingleWGP — B32, B64, B128, mask=0x1
+// Group 1: SingleWGP - B32, B64, B128, mask=0x1
 // ---------------------------------------------------------------------------
 
 TEST(AsyncLDS, B32_SingleWGP)
@@ -149,7 +149,7 @@ TEST(AsyncLDS, B128_SingleWGP)
 }
 
 // ---------------------------------------------------------------------------
-// Group 2: LDSVisibility — cross-wave LDS sharing after async load
+// Group 2: LDSVisibility - cross-wave LDS sharing after async load
 // ---------------------------------------------------------------------------
 // 4 waves per WG (128 threads). Wave 0 issues the async cluster load into
 // LDS[0..31], then all waves synchronize via block_sync_lds_direct_load
@@ -267,7 +267,7 @@ TEST(LDSVisibility, B128_FourWaves)
 }
 
 // ---------------------------------------------------------------------------
-// Group 3: LDSAddressLayout — per-lane VDST addressing (strided)
+// Group 3: LDSAddressLayout - per-lane VDST addressing (strided)
 // ---------------------------------------------------------------------------
 // CLUSTER_LOAD_ASYNC_TO_LDS_B* supplies the LDS destination address via a
 // per-lane VGPR (VDST). Each lane independently specifies where in LDS its
@@ -276,7 +276,7 @@ TEST(LDSVisibility, B128_FourWaves)
 //
 // Each lane writes to lds_buf[lane_id * kStride], leaving kStride-1 unused
 // slots between lanes. The strided slots are zero-initialized before the
-// async load so that any unwritten slot reads back 0 — which cannot
+// async load so that any unwritten slot reads back 0 - which cannot
 // collide with src[i] = 1000 + i. If the hardware ignores VDST and
 // writes to lds_buf[lane_id] instead, lanes 1..31 read from their strided
 // slots and find zeros, causing a FAIL.
@@ -363,10 +363,10 @@ TEST(LDSAddressLayout, B32_Strided_SingleWGP)
 }
 
 // ---------------------------------------------------------------------------
-// Group 4: MultiWGP Broadcast — async LDS delivery at cluster scale
+// Group 4: MultiWGP Broadcast - async LDS delivery at cluster scale
 // ---------------------------------------------------------------------------
 // All WGPs in a cluster load from the same single source value (true broadcast).
-// Each lane within a WGP loads from shared_src → lds_buf[lane_id] via per-lane
+// Each lane within a WGP loads from shared_src -> lds_buf[lane_id] via per-lane
 // VDST, so every LDS slot in every WGP ends up holding the broadcast value.
 //
 // Wave 0 of each WGP issues the async cluster load; s_wait_asynccnt<0> ensures
@@ -569,11 +569,11 @@ TEST(MultiWGPBroadcast, B32_2x2Cluster)
 }
 
 // ---------------------------------------------------------------------------
-// Group 7: ZeroMask — mask=0x0 degrades to non-multicast async load
+// Group 7: ZeroMask - mask=0x0 degrades to non-multicast async load
 // ---------------------------------------------------------------------------
 // ISA spec: "If M0[15:0] == 0, this is treated as a non-Cluster-multicast load:
 // return only to the requesting WGP (it is not treated as 'do not return to
-// any wave')." Data still lands in the requesting WGP's LDS — no deadlock,
+// any wave')." Data still lands in the requesting WGP's LDS - no deadlock,
 // no lost load.
 
 TEST(AsyncLDS, B32_ZeroMask)
@@ -589,7 +589,7 @@ TEST(AsyncLDS, B32_ZeroMask)
 }
 
 // ---------------------------------------------------------------------------
-// Group 5: ASYNCcnt Ordering — CLUSTER_LOAD_ASYNC_TO_LDS + GLOBAL_LOAD_ASYNC_TO_LDS
+// Group 5: ASYNCcnt Ordering - CLUSTER_LOAD_ASYNC_TO_LDS + GLOBAL_LOAD_ASYNC_TO_LDS
 // ---------------------------------------------------------------------------
 // Both instructions share a single ASYNCcnt on gfx1250, so one
 // s_wait_asynccnt<0>() is sufficient to guarantee both async LDS writes
@@ -598,7 +598,7 @@ TEST(AsyncLDS, B32_ZeroMask)
 // A single wave issues both instructions back-to-back with no wait between:
 //   1. cluster_multicast_load_async_to_lds(src_a + lane) -> lds_a[lane]
 //   2. global_load_async_to_lds_b32(src_b + lane)        -> lds_b[lane]
-//   3. s_wait_asynccnt<0>()   — one wait must drain both
+//   3. s_wait_asynccnt<0>()   - one wait must drain both
 //   4. Read lds_a[lane] -> dst_a[lane], lds_b[lane] -> dst_b[lane]
 //
 // If s_wait_asynccnt only drained one instruction type, the wave would read
@@ -628,7 +628,7 @@ struct ASYNCcntOrderingKernel
         __builtin_amdgcn_global_load_async_to_lds_b32(
             ck_tile::to_global(src_b + lane_id), ck_tile::to_lds(lds_b + lane_id), 0, 0);
 
-        // Step 3: Single wait — must drain both async loads.
+        // Step 3: Single wait - must drain both async loads.
         ck_tile::s_wait_asynccnt<0>();
 
         // Step 4: Read both LDS slots. Correct data in both confirms shared
@@ -705,7 +705,7 @@ TEST(ASYNCcntOrdering, MixedAsyncLoads_B32_SingleWGP)
 }
 
 // ---------------------------------------------------------------------------
-// Group 6: PartialBroadcast — non-contiguous mask, mixed instruction types
+// Group 6: PartialBroadcast - non-contiguous mask, mixed instruction types
 // ---------------------------------------------------------------------------
 // 4 WGPs, mask = 0x5 (binary 0101). WGPs 0 and 2 participate in the cluster
 // multicast; WGPs 1 and 3 do not.
@@ -719,7 +719,7 @@ TEST(ASYNCcntOrdering, MixedAsyncLoads_B32_SingleWGP)
 //   Expected LDS: 5000 + lane in every slot
 //
 // This simultaneously verifies:
-//   1. Multicast data is delivered only to WGPs whose bits are set in M0 —
+//   1. Multicast data is delivered only to WGPs whose bits are set in M0 -
 //      non-participating WGPs do not receive the broadcast value.
 //   2. Both async instruction types coexist in the same cluster on the same
 //      ASYNCcnt without cross-contaminating each other's LDS destinations.
@@ -823,7 +823,7 @@ TEST(PartialBroadcast, B32_4WGP_Mask0x5)
 }
 
 // ---------------------------------------------------------------------------
-// Group 8: MultiWGP + LDSVisibility Combined — the canonical GEMM tile-load
+// Group 8: MultiWGP + LDSVisibility Combined - the canonical GEMM tile-load
 // ---------------------------------------------------------------------------
 // 4 WGPs in a cluster, 4 waves per WG (128 threads). Wave 0 of each WG issues
 // cluster_multicast_load_async_to_lds (true broadcast: all lanes load from the
@@ -966,7 +966,7 @@ TEST(MultiWGPLDSVisibility, B128_4WGP_4Waves)
 }
 
 // ---------------------------------------------------------------------------
-// Group 10: ConcurrentGroups — LDS routing isolation between independent groups
+// Group 10: ConcurrentGroups - LDS routing isolation between independent groups
 // ---------------------------------------------------------------------------
 // 4 WGPs in one cluster, two independent broadcast groups:
 //   WGPs 0/1: mask = 0x3, load val_a into LDS
@@ -977,7 +977,7 @@ TEST(MultiWGPLDSVisibility, B128_4WGP_4Waves)
 //
 // For CLUSTER_LOAD_B (VGPR destination), misdirected data would land in a
 // per-thread VGPR that is private to one wave and physically unreadable by
-// another WG — so VGPR tests cannot detect LDS routing bugs. Here, if the
+// another WG - so VGPR tests cannot detect LDS routing bugs. Here, if the
 // hardware routes val_a to WGPs 2/3's LDS (or vice versa), the host
 // verification catches it. This is the only test that can expose such a bug.
 
@@ -1100,7 +1100,7 @@ TEST(ConcurrentGroupsLDS, B128_4WGP_TwoGroups)
 }
 
 // ---------------------------------------------------------------------------
-// Group 11: BufferViewAsyncGet — cluster_async_get() through buffer_view
+// Group 11: BufferViewAsyncGet - cluster_async_get() through buffer_view
 // ---------------------------------------------------------------------------
 // Tests the buffer_view::cluster_async_get() interface, which wraps
 // cluster_multicast_load_async_to_lds and handles global pointer arithmetic
@@ -1126,7 +1126,7 @@ using TestBufView = ck_tile::buffer_view<ck_tile::address_space_enum::global,
                                          true,
                                          ck_tile::amd_buffer_coherence_enum::coherence_default>;
 
-// Kernel 1: basic load — each lane loads src[lane_id] into lds_buf[lane_id]
+// Kernel 1: basic load - each lane loads src[lane_id] into lds_buf[lane_id]
 // via buffer_view::cluster_async_get.
 struct BufferViewBasicKernel
 {
@@ -1189,9 +1189,9 @@ TEST(BufferViewAsyncGet, B32_BasicLoad)
         EXPECT_EQ(h_dst[i], h_src[i]) << "B32_BasicLoad: mismatch at lane " << i;
 }
 
-// Kernel 2: inst_offset=4 — each lane supplies VDST = &lds_buf[lane*2].
+// Kernel 2: inst_offset=4 - each lane supplies VDST = &lds_buf[lane*2].
 // ISA applies inst_offset to both VADDR and VDST:
-//   LDS[VDST+4] = GLOBAL[VADDR+4]  →  lds_buf[lane*2+1] = src[lane+1]
+//   LDS[VDST+4] = GLOBAL[VADDR+4]  ->  lds_buf[lane*2+1] = src[lane+1]
 // src is allocated with NUM_LANES+1 elements so lane 31 reads src[32] safely.
 struct BufferViewInstOffsetKernel
 {
@@ -1263,7 +1263,7 @@ TEST(BufferViewAsyncGet, B32_InstOffset)
     const int sentinel = 0xDEADBEEF;
     for(int i = 0; i < NUM_LANES; i++)
     {
-        // Even slot (VDST): inst_offset skips this — sentinel must remain.
+        // Even slot (VDST): inst_offset skips this - sentinel must remain.
         EXPECT_EQ(h_dst[i * 2], sentinel)
             << "lane " << i << " even slot: expected sentinel, got " << h_dst[i * 2];
 
diff --git a/test/ck_tile/utility/test_sequence.cpp b/test/ck_tile/utility/test_sequence.cpp
index 9e75411e64..9f468bc6a1 100644
--- a/test/ck_tile/utility/test_sequence.cpp
+++ b/test/ck_tile/utility/test_sequence.cpp
@@ -178,7 +178,7 @@ TEST(CkTileSequence, UniformSequenceGenLarger)
 }
 
 // ============================================================================
-// sequence_reverse_inclusive_scan tests — runtime value verification
+// sequence_reverse_inclusive_scan tests - runtime value verification
 // ============================================================================
 
 TEST(CkTileSequence, ReverseInclusiveScanProduct)
@@ -217,7 +217,7 @@ TEST(CkTileSequence, ReverseInclusiveScanEmpty)
 }
 
 // ============================================================================
-// sequence_inclusive_scan (forward) tests — runtime value verification
+// sequence_inclusive_scan (forward) tests - runtime value verification
 // ============================================================================
 
 TEST(CkTileSequence, ForwardInclusiveScanSum)
@@ -273,7 +273,7 @@ TEST(CkTileSequence, ForwardInclusiveScanEmpty)
 }
 
 // ============================================================================
-// sequence_map_inverse tests — runtime round-trip verification
+// sequence_map_inverse tests - runtime round-trip verification
 // ============================================================================
 
 TEST(CkTileSequence, MapInverseIdentity)
diff --git a/test/ck_tile/utility/test_static_ford.cpp b/test/ck_tile/utility/test_static_ford.cpp
index 7337471647..e9b2e8c197 100644
--- a/test/ck_tile/utility/test_static_ford.cpp
+++ b/test/ck_tile/utility/test_static_ford.cpp
@@ -10,7 +10,7 @@
 using namespace ck_tile;
 
 // ============================================================================
-// static_ford Tests — Identity Order (default)
+// static_ford Tests - Identity Order (default)
 // ============================================================================
 
 TEST(CkTileStaticFord, Identity2D)
@@ -122,7 +122,7 @@ TEST(CkTileStaticFord, IdentityWithUnitDim)
 }
 
 // ============================================================================
-// static_ford Tests — Non-Identity Order (primary template with decompose_reordered)
+// static_ford Tests - Non-Identity Order (primary template with decompose_reordered)
 // ============================================================================
 
 TEST(CkTileStaticFord, ReversedOrder2D)
diff --git a/test/cluster_launch/test_cluster_launch.cpp b/test/cluster_launch/test_cluster_launch.cpp
index 0686fed2c8..648c530524 100644
--- a/test/cluster_launch/test_cluster_launch.cpp
+++ b/test/cluster_launch/test_cluster_launch.cpp
@@ -131,7 +131,7 @@ TEST(ClusterLaunch, ClusterBuiltins)
     out_mem.FromDevice(out_host.data());
 
     // cluster_id_x = blockIdx.x / clusterSize
-    // Blocks 0,1 → cluster 0; Blocks 2,3 → cluster 1
+    // Blocks 0,1 -> cluster 0; Blocks 2,3 -> cluster 1
     for(int block = 0; block < numBlocks; ++block)
     {
         int expected_cluster_id = block / clusterSize;
diff --git a/test/cluster_load/test_cluster_load.cpp b/test/cluster_load/test_cluster_load.cpp
index 5a5d4cf646..b0be4be2fc 100644
--- a/test/cluster_load/test_cluster_load.cpp
+++ b/test/cluster_load/test_cluster_load.cpp
@@ -18,7 +18,7 @@ using ::ck::DeviceMem;
 constexpr int kTileSize = 32;
 
 //
-// cluster_load: Global → VGPRs with a WGP participation mask.
+// cluster_load: Global -> VGPRs with a WGP participation mask.
 // Templated kernels covering 4-byte (int), 8-byte (int2), and 16-byte (int4) loads.
 //
 
diff --git a/test/cluster_load/test_cluster_load_async.cpp b/test/cluster_load/test_cluster_load_async.cpp
index a18a9c8291..0a1113a3c3 100644
--- a/test/cluster_load/test_cluster_load_async.cpp
+++ b/test/cluster_load/test_cluster_load_async.cpp
@@ -18,7 +18,7 @@ using ::ck::DeviceMem;
 constexpr int kTileSize = 32;
 
 //
-// cluster_load_async: Global → LDS with a WGP participation mask.
+// cluster_load_async: Global -> LDS with a WGP participation mask.
 // Templated kernels covering 1-byte (char), 4-byte (int), 8-byte (int2),
 // and 16-byte (int4) async loads.
 //
@@ -28,7 +28,7 @@ extern __shared__ char shared_lds[];
 
 // --- Templated kernels ----------------------------------------------------
 
-// Single WGP, async load global → LDS, copy LDS → output. mask = 0x1.
+// Single WGP, async load global -> LDS, copy LDS -> output. mask = 0x1.
 template <typename T>
 __global__ void
 cluster_load_async_single_wgp_kernel(const T* __restrict__ in, T* __restrict__ out, int n)
@@ -131,7 +131,7 @@ __global__ void cluster_load_async_partial_mask_kernel(
     }
 }
 
-// LDS bounds check — sentinel region adjacent to loaded tile remains zero.
+// LDS bounds check - sentinel region adjacent to loaded tile remains zero.
 template <typename T>
 __global__ void
 cluster_load_async_bounds_check_kernel(const T* __restrict__ in, T* __restrict__ out, int n)
diff --git a/test/data_type/test_mx_bf6_pk4scale.cpp b/test/data_type/test_mx_bf6_pk4scale.cpp
index 9820549730..bed773538a 100644
--- a/test/data_type/test_mx_bf6_pk4scale.cpp
+++ b/test/data_type/test_mx_bf6_pk4scale.cpp
@@ -95,7 +95,7 @@ TYPED_TEST(MXBF6Pk4ScaleTypedTest, DeviceWavewiseBlock32)
     constexpr float Val = 2.0f;
     std::vector<T> out(M * N, -1.0f);
     std::vector<uint32_t> scale(2 * M);
-    // Test scale variations: 16 rows × 128 columns
+    // Test scale variations: 16 rows x 128 columns
     // - Each row has different 8 scale factor (scale[m]-packed 4 and scale[m+16]-packed4)
     // - Within a row, every 16 consecutive columns share the same scale factor
     for(int m = 0; m < M; m++)
@@ -237,7 +237,7 @@ TYPED_TEST(MXBF6Pk4ScaleTypedTest, DeviceWavewiseBlock16)
     constexpr float Val = 2.0f;
     std::vector<T> out(M * N, -1.0f);
     std::vector<uint32_t> scale(2 * M);
-    // Test scale variations: 16 rows × 128 columns
+    // Test scale variations: 16 rows x 128 columns
     // - Each row has different 8 scale factor (scale[m]-packed 4 and scale[m+16]-packed4)
     // - Within a row, every 16 consecutive columns share the same scale factor
     for(int m = 0; m < M; m++)
diff --git a/test/data_type/test_mx_fp6_pk4scale.cpp b/test/data_type/test_mx_fp6_pk4scale.cpp
index e3e99e49aa..3fba00d68c 100644
--- a/test/data_type/test_mx_fp6_pk4scale.cpp
+++ b/test/data_type/test_mx_fp6_pk4scale.cpp
@@ -95,7 +95,7 @@ TYPED_TEST(MXFP6Pk4ScaleTypedTest, DeviceWavewiseBlock32)
     constexpr float Val = 2.0f;
     std::vector<T> out(M * N, -1.0f);
     std::vector<uint32_t> scale(2 * M);
-    // Test scale variations: 16 rows × 128 columns
+    // Test scale variations: 16 rows x 128 columns
     // - Each row has different 8 scale factor (scale[m]-packed 4 and scale[m+16]-packed4)
     // - Within a row, every 16 consecutive columns share the same scale factor
     for(int m = 0; m < M; m++)
@@ -238,7 +238,7 @@ TYPED_TEST(MXFP6Pk4ScaleTypedTest, DeviceWavewiseBlock16)
     constexpr float Val = 2.0f;
     std::vector<T> out(M * N, -1.0f);
     std::vector<uint32_t> scale(2 * M);
-    // Test scale variations: 16 rows × 128 columns
+    // Test scale variations: 16 rows x 128 columns
     // - Each row has different 8 scale factor (scale[m]-packed 4 and scale[m+16]-packed4)
     // - Within a row, every 16 consecutive columns share the same scale factor
     for(int m = 0; m < M; m++)
diff --git a/test/gemm/test_gemm_vgpr.cpp b/test/gemm/test_gemm_vgpr.cpp
index 46fabf74f3..45a6ef7861 100644
--- a/test/gemm/test_gemm_vgpr.cpp
+++ b/test/gemm/test_gemm_vgpr.cpp
@@ -1,4 +1,4 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
 /**
diff --git a/test/mx_wmma_op/mx_wmma_op.hpp b/test/mx_wmma_op/mx_wmma_op.hpp
index 242111f273..31c551d445 100644
--- a/test/mx_wmma_op/mx_wmma_op.hpp
+++ b/test/mx_wmma_op/mx_wmma_op.hpp
@@ -853,7 +853,7 @@ struct TestMXWMMA
                 std::vector<std::size_t> idx({i, j});
                 if constexpr(is_same_v<DataType, f4x2_pk_t>)
                 {
-                    // f4x2_pk_t packs two f4 values — print both
+                    // f4x2_pk_t packs two f4 values - print both
                     auto pack = mat(idx);
                     std::cout << ck::type_convert<float>(f4_t(pack.template unpack<>(Number<0>{})))
                               << "/" // lo/hi separator within a packed element
@@ -863,7 +863,7 @@ struct TestMXWMMA
                 else if constexpr(is_same_v<DataType, f6x16_pk_t> ||
                                   is_same_v<DataType, f6x32_pk_t>)
                 {
-                    // f6_pk_t packs packed_size f6_t values — print all
+                    // f6_pk_t packs packed_size f6_t values - print all
                     auto pack = mat(idx);
                     for(index_t k = 0; k < DataType::packed_size; ++k)
                     {
@@ -876,7 +876,7 @@ struct TestMXWMMA
                 else if constexpr(is_same_v<DataType, bf6x16_pk_t> ||
                                   is_same_v<DataType, bf6x32_pk_t>)
                 {
-                    // bf6_pk_t packs packed_size bf6_t values — print all
+                    // bf6_pk_t packs packed_size bf6_t values - print all
                     auto pack = mat(idx);
                     for(index_t k = 0; k < DataType::packed_size; ++k)
                     {
diff --git a/test/synchronization/monitor_mwait.cpp b/test/synchronization/monitor_mwait.cpp
index 8fa418cc5a..b3490bb08a 100644
--- a/test/synchronization/monitor_mwait.cpp
+++ b/test/synchronization/monitor_mwait.cpp
@@ -1,4 +1,4 @@
-// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier:  MIT
 
 #include "gtest/gtest.h"
diff --git a/tile_engine/ops/pooling/pooling_benchmark_single.cpp b/tile_engine/ops/pooling/pooling_benchmark_single.cpp
index 0d872a9f51..ff71081883 100644
--- a/tile_engine/ops/pooling/pooling_benchmark_single.cpp
+++ b/tile_engine/ops/pooling/pooling_benchmark_single.cpp
@@ -30,7 +30,7 @@
 // The kernel header is included via compile command line with -include flag.
 
 // --------------------------------------------------------------------------
-// Benchmark implementation — templated on pooling dimension so that only
+// Benchmark implementation - templated on pooling dimension so that only
 // the matching branch is instantiated (2D or 3D).
 // --------------------------------------------------------------------------