diff --git a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt index c59363fc37..bbd34caa61 100644 --- a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt +++ b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt @@ -10,11 +10,11 @@ if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95") add_executable(${EXE_NAME} EXCLUDE_FROM_ALL gemm_quant.cpp gemm_aquant_quantgrouped.cpp - gemm_bquant_quantgourped_prefill_bf8i4.cpp - gemm_bquant_quantgourped_prefill_fp8i4.cpp - gemm_bquant_quantgourped_prefill_bf8.cpp - gemm_bquant_quantgourped_prefill_fp8.cpp - gemm_bquant_quantgourped_preshuffleb_prefill.cpp + gemm_bquant_quantgrouped_prefill_bf8i4.cpp + gemm_bquant_quantgrouped_prefill_fp8i4.cpp + gemm_bquant_quantgrouped_prefill_bf8.cpp + gemm_bquant_quantgrouped_prefill_fp8.cpp + gemm_bquant_quantgrouped_preshuffleb_prefill.cpp ) target_compile_options(${EXE_NAME} PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) else() diff --git a/example/ck_tile/38_block_scale_gemm/README.md b/example/ck_tile/38_block_scale_gemm/README.md index 496697ca32..55150e0599 100644 --- a/example/ck_tile/38_block_scale_gemm/README.md +++ b/example/ck_tile/38_block_scale_gemm/README.md @@ -40,23 +40,31 @@ This will result in an executable `build/bin/tile_example_gemm_quant_basic` ## example ``` args: - -b batch size (default:1) - -m m dimension (default:1024) - -n n dimension (default:2048) - -k k dimension (default:64) - -a_layout Tensor A data layout (default: R) - -b_layout Tensor B data layout (default: C) - -c_layout Tensor C data layout (default: R) - -stride_a Tensor A stride (default:0) - -stride_b Tensor B stride (default:0) - -stride_c Tensor C stride (default:0) - -v 0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:1) - -e Absolute error tolerance (default:1e-5) - -prec data type. fp8/bf8/i4fp8/i4bf8/i4f32fp8/i4f32bf8 (default:fp8) - -warmup number of iterations before benchmark the kernel (default:10) - -repeat number of iterations to benchmark the kernel (default:100) - -timer gpu:gpu timer, cpu:cpu timer (default:gpu) - -quant_mode Which quant method to use (aquant, bquant, tensor, rowcol) + -h Print help message (default:false) + -m m dimension (default:3840) + -n n dimension (default:4096) + -k k dimension (default:2048) + -a_layout A tensor data layout - Row or Column (default:R) + -b_layout B tensor data layout - Row or Column (default:C) + -bq_layout Bq tensor data layout - Row or Column (default:C) + -c_layout C tensor data layout - Row or Column (default:R) + -stride_a Tensor A stride (default:0) + -stride_q Tensor AQ stride (default:0) + -stride_b Tensor B stride (default:0) + -stride_c Tensor C stride (default:0) + -v 0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:1) + -prec Data type. For AQuant: fp8/bf8/i4fp8/i4bf8, For Bquant: fp8/bf8/fp8i4/bf8i4 (default:fp8) + -warmup Number of iterations before benchmark the kernel (default:50) + -repeat Number of iterations to benchmark the kernel (default:1000) + -timer gpu:gpu timer, cpu:cpu timer (default:gpu) + -split_k SplitK value (default:1) + -device Device id that will be used to run the kernel (default:0) + -init 0:random, 1:linear, 2:constant(1) (default:0) + -flush_cache Flush cache before running the kernel (default:true) +-rotating_count Rotating count (default:1000) + -quant_mode Choose aquant, bquant, tensor or rowcol (default:bquant) + -preshuffleb Enable preshuffle of tensor B (default:false) + -group_size Quantization group size as MxNxK, e.g., 1x1x128, 1x32x128, 1x64x128 (default:1x1x128) ``` User need to select correct mapping of config for each quant mode: diff --git a/example/ck_tile/38_block_scale_gemm/gemm_aquant_quantgrouped.cpp b/example/ck_tile/38_block_scale_gemm/gemm_aquant_quantgrouped.cpp index 5e83963d20..0141d7b581 100644 --- a/example/ck_tile/38_block_scale_gemm/gemm_aquant_quantgrouped.cpp +++ b/example/ck_tile/38_block_scale_gemm/gemm_aquant_quantgrouped.cpp @@ -1,10 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. - -// This example demonstrates 2D block scale quantization (N×K) for BQuant -// using non-preshuffled configuration. -// NOTE: Once more 2d support is ready, we can migrate all 2d quant types to this example -// This is currently done separately to avoid too verbose dispatching. +// Copyright (c) , Advanced Micro Devices, Inc. All rights reserved. #include "run_gemm_quant_example.inc" diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgourped_prefill_bf8.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_bf8.cpp similarity index 85% rename from example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgourped_prefill_bf8.cpp rename to example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_bf8.cpp index 11f5374ed7..ede217b9b8 100644 --- a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgourped_prefill_bf8.cpp +++ b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_bf8.cpp @@ -1,10 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. - -// This example demonstrates 2D block scale quantization (N×K) for BQuant -// using non-preshuffled configuration. -// NOTE: Once more 2d support is ready, we can migrate all 2d quant types to this example -// This is currently done separately to avoid too verbose dispatching. +// Copyright (c) , Advanced Micro Devices, Inc. All rights reserved. #include "run_gemm_quant_example.inc" diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgourped_prefill_bf8i4.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_bf8i4.cpp similarity index 86% rename from example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgourped_prefill_bf8i4.cpp rename to example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_bf8i4.cpp index 94f24d2f5f..8632d97c89 100644 --- a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgourped_prefill_bf8i4.cpp +++ b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_bf8i4.cpp @@ -1,10 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. - -// This example demonstrates 2D block scale quantization (N×K) for BQuant -// using non-preshuffled configuration. -// NOTE: Once more 2d support is ready, we can migrate all 2d quant types to this example -// This is currently done separately to avoid too verbose dispatching. +// Copyright (c) , Advanced Micro Devices, Inc. All rights reserved. #include "run_gemm_quant_example.inc" diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgourped_prefill_fp8.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_fp8.cpp similarity index 85% rename from example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgourped_prefill_fp8.cpp rename to example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_fp8.cpp index 505cda1233..8c77a517a5 100644 --- a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgourped_prefill_fp8.cpp +++ b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_fp8.cpp @@ -1,10 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. - -// This example demonstrates 2D block scale quantization (N×K) for BQuant -// using non-preshuffled configuration. -// NOTE: Once more 2d support is ready, we can migrate all 2d quant types to this example -// This is currently done separately to avoid too verbose dispatching. +// Copyright (c) , Advanced Micro Devices, Inc. All rights reserved. #include "run_gemm_quant_example.inc" diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgourped_prefill_fp8i4.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_fp8i4.cpp similarity index 86% rename from example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgourped_prefill_fp8i4.cpp rename to example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_fp8i4.cpp index 9687feec04..cd8bc56356 100644 --- a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgourped_prefill_fp8i4.cpp +++ b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_fp8i4.cpp @@ -1,10 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. - -// This example demonstrates 2D block scale quantization (N×K) for BQuant -// using non-preshuffled configuration. -// NOTE: Once more 2d support is ready, we can migrate all 2d quant types to this example -// This is currently done separately to avoid too verbose dispatching. +// Copyright (c) , Advanced Micro Devices, Inc. All rights reserved. #include "run_gemm_quant_example.inc" diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgourped_preshuffleb_prefill.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_preshuffleb_prefill.cpp similarity index 100% rename from example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgourped_preshuffleb_prefill.cpp rename to example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_preshuffleb_prefill.cpp diff --git a/example/ck_tile/38_block_scale_gemm/gemm_quant.cpp b/example/ck_tile/38_block_scale_gemm/gemm_quant.cpp index a4dc550b26..a650152502 100644 --- a/example/ck_tile/38_block_scale_gemm/gemm_quant.cpp +++ b/example/ck_tile/38_block_scale_gemm/gemm_quant.cpp @@ -1,15 +1,9 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. - -// This example demonstrates 2D block scale quantization (N×K) for BQuant -// using non-preshuffled configuration. -// NOTE: Once more 2d support is ready, we can migrate all 2d quant types to this example -// This is currently done separately to avoid too verbose dispatching. +// Copyright (c) , Advanced Micro Devices, Inc. All rights reserved. #include #include #include -#include #include #include @@ -22,14 +16,14 @@ auto create_args(int argc, char* argv[]) { ck_tile::ArgParser arg_parser; - arg_parser.insert("h", "false", "print help message") + arg_parser.insert("h", "false", "Print help message") .insert("m", "3840", "m dimension") .insert("n", "4096", "n dimension") .insert("k", "2048", "k dimension") - .insert("a_layout", "R", "A tensor data layout - Row by default") - .insert("b_layout", "C", "B tensor data layout - Column by default") - .insert("bq_layout", "C", "Bq tensor data layout - Column by default") - .insert("c_layout", "R", "C tensor data layout - Row by default") + .insert("a_layout", "R", "A tensor data layout - Row or Column") + .insert("b_layout", "C", "B tensor data layout - Row or Column") + .insert("bq_layout", "C", "Bq tensor data layout - Row or Column") + .insert("c_layout", "R", "C tensor data layout - Row or Column") .insert("stride_a", "0", "Tensor A stride") .insert("stride_q", "0", "Tensor AQ stride") .insert("stride_b", "0", "Tensor B stride") @@ -37,17 +31,17 @@ auto create_args(int argc, char* argv[]) .insert("v", "1", "0. No validation, 1. Validation on CPU, 2. Validation on GPU") .insert("prec", "fp8", - "data type. For AQuant: fp8/bf8/i4fp8/i4bf8, For Bquant: fp8/bf8/fp8i4/bf8i4") - .insert("warmup", "50", "number of iterations before benchmark the kernel") - .insert("repeat", "1000", "number of iterations to benchmark the kernel") + "Data type. For AQuant: fp8/bf8/i4fp8/i4bf8, For Bquant: fp8/bf8/fp8i4/bf8i4") + .insert("warmup", "50", "Number of iterations before benchmark the kernel") + .insert("repeat", "1000", "Number of iterations to benchmark the kernel") .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer") - .insert("split_k", "1", "splitK value") - .insert("device", "0", "device id that will be used to run the kernel, default 0") + .insert("split_k", "1", "SplitK value") + .insert("device", "0", "Device id that will be used to run the kernel") .insert("init", "0", "0:random, 1:linear, 2:constant(1)") - .insert("flush_cache", "true", "flush cache before running the kernel, default to true") - .insert("rotating_count", "1000", "rotating count, defaults to 1") - .insert("quant_mode", "bquant", "Choose aquant (default), bquant, tensor or rowcol") - .insert("preshuffleb", "false", "Enable preshuffle tensor B, default false") + .insert("flush_cache", "true", "Flush cache before running the kernel") + .insert("rotating_count", "1000", "Rotating count") + .insert("quant_mode", "bquant", "Choose aquant, bquant, tensor or rowcol") + .insert("preshuffleb", "false", "Enable preshuffle of tensor B") .insert("group_size", "1x1x128", "Quantization group size as MxNxK, e.g., 1x1x128, 1x32x128, 1x64x128"); diff --git a/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp index c09d7d6b87..c0c424b76c 100644 --- a/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp +++ b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) , Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -17,6 +17,8 @@ inline size_t hash_multiple_strings(const std::vector& inputs) size_t combined_hash = 0; for(const auto& str : inputs) { + // Hash combine using golden ratio constant and bit shifts for good distribution and + // order-dependent mixing combined_hash ^= hasher(str) + 0x9e3779b9 + (combined_hash << 6) + (combined_hash >> 2); } return combined_hash; diff --git a/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc b/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc index 6a1c1a7a53..5089a6ea9a 100644 --- a/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc +++ b/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c), Advanced Micro Devices, Inc. All rights reserved. #pragma once #include @@ -7,7 +7,6 @@ #include #include #include -#include #include #include