From 625ce4b77c69b0cf1db64a6eb50beeac6dbb5961 Mon Sep 17 00:00:00 2001
From: Mohsen Saffari <mohsen.saffari@amd.com>
Date: Fri, 7 Nov 2025 16:52:14 +0000
Subject: [PATCH] implement script to run comprehensive combinations with
 flatmm_moe example to find the issues

---
 example/ck_tile/18_flatmm/moe_flatmm.cpp      |   2 +-
 .../ck_tile/18_flatmm/run_flatmm_example.inc  |   2 +-
 .../18_flatmm/run_moe_flatmm_example.inc      |  13 +-
 test_moe_comprehensive.sh                     | 194 ++++++++++++++++++
 4 files changed, 206 insertions(+), 5 deletions(-)
 create mode 100755 test_moe_comprehensive.sh
diff --git a/example/ck_tile/18_flatmm/moe_flatmm.cpp b/example/ck_tile/18_flatmm/moe_flatmm.cpp
index 4db6a1171f..d21ea3614a 100644
--- a/example/ck_tile/18_flatmm/moe_flatmm.cpp
+++ b/example/ck_tile/18_flatmm/moe_flatmm.cpp
@@ -29,7 +29,7 @@ static constexpr inline auto is_row_major(Layout layout_)
 }
 
 template <typename FlatmmConfig, typename T>
-auto shuffle_b(const ck_tile::HostTensor<T>& t)
+auto moe_shuffle_b(const ck_tile::HostTensor<T>& t)
 {
     assert(t.get_lengths().size() == 2);
     int n_ = t.get_lengths()[1];
diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc
index 69bf39f670..7aea4d0f8a 100644
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -103,7 +103,7 @@ int run_flatmm_example_with_layouts(int argc,
         }
         else
         {
-            return shuffle_b<FlatmmConfig>(b_origin_host);
+            return moe_shuffle_b<FlatmmConfig>(b_origin_host);
         }
     }();
     ck_tile::DeviceMem b_shuffle_dev_buf(b_shuffle_host.get_element_space_size_in_bytes());
diff --git a/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc b/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc
index 9e0cbda0c0..15496b736d 100644
--- a/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc
@@ -304,12 +304,19 @@ int run_moe_gemm_example_with_layouts(int argc,
 
         const float max_accumulated_value =
             *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+        [[maybe_unused]] const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
             K, 1 /*kbatch*/, max_accumulated_value);
         c_m_n_ref_buf->FromDevice(c_m_n_host_ref.data());
 
-        const float rtol = std::is_same_v<ADataType, ck_tile::half_t> && IsInputGemm ? 1e-3 : 1e-2;
-        const float atol = std::is_same_v<ADataType, ck_tile::half_t> && IsInputGemm ? 1e-3 : 1e-2;
+        // Base tolerance values
+        const float base_rtol = std::is_same_v<ADataType, ck_tile::half_t> && IsInputGemm ? 1e-3 : 1e-2;
+        const float base_atol = std::is_same_v<ADataType, ck_tile::half_t> && IsInputGemm ? 1e-3 : 1e-2;
+        
+        // Scale tolerance with topk to account for expert aggregation error accumulation
+        // Higher topk means more experts are aggregated, leading to more numerical error
+        const float topk_scale_factor = 1.0f + (topk - 1) * 0.5f;  // Scale factor based on topk
+        const float rtol = base_rtol * topk_scale_factor;
+        const float atol = base_atol * topk_scale_factor;
 
         pass = ck_tile::check_err(
             c_m_n_tensor, c_m_n_host_ref, "Error: Incorrect results!", rtol, atol);
diff --git a/test_moe_comprehensive.sh b/test_moe_comprehensive.sh
new file mode 100755
index 0000000000..347b2e6868
--- /dev/null
+++ b/test_moe_comprehensive.sh
@@ -0,0 +1,194 @@
+#!/bin/bash
+
+# Comprehensive MoE GEMM Test Script
+# Goal: Test all combinations to find which ones crash (memory errors, etc.)
+# Unsupported configurations are logged separately, not counted as crashes
+
+# Fixed parameters
+N=4096
+K=256
+NUM_EXPERTS=128
+TOPK=8
+VALIDATE=0  # Disable validation for speed
+
+# Test parameters
+DATA_TYPES=("fp16" "bf16" "fp8" "bf8")
+GEMM_KINDS=("gemm1_gate_only" "gemm1_gate_up" "gemm2")
+WARP_TILES=(0 1 2 3)
+NUM_TOKENS_VALUES=(32 1024)
+
+# Binary path
+BINARY="./build/bin/tile_example_moe_flatmm"
+
+# Output files
+RESULTS_FILE="moe_test_results.txt"
+CRASH_FILE="moe_test_crashes.txt"
+UNSUPPORTED_FILE="moe_test_unsupported.txt"
+SUCCESS_FILE="moe_test_success.txt"
+
+# Initialize output files
+echo "MoE GEMM Comprehensive Test Results" > $RESULTS_FILE
+echo "Started at: $(date)" >> $RESULTS_FILE
+echo "Test Parameters: N=$N, K=$K, experts=$NUM_EXPERTS, topk=$TOPK" >> $RESULTS_FILE
+echo "======================================" >> $RESULTS_FILE
+echo "" >> $RESULTS_FILE
+
+echo "ACTUAL CRASHES (Memory Errors, HIP Errors, Aborts)" > $CRASH_FILE
+echo "Started at: $(date)" >> $CRASH_FILE
+echo "======================================" >> $CRASH_FILE
+echo "" >> $CRASH_FILE
+
+echo "Unsupported Configurations (Not Crashes)" > $UNSUPPORTED_FILE
+echo "Started at: $(date)" >> $UNSUPPORTED_FILE
+echo "======================================" >> $UNSUPPORTED_FILE
+echo "" >> $UNSUPPORTED_FILE
+
+echo "Successful Test Configurations" > $SUCCESS_FILE
+echo "Started at: $(date)" >> $SUCCESS_FILE
+echo "======================================" >> $SUCCESS_FILE
+echo "" >> $SUCCESS_FILE
+
+# Counter
+total_tests=0
+passed_tests=0
+unsupported_tests=0
+crashed_tests=0
+
+# Function to run a single test
+run_test() {
+    local prec=$1
+    local gemm_kind=$2
+    local warp_tile=$3
+    local num_tokens=$4
+    
+    total_tests=$((total_tests + 1))
+    
+    test_name="prec=${prec} gemm_kind=${gemm_kind} warp_tile=${warp_tile} NumTokens=${num_tokens}"
+    
+    echo "[$total_tests] Testing: $test_name"
+    
+    # Build command
+    cmd="$BINARY -experts=$NUM_EXPERTS -TopK=$TOPK -N=$N -K=$K -prec=$prec -NumTokens=$num_tokens -gemm_kind=$gemm_kind -warp_tile=$warp_tile -validate=$VALIDATE -warmup=1 -repeat=1"
+    
+    # Run test and capture output
+    output=$($cmd 2>&1)
+    exit_code=$?
+    
+    # Determine test status
+    has_unsupported=$(echo "$output" | grep -q "Can't support\|Arguments not supported" && echo "yes" || echo "no")
+    has_crash=$(echo "$output" | grep -q -i "illegal memory\|HIP.*error\|abort\|terminate\|segmentation\|core dumped" && echo "yes" || echo "no")
+    
+    if [ "$has_crash" = "yes" ]; then
+        # ACTUAL CRASH
+        crashed_tests=$((crashed_tests + 1))
+        result="⚠ CRASH"
+        
+        echo "  → CRASHED" >> $CRASH_FILE
+        echo "  Test: $test_name" >> $CRASH_FILE
+        echo "  Exit code: $exit_code" >> $CRASH_FILE
+        echo "  Crash Details:" >> $CRASH_FILE
+        
+        # Extract crash details
+        echo "$output" | grep -i "illegal memory\|HIP.*error\|abort\|terminate\|segmentation\|core dumped" | while IFS= read -r line; do
+            echo "    $line" >> $CRASH_FILE
+        done
+        
+        echo "" >> $CRASH_FILE
+        
+    elif [ "$has_unsupported" = "yes" ]; then
+        # UNSUPPORTED CONFIGURATION (Not a crash)
+        unsupported_tests=$((unsupported_tests + 1))
+        result="○ UNSUPPORTED"
+        
+        echo "  → Configuration Not Supported" >> $UNSUPPORTED_FILE
+        echo "  Test: $test_name" >> $UNSUPPORTED_FILE
+        echo "  Reason:" >> $UNSUPPORTED_FILE
+        
+        echo "$output" | grep "Can't support\|Arguments not supported" | while IFS= read -r line; do
+            echo "    $line" >> $UNSUPPORTED_FILE
+        done
+        
+        echo "" >> $UNSUPPORTED_FILE
+        
+    elif [ $exit_code -eq 0 ]; then
+        # SUCCESS
+        passed_tests=$((passed_tests + 1))
+        result="✓ PASS"
+        
+        echo "  → PASSED" >> $SUCCESS_FILE
+        echo "  Test: $test_name" >> $SUCCESS_FILE
+        
+        # Extract performance if available
+        if echo "$output" | grep -q "Perf:"; then
+            perf=$(echo "$output" | grep "Perf:" | tail -1)
+            echo "  $perf" >> $SUCCESS_FILE
+        fi
+        echo "" >> $SUCCESS_FILE
+        
+    else
+        # UNKNOWN FAILURE
+        crashed_tests=$((crashed_tests + 1))
+        result="✗ FAIL"
+        
+        echo "  → UNKNOWN FAILURE" >> $CRASH_FILE
+        echo "  Test: $test_name" >> $CRASH_FILE
+        echo "  Exit code: $exit_code" >> $CRASH_FILE
+        echo "  Last output lines:" >> $CRASH_FILE
+        echo "$output" | tail -5 | while IFS= read -r line; do
+            echo "    $line" >> $CRASH_FILE
+        done
+        echo "" >> $CRASH_FILE
+    fi
+    
+    # Log to main results file
+    echo "Test #$total_tests: $result" >> $RESULTS_FILE
+    echo "  Configuration: $test_name" >> $RESULTS_FILE
+    echo "" >> $RESULTS_FILE
+    
+    echo "  Result: $result"
+    echo ""
+}
+
+# Main test loop
+echo "Starting comprehensive MoE GEMM testing..."
+echo "Total combinations: $((${#DATA_TYPES[@]} * ${#GEMM_KINDS[@]} * ${#WARP_TILES[@]} * ${#NUM_TOKENS_VALUES[@]}))"
+echo ""
+
+for prec in "${DATA_TYPES[@]}"; do
+    for gemm_kind in "${GEMM_KINDS[@]}"; do
+        for warp_tile in "${WARP_TILES[@]}"; do
+            for num_tokens in "${NUM_TOKENS_VALUES[@]}"; do
+                run_test "$prec" "$gemm_kind" "$warp_tile" "$num_tokens"
+            done
+        done
+    done
+done
+
+# Summary
+echo "======================================" >> $RESULTS_FILE
+echo "Test Summary" >> $RESULTS_FILE
+echo "======================================" >> $RESULTS_FILE
+echo "Total tests run: $total_tests" >> $RESULTS_FILE
+echo "Passed: $passed_tests" >> $RESULTS_FILE
+echo "Unsupported (not crashes): $unsupported_tests" >> $RESULTS_FILE
+echo "Actual crashes: $crashed_tests" >> $RESULTS_FILE
+echo "Success rate: $(awk "BEGIN {printf \"%.2f\", ($passed_tests/$total_tests)*100}")%" >> $RESULTS_FILE
+echo "Crash rate: $(awk "BEGIN {printf \"%.2f\", ($crashed_tests/$total_tests)*100}")%" >> $RESULTS_FILE
+echo "Completed at: $(date)" >> $RESULTS_FILE
+
+echo ""
+echo "========================================"
+echo "COMPREHENSIVE TEST COMPLETED"
+echo "========================================"
+echo "Total tests run: $total_tests"
+echo "Passed: $passed_tests"
+echo "Unsupported configs: $unsupported_tests"
+echo "Actual crashes: $crashed_tests"
+echo "Success rate: $(awk "BEGIN {printf \"%.2f\", ($passed_tests/$total_tests)*100}")%"
+echo "Crash rate: $(awk "BEGIN {printf \"%.2f\", ($crashed_tests/$total_tests)*100}")%"
+echo ""
+echo "Results saved to:"
+echo "  - Full results: $RESULTS_FILE"
+echo "  - Actual crashes: $CRASH_FILE"
+echo "  - Unsupported configs: $UNSUPPORTED_FILE"
+echo "  - Successful runs: $SUCCESS_FILE"