mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 10:09:41 +00:00
* docs(README): update readme with new build instructions
* feat(grouped_gemm): add support back for non persistent kernel
* refactor(grouped_gemm): simplify tensor creation
* refactor(grouped_gemm): Persistance is now GemmConfig value for easier management
* chore(grouped_gemm): add print statements to ease debugging
* WIP(grouped_gemm): add grouped_gemm_preshuffle example and update CMake configuration
* fix(tile_gemm_traits): change default value of Preshuffle_ from 0 to false for clarity
* WIP(grouped_gemm): add dummy variables to compile the preshuffle pipelines
* chore(grouped_gemm): add print statements and variables to debug numerical error with preshuffle
* style: clang format work so far
* BUG!(grouped_gemm_kernel.hpp): figured out a potential bug in for numerical errors in preshuffle pipeline
* fix(grouped_gemm_kernel): add function in the kernel code to dynamically calculate tail_number resolving numerical errors
* refactor(gemm_presuffle): make preshuffle pipeline v2 compatible with operator () calls from grouped gemm
* chore(grouped_gemm): add/remove debug comments and debug print statements
* feat(grouped_gemm): integrate preshuffle pipeline v2 into grouped gemm for all supported shapes
* chore(gemm_profile): add new argument combinations
* fix: branch cleanup, formatting, refactoring
* fix: branch cleanup, formatting, refactoring
* chore(changelog): update changelog to reflect new featuer
* address review comments & nit
[ROCm/composable_kernel commit: e279e9420e]
113 lines
3.1 KiB
Bash
Executable File
113 lines
3.1 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
BIN=./bin/tile_example_gemm_weight_preshuffle
|
|
PREC=fp8
|
|
VERBOSITY=2
|
|
|
|
# List of all (m, n, k) triplets
|
|
ARGS_LIST=(
|
|
"1 2048 5120"
|
|
"1 5120 1024"
|
|
"2 2048 5120"
|
|
"2 5120 1024"
|
|
"3 2048 5120"
|
|
"3 5120 1024"
|
|
"4 2048 5120"
|
|
"4 5120 1024"
|
|
"5 2048 5120"
|
|
"5 5120 1024"
|
|
"6 2048 5120"
|
|
"6 5120 1024"
|
|
"7 2048 5120"
|
|
"7 5120 1024"
|
|
"8 2048 5120"
|
|
"8 5120 1024"
|
|
"9 2048 5120"
|
|
"9 5120 1024"
|
|
"10 2048 5120"
|
|
"10 5120 1024"
|
|
"11 2048 5120"
|
|
"11 5120 1024"
|
|
"12 2048 5120"
|
|
"12 5120 1024"
|
|
"13 2048 5120"
|
|
"13 5120 1024"
|
|
"14 2048 5120"
|
|
"14 5120 1024"
|
|
"15 2048 5120"
|
|
"15 5120 1024"
|
|
"16 64 128"
|
|
"16 64 256"
|
|
"16 2048 5120"
|
|
"16 5120 1024"
|
|
"512 768 640"
|
|
"1024 1792 896"
|
|
"1536 2816 1152"
|
|
"2048 5120 1024"
|
|
"2048 5120 8192"
|
|
"2048 7168 8192"
|
|
"2048 8192 3584"
|
|
"16384 7168 8192"
|
|
"16384 8192 3584"
|
|
)
|
|
|
|
# Output file
|
|
OUTPUT_FILE="gemm_profile_results.csv"
|
|
|
|
# Output header
|
|
echo "m,n,k,Pipeline,Time_ms,TFlops,GBps,Verification" > "$OUTPUT_FILE"
|
|
|
|
# Loop over each argument set
|
|
for args in "${ARGS_LIST[@]}"; do
|
|
read -r m n k <<< "$args"
|
|
|
|
echo "Testing: m=$m, n=$n, k=$k"
|
|
OUTPUT=$($BIN -m=$m -n=$n -k=$k -prec=$PREC -v=$VERBOSITY 2>/dev/null)
|
|
|
|
# Extract pipeline information
|
|
# Format: "Launching kernel with args: gemm_fp8_pipeline_AGmemBGmemCRegV2_128x256x256x256_16x16x128_16x16_0x0x0"
|
|
PIPELINE=$(echo "$OUTPUT" | grep "Launching kernel with args:" | sed -n 's/.*Launching kernel with args: \(.*\)/\1/p')
|
|
|
|
# Extract TFlops and GB/s from the output
|
|
# Format: "Run Gemm kernel with M=3840 N=4096 K=2048 ... : 0.042338 ms, 1521.67 TFlops, 1126.89 GB/s,"
|
|
PERF_LINE=$(echo "$OUTPUT" | grep "TFlops")
|
|
|
|
# Extract verification result
|
|
# Format: "The GPU verification result is:correct" (note: no space after colon)
|
|
VERIFICATION=$(echo "$OUTPUT" | grep "The GPU verification result is:" | sed -n 's/.*The GPU verification result is:\(.*\)/\1/p')
|
|
|
|
if [ -n "$PERF_LINE" ]; then
|
|
# Extract execution time in ms
|
|
TIME_MS=$(echo "$PERF_LINE" | grep -o '[0-9]\+\.[0-9]\+ ms' | grep -o '[0-9]\+\.[0-9]\+')
|
|
# Extract TFlops value - more robust regex
|
|
TFLOPS=$(echo "$PERF_LINE" | grep -o '[0-9]\+\.[0-9]\+ TFlops' | grep -o '[0-9]\+\.[0-9]\+')
|
|
# Extract GB/s value - more robust regex
|
|
GBPS=$(echo "$PERF_LINE" | grep -o '[0-9]\+\.[0-9]\+ GB/s' | grep -o '[0-9]\+\.[0-9]\+')
|
|
|
|
# Use extracted pipeline or default if not found
|
|
if [ -z "$PIPELINE" ]; then
|
|
PIPELINE="gemm_basic"
|
|
fi
|
|
|
|
# Print to terminal
|
|
echo " Pipeline: $PIPELINE"
|
|
echo " Time: ${TIME_MS} ms"
|
|
echo " TFlops: ${TFLOPS}"
|
|
echo " GB/s: ${GBPS}"
|
|
echo " Verification: ${VERIFICATION:-N/A}"
|
|
|
|
|
|
# Save to CSV file
|
|
echo "$m,$n,$k,$PIPELINE,$TIME_MS,$TFLOPS,$GBPS,$VERIFICATION" >> "$OUTPUT_FILE"
|
|
else
|
|
echo " ERROR: Could not parse performance data"
|
|
echo ""
|
|
echo "$m,$n,$k,$PIPELINE,,,,$VERIFICATION" >> "$OUTPUT_FILE"
|
|
fi
|
|
done
|
|
|
|
echo "=========================================="
|
|
echo "Profile completed!"
|
|
echo "Results saved to: $OUTPUT_FILE"
|
|
echo "Total tests run: ${#ARGS_LIST[@]}"
|
|
echo "==========================================" |