composable_kernel/dispatcher/heuristics/collect_additional.sh

#!/bin/bash
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT

# Generate additional benchmark data for shapes NOT in the original log.
# Runs in background; outputs streaming JSON that can be parsed by data_pipeline.py.

BIN_DIR="/workspace/ck_tile/bin"
OUT_LOG="data/additional_shapes.log"
WARMUP=3
REPEAT=10

mkdir -p data

# Additional shapes: square powers-of-2 and common ML sizes not in original DeepSeek set
SHAPES=(
    "64,64,64"
    "128,128,128"
    "256,256,256"
    "512,512,512"
    "1024,1024,1024"
    "2048,2048,2048"
    "4096,4096,4096"
    "1,4096,4096"
    "8,4096,4096"
    "32,4096,4096"
    "128,4096,4096"
    "1,4096,11008"
    "32,4096,11008"
    "1,8192,8192"
    "32,8192,8192"
    "1,8192,28672"
    "32,8192,28672"
    "256,256,8192"
    "8192,8192,256"
    "1024,4096,1024"
    "4096,1024,4096"
    "2048,8192,2048"
)

echo "CK Tile Additional Shapes Benchmark" > "$OUT_LOG"
echo "GPU ID: 0" >> "$OUT_LOG"
echo "Implementation: gemm_universal" >> "$OUT_LOG"
echo "" >> "$OUT_LOG"

SHAPE_IDX=0
for SHAPE in "${SHAPES[@]}"; do
    IFS=',' read -r M N K <<< "$SHAPE"
    SHAPE_IDX=$((SHAPE_IDX + 1))

    echo "========================================" >> "$OUT_LOG"
    echo "Shape $SHAPE_IDX: M=$M N=$N K=$K dtype=fp8 layout=rcr" >> "$OUT_LOG"
    echo "========================================" >> "$OUT_LOG"

    KERNEL_COUNT=0
    for EXE in "$BIN_DIR"/benchmark_gemm_universal_fp8_rcr_*; do
        KERNEL_COUNT=$((KERNEL_COUNT + 1))
        OUTPUT=$("$EXE" -m="$M" -n="$N" -k="$K" -warmup=$WARMUP -repeat=$REPEAT -verify=0 2>/dev/null)
        # Extract just the JSON block
        echo "$OUTPUT" | sed -n '/{/,/^}/p' >> "$OUT_LOG"
    done

    echo "Found $KERNEL_COUNT kernels" >> "$OUT_LOG"
    echo "Completed shape $SHAPE_IDX: M=$M N=$N K=$K ($KERNEL_COUNT kernels)" >&2
done

echo "Done generating additional data" >&2