[CK TILE ENGINE] GEMM Multi D Restructure (#3121)

* Renaming old code * Adding GEMM code with new Architecture * Partial Progress : Errors * Partial Progress : Working code * Changes to element wise function * Removing Debugging statements * Working GEMM Multi D code * Removing Stale Code * Address Copilot review comments * Address Copilot review comments * Changes to validation file * Changes to common code snippets * Creating common folder * Removing duplicate files * Pointing to right common file * Pointing to right common file * Pointing to right common file * Changing to VERBOSE * Changing CMAKE messages to verbose * Updating Cmake with right layout datatype configs * Working code for GEMM Multi D
2026-04-20 06:49:15 +00:00 · 2025-10-31 14:02:46 -05:00
parent 04efd282cf
commit a33d98f8e2
22 changed files with 2443 additions and 2002 deletions
--- a/tile_engine/ops/gemm/CMakeLists.txt
+++ b/tile_engine/ops/gemm/CMakeLists.txt
@@ -122,15 +122,15 @@ function(build_individual_gemm_targets datatype layout)
    if(DEFINED ENV{GEMM_CONFIG_FILE} AND NOT "$ENV{GEMM_CONFIG_FILE}" STREQUAL "")
        set(config_filename "$ENV{GEMM_CONFIG_FILE}")
        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${config_filename}")
-        message(STATUS "  Using config from environment variable: ${config_filename}")
+        message(VERBOSE "  Using config from environment variable: ${config_filename}")
    elseif(NOT "${GEMM_CONFIG_FILE}" STREQUAL "")
        # Use CMake variable if set
        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/${GEMM_CONFIG_FILE}")
-        message(STATUS "  Using custom config: ${GEMM_CONFIG_FILE}")
+        message(VERBOSE "  Using custom config: ${GEMM_CONFIG_FILE}")
    else()
        # Use default config for all layouts
        set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json")
-        message(STATUS "  Using default config for layout ${layout}")
+        message(VERBOSE "  Using default config for layout ${layout}")
    endif()

    # Check if config file exists
@@ -151,16 +151,16 @@ function(build_individual_gemm_targets datatype layout)
    endif()

    # Generate individual kernel files using parallel version
-    message(STATUS "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...")
-    message(STATUS "  Working path: ${working_path}")
-    message(STATUS "  Config file: ${json_blob}")
-    message(STATUS "  Python executable: ${Python3_EXECUTABLE}")
-    message(STATUS "  Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py")
+    message(VERBOSE "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...")
+    message(VERBOSE "  Working path: ${working_path}")
+    message(VERBOSE "  Config file: ${json_blob}")
+    message(VERBOSE "  Python executable: ${Python3_EXECUTABLE}")
+    message(VERBOSE "  Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py")

    # Create working directory first
    file(MAKE_DIRECTORY ${working_path})

-    message(STATUS "COMMAND: ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
+    message(VERBOSE "COMMAND: ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
                --working_path ${working_path}
                --datatype ${datatype}
                --layout ${layout}
@@ -169,7 +169,7 @@ function(build_individual_gemm_targets datatype layout)
                --list_kernels ")

    # First, just list the kernels (fast operation)
-    message(STATUS "  Listing kernel configurations...")
+    message(VERBOSE "  Listing kernel configurations...")
    execute_process(
        COMMAND ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py
                --working_path ${working_path}
@@ -192,7 +192,7 @@ function(build_individual_gemm_targets datatype layout)
    if(EXISTS ${working_path}/gemm_kernel_count.txt)
        file(READ ${working_path}/gemm_kernel_count.txt kernel_count)
        string(STRIP "${kernel_count}" kernel_count)
-        message(STATUS "  Found ${kernel_count} kernel configurations")
+        message(VERBOSE "  Found ${kernel_count} kernel configurations")
    else()
        message(FATAL_ERROR "Kernel count file not found")
    endif()
@@ -216,10 +216,10 @@ function(build_individual_gemm_targets datatype layout)
 endfunction()

 # Main build logic - Only individual builds supported
-message(STATUS "=== Starting Tile Engine GEMM Configuration ===")
-message(STATUS "GEMM_DATATYPE: ${GEMM_DATATYPE}")
-message(STATUS "GEMM_LAYOUT: ${GEMM_LAYOUT}")
-message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+message(VERBOSE "=== Starting Tile Engine GEMM Configuration ===")
+message(VERBOSE "GEMM_DATATYPE: ${GEMM_DATATYPE}")
+message(VERBOSE "GEMM_LAYOUT: ${GEMM_LAYOUT}")
+message(VERBOSE "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")

 # Filter GPU targets to only gfx90a, gfx942, gfx950, gfx1201
 set(GEMM_GPU_TARGETS_INDIVIDUAL "")
@@ -228,7 +228,7 @@ set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201")
 foreach(target IN LISTS SUPPORTED_GPU_TARGETS)
    if(target IN_LIST DESIRED_TARGETS)
        list(APPEND GEMM_GPU_TARGETS_INDIVIDUAL ${target})
-        message(STATUS "  Adding GPU target: ${target}")
+        message(VERBOSE "  Adding GPU target: ${target}")
    endif()
 endforeach()

@@ -236,7 +236,7 @@ endforeach()
 if(NOT GEMM_GPU_TARGETS_INDIVIDUAL)
    message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
 else()
-    message(STATUS "Building individual GEMM targets for GPU targets: ${GEMM_GPU_TARGETS_INDIVIDUAL}")
+    message(VERBOSE "Building individual GEMM targets for GPU targets: ${GEMM_GPU_TARGETS_INDIVIDUAL}")

    # Enable parallel compilation optimizations
    # Set up job pools for better parallel compilation control
@@ -251,12 +251,12 @@ else()
        find_program(CCACHE_PROGRAM ccache)
        if(CCACHE_PROGRAM)
            set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
-            message(STATUS "Using ccache for faster compilation")
+            message(VERBOSE "Using ccache for faster compilation")
        else()
            message(WARNING "ccache requested but not found")
        endif()
    else()
-        message(STATUS "ccache disabled for GEMM ops (use -DENABLE_CCACHE_GEMM=ON to enable)")
+        message(VERBOSE "ccache disabled for GEMM ops (use -DENABLE_CCACHE_GEMM=ON to enable)")
    endif()

    # Create master collection targets
--- a/tile_engine/ops/gemm/commons/test_benchmark.sh
+++ b/tile_engine/ops/gemm/commons/test_benchmark.sh
@@ -1,102 +0,0 @@
-#!/bin/bash
-
-# Test script for tile engine GEMM benchmarks
-# This script demonstrates how to run the new individual benchmark executables
-
-# Colors for output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-NC='\033[0m' # No Color
-
-# Find the build directory
-if [ -z "$1" ]; then
-    # Try to find build directory automatically
-    BUILD_DIR=$(find /root/workspace/composable_kernel -name "test_gemm_fix" -type d 2>/dev/null | head -1)
-    if [ -z "$BUILD_DIR" ]; then
-        echo -e "${RED}Error: Could not find build directory. Please provide it as first argument.${NC}"
-        echo "Usage: $0 <build_directory>"
-        exit 1
-    fi
-else
-    BUILD_DIR="$1"
-fi
-
-echo -e "${GREEN}Using build directory: $BUILD_DIR${NC}"
-
-# Check if bin directory exists
-if [ ! -d "$BUILD_DIR/bin" ]; then
-    echo -e "${RED}Error: bin directory not found in $BUILD_DIR${NC}"
-    exit 1
-fi
-
-# Find all benchmark executables
-echo -e "${YELLOW}Finding benchmark executables...${NC}"
-BENCHMARKS=$(find "$BUILD_DIR/bin" -name "benchmark_gemm_*" -type f 2>/dev/null)
-
-if [ -z "$BENCHMARKS" ]; then
-    echo -e "${RED}No benchmark executables found in $BUILD_DIR/bin${NC}"
-    echo "Please build some benchmarks first with:"
-    echo "  cd $BUILD_DIR"
-    echo "  make benchmark_gemm_<kernel_name>"
-    exit 1
-fi
-
-# Count benchmarks
-NUM_BENCHMARKS=$(echo "$BENCHMARKS" | wc -l)
-echo -e "${GREEN}Found $NUM_BENCHMARKS benchmark executable(s)${NC}"
-
-# Test sizes
-SIZES=(512 1024 2048)
-
-# Results file
-RESULTS_FILE="benchmark_results_$(date +%Y%m%d_%H%M%S).csv"
-
-echo -e "${YELLOW}Running benchmarks...${NC}"
-echo "Results will be saved to: $RESULTS_FILE"
-
-# Run each benchmark
-COUNTER=0
-for BENCH in $BENCHMARKS; do
-    COUNTER=$((COUNTER + 1))
-    BENCH_NAME=$(basename "$BENCH")
-    echo -e "\n${GREEN}[$COUNTER/$NUM_BENCHMARKS] Running: $BENCH_NAME${NC}"
-    
-    for SIZE in "${SIZES[@]}"; do
-        echo -e "  Testing size: ${SIZE}x${SIZE}x${SIZE}"
-        
-        # Run with verification
-        "$BENCH" -m=$SIZE -n=$SIZE -k=$SIZE -verify=2 -warmup=10 -repeat=20 \
-                 -csv_filename="$RESULTS_FILE" -csv_format=simple \
-                 2>&1 | grep -E "(Time:|Performance:|Verification:|Error)"
-        
-        if [ ${PIPESTATUS[0]} -ne 0 ]; then
-            echo -e "  ${RED}Benchmark failed!${NC}"
-        fi
-    done
-done
-
-echo -e "\n${GREEN}Benchmark testing complete!${NC}"
-echo "Results saved to: $RESULTS_FILE"
-
-# Show summary if CSV file exists
-if [ -f "$RESULTS_FILE" ]; then
-    echo -e "\n${YELLOW}Summary of results:${NC}"
-    echo "Number of tests: $(tail -n +2 "$RESULTS_FILE" | wc -l)"
-    echo "Successful tests: $(grep -c "true" "$RESULTS_FILE")"
-    echo "Failed tests: $(grep -c "false" "$RESULTS_FILE")"
-fi
-
-# Example of running a specific benchmark with different options
-echo -e "\n${YELLOW}Example commands for manual testing:${NC}"
-echo "# Basic run:"
-echo "$BUILD_DIR/bin/benchmark_gemm_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 -m=1024 -n=1024 -k=1024"
-echo ""
-echo "# With CPU verification:"
-echo "$BUILD_DIR/bin/benchmark_gemm_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 -m=1024 -n=1024 -k=1024 -verify=1"
-echo ""
-echo "# JSON output for parsing:"
-echo "$BUILD_DIR/bin/benchmark_gemm_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 -m=1024 -n=1024 -k=1024 -json_output=true"
-echo ""
-echo "# Performance testing with TFLOPS metric:"
-echo "$BUILD_DIR/bin/benchmark_gemm_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 -m=4096 -n=4096 -k=4096 -warmup=100 -repeat=200 -metric=1"
--- a/tile_engine/ops/gemm/commons/test_validation.py
+++ b/tile_engine/ops/gemm/commons/test_validation.py
@@ -1,141 +0,0 @@
-#!/usr/bin/env python
-"""
-Test script to verify that the validation logic is working correctly.
-"""
-
-from validation_utils import (
-    is_tile_config_valid,
-    is_trait_combination_valid,
-    validate_warp_tile_combination,
-)
-
-
-def test_warp_tile_validation():
-    """Test warp tile combination validation"""
-    print("Testing warp tile combination validation...")
-
-    # Get GPU name
-    gpu_name = "gfx90a"
-
-    # Test cases for fp16
-    test_cases = [
-        # (warp_tile_m, warp_tile_n, warp_tile_k, expected_valid)
-        ([4, 64, 8], False),  # Invalid - not in supported list
-        ([4, 64, 16], True),  # Valid
-        ([32, 32, 8], True),  # Valid
-        ([16, 16, 16], True),  # Valid
-        ([32, 32, 16], True),  # Valid
-        ([16, 16, 32], True),  # Valid
-        ([64, 4, 16], True),  # Valid
-        ([128, 128, 128], False),  # Invalid - too large
-    ]
-
-    print("\nTesting fp16 warp tile combinations:")
-    for (warp_tile_m, warp_tile_n, warp_tile_k), expected in test_cases:
-        valid, msg = validate_warp_tile_combination(
-            warp_tile_m, warp_tile_n, warp_tile_k, "fp16", "fp16", "fp16", gpu_name
-        )
-        status = "PASS" if valid == expected else "FAIL"
-        print(f"  [{warp_tile_m}, {warp_tile_n}, {warp_tile_k}]: {valid} - {status}")
-        if not valid and msg:
-            print(f"    Reason: {msg}")
-
-
-def test_trait_combinations():
-    """Test trait combination validation"""
-    print("\n\nTesting trait combination validation...")
-
-    test_cases = [
-        # (pipeline, epilogue, scheduler, expected_valid)
-        ("mem", "default", "intrawave", True),
-        ("mem", "cshuffle", "intrawave", True),
-        ("compv3", "default", "interwave", False),  # Invalid combination
-        ("compv3", "cshuffle", "interwave", False),  # Invalid combination
-        ("compv4", "default", "interwave", False),  # Invalid combination
-        ("compv4", "cshuffle", "interwave", False),  # Invalid combination
-        ("compv3", "default", "intrawave", True),
-        ("compv4", "cshuffle", "intrawave", True),
-    ]
-
-    print("\nTesting trait combinations:")
-    for pipeline, epilogue, scheduler, expected in test_cases:
-        valid = is_trait_combination_valid(pipeline, epilogue, scheduler)
-        status = "PASS" if valid == expected else "FAIL"
-        print(f"  {pipeline}-{epilogue}-{scheduler}: {valid} - {status}")
-
-
-def test_full_tile_config_validation():
-    """Test full tile configuration validation"""
-    print("\n\nTesting full tile configuration validation...")
-
-    # Test case that was failing in the build
-    tile_m, tile_n, tile_k = 256, 256, 32
-    warp_m, warp_n, warp_k = 1, 4, 1
-    warp_tile_m, warp_tile_n, warp_tile_k = 4, 64, 8
-
-    print("\nTesting problematic configuration:")
-    print(f"  Tile: {tile_m}x{tile_n}x{tile_k}")
-    print(f"  Warp: {warp_m}x{warp_n}x{warp_k}")
-    print(f"  WarpTile: {warp_tile_m}x{warp_tile_n}x{warp_tile_k}")
-
-    valid = is_tile_config_valid(
-        tile_m,
-        tile_n,
-        tile_k,
-        warp_m,
-        warp_n,
-        warp_k,
-        warp_tile_m,
-        warp_tile_n,
-        warp_tile_k,
-        "fp16",
-        "fp16",
-        "fp16",
-        "mem",
-    )
-
-    print(f"  Valid: {valid}")
-    print("  Expected: False (warp tile [4, 64, 8] is not supported for fp16)")
-
-    # Test a valid configuration
-    warp_tile_k = 16  # Change to valid value
-    print("\nTesting corrected configuration:")
-    print(f"  WarpTile: {warp_tile_m}x{warp_tile_n}x{warp_tile_k}")
-
-    valid = is_tile_config_valid(
-        tile_m,
-        tile_n,
-        tile_k,
-        warp_m,
-        warp_n,
-        warp_k,
-        warp_tile_m,
-        warp_tile_n,
-        warp_tile_k,
-        "fp16",
-        "fp16",
-        "fp16",
-        "mem",
-    )
-
-    print(f"  Valid: {valid}")
-    print("  Expected: True")
-
-
-def main():
-    """Run all tests"""
-    print("=" * 60)
-    print("GEMM Validation Test Suite")
-    print("=" * 60)
-
-    test_warp_tile_validation()
-    test_trait_combinations()
-    test_full_tile_config_validation()
-
-    print("\n" + "=" * 60)
-    print("Test suite completed")
-    print("=" * 60)
-
-
-if __name__ == "__main__":
-    main()
--- a/tile_engine/ops/gemm/commons/validation_utils.py
+++ b/tile_engine/ops/gemm/commons/validation_utils.py
@@ -1,615 +0,0 @@
-#!/usr/bin/env python
-# SPDX-License-Identifier: MIT
-# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-"""
-Validation utilities for GEMM kernel generation.
-Extracted from tile_engine_develop for consistency.
-"""
-
-import logging
-from typing import Tuple, List
-
-# Element size mapping for different data types
-ELEMENT_SIZE_MAP = {
-    "fp16": 2,
-    "bf16": 2,
-    "int8": 1,
-    "fp8": 1,
-    "bf8": 1,
-    "int4": 0.5,
-    "int32": 4,
-    "fp32": 4,
-    "fp64": 8,
-}
-
-WARP_SUPPORTED_COMBINATIONS = {
-    "gfx90a": [
-        [1, 4, 1],
-        [2, 2, 1],
-        [4, 1, 1],
-    ],
-    "gfx942": [
-        [1, 4, 1],
-        [2, 2, 1],
-        [4, 1, 1],
-    ],
-    "gfx950": [
-        [1, 4, 1],
-        [2, 2, 1],
-        [4, 1, 1],
-    ],
-    "gfx1201": [
-        [2, 4, 1],
-        [1, 8, 1],
-        [8, 1, 1],
-        [4, 2, 1],
-    ],
-}
-
-# [TODO] Handle this while moving code to commons
-# Supported warp tile combinations for different GPU architectures and data types
-WARP_TILE_SUPPORTED_COMBINATIONS = {
-    "gfx90a": {
-        "fp16_fp16_fp16": [
-            [32, 32, 8],
-            [16, 16, 16],
-            [32, 32, 16],
-            [16, 16, 32],
-            [4, 64, 16],
-            [64, 4, 16],
-        ],
-        "bf16_bf16_bf16": [
-            [32, 32, 8],
-            [16, 16, 16],
-            [32, 32, 16],
-            [16, 16, 32],
-            [4, 64, 16],
-            [64, 4, 16],
-        ],
-        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32]],
-        "bf8_bf8_fp16": [[32, 32, 16], [32, 32, 32]],
-    },
-    "gfx942": {
-        "fp16_fp16_fp16": [
-            [32, 32, 8],
-            [16, 16, 16],
-            [32, 32, 16],
-            [16, 16, 32],
-            [4, 64, 16],
-            [64, 4, 16],
-        ],
-        "bf16_bf16_bf16": [
-            [32, 32, 8],
-            [16, 16, 16],
-            [32, 32, 16],
-            [16, 16, 32],
-            [4, 64, 16],
-            [64, 4, 16],
-        ],
-        "fp8_fp8_fp16": [[32, 32, 16], [32, 32, 32], [16, 16, 32], [16, 16, 64]],
-        "bf8_bf8_fp16": [[32, 32, 16], [32, 32, 32], [16, 16, 64], [16, 16, 32]],
-        "int8_int8_int32": [[16, 16, 32], [32, 32, 16]],
-    },
-    "gfx950": {
-        "fp16_fp16_fp16": [
-            [32, 32, 8],
-            [16, 16, 16],
-            [32, 32, 16],
-            [16, 16, 32],
-            [4, 64, 16],
-            [64, 4, 16],
-        ],
-        "bf16_bf16_bf16": [
-            [32, 32, 8],
-            [16, 16, 16],
-            [32, 32, 16],
-            [16, 16, 32],
-            [4, 64, 16],
-            [64, 4, 16],
-        ],
-        "fp8_fp8_fp16": [
-            [32, 32, 16],
-            [32, 32, 32],
-            [16, 16, 32],
-            [16, 16, 64],
-            [16, 16, 128],
-            [32, 32, 64],
-        ],
-        "bf8_bf8_fp16": [
-            [32, 32, 16],
-            [32, 32, 32],
-            [16, 16, 64],
-            [16, 16, 32],
-            [16, 16, 128],
-            [32, 32, 64],
-        ],
-    },
-    "gfx1201": {
-        "fp16_fp16_fp16": [
-            [16, 16, 16],
-        ],
-    },
-}
-
-# Supported warp tile combinations for different GPU architectures and data types
-WARP_SUPPORTED_COMBINATIONS = {
-    "gfx90a": [
-        [1, 4, 1],
-        [2, 2, 1],
-        [4, 1, 1],
-    ],
-    "gfx942": [
-        [1, 4, 1],
-        [2, 2, 1],
-        [4, 1, 1],
-    ],
-    "gfx950": [
-        [1, 4, 1],
-        [2, 2, 1],
-        [4, 1, 1],
-    ],
-    "gfx1201": [
-        [2, 4, 1],
-        [1, 8, 1],
-        [8, 1, 1],
-        [4, 2, 1],
-    ],
-}
-
-# Unsupported trait combinations
-TRAIT_UNSUPPORTED_COMBINATIONS = {
-    ("compv3", "cshuffle", "interwave"),
-    ("compv3", "default", "interwave"),
-    ("compv4", "cshuffle", "interwave"),
-    ("compv4", "default", "interwave"),
-}
-
-
-def element_size(data_type: str) -> float:
-    """Calculate the size (in bytes) of a single element for given data type."""
-    data_type = data_type.lower()
-    if data_type not in ELEMENT_SIZE_MAP:
-        raise ValueError(f"Unsupported data type: {data_type}")
-    return ELEMENT_SIZE_MAP[data_type]
-
-
-def is_trait_combination_valid(pipeline: str, epilogue: str, scheduler: str) -> bool:
-    """Check if a trait combination is valid."""
-    return (pipeline, epilogue, scheduler) not in TRAIT_UNSUPPORTED_COMBINATIONS
-
-
-def validate_warp_configuration(
-    warp_m: int,
-    warp_n: int,
-    warp_k: int,
-    gpu_name: str,
-) -> bool:
-    """Validate warp configuration."""
-
-    current_combination = [warp_m, warp_n, warp_k]
-
-    allowed_combinations = WARP_SUPPORTED_COMBINATIONS.get(gpu_name, {})
-    if not allowed_combinations:
-        # If GPU not recognized, try to be permissive but log warning
-        logging.warning(f"No warp_[m/n/k] combinations found for GPU: {gpu_name}")
-        return True
-
-    # Check if current combination is in the allowed list
-    if current_combination not in allowed_combinations:
-        return False
-
-    return True
-
-
-def validate_dimension_alignment(
-    tile_m: int,
-    tile_n: int,
-    tile_k: int,
-    warp_m: int,
-    warp_n: int,
-    warp_k: int,
-    warp_tile_m: int,
-    warp_tile_n: int,
-    warp_tile_k: int,
-) -> Tuple[bool, List[str]]:
-    """Check if tile dimensions are properly aligned with warp dimensions."""
-    alignment_issues = []
-
-    if tile_m % (warp_m * warp_tile_m) != 0:
-        alignment_issues.append(
-            f"tile_m({tile_m}) % [{warp_m}x{warp_tile_m}] = {tile_m % (warp_m * warp_tile_m)}"
-        )
-    if tile_n % (warp_n * warp_tile_n) != 0:
-        alignment_issues.append(
-            f"tile_n({tile_n}) % [{warp_n}x{warp_tile_n}] = {tile_n % (warp_n * warp_tile_n)}"
-        )
-    if tile_k % (warp_k * warp_tile_k) != 0:
-        alignment_issues.append(
-            f"tile_k({tile_k}) % [{warp_k}x{warp_tile_k}] = {tile_k % (warp_k * warp_tile_k)}"
-        )
-
-    return len(alignment_issues) == 0, alignment_issues
-
-
-def validate_lds_capacity(
-    tile_m: int,
-    tile_n: int,
-    tile_k: int,
-    a_datatype: str,
-    b_datatype: str,
-    pipeline: str,
-) -> Tuple[bool, str]:
-    """Validate LDS capacity requirements."""
-    matrix_a_size = (tile_m * tile_k) * element_size(a_datatype)
-    matrix_b_size = (tile_n * tile_k) * element_size(b_datatype)
-    total_tile_in_lds = matrix_a_size + matrix_b_size
-
-    max_tile_size = 2**15 if pipeline == "compv4" else 2**16
-
-    if total_tile_in_lds > max_tile_size:
-        error_msg = (
-            f"LDS capacity exceeded: Total required {total_tile_in_lds:,}B ({total_tile_in_lds / 1024:.1f}KB) > "
-            f"maximum allowed {max_tile_size:,}B ({max_tile_size / 1024}KB). Breakdown:\n"
-            f"- Matrix A ({a_datatype}): {tile_m}x{tile_k} = {matrix_a_size:,}B\n"
-            f"- Matrix B ({b_datatype}): {tile_n}x{tile_k} = {matrix_b_size:,}B"
-        )
-        return False, error_msg
-
-    return True, ""
-
-
-def validate_warp_tile_combination(
-    warp_tile_m: int,
-    warp_tile_n: int,
-    warp_tile_k: int,
-    a_datatype: str,
-    b_datatype: str,
-    c_datatype: str,
-    gpu_name: str,
-) -> Tuple[bool, str]:
-    """Validate warp tile combination against GPU-specific supported combinations."""
-
-    # Construct the key for looking up supported combinations
-    warp_tile_key = f"{a_datatype}_{b_datatype}_{c_datatype}"
-    current_combination = [warp_tile_m, warp_tile_n, warp_tile_k]
-
-    # Check if we have GPU-specific combinations
-    gpu_warp_tile_combinations = WARP_TILE_SUPPORTED_COMBINATIONS.get(gpu_name, {})
-    if not gpu_warp_tile_combinations:
-        # If GPU not recognized, try to be permissive but log warning
-        logging.warning(f"No warp tile combinations found for GPU: {gpu_name}")
-        return True, ""
-
-    # Check if we have combinations for this data type combination
-    allowed_combinations = gpu_warp_tile_combinations.get(warp_tile_key, [])
-    if not allowed_combinations:
-        # For data type combinations not in the list, be permissive
-        logging.debug(
-            f"No warp tile combinations found for data types: {warp_tile_key}"
-        )
-        return True, ""
-
-    # Check if current combination is in the allowed list
-    if current_combination not in allowed_combinations:
-        error_msg = (
-            f"Invalid warp tile combination: {current_combination} not in allowed list. "
-            f"Valid combinations for '{warp_tile_key}' on {gpu_name}: {allowed_combinations}"
-        )
-        return False, error_msg
-
-    return True, ""
-
-
-def is_tile_config_valid(
-    tile_m: int,
-    tile_n: int,
-    tile_k: int,
-    warp_m: int,
-    warp_n: int,
-    warp_k: int,
-    warp_tile_m: int,
-    warp_tile_n: int,
-    warp_tile_k: int,
-    a_datatype: str,
-    b_datatype: str,
-    c_datatype: str,
-    pipeline: str,
-    layout: str,
-    gpu_target: str,
-    trait_name: str = None,
-) -> bool:
-    """
-    Comprehensive tile configuration validation.
-    Returns True if configuration is valid, False otherwise.
-    """
-    # Basic sanity checks
-    if tile_m <= 0 or tile_n <= 0 or tile_k <= 0:
-        return False
-    if warp_m <= 0 or warp_n <= 0 or warp_k <= 0:
-        return False
-    if warp_tile_m <= 0 or warp_tile_n <= 0 or warp_tile_k <= 0:
-        return False
-
-    # Check that warp tiles fit within block tiles
-    if warp_m * warp_tile_m > tile_m:
-        return False
-    if warp_n * warp_tile_n > tile_n:
-        return False
-    if warp_k * warp_tile_k > tile_k:
-        return False
-
-    # Validate warp configuration
-    if not validate_warp_configuration(warp_m, warp_n, warp_k, gpu_target):
-        logging.debug(
-            f"Invalid warp configuration: warp_m({warp_m}), warp_n({warp_n}), warp_k({warp_k})"
-        )
-        return False
-
-    # Validate dimension alignment
-    is_aligned, alignment_issues = validate_dimension_alignment(
-        tile_m,
-        tile_n,
-        tile_k,
-        warp_m,
-        warp_n,
-        warp_k,
-        warp_tile_m,
-        warp_tile_n,
-        warp_tile_k,
-    )
-    if not is_aligned:
-        logging.debug(
-            f"Dimension alignment failed: {', '.join(alignment_issues)}. "
-            f"Tile dimensions {tile_m}x{tile_n}x{tile_k} must be divisible by "
-            f"[warp]: {warp_m}x{warp_n}x{warp_k} x [warp_tile]: {warp_tile_m}x{warp_tile_n}x{warp_tile_k}"
-        )
-        return False
-
-    # Validate LDS capacity
-    lds_valid, lds_error = validate_lds_capacity(
-        tile_m, tile_n, tile_k, a_datatype, b_datatype, pipeline
-    )
-    if not lds_valid:
-        logging.debug(f"LDS validation failed: {lds_error}")
-        return False
-
-    # Validate whole workgroup cover configuration
-    wr_cover_valid, wg_cover_error = validate_whole_wg_cover_configuration(
-        tile_m,
-        tile_n,
-        tile_k,
-        warp_m,
-        warp_n,
-        warp_k,
-        layout,
-        a_datatype,
-        b_datatype,
-    )
-    if not wr_cover_valid:
-        logging.debug(
-            f"Whole workgroup cover configuration validation failed: {wg_cover_error}"
-        )
-        return False
-
-    # Validate warp tile combination
-    warp_tile_valid, warp_tile_error = validate_warp_tile_combination(
-        warp_tile_m,
-        warp_tile_n,
-        warp_tile_k,
-        a_datatype,
-        b_datatype,
-        c_datatype,
-        gpu_target,
-    )
-    if not warp_tile_valid:
-        logging.debug(f"Warp tile validation failed: {warp_tile_error}")
-        return False
-
-    return True
-
-
-# [TODO] Handle this while moving code to commons Add more datatype to this function if needed
-def get_dtype_string(datatype: str) -> str:
-    """Get C++ type string for datatype"""
-    dtype_map = {
-        "fp16": "ck_tile::fp16_t",
-        "fp8": "ck_tile::fp8_t",
-        "bf8": "ck_tile::bf8_t",
-        "bf16": "ck_tile::bf16_t",
-        "fp32": "float",
-        "fp64": "double",
-    }
-    return dtype_map.get(datatype, "float")
-
-
-LAYOUT_MAP = {
-    "r": "ck_tile::tensor_layout::gemm::RowMajor",
-    "c": "ck_tile::tensor_layout::gemm::ColumnMajor",
-}
-
-
-def get_abc_layouts(layout_code: str) -> Tuple[str, str, str]:
-    """
-    Return (ALayout, BLayout, CLayout) from a 3-letter code like 'rcr', 'ccr', 'crr', 'rrr'.
-    """
-    code = str(layout_code).strip().lower()
-
-    a_layout = LAYOUT_MAP[code[0]]
-    b_layout = LAYOUT_MAP[code[1]]
-    c_layout = LAYOUT_MAP[code[2]]
-    return a_layout, b_layout, c_layout
-
-
-def validate_whole_wg_cover_configuration(
-    tile_m,
-    tile_n,
-    tile_k,
-    warp_m,
-    warp_n,
-    warp_k,
-    layout,
-    a_datatype,
-    b_datatype,
-) -> Tuple[bool, str]:
-    # Validate whole workgroup cover configuration
-
-    warp_size = 64
-    NumWarps = warp_m * warp_n * warp_k
-    BlockSize = NumWarps * warp_size
-
-    XPerTile = 0
-    YPerTile = 0
-    vector_load_size = 0
-
-    # A matrix validation
-    if layout[0] == "r":
-        XPerTile = tile_k
-        YPerTile = tile_m
-
-        vector_load_size = get_global_vector_load_size(
-            BlockSize, tile_k, a_datatype, tile_m, tile_k
-        )
-
-    elif layout[0] == "c":
-        vector_load_size = get_global_vector_load_size(
-            BlockSize, tile_k, a_datatype, tile_m, tile_m
-        )
-
-        # Validate distribution
-        XPerTile = tile_k
-        YPerTile = tile_m
-
-        wg_cover_core_valid, wg_cover_core_error = wg_cover_core_validation(
-            XPerTile, YPerTile, BlockSize, vector_load_size, warp_size
-        )
-
-        if not wg_cover_core_valid:
-            print("I am here 1")
-            logging.debug(
-                f"whole workgroup cover failed for Matrix A distribution: {wg_cover_core_error}"
-            )
-            return False, wg_cover_core_error
-
-        XPerTile = tile_m
-        YPerTile = tile_k
-
-    wg_cover_core_valid, wg_cover_core_error = wg_cover_core_validation(
-        XPerTile, YPerTile, BlockSize, vector_load_size, warp_size
-    )
-
-    if not wg_cover_core_valid:
-        logging.debug(
-            f"whole workgroup cover failed for Matrix A: {wg_cover_core_error}"
-        )
-        return False, wg_cover_core_error
-
-    # B matrix validation
-    if layout[1] == "r":
-        vector_load_size = get_global_vector_load_size(
-            BlockSize, tile_k, b_datatype, tile_n, tile_n
-        )
-
-        # Validate distribution
-        XPerTile = tile_k
-        YPerTile = tile_n
-
-        wg_cover_core_valid, wg_cover_core_error = wg_cover_core_validation(
-            XPerTile, YPerTile, BlockSize, vector_load_size, warp_size
-        )
-
-        if not wg_cover_core_valid:
-            print("I am here 3")
-            logging.debug(
-                f"whole workgroup cover failed for Matrix A distribution: {wg_cover_core_error}"
-            )
-            return False, wg_cover_core_error
-
-        XPerTile = tile_n
-        YPerTile = tile_k
-
-    elif layout[1] == "c":
-        XPerTile = tile_k
-        YPerTile = tile_n
-
-        vector_load_size = get_global_vector_load_size(
-            BlockSize, tile_k, b_datatype, tile_n, tile_k
-        )
-
-    wg_cover_core_valid, wg_cover_core_error = wg_cover_core_validation(
-        XPerTile, YPerTile, BlockSize, vector_load_size, warp_size
-    )
-    if not wg_cover_core_valid:
-        print("I am here 4")
-        logging.debug(
-            f"whole workgroup cover failed for Matrix B: {wg_cover_core_error}"
-        )
-        return False, wg_cover_core_error
-
-    return True, ""
-
-
-def wg_cover_core_validation(
-    XPerTile: int,
-    YPerTile: int,
-    BlockSize: int,
-    vector_load_size: int,
-    warp_size: int,
-) -> Tuple[bool, str]:
-    if XPerTile % vector_load_size != 0:
-        return False
-
-    num_warps = BlockSize / warp_size
-    LargestVec = (XPerTile * YPerTile) / (num_warps * warp_size)
-
-    X1 = LargestVec if vector_load_size > LargestVec else vector_load_size
-    X0 = XPerTile / X1
-    Y1 = warp_size // X0
-
-    if X0 * Y1 != warp_size:
-        return False, ""
-
-    return True, ""
-
-
-def get_global_vector_load_size(
-    BlockSize: int,
-    KPerBlock: int,
-    DataType: str,
-    MNPerBlock: int,
-    XPerTile: int,
-) -> int:
-    elements_per_thread = MNPerBlock * KPerBlock / BlockSize
-    PackedSize = 1
-
-    if (
-        XPerTile % (PackedSize * 32 / element_size(DataType)) == 0
-        and elements_per_thread % (PackedSize * 32 / element_size(DataType)) == 0
-        and PackedSize == 2
-    ):
-        return PackedSize * 32 / element_size(DataType)
-    elif (
-        XPerTile % (PackedSize * 16 / element_size(DataType)) == 0
-        and elements_per_thread % (PackedSize * 16 / element_size(DataType)) == 0
-    ):
-        return int(PackedSize * 16 / element_size(DataType))
-
-    elif (
-        XPerTile % (PackedSize * 8 / element_size(DataType)) == 0
-        and elements_per_thread % (PackedSize * 8 / element_size(DataType)) == 0
-    ):
-        return int(PackedSize * 8 / element_size(DataType))
-    elif (
-        element_size(DataType) >= PackedSize * 4
-        and XPerTile % (PackedSize * 4 / element_size(DataType)) == 0
-        and elements_per_thread % (PackedSize * 4 / element_size(DataType)) == 0
-    ):
-        return int(PackedSize * 4 / element_size(DataType))
-    elif (
-        element_size(DataType) >= PackedSize * 2
-        and XPerTile % (PackedSize * 2 / element_size(DataType)) == 0
-        and elements_per_thread % (PackedSize * 2 / element_size(DataType)) == 0
-    ):
-        return int(PackedSize * 2 / element_size(DataType))
-    else:
-        return PackedSize
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -8,12 +8,30 @@ import multiprocessing
 import concurrent.futures
 from pathlib import Path
 import logging
-from commons.validation_utils import (
-    is_tile_config_valid,
-    is_trait_combination_valid,
-    get_dtype_string,
-    get_abc_layouts,
-)
+import importlib.util
+
+
+def _import_validation_utils():
+    """Import validation utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(current_dir)
+
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "validation_utils", os.path.join(parent_dir, "commons", "validation_utils.py")
+    )
+    validation_utils = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(validation_utils)
+
+    return validation_utils
+
+
+# Import validation functions
+_validation_utils = _import_validation_utils()
+is_tile_config_valid = _validation_utils.is_tile_config_valid
+is_trait_combination_valid = _validation_utils.is_trait_combination_valid
+get_dtype_string = _validation_utils.get_dtype_string
+get_abc_layouts = _validation_utils.get_abc_layouts

 logging.basicConfig(level=logging.INFO)

@@ -563,6 +581,8 @@ struct SelectedKernel {{
        tile_configs = self._get_tile_configs()
        trait_combos = self._generate_trait_combinations()
        k_block_per_cu = self.config.get("k_block_per_cu")
+        if k_block_per_cu is None:
+            k_block_per_cu = 1

        # Prepare work items for parallel processing
        work_items = []
@@ -574,11 +594,12 @@ struct SelectedKernel {{
                        trait_combo,
                        k_block_per_cu,
                        self.working_path,
+                        self.gpu_target,
                        self.datatype,
                        self.layout,
+                        self.config_json,
                    )
                )
-
        print(
            f"Generating {len(work_items)} individual kernel files using {num_workers} workers..."
        )
@@ -615,7 +636,6 @@ struct SelectedKernel {{
                    print(
                        f"  Progress: {completed}/{len(work_items)} kernels generated"
                    )
-
                try:
                    result = future.result()
                    if result:
@@ -662,10 +682,19 @@ struct SelectedKernel {{

 def _generate_single_kernel_individual(work_item):
    """Worker function to generate a single individual kernel file"""
-    tile_config, trait_combo, k_block_per_cu, working_path, datatype, layout = work_item
+    (
+        tile_config,
+        trait_combo,
+        k_block_per_cu,
+        working_path,
+        gpu_target,
+        datatype,
+        layout,
+        config_json,
+    ) = work_item

    # Create a temporary builder instance for this worker
-    builder = GemmKernelBuilder(working_path, datatype, layout)
+    builder = GemmKernelBuilder(working_path, gpu_target, datatype, layout, config_json)

    try:
        kernel_name, instance_code = builder._generate_kernel_instance(
@@ -798,6 +827,8 @@ def main():
        )

        k_block_per_cu = builder.config.get("k_block_per_cu")
+        if k_block_per_cu is None:
+            k_block_per_cu = 1

        # Generate the kernel
        kernel_name, instance_code = builder._generate_kernel_instance(