#!/usr/bin/env python3 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT """ Feature engineering for grouped convolution kernel performance prediction. Extends the FeatureEngine interface to support grouped convolution operations. Follows the same pattern as GEMM: hardware parameters are read from the data (hw_* columns) with fallback defaults for gfx950. """ import math import numpy as np import pandas as pd from feature_engine import FeatureEngine, DTYPE_BYTES, PIPELINE_MAP class GroupedConvFeatureEngine(FeatureEngine): """Feature engine for grouped_conv kernels. Hardware parameters are initialized from defaults but can be overridden by reading from data columns (hw_num_cus, hw_max_clock_mhz, etc.) """ def __init__( self, num_cus: int = 256, # gfx950 MI300 default lds_capacity: int = 65536, max_clock_mhz: int = 2400, simds_per_cu: int = 4, shader_engines: int = 32, max_waves_per_cu: int = 32, wavefront_size: int = 64, l1_cache_kb: int = 32, l2_cache_kb: int = 4096, l3_cache_kb: int = 262144, num_xcd: int = 8, ): self._hw = { "num_cus": num_cus, "lds_capacity": lds_capacity, "max_clock_mhz": max_clock_mhz, "simds_per_cu": simds_per_cu, "shader_engines": shader_engines, "max_waves_per_cu": max_waves_per_cu, "wavefront_size": wavefront_size, "l1_cache_kb": l1_cache_kb, "l2_cache_kb": l2_cache_kb, "l3_cache_kb": l3_cache_kb, "num_xcd": num_xcd, "total_simds": num_cus * simds_per_cu, } def get_feature_names(self) -> list[str]: return [ # Problem features (30 -> 38 with Tier-1 additions -> 46 with 3D support) "N", "C", "K", "G", "Hi", "Wi", "Y", "X", "stride_h", "stride_w", "pad_h", "pad_w", "Ho", "Wo", # Computed output dimensions "log2_N", "log2_C", "log2_K", "log2_G", "log2_Hi", "log2_Wi", "log2_spatial", # log2(Hi * Wi) for 2D, log2(Di * Hi * Wi) for 3D "log2_filter", # log2(Y * X) for 2D, log2(Z * Y * X) for 3D "log2_output", # log2(Ho * Wo) for 2D, log2(Do * Ho * Wo) for 3D "arithmetic_intensity", "filter_area", # Y * X for 2D, Z * Y * X for 3D "is_1x1_conv", "is_3x3_conv", "channels_per_group", # C / G "aspect_ratio_hw", # Hi / Wi "aspect_ratio_filter", # Y / X # 3D-specific features (8 new) "is_3d", # 1.0 if 3D conv, 0.0 if 2D "Di", # Depth input (1 for 2D) "Z", # Filter depth (1 for 2D) "Do", # Depth output (1 for 2D) "stride_d", # Depth stride (1 for 2D) "pad_d", # Depth padding (0 for 2D) "dilation_h", # Height dilation "dilation_w", # Width dilation # Tier-1 Group-specific features (8) "log2_channels_per_group", "log2_output_channels_per_group", "is_depthwise", "group_density", "is_small_group", "channels_product_per_group", "batch_group_product", "is_small_batch_grouped", # Kernel features (15 -> 21 with Tier-1 additions) "block_size", "gemm_m_per_block", "gemm_n_per_block", "pipeline", "num_warps", # Estimated from block_size "tile_volume", # gemm_m * gemm_n * block_size "tile_mn", # gemm_m * gemm_n "lds_usage_estimate", "lds_usage_ratio", "block_tile_ratio_m", # gemm_m / block_size "block_tile_ratio_n", # gemm_n / block_size "block_efficiency", # Degree to which block is square-like "is_compv3", "is_compv4", "is_compv5", # Suffix-aware kernel features (6 new) "is_intrawave", # 1.0 if wave_mode == "intrawave", 0.0 if "interwave" "has_dsb", # 1.0 if double smem buffer suffix present "has_si", # 1.0 if store-immediate suffix present "is_basic", # 1.0 if pipeline starts with "basic_v" "is_compv6", # 1.0 if pipeline == "compv6" "is_mem", # 1.0 if pipeline == "mem" # Interaction features (18) "gemm_m_output", # Effective GEMM M: N * Ho * Wo "gemm_n_output", # Effective GEMM N: K "gemm_k_output", # Effective GEMM K: (C/G) * Y * X "num_tiles_m", "num_tiles_n", "num_tiles_k", "total_output_tiles", "tile_eff_m", "tile_eff_n", "tile_eff_k", "overall_tile_efficiency", "cu_utilization", "ratio_gemm_m_to_tile_m", "ratio_gemm_n_to_tile_n", "ratio_gemm_k_to_tile_k", "problem_smaller_than_tile_m", "problem_smaller_than_tile_n", "problem_smaller_than_tile_k", # Hardware features (12) "hw_num_cus", "hw_simds_per_cu", "hw_total_simds", "hw_shader_engines", "hw_max_clock_mhz", "hw_max_waves_per_cu", "hw_wavefront_size", "hw_lds_capacity", "hw_l1_cache_kb", "hw_l2_cache_kb", "hw_l3_cache_kb", "hw_num_xcd", ] def get_categorical_features(self) -> list[str]: return ["pipeline"] def extract(self, problem: dict, kernel: dict) -> np.ndarray: # Problem features - 2D and 3D N = int(problem.get("N", 1)) C = int(problem.get("C", 64)) K = int(problem.get("K", 64)) G = int(problem.get("G", 1)) Hi = int(problem.get("Hi", 32)) Wi = int(problem.get("Wi", 32)) Di = int(problem.get("Di", 1)) # 3D support Y = int(problem.get("Y", 1)) X = int(problem.get("X", 1)) Z = int(problem.get("Z", 1)) # 3D support stride_h = int(problem.get("stride_h", 1)) stride_w = int(problem.get("stride_w", 1)) stride_d = int(problem.get("stride_d", 1)) # 3D support pad_h = int(problem.get("pad_h", 0)) pad_w = int(problem.get("pad_w", 0)) pad_d = int(problem.get("pad_d", 0)) # 3D support dilation_h = int(problem.get("dilation_h", 1)) dilation_w = int(problem.get("dilation_w", 1)) dilation_d = int(problem.get("dilation_d", 1)) # 3D support # Determine if 3D convolution is_3d = float(Di > 1 or Z > 1 or pad_d > 0) # Compute output dimensions (match GroupedConvProblem.Ho/Wo/Do formula) eff_y = (Y - 1) * dilation_h + 1 eff_x = (X - 1) * dilation_w + 1 eff_z = (Z - 1) * dilation_d + 1 Ho = (Hi + 2 * pad_h - eff_y) // stride_h + 1 Wo = (Wi + 2 * pad_w - eff_x) // stride_w + 1 Do = (Di + 2 * pad_d - eff_z) // stride_d + 1 if is_3d else 1 # Log features (adjusted for 3D) log2_N = math.log2(max(N, 1)) log2_C = math.log2(max(C, 1)) log2_K = math.log2(max(K, 1)) log2_G = math.log2(max(G, 1)) log2_Hi = math.log2(max(Hi, 1)) log2_Wi = math.log2(max(Wi, 1)) # For 3D: spatial includes depth dimension spatial_volume = Di * Hi * Wi if is_3d else Hi * Wi filter_volume = Z * Y * X if is_3d else Y * X output_volume = Do * Ho * Wo if is_3d else Ho * Wo log2_spatial = math.log2(max(spatial_volume, 1)) log2_filter = math.log2(max(filter_volume, 1)) log2_output = math.log2(max(output_volume, 1)) # Arithmetic intensity (FLOPs / bytes) - adjusted for 3D dtype = str(problem.get("dtype", "bf16")) bpe = DTYPE_BYTES.get(dtype, 2.0) # FLOPs: N * K * output_volume * (C/G) * filter_volume * 2 (MAC) flops = N * K * output_volume * (C / max(G, 1)) * filter_volume * 2 # Bytes: input + filter + output (adjusted for 3D) input_bytes = N * C * spatial_volume * bpe filter_bytes = K * (C / max(G, 1)) * filter_volume * bpe output_bytes = N * K * output_volume * bpe bytes_transferred = input_bytes + filter_bytes + output_bytes ai = flops / max(bytes_transferred, 1) # Derived problem features (adjusted for 3D) filter_area = filter_volume # Y * X for 2D, Z * Y * X for 3D is_1x1_conv = float(Y == 1 and X == 1 and Z == 1) is_3x3_conv = ( float(Y == 3 and X == 3 and Z == 3) if is_3d else float(Y == 3 and X == 3) ) channels_per_group = C / max(G, 1) aspect_ratio_hw = Hi / max(Wi, 1) aspect_ratio_filter = Y / max(X, 1) # Tier-1 Group-specific features (8) output_channels_per_group = K / max(G, 1) log2_channels_per_group = math.log2(max(channels_per_group, 1)) log2_output_channels_per_group = math.log2(max(output_channels_per_group, 1)) is_depthwise = float(G == C and G == K) group_density = G / max(C, 1) is_small_group = float( channels_per_group < 16 or output_channels_per_group < 16 ) channels_product_per_group = channels_per_group * output_channels_per_group batch_group_product = N * G is_small_batch_grouped = float(N < 8 and G > 1) # Kernel features block_size = int(kernel.get("block_size", 16)) gemm_m_per_block = int(kernel.get("gemm_m_per_block", 64)) gemm_n_per_block = int(kernel.get("gemm_n_per_block", 64)) pipeline_str = str(kernel.get("pipeline", "compv3")) pipeline_code = PIPELINE_MAP.get(pipeline_str, 0) # Estimate warps (assuming 256 thread block) num_warps = block_size / 4.0 tile_volume = gemm_m_per_block * gemm_n_per_block * block_size tile_mn = gemm_m_per_block * gemm_n_per_block # LDS usage estimate lds_est = (gemm_m_per_block * block_size + gemm_n_per_block * block_size) * bpe lds_cap = self._hw["lds_capacity"] if pipeline_str.startswith("compv4"): lds_cap = 32768 lds_ratio = lds_est / max(lds_cap, 1) # Kernel derived features block_tile_ratio_m = gemm_m_per_block / max(block_size, 1) block_tile_ratio_n = gemm_n_per_block / max(block_size, 1) block_efficiency = min(gemm_m_per_block, gemm_n_per_block) / max( gemm_m_per_block, gemm_n_per_block, 1 ) is_compv3 = float(pipeline_str == "compv3") is_compv4 = float(pipeline_str == "compv4") is_compv5 = float(pipeline_str == "compv5") # Suffix-aware kernel features (6 new) wave_mode_str = str(kernel.get("wave_mode", "intrawave")) is_intrawave = float(wave_mode_str == "intrawave") has_dsb = float(int(kernel.get("has_dsb", 0))) has_si = float(int(kernel.get("has_si", 0))) is_basic = float(pipeline_str.startswith("basic_v")) is_compv6 = float(pipeline_str == "compv6") is_mem = float(pipeline_str == "mem") # Interaction features - Map conv to GEMM dimensions (adjusted for 3D) # GEMM M: N * output_volume (N * Do * Ho * Wo for 3D, N * Ho * Wo for 2D) # GEMM N: K (output channels) # GEMM K: (C/G) * filter_volume ((C/G) * Z * Y * X for 3D, (C/G) * Y * X for 2D) gemm_m = N * output_volume gemm_n = K gemm_k = int(channels_per_group * filter_volume) num_tiles_m = math.ceil(gemm_m / max(gemm_m_per_block, 1)) num_tiles_n = math.ceil(gemm_n / max(gemm_n_per_block, 1)) num_tiles_k = math.ceil(gemm_k / max(block_size, 1)) total_output_tiles = num_tiles_m * num_tiles_n rem_m = gemm_m % gemm_m_per_block if gemm_m_per_block > 0 else 0 tile_eff_m = rem_m / gemm_m_per_block if rem_m > 0 else 1.0 rem_n = gemm_n % gemm_n_per_block if gemm_n_per_block > 0 else 0 tile_eff_n = rem_n / gemm_n_per_block if rem_n > 0 else 1.0 rem_k = gemm_k % block_size if block_size > 0 else 0 tile_eff_k = rem_k / block_size if rem_k > 0 else 1.0 overall_eff = tile_eff_m * tile_eff_n * tile_eff_k cu_util = total_output_tiles / max(self._hw["num_cus"], 1) # Problem-to-tile ratios ratio_gemm_m_to_tile_m = gemm_m / max(gemm_m_per_block, 1) ratio_gemm_n_to_tile_n = gemm_n / max(gemm_n_per_block, 1) ratio_gemm_k_to_tile_k = gemm_k / max(block_size, 1) problem_smaller_than_tile_m = float(gemm_m < gemm_m_per_block) problem_smaller_than_tile_n = float(gemm_n < gemm_n_per_block) problem_smaller_than_tile_k = float(gemm_k < block_size) hw = self._hw return np.array( [ # Problem features (30) N, C, K, G, Hi, Wi, Y, X, stride_h, stride_w, pad_h, pad_w, Ho, Wo, log2_N, log2_C, log2_K, log2_G, log2_Hi, log2_Wi, log2_spatial, log2_filter, log2_output, ai, filter_area, is_1x1_conv, is_3x3_conv, channels_per_group, aspect_ratio_hw, aspect_ratio_filter, # 3D-specific features (8) is_3d, Di, Z, Do, stride_d, pad_d, dilation_h, dilation_w, # Tier-1 Group-specific features (8) log2_channels_per_group, log2_output_channels_per_group, is_depthwise, group_density, is_small_group, channels_product_per_group, batch_group_product, is_small_batch_grouped, # Kernel features (15) block_size, gemm_m_per_block, gemm_n_per_block, pipeline_code, num_warps, tile_volume, tile_mn, lds_est, lds_ratio, block_tile_ratio_m, block_tile_ratio_n, block_efficiency, is_compv3, is_compv4, is_compv5, # Suffix-aware kernel features (6) is_intrawave, has_dsb, has_si, is_basic, is_compv6, is_mem, # Interaction features (18) gemm_m, gemm_n, gemm_k, num_tiles_m, num_tiles_n, num_tiles_k, total_output_tiles, tile_eff_m, tile_eff_n, tile_eff_k, overall_eff, cu_util, ratio_gemm_m_to_tile_m, ratio_gemm_n_to_tile_n, ratio_gemm_k_to_tile_k, problem_smaller_than_tile_m, problem_smaller_than_tile_n, problem_smaller_than_tile_k, # Hardware features (12) hw["num_cus"], hw["simds_per_cu"], hw["total_simds"], hw["shader_engines"], hw["max_clock_mhz"], hw["max_waves_per_cu"], hw["wavefront_size"], hw["lds_capacity"], hw["l1_cache_kb"], hw["l2_cache_kb"], hw["l3_cache_kb"], hw["num_xcd"], ], dtype=np.float64, ) def extract_batch(self, df: pd.DataFrame) -> np.ndarray: """Vectorized batch extraction -- much faster than row-by-row.""" n = len(df) names = self.get_feature_names() result = np.zeros((n, len(names)), dtype=np.float64) # Extract problem features (2D and 3D) N = df["N"].values.astype(np.float64) C = df["C"].values.astype(np.float64) K = df["K"].values.astype(np.float64) G = df["G"].values.astype(np.float64) Hi = df["Hi"].values.astype(np.float64) Wi = df["Wi"].values.astype(np.float64) Y = df["Y"].values.astype(np.float64) X = df["X"].values.astype(np.float64) stride_h = df["stride_h"].values.astype(np.float64) stride_w = df["stride_w"].values.astype(np.float64) pad_h = df["pad_h"].values.astype(np.float64) pad_w = df["pad_w"].values.astype(np.float64) # 3D parameters (default to 1 for 2D convolutions) Di = df.get("Di", pd.Series(np.ones(n))).values.astype(np.float64) Z = df.get("Z", pd.Series(np.ones(n))).values.astype(np.float64) stride_d = df.get("stride_d", pd.Series(np.ones(n))).values.astype(np.float64) pad_d = df.get("pad_d", pd.Series(np.zeros(n))).values.astype(np.float64) # Dilation defaults to 1 if not present (standard convolution) dilation_h = df.get("dilation_h", pd.Series(np.ones(n))).values.astype( np.float64 ) dilation_w = df.get("dilation_w", pd.Series(np.ones(n))).values.astype( np.float64 ) dilation_d = df.get("dilation_d", pd.Series(np.ones(n))).values.astype( np.float64 ) # Determine if 3D convolution is_3d = ((Di > 1) | (Z > 1) | (pad_d > 0)).astype(np.float64) # Compute output dimensions (match GroupedConvProblem.Ho/Wo/Do formula) eff_y = (Y - 1) * dilation_h + 1 eff_x = (X - 1) * dilation_w + 1 eff_z = (Z - 1) * dilation_d + 1 Ho = (Hi + 2 * pad_h - eff_y) // stride_h + 1 Wo = (Wi + 2 * pad_w - eff_x) // stride_w + 1 Do = np.where(is_3d, (Di + 2 * pad_d - eff_z) // stride_d + 1, 1.0) # Log features (adjusted for 3D) log2_N = np.log2(np.maximum(N, 1)) log2_C = np.log2(np.maximum(C, 1)) log2_K = np.log2(np.maximum(K, 1)) log2_G = np.log2(np.maximum(G, 1)) log2_Hi = np.log2(np.maximum(Hi, 1)) log2_Wi = np.log2(np.maximum(Wi, 1)) # For 3D: spatial includes depth dimension spatial_volume = np.where(is_3d, Di * Hi * Wi, Hi * Wi) filter_volume = np.where(is_3d, Z * Y * X, Y * X) output_volume = np.where(is_3d, Do * Ho * Wo, Ho * Wo) log2_spatial = np.log2(np.maximum(spatial_volume, 1)) log2_filter = np.log2(np.maximum(filter_volume, 1)) log2_output = np.log2(np.maximum(output_volume, 1)) # Arithmetic intensity (vectorized per-row for mixed-dtype batches) if "dtype" in df.columns: bpe = df["dtype"].map(DTYPE_BYTES).fillna(2.0).values.astype(np.float64) else: bpe = np.full(n, 2.0, dtype=np.float64) # Default to bf16 bpe=2 # FLOPs and arithmetic intensity (adjusted for 3D) flops = N * K * output_volume * (C / np.maximum(G, 1)) * filter_volume * 2 input_bytes = N * C * spatial_volume * bpe filter_bytes = K * (C / np.maximum(G, 1)) * filter_volume * bpe output_bytes = N * K * output_volume * bpe bytes_transferred = input_bytes + filter_bytes + output_bytes ai = flops / np.maximum(bytes_transferred, 1) # Derived problem features (adjusted for 3D) filter_area = filter_volume # Y * X for 2D, Z * Y * X for 3D is_1x1_conv = np.where( is_3d, ((Y == 1) & (X == 1) & (Z == 1)).astype(np.float64), ((Y == 1) & (X == 1)).astype(np.float64), ) is_3x3_conv = np.where( is_3d, ((Y == 3) & (X == 3) & (Z == 3)).astype(np.float64), ((Y == 3) & (X == 3)).astype(np.float64), ) channels_per_group = C / np.maximum(G, 1) aspect_ratio_hw = Hi / np.maximum(Wi, 1) aspect_ratio_filter = Y / np.maximum(X, 1) # Tier-1 Group-specific features (8) output_channels_per_group = K / np.maximum(G, 1) log2_channels_per_group = np.log2(np.maximum(channels_per_group, 1)) log2_output_channels_per_group = np.log2( np.maximum(output_channels_per_group, 1) ) is_depthwise = ((G == C) & (G == K)).astype(np.float64) group_density = G / np.maximum(C, 1) is_small_group = ( (channels_per_group < 16) | (output_channels_per_group < 16) ).astype(np.float64) channels_product_per_group = channels_per_group * output_channels_per_group batch_group_product = N * G is_small_batch_grouped = ((N < 8) & (G > 1)).astype(np.float64) # Kernel features block_size = df["block_size"].values.astype(np.float64) gemm_m_per_block = df["gemm_m_per_block"].values.astype(np.float64) gemm_n_per_block = df["gemm_n_per_block"].values.astype(np.float64) pipeline_code = ( df["pipeline"].map(PIPELINE_MAP).fillna(0).values.astype(np.float64) ) num_warps = block_size / 4.0 tile_volume = gemm_m_per_block * gemm_n_per_block * block_size tile_mn = gemm_m_per_block * gemm_n_per_block # LDS usage lds_est = (gemm_m_per_block * block_size + gemm_n_per_block * block_size) * bpe lds_cap = np.full(n, self._hw["lds_capacity"], dtype=np.float64) is_compv4 = (df["pipeline"] == "compv4").values lds_cap[is_compv4] = 32768 lds_ratio = lds_est / np.maximum(lds_cap, 1) # Kernel derived features block_tile_ratio_m = gemm_m_per_block / np.maximum(block_size, 1) block_tile_ratio_n = gemm_n_per_block / np.maximum(block_size, 1) block_efficiency = np.minimum(gemm_m_per_block, gemm_n_per_block) / np.maximum( np.maximum(gemm_m_per_block, gemm_n_per_block), 1 ) is_compv3_arr = (df["pipeline"] == "compv3").values.astype(np.float64) is_compv4_arr = (df["pipeline"] == "compv4").values.astype(np.float64) is_compv5_arr = (df["pipeline"] == "compv5").values.astype(np.float64) # Suffix-aware kernel features (6 new). Use df.get() with sensible defaults # so old parquets without these columns still load. wave_mode_series = df.get( "wave_mode", pd.Series(["intrawave"] * n, index=df.index) ) is_intrawave_arr = (wave_mode_series == "intrawave").values.astype(np.float64) has_dsb_arr = ( df.get("has_dsb", pd.Series(np.zeros(n), index=df.index)) .fillna(0) .values.astype(np.float64) ) has_si_arr = ( df.get("has_si", pd.Series(np.zeros(n), index=df.index)) .fillna(0) .values.astype(np.float64) ) is_basic_arr = ( df["pipeline"] .astype(str) .str.startswith("basic_v") .values.astype(np.float64) ) is_compv6_arr = (df["pipeline"] == "compv6").values.astype(np.float64) is_mem_arr = (df["pipeline"] == "mem").values.astype(np.float64) # Interaction features (adjusted for 3D) # GEMM M: N * output_volume (N * Do * Ho * Wo for 3D, N * Ho * Wo for 2D) # GEMM N: K (output channels) # GEMM K: channels_per_group * filter_volume gemm_m = N * output_volume gemm_n = K gemm_k = (channels_per_group * filter_volume).astype(np.int64) num_tiles_m = np.ceil(gemm_m / np.maximum(gemm_m_per_block, 1)) num_tiles_n = np.ceil(gemm_n / np.maximum(gemm_n_per_block, 1)) num_tiles_k = np.ceil(gemm_k / np.maximum(block_size, 1)) total_output_tiles = num_tiles_m * num_tiles_n rem_m = np.where(gemm_m_per_block > 0, gemm_m % gemm_m_per_block, 0) tile_eff_m = np.where(rem_m > 0, rem_m / gemm_m_per_block, 1.0) rem_n = np.where(gemm_n_per_block > 0, gemm_n % gemm_n_per_block, 0) tile_eff_n = np.where(rem_n > 0, rem_n / gemm_n_per_block, 1.0) rem_k = np.where(block_size > 0, gemm_k % block_size, 0) tile_eff_k = np.where(rem_k > 0, rem_k / block_size, 1.0) overall_eff = tile_eff_m * tile_eff_n * tile_eff_k cu_util = total_output_tiles / max(self._hw["num_cus"], 1) # Problem-to-tile ratios ratio_gemm_m_to_tile_m = gemm_m / np.maximum(gemm_m_per_block, 1) ratio_gemm_n_to_tile_n = gemm_n / np.maximum(gemm_n_per_block, 1) ratio_gemm_k_to_tile_k = gemm_k / np.maximum(block_size, 1) problem_smaller_than_tile_m = (gemm_m < gemm_m_per_block).astype(np.float64) problem_smaller_than_tile_n = (gemm_n < gemm_n_per_block).astype(np.float64) problem_smaller_than_tile_k = (gemm_k < block_size).astype(np.float64) hw = self._hw # Assemble feature matrix column by column idx = 0 result[:, idx] = N idx += 1 result[:, idx] = C idx += 1 result[:, idx] = K idx += 1 result[:, idx] = G idx += 1 result[:, idx] = Hi idx += 1 result[:, idx] = Wi idx += 1 result[:, idx] = Y idx += 1 result[:, idx] = X idx += 1 result[:, idx] = stride_h idx += 1 result[:, idx] = stride_w idx += 1 result[:, idx] = pad_h idx += 1 result[:, idx] = pad_w idx += 1 result[:, idx] = Ho idx += 1 result[:, idx] = Wo idx += 1 result[:, idx] = log2_N idx += 1 result[:, idx] = log2_C idx += 1 result[:, idx] = log2_K idx += 1 result[:, idx] = log2_G idx += 1 result[:, idx] = log2_Hi idx += 1 result[:, idx] = log2_Wi idx += 1 result[:, idx] = log2_spatial idx += 1 result[:, idx] = log2_filter idx += 1 result[:, idx] = log2_output idx += 1 result[:, idx] = ai idx += 1 result[:, idx] = filter_area idx += 1 result[:, idx] = is_1x1_conv idx += 1 result[:, idx] = is_3x3_conv idx += 1 result[:, idx] = channels_per_group idx += 1 result[:, idx] = aspect_ratio_hw idx += 1 result[:, idx] = aspect_ratio_filter idx += 1 # 3D-specific features (8) result[:, idx] = is_3d idx += 1 result[:, idx] = Di idx += 1 result[:, idx] = Z idx += 1 result[:, idx] = Do idx += 1 result[:, idx] = stride_d idx += 1 result[:, idx] = pad_d idx += 1 result[:, idx] = dilation_h idx += 1 result[:, idx] = dilation_w idx += 1 # Tier-1 Group-specific features (8) result[:, idx] = log2_channels_per_group idx += 1 result[:, idx] = log2_output_channels_per_group idx += 1 result[:, idx] = is_depthwise idx += 1 result[:, idx] = group_density idx += 1 result[:, idx] = is_small_group idx += 1 result[:, idx] = channels_product_per_group idx += 1 result[:, idx] = batch_group_product idx += 1 result[:, idx] = is_small_batch_grouped idx += 1 # Kernel features result[:, idx] = block_size idx += 1 result[:, idx] = gemm_m_per_block idx += 1 result[:, idx] = gemm_n_per_block idx += 1 result[:, idx] = pipeline_code idx += 1 result[:, idx] = num_warps idx += 1 result[:, idx] = tile_volume idx += 1 result[:, idx] = tile_mn idx += 1 result[:, idx] = lds_est idx += 1 result[:, idx] = lds_ratio idx += 1 result[:, idx] = block_tile_ratio_m idx += 1 result[:, idx] = block_tile_ratio_n idx += 1 result[:, idx] = block_efficiency idx += 1 result[:, idx] = is_compv3_arr idx += 1 result[:, idx] = is_compv4_arr idx += 1 result[:, idx] = is_compv5_arr idx += 1 # Suffix-aware kernel features (6) result[:, idx] = is_intrawave_arr idx += 1 result[:, idx] = has_dsb_arr idx += 1 result[:, idx] = has_si_arr idx += 1 result[:, idx] = is_basic_arr idx += 1 result[:, idx] = is_compv6_arr idx += 1 result[:, idx] = is_mem_arr idx += 1 result[:, idx] = gemm_m idx += 1 result[:, idx] = gemm_n idx += 1 result[:, idx] = gemm_k idx += 1 result[:, idx] = num_tiles_m idx += 1 result[:, idx] = num_tiles_n idx += 1 result[:, idx] = num_tiles_k idx += 1 result[:, idx] = total_output_tiles idx += 1 result[:, idx] = tile_eff_m idx += 1 result[:, idx] = tile_eff_n idx += 1 result[:, idx] = tile_eff_k idx += 1 result[:, idx] = overall_eff idx += 1 result[:, idx] = cu_util idx += 1 result[:, idx] = ratio_gemm_m_to_tile_m idx += 1 result[:, idx] = ratio_gemm_n_to_tile_n idx += 1 result[:, idx] = ratio_gemm_k_to_tile_k idx += 1 result[:, idx] = problem_smaller_than_tile_m idx += 1 result[:, idx] = problem_smaller_than_tile_n idx += 1 result[:, idx] = problem_smaller_than_tile_k idx += 1 result[:, idx] = hw["num_cus"] idx += 1 result[:, idx] = hw["simds_per_cu"] idx += 1 result[:, idx] = hw["total_simds"] idx += 1 result[:, idx] = hw["shader_engines"] idx += 1 result[:, idx] = hw["max_clock_mhz"] idx += 1 result[:, idx] = hw["max_waves_per_cu"] idx += 1 result[:, idx] = hw["wavefront_size"] idx += 1 result[:, idx] = hw["lds_capacity"] idx += 1 result[:, idx] = hw["l1_cache_kb"] idx += 1 result[:, idx] = hw["l2_cache_kb"] idx += 1 result[:, idx] = hw["l3_cache_kb"] idx += 1 result[:, idx] = hw["num_xcd"] idx += 1 return result