From e7c79223855b740ca3d09a67a287c1c725728b01 Mon Sep 17 00:00:00 2001 From: "assistant-librarian[bot]" Date: Thu, 27 Nov 2025 00:36:02 +0000 Subject: [PATCH] Merge commit '79aae7c7f71404bdb80d6db52bc6401e0e221d42' into develop --- Jenkinsfile | 132 ++++++++++++++++++ example/ck_tile/01_fmha/CMakeLists.txt | 1 + example/ck_tile/02_layernorm2d/CMakeLists.txt | 2 +- example/ck_tile/03_gemm/CMakeLists.txt | 40 +++--- .../03_gemm/gemm_splitk_two_stage_reduce.cpp | 2 +- example/ck_tile/03_gemm/run_gemm_example.inc | 4 +- example/ck_tile/04_img2col/CMakeLists.txt | 2 +- example/ck_tile/05_reduce/CMakeLists.txt | 4 +- example/ck_tile/06_permute/CMakeLists.txt | 2 +- .../ck_tile/09_topk_softmax/CMakeLists.txt | 2 +- example/ck_tile/10_rmsnorm2d/CMakeLists.txt | 4 +- .../11_add_rmsnorm2d_rdquant/CMakeLists.txt | 4 +- example/ck_tile/12_smoothquant/CMakeLists.txt | 2 +- example/ck_tile/13_moe_sorting/CMakeLists.txt | 2 +- .../ck_tile/14_moe_smoothquant/CMakeLists.txt | 3 +- example/ck_tile/15_fused_moe/CMakeLists.txt | 32 ++--- .../instances/fused_moesorting_api.cpp | 16 --- .../ck_tile/16_batched_gemm/CMakeLists.txt | 2 +- .../ck_tile/17_grouped_gemm/CMakeLists.txt | 24 ++-- .../run_grouped_gemm_example.inc | 3 +- example/ck_tile/18_flatmm/CMakeLists.txt | 41 +++--- .../18_flatmm/run_grouped_flatmm_example.inc | 4 +- .../18_flatmm/run_moe_flatmm_example.inc | 4 - .../ck_tile/19_gemm_multi_d/CMakeLists.txt | 2 +- .../20_grouped_convolution/CMakeLists.txt | 30 ++-- ...nvolution_forward_large_tensor_invoker.hpp | 40 +++--- .../ck_tile/22_gemm_multi_abd/CMakeLists.txt | 2 +- .../35_batched_transpose/CMakeLists.txt | 17 +-- example/ck_tile/36_pooling/CMakeLists.txt | 3 +- .../38_block_scale_gemm/CMakeLists.txt | 2 +- .../ck_tile/40_streamk_gemm/CMakeLists.txt | 2 +- .../41_batched_contraction/CMakeLists.txt | 2 +- .../run_batched_contraction_example.inc | 28 ++-- .../ops/pooling/kernel/pool_kernel.hpp | 6 +- script/infra_helper/capture_build_trace.js | 53 +++++++ tile_engine/ops/gemm/CMakeLists.txt | 1 + tile_engine/ops/gemm_multi_d/CMakeLists.txt | 1 + .../ops/gemm_preshuffle/CMakeLists.txt | 1 + .../ck_tile/00_copy_kernel/CMakeLists.txt | 2 +- tutorial/ck_tile/01_naive_gemm/CMakeLists.txt | 2 +- ...ce_gemm_host_pipeline_agmem_bgmem_creg.hpp | 8 +- 41 files changed, 360 insertions(+), 174 deletions(-) create mode 100644 script/infra_helper/capture_build_trace.js diff --git a/Jenkinsfile b/Jenkinsfile index f3e690edd7..c79b8f18e1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -72,6 +72,129 @@ def sendFailureNotifications() { } } +def generateAndArchiveBuildTraceVisualization() { + try { + def buildTraceFileName = "ck_build_trace.json"; + + // Attempt to download the build trace file to check if it exists + def traceFileExists = false + try { + copyArtifacts( + projectName: env.JOB_NAME, + selector: specific(env.BUILD_NUMBER), + filter: buildTraceFileName + ) + traceFileExists = fileExists(buildTraceFileName) + } catch (Exception e) { + echo "Could not copy artifacts: ${e.getMessage()}" + traceFileExists = false + } + + sh """ + echo "post download:" + ls -la + """ + + if (traceFileExists) { + // Move the build trace file to a temporary location to preserve it during checkout + sh """ + mkdir -p /tmp/jenkins_artifacts + cp ${buildTraceFileName} /tmp/jenkins_artifacts/${buildTraceFileName} + ls -la /tmp/jenkins_artifacts/ + """ + } else { + echo "Build trace archive not found" + return + } + + // Checkout source code to get required files + checkout scm + + // Restore the build trace file after checkout + sh """ + ls -la + cp /tmp/jenkins_artifacts/${buildTraceFileName} ${buildTraceFileName} + ls -la ${buildTraceFileName} + """ + + // Pull image + def image = "ghcr.io/puppeteer/puppeteer:24.30.0" + echo "Pulling image: ${image}" + def retimage = docker.image("${image}") + retimage.pull() + + // Create a temporary workspace + sh """#!/bin/bash + ls -la + mkdir -p workspace + cp ./script/infra_helper/capture_build_trace.js ./workspace + cp ${buildTraceFileName} ./workspace/${buildTraceFileName} + chmod 777 ./workspace + ls -la ./workspace + """ + + // Run container to get snapshot + def dockerOpts = "--cap-add=SYS_ADMIN -v \"\$(pwd)/workspace:/workspace\" -e NODE_PATH=/home/pptruser/node_modules" + // Create unique image name by sanitizing job name + def sanitizedJobName = env.JOB_NAME.replaceAll(/[\/\\:*?"<>| ]/, '_') + def imageName = "perfetto_snapshot_${sanitizedJobName}_build_${env.BUILD_NUMBER}.png" + sh """ + docker run --rm ${dockerOpts} ${image} node /workspace/capture_build_trace.js + mv ./workspace/perfetto_snapshot_build.png ./workspace/${imageName} + """ + + // Archive the snapshot + sh """ + mv ./workspace/${imageName} ${imageName} + """ + archiveArtifacts "${imageName}" + + // Notify the channel + withCredentials([string(credentialsId: 'ck_ci_build_perf_webhook_url', variable: 'WEBHOOK_URL')]) { + sh ''' + # Create build trace filename with build number based on the original filename + BUILD_TRACE_WITH_NUMBER=$(echo "''' + buildTraceFileName + '''" | sed 's/.json/_''' + sanitizedJobName + '''_''' + env.BUILD_NUMBER + '''.json/') + + # Convert image to base64 + echo "Converting image to base64..." + IMAGE_BASE64=$(base64 -w 0 ''' + imageName + ''') + echo "Image base64 length: ${#IMAGE_BASE64}" + + # Convert build trace to base64 + echo "Converting build trace to base64..." + BUILD_TRACE_BASE64=$(base64 -w 0 ''' + buildTraceFileName + ''') + echo "Build trace base64 length: ${#BUILD_TRACE_BASE64}" + + # Create JSON payload with base64 data + echo "Creating JSON payload..." + { + printf '{\n' + printf ' "jobName": "%s",\n' "''' + env.JOB_NAME + '''" + printf ' "buildNumber": "%s",\n' "''' + env.BUILD_NUMBER + '''" + printf ' "jobUrl": "%s",\n' "''' + env.RUN_DISPLAY_URL + '''" + printf ' "imageName": "%s",\n' "''' + imageName + '''" + printf ' "imageData": "%s",\n' "$IMAGE_BASE64" + printf ' "buildTraceName": "%s",\n' "$BUILD_TRACE_WITH_NUMBER" + printf ' "buildTraceData": "%s"\n' "$BUILD_TRACE_BASE64" + printf '}\n' + } > webhook_payload.json + + echo "JSON payload created, size: $(wc -c < webhook_payload.json) bytes" + + curl -X POST "${WEBHOOK_URL}" \ + -H "Content-Type: application/json" \ + -d @webhook_payload.json + + # Clean up temporary file + rm -f webhook_payload.json + ''' + } + } catch (Exception e) { + echo "Throwing error exception while generating build trace visualization" + echo 'Exception occurred: ' + e.toString() + } +} + class Version { int major, minor, patch @Override @@ -1750,6 +1873,15 @@ pipeline { } } post { + always { + node(rocmnode("nogpu")) { + script { + // Simulate capture + generateAndArchiveBuildTraceVisualization() + } + cleanWs() + } + } success { script { // Report the parent stage build ck and run tests status diff --git a/example/ck_tile/01_fmha/CMakeLists.txt b/example/ck_tile/01_fmha/CMakeLists.txt index ce914b92af..9edf50e89c 100644 --- a/example/ck_tile/01_fmha/CMakeLists.txt +++ b/example/ck_tile/01_fmha/CMakeLists.txt @@ -109,6 +109,7 @@ set(FMHA_FWD_INSTANCES "tile_fmha_fwd_instances") set(FMHA_BWD_INSTANCES "tile_fmha_bwd_instances") message(DEBUG "adding instances ${FMHA_FWD_INSTANCES}") +# to save build time, exclude the target from "all" target of "01_fmha" directory and its ancestors add_library(${FMHA_FWD_INSTANCES} OBJECT EXCLUDE_FROM_ALL) target_include_directories(${FMHA_FWD_INSTANCES} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_sources(${FMHA_FWD_INSTANCES} PRIVATE ${FMHA_FWD_GEN_BLOBS}) diff --git a/example/ck_tile/02_layernorm2d/CMakeLists.txt b/example/ck_tile/02_layernorm2d/CMakeLists.txt index 07714f0fe2..8b14174b63 100644 --- a/example/ck_tile/02_layernorm2d/CMakeLists.txt +++ b/example/ck_tile/02_layernorm2d/CMakeLists.txt @@ -26,7 +26,7 @@ add_custom_command( set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd") message(DEBUG "adding example ${EXAMPLE_LAYERNORM2D_FWD}") -add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp) +add_executable(${EXAMPLE_LAYERNORM2D_FWD} layernorm2d_fwd.cpp) target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS}) diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt index d2112a67bf..c01f93ddb7 100644 --- a/example/ck_tile/03_gemm/CMakeLists.txt +++ b/example/ck_tile/03_gemm/CMakeLists.txt @@ -1,20 +1,22 @@ -add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp) -add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp) -add_executable(tile_example_gemm_weight_preshuffle EXCLUDE_FROM_ALL gemm_weight_preshuffle.cpp) -add_executable(tile_example_gemm_reduce EXCLUDE_FROM_ALL gemm_splitk_two_stage_reduce.cpp) -add_executable(tile_example_gemm_splitk_two_stage EXCLUDE_FROM_ALL gemm_splitk_two_stage.cpp) -set(EXAMPLE_GEMM_COMPILE_OPTIONS) -set(EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS) -if(CK_USE_OCP_FP8) - list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8) +if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a") + add_executable(tile_example_gemm_basic gemm_basic.cpp) + add_executable(tile_example_gemm_universal universal_gemm.cpp) + add_executable(tile_example_gemm_weight_preshuffle gemm_weight_preshuffle.cpp) + add_executable(tile_example_gemm_reduce gemm_splitk_two_stage_reduce.cpp) + add_executable(tile_example_gemm_splitk_two_stage gemm_splitk_two_stage.cpp) + set(EXAMPLE_GEMM_COMPILE_OPTIONS) + set(EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS) + if(CK_USE_OCP_FP8) + list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8) + endif() + list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0) + list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-unused-local-typedef) + list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-gnu-line-marker) + list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS --save-temps) + list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm -enable-noalias-to-md-conversion=0") + target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) + target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) + target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS}) + target_compile_options(tile_example_gemm_reduce PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) + target_compile_options(tile_example_gemm_splitk_two_stage PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) endif() -list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0) -list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-unused-local-typedef) -list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-gnu-line-marker) -list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS --save-temps) -list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm -enable-noalias-to-md-conversion=0") -target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) -target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) -target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS}) -target_compile_options(tile_example_gemm_reduce PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) -target_compile_options(tile_example_gemm_splitk_two_stage PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) diff --git a/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp index 492f94bae7..354d236c20 100644 --- a/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp +++ b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp @@ -683,7 +683,7 @@ int run_gemm_example_with_layouts_two_stage(ck_tile::ArgParser& arg_parser, if constexpr(preshuffle) { - ck_tile::HostTensor b_shuffle_host = shuffle_b(b_k_n); + ck_tile::HostTensor b_shuffle_host = ck_tile::shuffle_b(b_k_n); // shuffled buffer B for device implementation b_k_n_dev_buf.ToDevice(b_shuffle_host.data()); } diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc index c38ce7ce83..204114d6bb 100644 --- a/example/ck_tile/03_gemm/run_gemm_example.inc +++ b/example/ck_tile/03_gemm/run_gemm_example.inc @@ -284,12 +284,12 @@ int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser, if constexpr(GemmConfig::TiledMMAPermuteN) { std::cout << "Run with PermuteN" << std::endl; - return shuffle_b_permuteN(b_k_n); + return ck_tile::shuffle_b_permuteN(b_k_n); } else { std::cout << "Run without PermuteN" << std::endl; - return shuffle_b(b_k_n); + return ck_tile::shuffle_b(b_k_n); } }(); // shuffled buffer B for device implementation diff --git a/example/ck_tile/04_img2col/CMakeLists.txt b/example/ck_tile/04_img2col/CMakeLists.txt index 3864c9ed9d..145adffd10 100644 --- a/example/ck_tile/04_img2col/CMakeLists.txt +++ b/example/ck_tile/04_img2col/CMakeLists.txt @@ -1,3 +1,3 @@ # not using add_example_executable() to add this target, since we don't want this to have # to be included in "make all/install/check" -add_executable(tile_example_img2col EXCLUDE_FROM_ALL image_to_column.cpp) +add_executable(tile_example_img2col image_to_column.cpp) diff --git a/example/ck_tile/05_reduce/CMakeLists.txt b/example/ck_tile/05_reduce/CMakeLists.txt index 2f48bb85a5..5ffd960403 100644 --- a/example/ck_tile/05_reduce/CMakeLists.txt +++ b/example/ck_tile/05_reduce/CMakeLists.txt @@ -3,7 +3,7 @@ set(EXAMPLE_REDUCE "tile_example_reduce") # to be included in "make all/install/check" message(DEBUG "adding example ${EXAMPLE_REDUCE}") -add_executable(${EXAMPLE_REDUCE} EXCLUDE_FROM_ALL reduce.cpp) +add_executable(${EXAMPLE_REDUCE} reduce.cpp) target_include_directories(${EXAMPLE_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) set(EXAMPLE_REDUCE_COMPILE_OPTIONS) @@ -16,4 +16,4 @@ target_compile_options(${EXAMPLE_REDUCE} PRIVATE ${EXAMPLE_REDUCE_COMPILE_OPTION # by cmake will print too many files, execvp: /bin/sh: Argument list too long # however, this property may affect global # TODO: consider codegen a makefile by us -set_property(GLOBAL PROPERTY RULE_MESSAGES OFF) \ No newline at end of file +set_property(GLOBAL PROPERTY RULE_MESSAGES OFF) diff --git a/example/ck_tile/06_permute/CMakeLists.txt b/example/ck_tile/06_permute/CMakeLists.txt index 327fceb685..f31f2ce693 100644 --- a/example/ck_tile/06_permute/CMakeLists.txt +++ b/example/ck_tile/06_permute/CMakeLists.txt @@ -1,6 +1,6 @@ # not using add_example_executable() to add this target, since we don't want this to have # to be included in "make all/install/check" -add_executable(tile_example_permute EXCLUDE_FROM_ALL permute.cpp) +add_executable(tile_example_permute permute.cpp) if(NOT DEFINED PERMUTE_USE_ALTERNATIVE_IMPL) # set(PERMUTE_USE_ALTERNATIVE_IMPL false) diff --git a/example/ck_tile/09_topk_softmax/CMakeLists.txt b/example/ck_tile/09_topk_softmax/CMakeLists.txt index b43b989792..5fdbb43838 100644 --- a/example/ck_tile/09_topk_softmax/CMakeLists.txt +++ b/example/ck_tile/09_topk_softmax/CMakeLists.txt @@ -1,4 +1,4 @@ -add_executable(tile_example_topk_softmax EXCLUDE_FROM_ALL topk_softmax.cpp topk_softmax_api.cpp) +add_executable(tile_example_topk_softmax topk_softmax.cpp topk_softmax_api.cpp) target_include_directories(tile_example_topk_softmax PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/) set(EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS) diff --git a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt index 878f668f91..c6fb677fab 100644 --- a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt +++ b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt @@ -26,7 +26,7 @@ add_custom_command( set(TILE_RMSNORM2D_FWD "tile_rmsnorm2d_fwd") message(DEBUG "adding ${TILE_RMSNORM2D_FWD}") -add_executable(${TILE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL rmsnorm2d_fwd.cpp) +add_executable(${TILE_RMSNORM2D_FWD} rmsnorm2d_fwd.cpp) target_include_directories(${TILE_RMSNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_sources(${TILE_RMSNORM2D_FWD} PRIVATE ${RMSNORM2D_FWD_GEN_BLOBS}) @@ -38,7 +38,7 @@ list(APPEND TILE_RMSNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno target_compile_options(${TILE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS}) set(EXAMPLE_RMSNORM2D_FWD "tile_example_rmsnorm2d_fwd") -add_executable(${EXAMPLE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL example_rmsnorm2d_fwd.cpp) +add_executable(${EXAMPLE_RMSNORM2D_FWD} example_rmsnorm2d_fwd.cpp) target_compile_options(${EXAMPLE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS}) # TODO: we have to turn off this global prop, otherwise the progress bar generated diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt index 7d56dd1fe3..61c19d4122 100644 --- a/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt @@ -3,7 +3,7 @@ set(TILE_ADD_RMSNORM2D_RDQUANT_FWD "tile_add_rmsnorm2d_rdquant_fwd") # to be included in "make all/install/check" message(DEBUG "adding ${TILE_ADD_RMSNORM2D_RDQUANT_FWD}") file(GLOB INSTANCE_SRCS instances/*.cpp) -add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL add_rmsnorm2d_rdquant_fwd.cpp) +add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} add_rmsnorm2d_rdquant_fwd.cpp) target_include_directories(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_sources(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${INSTANCE_SRCS}) @@ -15,7 +15,7 @@ list(APPEND TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS -Wno-undefined-func-t target_compile_options(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS}) set(EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD "tile_example_add_rmsnorm2d_rdquant_fwd") -add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL example_add_rmsnorm2d_rdquant_fwd.cpp) +add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} example_add_rmsnorm2d_rdquant_fwd.cpp) target_compile_options(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS}) # TODO: we have to turn off this global prop, otherwise the progress bar generated diff --git a/example/ck_tile/12_smoothquant/CMakeLists.txt b/example/ck_tile/12_smoothquant/CMakeLists.txt index 52f10b8d51..c220edee08 100644 --- a/example/ck_tile/12_smoothquant/CMakeLists.txt +++ b/example/ck_tile/12_smoothquant/CMakeLists.txt @@ -2,7 +2,7 @@ function (add_smoothquant_example TARGET_NAME MAIN_SRC) message(DEBUG "adding ${TARGET_NAME}") # not using add_example_executable() to add target, since we don't want this to have # to be included in "make all/install/check" - add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC}) + add_executable(${TARGET_NAME} ${MAIN_SRC}) target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) foreach(source IN LISTS ARGN) diff --git a/example/ck_tile/13_moe_sorting/CMakeLists.txt b/example/ck_tile/13_moe_sorting/CMakeLists.txt index 09f3e4ac4e..ec9ceaa298 100644 --- a/example/ck_tile/13_moe_sorting/CMakeLists.txt +++ b/example/ck_tile/13_moe_sorting/CMakeLists.txt @@ -1,4 +1,4 @@ -add_executable(tile_example_moe_sorting EXCLUDE_FROM_ALL moe_sorting.cpp moe_sorting_api.cpp) +add_executable(tile_example_moe_sorting moe_sorting.cpp moe_sorting_api.cpp) target_include_directories(tile_example_moe_sorting PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/) set(EXAMPLE_MOE_SORTING_COMPILE_OPTIONS) diff --git a/example/ck_tile/14_moe_smoothquant/CMakeLists.txt b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt index 6b848bda2a..039c43375d 100644 --- a/example/ck_tile/14_moe_smoothquant/CMakeLists.txt +++ b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt @@ -2,7 +2,7 @@ function (add_moe_smoothquant_example TARGET_NAME MAIN_SRC) message(DEBUG "adding ${TARGET_NAME}") # not using add_example_executable() to add target, since we don't want this to have # to be included in "make all/install/check" - add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC}) + add_executable(${TARGET_NAME} ${MAIN_SRC}) target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) foreach(source IN LISTS ARGN) @@ -22,4 +22,3 @@ endfunction(add_moe_smoothquant_example TARGET_NAME MAIN_SRC) file(GLOB INSTANCE_SRCS instances/*.cpp) add_moe_smoothquant_example(tile_example_moe_smoothquant moe_smoothquant.cpp ${INSTANCE_SRCS}) - diff --git a/example/ck_tile/15_fused_moe/CMakeLists.txt b/example/ck_tile/15_fused_moe/CMakeLists.txt index 78ec754528..42b96265ad 100644 --- a/example/ck_tile/15_fused_moe/CMakeLists.txt +++ b/example/ck_tile/15_fused_moe/CMakeLists.txt @@ -1,19 +1,19 @@ -set(TILE_EXAPMLE_FUSED_MOE "tile_example_fused_moe") -# not using add_example_executable() to add this target, since we don't want this to have -# to be included in "make all/install/check" -message(DEBUG "adding ${TILE_EXAPMLE_FUSED_MOE}") -file(GLOB INSTANCE_SRCS instances/*.cpp) -add_executable(${TILE_EXAPMLE_FUSED_MOE} EXCLUDE_FROM_ALL main.cpp) -target_include_directories(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) -target_sources(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${INSTANCE_SRCS}) +if(GPU_TARGETS MATCHES "gfx94|gfx95") + set(TILE_EXAMPLE_FUSED_MOE "tile_example_fused_moe") + message(DEBUG "adding ${TILE_EXAMPLE_FUSED_MOE}") + file(GLOB INSTANCE_SRCS instances/*.cpp) + add_executable(${TILE_EXAMPLE_FUSED_MOE} main.cpp) + target_include_directories(${TILE_EXAMPLE_FUSED_MOE} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) + target_sources(${TILE_EXAMPLE_FUSED_MOE} PRIVATE ${INSTANCE_SRCS}) -set(TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS) + set(TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS) -# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations -list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) -list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_BUFFER_LOAD_AGPR=1) # TODO: enable load to a -list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=4) # rta -# list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -mllvm -greedy-reverse-local-assignment=1) -# list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker) + # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations + list(APPEND TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) + list(APPEND TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_BUFFER_LOAD_AGPR=1) # TODO: enable load to a + list(APPEND TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=4) # rta + # list(APPEND TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS -mllvm -greedy-reverse-local-assignment=1) + # list(APPEND TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker) -target_compile_options(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS}) + target_compile_options(${TILE_EXAMPLE_FUSED_MOE} PRIVATE ${TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS}) +endif() diff --git a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp index 04ad882200..d80fed7e8c 100644 --- a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp +++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp @@ -402,22 +402,6 @@ float fused_moesorting_mp(fused_moesorting_trait t, using ms_index_t = ck_tile::index_t; using ms_weight_type = float; - auto maybe_clear_workspace = [=](const ck_tile::stream_config& s_) { - if(t.clear_workspace_inside_api) - { - if(is_local_token) - { - auto k = MOR_SORTING_CLEAR_WS_DISPATCH_(true, 1024, 1); - k(s_); - } - else - { - auto k = MOR_SORTING_CLEAR_WS_DISPATCH_(false, 1024, 1); - k(s_); - } - } - }; - if(a.tokens < 2048) { if(ck_tile::impl::moe_sorting_get_smem_size_p23(a.num_experts) > diff --git a/example/ck_tile/16_batched_gemm/CMakeLists.txt b/example/ck_tile/16_batched_gemm/CMakeLists.txt index 78e78c6b04..0d94a4a9a1 100644 --- a/example/ck_tile/16_batched_gemm/CMakeLists.txt +++ b/example/ck_tile/16_batched_gemm/CMakeLists.txt @@ -1 +1 @@ -add_executable(tile_example_batched_gemm EXCLUDE_FROM_ALL batched_gemm.cpp) +add_executable(tile_example_batched_gemm batched_gemm.cpp) diff --git a/example/ck_tile/17_grouped_gemm/CMakeLists.txt b/example/ck_tile/17_grouped_gemm/CMakeLists.txt index bbfb2df006..8e4550448b 100644 --- a/example/ck_tile/17_grouped_gemm/CMakeLists.txt +++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt @@ -1,12 +1,14 @@ -add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp) -add_executable(tile_example_quant_grouped_gemm EXCLUDE_FROM_ALL quant_grouped_gemm.cpp) -add_executable(tile_example_grouped_gemm_preshuffle EXCLUDE_FROM_ALL grouped_gemm_preshuffle.cpp) -add_executable(tile_example_grouped_gemm_multi_d EXCLUDE_FROM_ALL grouped_gemm_multi_d.cpp) -set(EXAMPLE_GEMM_COMPILE_OPTIONS) -if(CK_USE_OCP_FP8) - list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8) +if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a") + add_executable(tile_example_grouped_gemm grouped_gemm.cpp) + add_executable(tile_example_quant_grouped_gemm quant_grouped_gemm.cpp) + add_executable(tile_example_grouped_gemm_preshuffle grouped_gemm_preshuffle.cpp) + add_executable(tile_example_grouped_gemm_multi_d grouped_gemm_multi_d.cpp) + set(EXAMPLE_GEMM_COMPILE_OPTIONS) + if(CK_USE_OCP_FP8) + list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8) + endif() + target_compile_options(tile_example_grouped_gemm PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) + target_compile_options(tile_example_grouped_gemm_preshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) + target_compile_options(tile_example_grouped_gemm_multi_d PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) + target_compile_options(tile_example_quant_grouped_gemm PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) endif() -target_compile_options(tile_example_grouped_gemm PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) -target_compile_options(tile_example_grouped_gemm_preshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) -target_compile_options(tile_example_grouped_gemm_multi_d PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) -target_compile_options(tile_example_quant_grouped_gemm PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) \ No newline at end of file diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc index a620964eaf..390a54644b 100644 --- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc +++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc @@ -233,7 +233,8 @@ int run_grouped_gemm_example_with_layouts(int argc, // Perform preshuffle for B tensor if constexpr(GemmConfig::Preshuffle) { - ck_tile::HostTensor b_shuffle_host = shuffle_b(b_k_n_tensors[i]); + ck_tile::HostTensor b_shuffle_host = + ck_tile::shuffle_b(b_k_n_tensors[i]); b_k_n_dev_buf.push_back(std::make_unique(b_shuffle_host)); } else diff --git a/example/ck_tile/18_flatmm/CMakeLists.txt b/example/ck_tile/18_flatmm/CMakeLists.txt index 43789750d0..b9f5e89e36 100644 --- a/example/ck_tile/18_flatmm/CMakeLists.txt +++ b/example/ck_tile/18_flatmm/CMakeLists.txt @@ -9,18 +9,6 @@ foreach(gpu IN LISTS GPU_TARGETS) endforeach() if(has_supported_gpu) - add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp) - add_executable(tile_example_mixed_prec_flatmm EXCLUDE_FROM_ALL mixed_prec/mixed_prec_flatmm.cpp) - add_executable(tile_example_moe_flatmm EXCLUDE_FROM_ALL moe_flatmm.cpp) - add_executable(tile_example_a16w4_moe_flatmm EXCLUDE_FROM_ALL mixed_prec/a16w4_moe_flatmm.cpp) - add_executable(tile_example_grouped_flatmm EXCLUDE_FROM_ALL grouped_flatmm.cpp) - - include(mxgemm/mx_flatmm_instance.cmake) - mx_flatmm_instance_generate(EXAMPLE_MX_FLATMM_FILES) - message(STATUS "Generated MX FlatMM kernel files: ${EXAMPLE_MX_FLATMM_FILES}") - add_executable(tile_example_mx_flatmm EXCLUDE_FROM_ALL mxgemm/mx_flatmm.cpp ${EXAMPLE_MX_FLATMM_FILES}) - target_include_directories(tile_example_mx_flatmm PRIVATE mxgemm) - # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations # ... because they are auto-generated set(EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template) @@ -30,11 +18,28 @@ if(has_supported_gpu) list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8) endif() + add_executable(tile_example_flatmm_basic flatmm_basic.cpp) target_compile_options(tile_example_flatmm_basic PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) - target_compile_options(tile_example_mixed_prec_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) - target_compile_options(tile_example_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) - target_compile_options(tile_example_a16w4_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) - target_compile_options(tile_example_grouped_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) - target_compile_options(tile_example_mx_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) # TODO: 950 only -endif() + add_executable(tile_example_moe_flatmm moe_flatmm.cpp) + target_compile_options(tile_example_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) + + add_executable(tile_example_grouped_flatmm grouped_flatmm.cpp) + target_compile_options(tile_example_grouped_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) + + if (GPU_TARGETS MATCHES "gfx95") + add_executable(tile_example_mixed_prec_flatmm mixed_prec/mixed_prec_flatmm.cpp) + target_compile_options(tile_example_mixed_prec_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) + + add_executable(tile_example_a16w4_moe_flatmm mixed_prec/a16w4_moe_flatmm.cpp) + target_compile_options(tile_example_a16w4_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) + + include(mxgemm/mx_flatmm_instance.cmake) + mx_flatmm_instance_generate(EXAMPLE_MX_FLATMM_FILES) + message(STATUS "Generated MX FlatMM kernel files: ${EXAMPLE_MX_FLATMM_FILES}") + + add_executable(tile_example_mx_flatmm mxgemm/mx_flatmm.cpp ${EXAMPLE_MX_FLATMM_FILES}) + target_include_directories(tile_example_mx_flatmm PRIVATE mxgemm) + target_compile_options(tile_example_mx_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) + endif() +endif() diff --git a/example/ck_tile/18_flatmm/run_grouped_flatmm_example.inc b/example/ck_tile/18_flatmm/run_grouped_flatmm_example.inc index c891f1139e..2027544709 100644 --- a/example/ck_tile/18_flatmm/run_grouped_flatmm_example.inc +++ b/example/ck_tile/18_flatmm/run_grouped_flatmm_example.inc @@ -215,7 +215,7 @@ int run_contiguous_grouped_flatmm_example_with_layouts( assert(N % N_Warp_Tile == 0 && "N must be divisible by N_Warp_Tile for contiguous grouped gemm"); ck_tile::HostTensor b_shuffle_host = - shuffle_b(b_k_n_tensor); + ck_tile::shuffle_b(b_k_n_tensor); std::unique_ptr a_m_k_dev_buf( std::make_unique(a_m_k_tensor.get_element_space_size_in_bytes())); @@ -431,7 +431,7 @@ int run_masked_grouped_flatmm_example_with_layouts( assert(N % N_Warp_Tile == 0 && "N must be divisible by N_Warp_Tile for contiguous grouped gemm"); ck_tile::HostTensor b_shuffle_host = - shuffle_b(b_k_n_tensor); + ck_tile::shuffle_b(b_k_n_tensor); std::unique_ptr a_m_k_dev_buf( std::make_unique(a_m_k_tensor.get_element_space_size_in_bytes())); diff --git a/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc b/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc index f5259ea87b..c58ddc2584 100644 --- a/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc +++ b/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc @@ -302,10 +302,6 @@ int run_moe_gemm_example_with_layouts(int argc, static_cast(per_token_scale_dev_buf.GetDeviceBuffer()), static_cast(per_channel_scale_dev_buf.GetDeviceBuffer())); - const float max_accumulated_value = - *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end()); - const auto rtol_atol = calculate_rtol_atol( - K, 1 /*kbatch*/, max_accumulated_value); c_m_n_ref_buf->FromDevice(c_m_n_host_ref.data()); const float rtol = std::is_same_v && IsInputGemm ? 1e-3 : 1e-2; diff --git a/example/ck_tile/19_gemm_multi_d/CMakeLists.txt b/example/ck_tile/19_gemm_multi_d/CMakeLists.txt index 4ecfec7ccf..a237b2b676 100644 --- a/example/ck_tile/19_gemm_multi_d/CMakeLists.txt +++ b/example/ck_tile/19_gemm_multi_d/CMakeLists.txt @@ -1,4 +1,4 @@ -add_executable(tile_example_gemm_multi_d_fp16 EXCLUDE_FROM_ALL gemm_multi_d_fp16.cpp) +add_executable(tile_example_gemm_multi_d_fp16 gemm_multi_d_fp16.cpp) set(EXAMPLE_GEMM_COMPILE_OPTIONS) if(CK_USE_OCP_FP8) list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8) diff --git a/example/ck_tile/20_grouped_convolution/CMakeLists.txt b/example/ck_tile/20_grouped_convolution/CMakeLists.txt index ed2a2a0dd6..ef8bafc1df 100644 --- a/example/ck_tile/20_grouped_convolution/CMakeLists.txt +++ b/example/ck_tile/20_grouped_convolution/CMakeLists.txt @@ -1,20 +1,22 @@ -set(EXAMPLE_CONV_COMPILE_OPTIONS) -list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0) +if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a") + set(EXAMPLE_CONV_COMPILE_OPTIONS) + list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0) -add_executable(tile_example_grouped_conv_fwd EXCLUDE_FROM_ALL grouped_convolution_forward.cpp) -target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS}) + add_executable(tile_example_grouped_conv_fwd grouped_convolution_forward.cpp) + target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS}) -add_executable(tile_example_grouped_conv_fwd_large_tensor EXCLUDE_FROM_ALL grouped_convolution_forward_large_tensor.cpp) -target_compile_options(tile_example_grouped_conv_fwd_large_tensor PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS}) + add_executable(tile_example_grouped_conv_fwd_large_tensor grouped_convolution_forward_large_tensor.cpp) + target_compile_options(tile_example_grouped_conv_fwd_large_tensor PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS}) -add_executable(tile_example_grouped_conv_fwd_bias_clamp EXCLUDE_FROM_ALL grouped_convolution_forward_bias_clamp.cpp) -target_compile_options(tile_example_grouped_conv_fwd_bias_clamp PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) + add_executable(tile_example_grouped_conv_fwd_bias_clamp grouped_convolution_forward_bias_clamp.cpp) + target_compile_options(tile_example_grouped_conv_fwd_bias_clamp PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) -add_executable(tile_example_grouped_conv_bwd_weight EXCLUDE_FROM_ALL grouped_convolution_backward_weight.cpp) -target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS}) + add_executable(tile_example_grouped_conv_bwd_weight grouped_convolution_backward_weight.cpp) + target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS}) -add_executable(tile_example_grouped_conv_bwd_weight_two_stage EXCLUDE_FROM_ALL grouped_convolution_backward_weight_two_stage.cpp) -target_compile_options(tile_example_grouped_conv_bwd_weight_two_stage PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS}) + add_executable(tile_example_grouped_conv_bwd_weight_two_stage grouped_convolution_backward_weight_two_stage.cpp) + target_compile_options(tile_example_grouped_conv_bwd_weight_two_stage PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS}) -add_executable(tile_example_grouped_conv_bwd_data EXCLUDE_FROM_ALL grouped_convolution_backward_data.cpp) -target_compile_options(tile_example_grouped_conv_bwd_data PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS}) + add_executable(tile_example_grouped_conv_bwd_data grouped_convolution_backward_data.cpp) + target_compile_options(tile_example_grouped_conv_bwd_data PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS}) +endif() diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor_invoker.hpp index f168d36cac..d154d8710b 100644 --- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor_invoker.hpp +++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor_invoker.hpp @@ -101,7 +101,6 @@ struct GroupedConvolutionForwardInvoker const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); - float ave_time{0}; using TransformType = ck_tile::TransformConvFwdToGemm(const auto has_hot_loop_, - const auto tail_number_, - const auto memory_operation_) { + const auto Run = [&](const auto has_hot_loop_, + const auto tail_number_, + const auto memory_operation_, + const auto enable_split_image_) { constexpr bool has_hot_loop_v = has_hot_loop_.value; constexpr auto tail_number_v = tail_number_.value; constexpr auto scheduler = ConvConfig::Scheduler; constexpr auto memory_operation = memory_operation_.value; + constexpr bool EnableSplitImage = enable_split_image_.value; using GroupedConvTraitsType = std::conditional_t(Kernel{}, grids, blocks, 0, kargs)); - - return ave_time; }; // ===================================================================== @@ -369,28 +368,33 @@ struct GroupedConvolutionForwardInvoker // ===================================================================== if(use_split_image) { - // Use split-image kernel (Kernel) const auto RunSplitImage = [&](const auto has_hot_loop_, const auto tail_number_) { if(args.k_batch == 1) - Run.template operator()(has_hot_loop_, tail_number_, MemoryOpSet{}); + return Run( + has_hot_loop_, tail_number_, MemoryOpSet{}, ck_tile::bool_constant{}); else - Run.template operator()(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{}); + return Run(has_hot_loop_, + tail_number_, + MemoryOpAtomicAdd{}, + ck_tile::bool_constant{}); }; - BaseGemmPipeline::TailHandler(RunSplitImage, has_hot_loop, tail_num); + return BaseGemmPipeline::TailHandler(RunSplitImage, has_hot_loop, tail_num); } else { - // Use regular kernel (Kernel) const auto RunRegular = [&](const auto has_hot_loop_, const auto tail_number_) { if(args.k_batch == 1) - Run.template operator()(has_hot_loop_, tail_number_, MemoryOpSet{}); + return Run(has_hot_loop_, + tail_number_, + MemoryOpSet{}, + ck_tile::bool_constant{}); else - Run.template operator()( - has_hot_loop_, tail_number_, MemoryOpAtomicAdd{}); + return Run(has_hot_loop_, + tail_number_, + MemoryOpAtomicAdd{}, + ck_tile::bool_constant{}); }; - BaseGemmPipeline::TailHandler(RunRegular, has_hot_loop, tail_num); + return BaseGemmPipeline::TailHandler(RunRegular, has_hot_loop, tail_num); } - - return ave_time; } }; diff --git a/example/ck_tile/22_gemm_multi_abd/CMakeLists.txt b/example/ck_tile/22_gemm_multi_abd/CMakeLists.txt index f382e0cf45..927c77bd11 100644 --- a/example/ck_tile/22_gemm_multi_abd/CMakeLists.txt +++ b/example/ck_tile/22_gemm_multi_abd/CMakeLists.txt @@ -1 +1 @@ -add_executable(tile_example_gemm_multi_abd_fp16 EXCLUDE_FROM_ALL gemm_multi_abd_fp16.cpp) +add_executable(tile_example_gemm_multi_abd_fp16 gemm_multi_abd_fp16.cpp) diff --git a/example/ck_tile/35_batched_transpose/CMakeLists.txt b/example/ck_tile/35_batched_transpose/CMakeLists.txt index a08fcebb74..326ab1b8e1 100644 --- a/example/ck_tile/35_batched_transpose/CMakeLists.txt +++ b/example/ck_tile/35_batched_transpose/CMakeLists.txt @@ -1,9 +1,10 @@ -set(TARGET_NAME tile_example_batched_transpose) -add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL batched_transpose_example.cpp batched_transpose_api.cpp) -target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/) - -# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations -list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) -# list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker) -target_compile_options(tile_example_batched_transpose PRIVATE ${EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS}) +if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a") + set(TARGET_NAME tile_example_batched_transpose) + add_executable(${TARGET_NAME} batched_transpose_example.cpp batched_transpose_api.cpp) + target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/) + # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations + list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal) + # list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker) + target_compile_options(tile_example_batched_transpose PRIVATE ${EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS}) +endif() diff --git a/example/ck_tile/36_pooling/CMakeLists.txt b/example/ck_tile/36_pooling/CMakeLists.txt index 425a8c83ba..17f8865d53 100644 --- a/example/ck_tile/36_pooling/CMakeLists.txt +++ b/example/ck_tile/36_pooling/CMakeLists.txt @@ -1,8 +1,7 @@ set(EXAMPLE_POOL_3D "tile_example_pool3d") message(DEBUG "adding example ${EXAMPLE_POOL_3D}") -add_executable(${EXAMPLE_POOL_3D} EXCLUDE_FROM_ALL pool3d.cpp) +add_executable(${EXAMPLE_POOL_3D} pool3d.cpp) target_include_directories(${EXAMPLE_POOL_3D} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_compile_options(${EXAMPLE_POOL_3D} PRIVATE ${EXAMPLE_POOL_COMPILE_OPTIONS}) - diff --git a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt index 40a4166126..60fab5bd00 100644 --- a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt +++ b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt @@ -7,7 +7,7 @@ list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12") set(EXE_NAME tile_example_gemm_quant) - add_executable(${EXE_NAME} EXCLUDE_FROM_ALL + add_executable(${EXE_NAME} gemm_quant.cpp gemm_aquant_quantgrouped.cpp gemm_aquant_quantgrouped_preshufflequant.cpp diff --git a/example/ck_tile/40_streamk_gemm/CMakeLists.txt b/example/ck_tile/40_streamk_gemm/CMakeLists.txt index 3b285a54b5..30a487ca4c 100644 --- a/example/ck_tile/40_streamk_gemm/CMakeLists.txt +++ b/example/ck_tile/40_streamk_gemm/CMakeLists.txt @@ -1,5 +1,5 @@ if(GPU_TARGETS MATCHES "gfx9") - add_executable(tile_example_streamk_gemm_basic EXCLUDE_FROM_ALL streamk_gemm_basic.cpp) + add_executable(tile_example_streamk_gemm_basic streamk_gemm_basic.cpp) set(EXAMPLE_GEMM_COMPILE_OPTIONS) if(CK_USE_OCP_FP8) list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8) diff --git a/example/ck_tile/41_batched_contraction/CMakeLists.txt b/example/ck_tile/41_batched_contraction/CMakeLists.txt index 10b2e48cbf..2ba14f80a4 100644 --- a/example/ck_tile/41_batched_contraction/CMakeLists.txt +++ b/example/ck_tile/41_batched_contraction/CMakeLists.txt @@ -1,4 +1,4 @@ -add_executable(tile_example_batched_contraction EXCLUDE_FROM_ALL batched_contraction.cpp) +add_executable(tile_example_batched_contraction batched_contraction.cpp) set(EXAMPLE_CONTRACTION_COMPILE_OPTIONS) if(CK_USE_OCP_FP8) list(APPEND EXAMPLE_CONTRACTION_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8) diff --git a/example/ck_tile/41_batched_contraction/run_batched_contraction_example.inc b/example/ck_tile/41_batched_contraction/run_batched_contraction_example.inc index 9ebacdedd3..f1a5f8e9ae 100644 --- a/example/ck_tile/41_batched_contraction/run_batched_contraction_example.inc +++ b/example/ck_tile/41_batched_contraction/run_batched_contraction_example.inc @@ -316,20 +316,20 @@ int run_batched_contraction_example_with_layouts( auto start_time = std::chrono::high_resolution_clock::now(); - calculate_reference_flat_indexing(a_full_dims_host, - b_full_dims_host, - ds_full_dims_host, - e_full_dims_host_ref, - G_total, - M_total, - N_total, - K_total, - CDEElementWise{}); + ck_tile::calculate_reference_flat_indexing(a_full_dims_host, + b_full_dims_host, + ds_full_dims_host, + e_full_dims_host_ref, + G_total, + M_total, + N_total, + K_total, + CDEElementWise{}); auto end_time = std::chrono::high_resolution_clock::now(); auto duration = diff --git a/include/ck_tile/ops/pooling/kernel/pool_kernel.hpp b/include/ck_tile/ops/pooling/kernel/pool_kernel.hpp index 06835d7b48..91be63b803 100644 --- a/include/ck_tile/ops/pooling/kernel/pool_kernel.hpp +++ b/include/ck_tile/ops/pooling/kernel/pool_kernel.hpp @@ -436,12 +436,14 @@ struct PoolKernel // Main reduction loop - with index tracking for(int k_tile = amd_wave_read_first_lane(0); k_tile < num_k_tiles; ++k_tile) { - const auto x_tile = load_tile(x_window); + const auto x_tile = load_tile(x_window); + const auto& in_tensor_padded_ref = + in_tensor_padded; // structured bindings cannot be captured prior to cpp20 auto index_calculator = [&](const auto& x_indices) { // Get global coordinates in the 2D matrix space (M, N) const auto global_M = x_indices.at(number<0>{}) + iM; const auto global_N = (k_tile * S::Block_N) + x_indices.at(number<1>{}); - return in_tensor_padded.get_tensor_descriptor().calculate_offset( + return in_tensor_padded_ref.get_tensor_descriptor().calculate_offset( make_tuple(global_M, global_N)); }; diff --git a/script/infra_helper/capture_build_trace.js b/script/infra_helper/capture_build_trace.js new file mode 100644 index 0000000000..e484a815cc --- /dev/null +++ b/script/infra_helper/capture_build_trace.js @@ -0,0 +1,53 @@ +const puppeteer = require('puppeteer'); + +(async () => { + try { + // Launch the browser + const browser = await puppeteer.launch({ + args: [ + '--no-sandbox', + '--headless', + '--disable-gpu', + '--window-size=1920x1080' + ]}); + const page = await browser.newPage(); + await page.setViewport({ width: 1920, height: 1080 }); + await page.goto('https://ui.perfetto.dev'); + // Wait for the home page to be visible + console.log('Waiting for page to load...'); + await page.waitForSelector('.pf-home-page', { visible: true, timeout: 30000 }); + // Locate and click the Open trace button + const elements = await page.$$('li'); + let element = null; + for (const el of elements) { + const text = await el.evaluate(node => node.textContent); + if (text && text.includes('Open trace file')) { + element = el; + break; + } + } + if (element) { + const [fileChooser] = await Promise.all([ + page.waitForFileChooser(), + element.click() + ]); + await fileChooser.accept(['/workspace/ck_build_trace.json']); + } else { + throw new Error('Element not found'); + } + console.log('Waiting for data to load...'); + // Wait for the timeline element to be visible + await page.waitForSelector('.pf-track', { timeout: 30000 }); + // Wait for the data to finish loading + await page.waitForFunction(() => { + return !document.body.textContent.includes('Loading...'); + }, { timeout: 30000 }); + console.log('Capturing screenshot...'); + await page.screenshot({path: '/workspace/perfetto_snapshot_build.png'}); + console.log('Done capturing screenshot...'); + await browser.close(); + } catch (err) { + console.error(err); + process.exit(1); + } +})(); \ No newline at end of file diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt index a72b6c40ab..5e45cecaec 100644 --- a/tile_engine/ops/gemm/CMakeLists.txt +++ b/tile_engine/ops/gemm/CMakeLists.txt @@ -64,6 +64,7 @@ function(create_individual_gemm_target datatype layout trait tile_config config_ # Create the executable add_executable(${target_name} + # to save build time, exclude the target from "all" target of "gemm" directory and its ancestors EXCLUDE_FROM_ALL ${GEMM_SOURCE_DIR}/gemm_benchmark_single.cpp ${instance_header} diff --git a/tile_engine/ops/gemm_multi_d/CMakeLists.txt b/tile_engine/ops/gemm_multi_d/CMakeLists.txt index 8d9c087e24..1051c98aec 100644 --- a/tile_engine/ops/gemm_multi_d/CMakeLists.txt +++ b/tile_engine/ops/gemm_multi_d/CMakeLists.txt @@ -67,6 +67,7 @@ function(create_individual_gemm_multi_d_target datatype layout trait tile_config # Create the executable add_executable(${target_name} + # to save build time, exclude the target from "all" target of "gemm_multi_d" directory and its ancestors EXCLUDE_FROM_ALL ${GEMM_MULTI_D_SOURCE_DIR}/gemm_multi_d_benchmark_single.cpp ${instance_header} diff --git a/tile_engine/ops/gemm_preshuffle/CMakeLists.txt b/tile_engine/ops/gemm_preshuffle/CMakeLists.txt index e3bee6ff52..f714b247dd 100644 --- a/tile_engine/ops/gemm_preshuffle/CMakeLists.txt +++ b/tile_engine/ops/gemm_preshuffle/CMakeLists.txt @@ -64,6 +64,7 @@ function(create_individual_gemm_preshuffle_target datatype layout trait tile_con # Create the executable add_executable(${target_name} + # to save build time, exclude the target from "all" target of "gemm_preshuffle" directory and its ancestors EXCLUDE_FROM_ALL ${GEMM_PRESHUFFLE_SOURCE_DIR}/gemm_preshuffle_benchmark_single.cpp ${instance_header} diff --git a/tutorial/ck_tile/00_copy_kernel/CMakeLists.txt b/tutorial/ck_tile/00_copy_kernel/CMakeLists.txt index 91dd036eff..ae1cf43216 100644 --- a/tutorial/ck_tile/00_copy_kernel/CMakeLists.txt +++ b/tutorial/ck_tile/00_copy_kernel/CMakeLists.txt @@ -1,4 +1,4 @@ -add_executable(tile_tutorial_copy_kernel EXCLUDE_FROM_ALL copy_basic.cpp) +add_executable(tile_tutorial_copy_kernel copy_basic.cpp) # Impact: This flag ensures that the compiler doesn't make # assumptions about memory aliasing that could interfere with Composable Kernel's explicit memory access patterns. diff --git a/tutorial/ck_tile/01_naive_gemm/CMakeLists.txt b/tutorial/ck_tile/01_naive_gemm/CMakeLists.txt index e16977921a..647392fab2 100644 --- a/tutorial/ck_tile/01_naive_gemm/CMakeLists.txt +++ b/tutorial/ck_tile/01_naive_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -add_executable(tile_tutorial_naive_gemm EXCLUDE_FROM_ALL practice_gemm.cpp) +add_executable(tile_tutorial_naive_gemm practice_gemm.cpp) target_compile_options(tile_tutorial_naive_gemm PRIVATE -mllvm -enable-noalias-to-md-conversion=0 diff --git a/tutorial/ck_tile/01_naive_gemm/host_level/practice_gemm_host_pipeline_agmem_bgmem_creg.hpp b/tutorial/ck_tile/01_naive_gemm/host_level/practice_gemm_host_pipeline_agmem_bgmem_creg.hpp index 9bb1961cce..dd72f08d99 100644 --- a/tutorial/ck_tile/01_naive_gemm/host_level/practice_gemm_host_pipeline_agmem_bgmem_creg.hpp +++ b/tutorial/ck_tile/01_naive_gemm/host_level/practice_gemm_host_pipeline_agmem_bgmem_creg.hpp @@ -28,9 +28,9 @@ struct PracticeGemmHostPipeline { // Size of the entire problem - const auto M = a_dram.get_tensor_descriptor().get_length(number<0>{}); // M x K - const auto N = c_dram.get_tensor_descriptor().get_length(number<1>{}); // M x N - const auto K = a_dram.get_tensor_descriptor().get_length(number<1>{}); // M x K + const auto M = a_dram.get_tensor_descriptor().get_length(number<0>{}); // M x K + const auto N = c_dram_ref.get_tensor_descriptor().get_length(number<1>{}); // M x N + const auto K = a_dram.get_tensor_descriptor().get_length(number<1>{}); // M x K // Size of the block tile const auto MPerBlock = BlockTile::at(number<0>{}); @@ -83,7 +83,7 @@ struct PracticeGemmHostPipeline __shared__ char p_smem_char[block_gemm_pipeline.GetStaticLDSSize()]; const auto c_block_tile = block_gemm_pipeline(a_block_window, b_block_window, num_loops_k, p_smem_char); - auto c_window = make_tile_window(c_dram, + auto c_window = make_tile_window(c_dram_ref, make_tuple(number{}, number{}), {tile_origin_m, tile_origin_n}); store_tile(c_window, c_block_tile);