mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 10:09:41 +00:00
Merge commit '79aae7c7f71404bdb80d6db52bc6401e0e221d42' into develop
This commit is contained in:
132
Jenkinsfile
vendored
132
Jenkinsfile
vendored
@@ -72,6 +72,129 @@ def sendFailureNotifications() {
|
||||
}
|
||||
}
|
||||
|
||||
def generateAndArchiveBuildTraceVisualization() {
|
||||
try {
|
||||
def buildTraceFileName = "ck_build_trace.json";
|
||||
|
||||
// Attempt to download the build trace file to check if it exists
|
||||
def traceFileExists = false
|
||||
try {
|
||||
copyArtifacts(
|
||||
projectName: env.JOB_NAME,
|
||||
selector: specific(env.BUILD_NUMBER),
|
||||
filter: buildTraceFileName
|
||||
)
|
||||
traceFileExists = fileExists(buildTraceFileName)
|
||||
} catch (Exception e) {
|
||||
echo "Could not copy artifacts: ${e.getMessage()}"
|
||||
traceFileExists = false
|
||||
}
|
||||
|
||||
sh """
|
||||
echo "post download:"
|
||||
ls -la
|
||||
"""
|
||||
|
||||
if (traceFileExists) {
|
||||
// Move the build trace file to a temporary location to preserve it during checkout
|
||||
sh """
|
||||
mkdir -p /tmp/jenkins_artifacts
|
||||
cp ${buildTraceFileName} /tmp/jenkins_artifacts/${buildTraceFileName}
|
||||
ls -la /tmp/jenkins_artifacts/
|
||||
"""
|
||||
} else {
|
||||
echo "Build trace archive not found"
|
||||
return
|
||||
}
|
||||
|
||||
// Checkout source code to get required files
|
||||
checkout scm
|
||||
|
||||
// Restore the build trace file after checkout
|
||||
sh """
|
||||
ls -la
|
||||
cp /tmp/jenkins_artifacts/${buildTraceFileName} ${buildTraceFileName}
|
||||
ls -la ${buildTraceFileName}
|
||||
"""
|
||||
|
||||
// Pull image
|
||||
def image = "ghcr.io/puppeteer/puppeteer:24.30.0"
|
||||
echo "Pulling image: ${image}"
|
||||
def retimage = docker.image("${image}")
|
||||
retimage.pull()
|
||||
|
||||
// Create a temporary workspace
|
||||
sh """#!/bin/bash
|
||||
ls -la
|
||||
mkdir -p workspace
|
||||
cp ./script/infra_helper/capture_build_trace.js ./workspace
|
||||
cp ${buildTraceFileName} ./workspace/${buildTraceFileName}
|
||||
chmod 777 ./workspace
|
||||
ls -la ./workspace
|
||||
"""
|
||||
|
||||
// Run container to get snapshot
|
||||
def dockerOpts = "--cap-add=SYS_ADMIN -v \"\$(pwd)/workspace:/workspace\" -e NODE_PATH=/home/pptruser/node_modules"
|
||||
// Create unique image name by sanitizing job name
|
||||
def sanitizedJobName = env.JOB_NAME.replaceAll(/[\/\\:*?"<>| ]/, '_')
|
||||
def imageName = "perfetto_snapshot_${sanitizedJobName}_build_${env.BUILD_NUMBER}.png"
|
||||
sh """
|
||||
docker run --rm ${dockerOpts} ${image} node /workspace/capture_build_trace.js
|
||||
mv ./workspace/perfetto_snapshot_build.png ./workspace/${imageName}
|
||||
"""
|
||||
|
||||
// Archive the snapshot
|
||||
sh """
|
||||
mv ./workspace/${imageName} ${imageName}
|
||||
"""
|
||||
archiveArtifacts "${imageName}"
|
||||
|
||||
// Notify the channel
|
||||
withCredentials([string(credentialsId: 'ck_ci_build_perf_webhook_url', variable: 'WEBHOOK_URL')]) {
|
||||
sh '''
|
||||
# Create build trace filename with build number based on the original filename
|
||||
BUILD_TRACE_WITH_NUMBER=$(echo "''' + buildTraceFileName + '''" | sed 's/.json/_''' + sanitizedJobName + '''_''' + env.BUILD_NUMBER + '''.json/')
|
||||
|
||||
# Convert image to base64
|
||||
echo "Converting image to base64..."
|
||||
IMAGE_BASE64=$(base64 -w 0 ''' + imageName + ''')
|
||||
echo "Image base64 length: ${#IMAGE_BASE64}"
|
||||
|
||||
# Convert build trace to base64
|
||||
echo "Converting build trace to base64..."
|
||||
BUILD_TRACE_BASE64=$(base64 -w 0 ''' + buildTraceFileName + ''')
|
||||
echo "Build trace base64 length: ${#BUILD_TRACE_BASE64}"
|
||||
|
||||
# Create JSON payload with base64 data
|
||||
echo "Creating JSON payload..."
|
||||
{
|
||||
printf '{\n'
|
||||
printf ' "jobName": "%s",\n' "''' + env.JOB_NAME + '''"
|
||||
printf ' "buildNumber": "%s",\n' "''' + env.BUILD_NUMBER + '''"
|
||||
printf ' "jobUrl": "%s",\n' "''' + env.RUN_DISPLAY_URL + '''"
|
||||
printf ' "imageName": "%s",\n' "''' + imageName + '''"
|
||||
printf ' "imageData": "%s",\n' "$IMAGE_BASE64"
|
||||
printf ' "buildTraceName": "%s",\n' "$BUILD_TRACE_WITH_NUMBER"
|
||||
printf ' "buildTraceData": "%s"\n' "$BUILD_TRACE_BASE64"
|
||||
printf '}\n'
|
||||
} > webhook_payload.json
|
||||
|
||||
echo "JSON payload created, size: $(wc -c < webhook_payload.json) bytes"
|
||||
|
||||
curl -X POST "${WEBHOOK_URL}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d @webhook_payload.json
|
||||
|
||||
# Clean up temporary file
|
||||
rm -f webhook_payload.json
|
||||
'''
|
||||
}
|
||||
} catch (Exception e) {
|
||||
echo "Throwing error exception while generating build trace visualization"
|
||||
echo 'Exception occurred: ' + e.toString()
|
||||
}
|
||||
}
|
||||
|
||||
class Version {
|
||||
int major, minor, patch
|
||||
@Override
|
||||
@@ -1750,6 +1873,15 @@ pipeline {
|
||||
}
|
||||
}
|
||||
post {
|
||||
always {
|
||||
node(rocmnode("nogpu")) {
|
||||
script {
|
||||
// Simulate capture
|
||||
generateAndArchiveBuildTraceVisualization()
|
||||
}
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
success {
|
||||
script {
|
||||
// Report the parent stage build ck and run tests status
|
||||
|
||||
@@ -109,6 +109,7 @@ set(FMHA_FWD_INSTANCES "tile_fmha_fwd_instances")
|
||||
set(FMHA_BWD_INSTANCES "tile_fmha_bwd_instances")
|
||||
|
||||
message(DEBUG "adding instances ${FMHA_FWD_INSTANCES}")
|
||||
# to save build time, exclude the target from "all" target of "01_fmha" directory and its ancestors
|
||||
add_library(${FMHA_FWD_INSTANCES} OBJECT EXCLUDE_FROM_ALL)
|
||||
target_include_directories(${FMHA_FWD_INSTANCES} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_sources(${FMHA_FWD_INSTANCES} PRIVATE ${FMHA_FWD_GEN_BLOBS})
|
||||
|
||||
@@ -26,7 +26,7 @@ add_custom_command(
|
||||
set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd")
|
||||
|
||||
message(DEBUG "adding example ${EXAMPLE_LAYERNORM2D_FWD}")
|
||||
add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp)
|
||||
add_executable(${EXAMPLE_LAYERNORM2D_FWD} layernorm2d_fwd.cpp)
|
||||
target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS})
|
||||
|
||||
|
||||
@@ -1,20 +1,22 @@
|
||||
add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
|
||||
add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
|
||||
add_executable(tile_example_gemm_weight_preshuffle EXCLUDE_FROM_ALL gemm_weight_preshuffle.cpp)
|
||||
add_executable(tile_example_gemm_reduce EXCLUDE_FROM_ALL gemm_splitk_two_stage_reduce.cpp)
|
||||
add_executable(tile_example_gemm_splitk_two_stage EXCLUDE_FROM_ALL gemm_splitk_two_stage.cpp)
|
||||
set(EXAMPLE_GEMM_COMPILE_OPTIONS)
|
||||
set(EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS)
|
||||
if(CK_USE_OCP_FP8)
|
||||
list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
|
||||
if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a")
|
||||
add_executable(tile_example_gemm_basic gemm_basic.cpp)
|
||||
add_executable(tile_example_gemm_universal universal_gemm.cpp)
|
||||
add_executable(tile_example_gemm_weight_preshuffle gemm_weight_preshuffle.cpp)
|
||||
add_executable(tile_example_gemm_reduce gemm_splitk_two_stage_reduce.cpp)
|
||||
add_executable(tile_example_gemm_splitk_two_stage gemm_splitk_two_stage.cpp)
|
||||
set(EXAMPLE_GEMM_COMPILE_OPTIONS)
|
||||
set(EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS)
|
||||
if(CK_USE_OCP_FP8)
|
||||
list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
|
||||
endif()
|
||||
list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
|
||||
list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-unused-local-typedef)
|
||||
list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-gnu-line-marker)
|
||||
list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS --save-temps)
|
||||
list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm -enable-noalias-to-md-conversion=0")
|
||||
target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_gemm_reduce PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_gemm_splitk_two_stage PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
endif()
|
||||
list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
|
||||
list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-unused-local-typedef)
|
||||
list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-gnu-line-marker)
|
||||
list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS --save-temps)
|
||||
list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm -enable-noalias-to-md-conversion=0")
|
||||
target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_gemm_reduce PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_gemm_splitk_two_stage PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
|
||||
@@ -683,7 +683,7 @@ int run_gemm_example_with_layouts_two_stage(ck_tile::ArgParser& arg_parser,
|
||||
|
||||
if constexpr(preshuffle)
|
||||
{
|
||||
ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b<GemmConfig>(b_k_n);
|
||||
ck_tile::HostTensor<BDataType> b_shuffle_host = ck_tile::shuffle_b<GemmConfig>(b_k_n);
|
||||
// shuffled buffer B for device implementation
|
||||
b_k_n_dev_buf.ToDevice(b_shuffle_host.data());
|
||||
}
|
||||
|
||||
@@ -284,12 +284,12 @@ int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser,
|
||||
if constexpr(GemmConfig::TiledMMAPermuteN)
|
||||
{
|
||||
std::cout << "Run with PermuteN" << std::endl;
|
||||
return shuffle_b_permuteN<GemmConfig>(b_k_n);
|
||||
return ck_tile::shuffle_b_permuteN<GemmConfig>(b_k_n);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Run without PermuteN" << std::endl;
|
||||
return shuffle_b<GemmConfig>(b_k_n);
|
||||
return ck_tile::shuffle_b<GemmConfig>(b_k_n);
|
||||
}
|
||||
}();
|
||||
// shuffled buffer B for device implementation
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
# not using add_example_executable() to add this target, since we don't want this to have
|
||||
# to be included in "make all/install/check"
|
||||
add_executable(tile_example_img2col EXCLUDE_FROM_ALL image_to_column.cpp)
|
||||
add_executable(tile_example_img2col image_to_column.cpp)
|
||||
|
||||
@@ -3,7 +3,7 @@ set(EXAMPLE_REDUCE "tile_example_reduce")
|
||||
# to be included in "make all/install/check"
|
||||
message(DEBUG "adding example ${EXAMPLE_REDUCE}")
|
||||
|
||||
add_executable(${EXAMPLE_REDUCE} EXCLUDE_FROM_ALL reduce.cpp)
|
||||
add_executable(${EXAMPLE_REDUCE} reduce.cpp)
|
||||
target_include_directories(${EXAMPLE_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
|
||||
set(EXAMPLE_REDUCE_COMPILE_OPTIONS)
|
||||
|
||||
@@ -16,4 +16,4 @@ target_compile_options(${EXAMPLE_REDUCE} PRIVATE ${EXAMPLE_REDUCE_COMPILE_OPTION
|
||||
# by cmake will print too many files, execvp: /bin/sh: Argument list too long
|
||||
# however, this property may affect global
|
||||
# TODO: consider codegen a makefile by us
|
||||
set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
|
||||
set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# not using add_example_executable() to add this target, since we don't want this to have
|
||||
# to be included in "make all/install/check"
|
||||
add_executable(tile_example_permute EXCLUDE_FROM_ALL permute.cpp)
|
||||
add_executable(tile_example_permute permute.cpp)
|
||||
|
||||
if(NOT DEFINED PERMUTE_USE_ALTERNATIVE_IMPL)
|
||||
# set(PERMUTE_USE_ALTERNATIVE_IMPL false)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
add_executable(tile_example_topk_softmax EXCLUDE_FROM_ALL topk_softmax.cpp topk_softmax_api.cpp)
|
||||
add_executable(tile_example_topk_softmax topk_softmax.cpp topk_softmax_api.cpp)
|
||||
target_include_directories(tile_example_topk_softmax PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
|
||||
|
||||
set(EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS)
|
||||
|
||||
@@ -26,7 +26,7 @@ add_custom_command(
|
||||
set(TILE_RMSNORM2D_FWD "tile_rmsnorm2d_fwd")
|
||||
|
||||
message(DEBUG "adding ${TILE_RMSNORM2D_FWD}")
|
||||
add_executable(${TILE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL rmsnorm2d_fwd.cpp)
|
||||
add_executable(${TILE_RMSNORM2D_FWD} rmsnorm2d_fwd.cpp)
|
||||
target_include_directories(${TILE_RMSNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_sources(${TILE_RMSNORM2D_FWD} PRIVATE ${RMSNORM2D_FWD_GEN_BLOBS})
|
||||
|
||||
@@ -38,7 +38,7 @@ list(APPEND TILE_RMSNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno
|
||||
target_compile_options(${TILE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})
|
||||
|
||||
set(EXAMPLE_RMSNORM2D_FWD "tile_example_rmsnorm2d_fwd")
|
||||
add_executable(${EXAMPLE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL example_rmsnorm2d_fwd.cpp)
|
||||
add_executable(${EXAMPLE_RMSNORM2D_FWD} example_rmsnorm2d_fwd.cpp)
|
||||
target_compile_options(${EXAMPLE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})
|
||||
|
||||
# TODO: we have to turn off this global prop, otherwise the progress bar generated
|
||||
|
||||
@@ -3,7 +3,7 @@ set(TILE_ADD_RMSNORM2D_RDQUANT_FWD "tile_add_rmsnorm2d_rdquant_fwd")
|
||||
# to be included in "make all/install/check"
|
||||
message(DEBUG "adding ${TILE_ADD_RMSNORM2D_RDQUANT_FWD}")
|
||||
file(GLOB INSTANCE_SRCS instances/*.cpp)
|
||||
add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL add_rmsnorm2d_rdquant_fwd.cpp)
|
||||
add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} add_rmsnorm2d_rdquant_fwd.cpp)
|
||||
target_include_directories(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_sources(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${INSTANCE_SRCS})
|
||||
|
||||
@@ -15,7 +15,7 @@ list(APPEND TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS -Wno-undefined-func-t
|
||||
target_compile_options(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS})
|
||||
|
||||
set(EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD "tile_example_add_rmsnorm2d_rdquant_fwd")
|
||||
add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL example_add_rmsnorm2d_rdquant_fwd.cpp)
|
||||
add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} example_add_rmsnorm2d_rdquant_fwd.cpp)
|
||||
target_compile_options(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS})
|
||||
|
||||
# TODO: we have to turn off this global prop, otherwise the progress bar generated
|
||||
|
||||
@@ -2,7 +2,7 @@ function (add_smoothquant_example TARGET_NAME MAIN_SRC)
|
||||
message(DEBUG "adding ${TARGET_NAME}")
|
||||
# not using add_example_executable() to add target, since we don't want this to have
|
||||
# to be included in "make all/install/check"
|
||||
add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
|
||||
add_executable(${TARGET_NAME} ${MAIN_SRC})
|
||||
target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
|
||||
|
||||
foreach(source IN LISTS ARGN)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
add_executable(tile_example_moe_sorting EXCLUDE_FROM_ALL moe_sorting.cpp moe_sorting_api.cpp)
|
||||
add_executable(tile_example_moe_sorting moe_sorting.cpp moe_sorting_api.cpp)
|
||||
target_include_directories(tile_example_moe_sorting PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
|
||||
|
||||
set(EXAMPLE_MOE_SORTING_COMPILE_OPTIONS)
|
||||
|
||||
@@ -2,7 +2,7 @@ function (add_moe_smoothquant_example TARGET_NAME MAIN_SRC)
|
||||
message(DEBUG "adding ${TARGET_NAME}")
|
||||
# not using add_example_executable() to add target, since we don't want this to have
|
||||
# to be included in "make all/install/check"
|
||||
add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
|
||||
add_executable(${TARGET_NAME} ${MAIN_SRC})
|
||||
target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
|
||||
|
||||
foreach(source IN LISTS ARGN)
|
||||
@@ -22,4 +22,3 @@ endfunction(add_moe_smoothquant_example TARGET_NAME MAIN_SRC)
|
||||
file(GLOB INSTANCE_SRCS instances/*.cpp)
|
||||
|
||||
add_moe_smoothquant_example(tile_example_moe_smoothquant moe_smoothquant.cpp ${INSTANCE_SRCS})
|
||||
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
set(TILE_EXAPMLE_FUSED_MOE "tile_example_fused_moe")
|
||||
# not using add_example_executable() to add this target, since we don't want this to have
|
||||
# to be included in "make all/install/check"
|
||||
message(DEBUG "adding ${TILE_EXAPMLE_FUSED_MOE}")
|
||||
file(GLOB INSTANCE_SRCS instances/*.cpp)
|
||||
add_executable(${TILE_EXAPMLE_FUSED_MOE} EXCLUDE_FROM_ALL main.cpp)
|
||||
target_include_directories(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_sources(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${INSTANCE_SRCS})
|
||||
if(GPU_TARGETS MATCHES "gfx94|gfx95")
|
||||
set(TILE_EXAMPLE_FUSED_MOE "tile_example_fused_moe")
|
||||
message(DEBUG "adding ${TILE_EXAMPLE_FUSED_MOE}")
|
||||
file(GLOB INSTANCE_SRCS instances/*.cpp)
|
||||
add_executable(${TILE_EXAMPLE_FUSED_MOE} main.cpp)
|
||||
target_include_directories(${TILE_EXAMPLE_FUSED_MOE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_sources(${TILE_EXAMPLE_FUSED_MOE} PRIVATE ${INSTANCE_SRCS})
|
||||
|
||||
set(TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS)
|
||||
set(TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS)
|
||||
|
||||
# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
|
||||
list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
|
||||
list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_BUFFER_LOAD_AGPR=1) # TODO: enable load to a
|
||||
list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=4) # rta
|
||||
# list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -mllvm -greedy-reverse-local-assignment=1)
|
||||
# list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
|
||||
# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
|
||||
list(APPEND TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
|
||||
list(APPEND TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_BUFFER_LOAD_AGPR=1) # TODO: enable load to a
|
||||
list(APPEND TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=4) # rta
|
||||
# list(APPEND TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS -mllvm -greedy-reverse-local-assignment=1)
|
||||
# list(APPEND TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
|
||||
|
||||
target_compile_options(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS})
|
||||
target_compile_options(${TILE_EXAMPLE_FUSED_MOE} PRIVATE ${TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS})
|
||||
endif()
|
||||
|
||||
@@ -402,22 +402,6 @@ float fused_moesorting_mp(fused_moesorting_trait t,
|
||||
using ms_index_t = ck_tile::index_t;
|
||||
using ms_weight_type = float;
|
||||
|
||||
auto maybe_clear_workspace = [=](const ck_tile::stream_config& s_) {
|
||||
if(t.clear_workspace_inside_api)
|
||||
{
|
||||
if(is_local_token)
|
||||
{
|
||||
auto k = MOR_SORTING_CLEAR_WS_DISPATCH_(true, 1024, 1);
|
||||
k(s_);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto k = MOR_SORTING_CLEAR_WS_DISPATCH_(false, 1024, 1);
|
||||
k(s_);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if(a.tokens < 2048)
|
||||
{
|
||||
if(ck_tile::impl::moe_sorting_get_smem_size_p23(a.num_experts) >
|
||||
|
||||
@@ -1 +1 @@
|
||||
add_executable(tile_example_batched_gemm EXCLUDE_FROM_ALL batched_gemm.cpp)
|
||||
add_executable(tile_example_batched_gemm batched_gemm.cpp)
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp)
|
||||
add_executable(tile_example_quant_grouped_gemm EXCLUDE_FROM_ALL quant_grouped_gemm.cpp)
|
||||
add_executable(tile_example_grouped_gemm_preshuffle EXCLUDE_FROM_ALL grouped_gemm_preshuffle.cpp)
|
||||
add_executable(tile_example_grouped_gemm_multi_d EXCLUDE_FROM_ALL grouped_gemm_multi_d.cpp)
|
||||
set(EXAMPLE_GEMM_COMPILE_OPTIONS)
|
||||
if(CK_USE_OCP_FP8)
|
||||
list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
|
||||
if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a")
|
||||
add_executable(tile_example_grouped_gemm grouped_gemm.cpp)
|
||||
add_executable(tile_example_quant_grouped_gemm quant_grouped_gemm.cpp)
|
||||
add_executable(tile_example_grouped_gemm_preshuffle grouped_gemm_preshuffle.cpp)
|
||||
add_executable(tile_example_grouped_gemm_multi_d grouped_gemm_multi_d.cpp)
|
||||
set(EXAMPLE_GEMM_COMPILE_OPTIONS)
|
||||
if(CK_USE_OCP_FP8)
|
||||
list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
|
||||
endif()
|
||||
target_compile_options(tile_example_grouped_gemm PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_grouped_gemm_preshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_grouped_gemm_multi_d PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_quant_grouped_gemm PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
endif()
|
||||
target_compile_options(tile_example_grouped_gemm PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_grouped_gemm_preshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_grouped_gemm_multi_d PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_quant_grouped_gemm PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
@@ -233,7 +233,8 @@ int run_grouped_gemm_example_with_layouts(int argc,
|
||||
// Perform preshuffle for B tensor
|
||||
if constexpr(GemmConfig::Preshuffle)
|
||||
{
|
||||
ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b<GemmConfig>(b_k_n_tensors[i]);
|
||||
ck_tile::HostTensor<BDataType> b_shuffle_host =
|
||||
ck_tile::shuffle_b<GemmConfig>(b_k_n_tensors[i]);
|
||||
b_k_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(b_shuffle_host));
|
||||
}
|
||||
else
|
||||
|
||||
@@ -9,18 +9,6 @@ foreach(gpu IN LISTS GPU_TARGETS)
|
||||
endforeach()
|
||||
|
||||
if(has_supported_gpu)
|
||||
add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp)
|
||||
add_executable(tile_example_mixed_prec_flatmm EXCLUDE_FROM_ALL mixed_prec/mixed_prec_flatmm.cpp)
|
||||
add_executable(tile_example_moe_flatmm EXCLUDE_FROM_ALL moe_flatmm.cpp)
|
||||
add_executable(tile_example_a16w4_moe_flatmm EXCLUDE_FROM_ALL mixed_prec/a16w4_moe_flatmm.cpp)
|
||||
add_executable(tile_example_grouped_flatmm EXCLUDE_FROM_ALL grouped_flatmm.cpp)
|
||||
|
||||
include(mxgemm/mx_flatmm_instance.cmake)
|
||||
mx_flatmm_instance_generate(EXAMPLE_MX_FLATMM_FILES)
|
||||
message(STATUS "Generated MX FlatMM kernel files: ${EXAMPLE_MX_FLATMM_FILES}")
|
||||
add_executable(tile_example_mx_flatmm EXCLUDE_FROM_ALL mxgemm/mx_flatmm.cpp ${EXAMPLE_MX_FLATMM_FILES})
|
||||
target_include_directories(tile_example_mx_flatmm PRIVATE mxgemm)
|
||||
|
||||
# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
|
||||
# ... because they are auto-generated
|
||||
set(EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template)
|
||||
@@ -30,11 +18,28 @@ if(has_supported_gpu)
|
||||
list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
|
||||
endif()
|
||||
|
||||
add_executable(tile_example_flatmm_basic flatmm_basic.cpp)
|
||||
target_compile_options(tile_example_flatmm_basic PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_mixed_prec_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_a16w4_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_grouped_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_mx_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) # TODO: 950 only
|
||||
endif()
|
||||
|
||||
add_executable(tile_example_moe_flatmm moe_flatmm.cpp)
|
||||
target_compile_options(tile_example_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
|
||||
|
||||
add_executable(tile_example_grouped_flatmm grouped_flatmm.cpp)
|
||||
target_compile_options(tile_example_grouped_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
|
||||
|
||||
if (GPU_TARGETS MATCHES "gfx95")
|
||||
add_executable(tile_example_mixed_prec_flatmm mixed_prec/mixed_prec_flatmm.cpp)
|
||||
target_compile_options(tile_example_mixed_prec_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
|
||||
|
||||
add_executable(tile_example_a16w4_moe_flatmm mixed_prec/a16w4_moe_flatmm.cpp)
|
||||
target_compile_options(tile_example_a16w4_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
|
||||
|
||||
include(mxgemm/mx_flatmm_instance.cmake)
|
||||
mx_flatmm_instance_generate(EXAMPLE_MX_FLATMM_FILES)
|
||||
message(STATUS "Generated MX FlatMM kernel files: ${EXAMPLE_MX_FLATMM_FILES}")
|
||||
|
||||
add_executable(tile_example_mx_flatmm mxgemm/mx_flatmm.cpp ${EXAMPLE_MX_FLATMM_FILES})
|
||||
target_include_directories(tile_example_mx_flatmm PRIVATE mxgemm)
|
||||
target_compile_options(tile_example_mx_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -215,7 +215,7 @@ int run_contiguous_grouped_flatmm_example_with_layouts(
|
||||
assert(N % N_Warp_Tile == 0 &&
|
||||
"N must be divisible by N_Warp_Tile for contiguous grouped gemm");
|
||||
ck_tile::HostTensor<BDataType> b_shuffle_host =
|
||||
shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);
|
||||
ck_tile::shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);
|
||||
|
||||
std::unique_ptr<ck_tile::DeviceMem> a_m_k_dev_buf(
|
||||
std::make_unique<ck_tile::DeviceMem>(a_m_k_tensor.get_element_space_size_in_bytes()));
|
||||
@@ -431,7 +431,7 @@ int run_masked_grouped_flatmm_example_with_layouts(
|
||||
assert(N % N_Warp_Tile == 0 &&
|
||||
"N must be divisible by N_Warp_Tile for contiguous grouped gemm");
|
||||
ck_tile::HostTensor<BDataType> b_shuffle_host =
|
||||
shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);
|
||||
ck_tile::shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);
|
||||
|
||||
std::unique_ptr<ck_tile::DeviceMem> a_m_k_dev_buf(
|
||||
std::make_unique<ck_tile::DeviceMem>(a_m_k_tensor.get_element_space_size_in_bytes()));
|
||||
|
||||
@@ -302,10 +302,6 @@ int run_moe_gemm_example_with_layouts(int argc,
|
||||
static_cast<float*>(per_token_scale_dev_buf.GetDeviceBuffer()),
|
||||
static_cast<float*>(per_channel_scale_dev_buf.GetDeviceBuffer()));
|
||||
|
||||
const float max_accumulated_value =
|
||||
*std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
|
||||
const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
|
||||
K, 1 /*kbatch*/, max_accumulated_value);
|
||||
c_m_n_ref_buf->FromDevice(c_m_n_host_ref.data());
|
||||
|
||||
const float rtol = std::is_same_v<ADataType, ck_tile::half_t> && IsInputGemm ? 1e-3 : 1e-2;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
add_executable(tile_example_gemm_multi_d_fp16 EXCLUDE_FROM_ALL gemm_multi_d_fp16.cpp)
|
||||
add_executable(tile_example_gemm_multi_d_fp16 gemm_multi_d_fp16.cpp)
|
||||
set(EXAMPLE_GEMM_COMPILE_OPTIONS)
|
||||
if(CK_USE_OCP_FP8)
|
||||
list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
|
||||
|
||||
@@ -1,20 +1,22 @@
|
||||
set(EXAMPLE_CONV_COMPILE_OPTIONS)
|
||||
list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
|
||||
if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a")
|
||||
set(EXAMPLE_CONV_COMPILE_OPTIONS)
|
||||
list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
|
||||
|
||||
add_executable(tile_example_grouped_conv_fwd EXCLUDE_FROM_ALL grouped_convolution_forward.cpp)
|
||||
target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
|
||||
add_executable(tile_example_grouped_conv_fwd grouped_convolution_forward.cpp)
|
||||
target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
|
||||
|
||||
add_executable(tile_example_grouped_conv_fwd_large_tensor EXCLUDE_FROM_ALL grouped_convolution_forward_large_tensor.cpp)
|
||||
target_compile_options(tile_example_grouped_conv_fwd_large_tensor PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
|
||||
add_executable(tile_example_grouped_conv_fwd_large_tensor grouped_convolution_forward_large_tensor.cpp)
|
||||
target_compile_options(tile_example_grouped_conv_fwd_large_tensor PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
|
||||
|
||||
add_executable(tile_example_grouped_conv_fwd_bias_clamp EXCLUDE_FROM_ALL grouped_convolution_forward_bias_clamp.cpp)
|
||||
target_compile_options(tile_example_grouped_conv_fwd_bias_clamp PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
add_executable(tile_example_grouped_conv_fwd_bias_clamp grouped_convolution_forward_bias_clamp.cpp)
|
||||
target_compile_options(tile_example_grouped_conv_fwd_bias_clamp PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
|
||||
|
||||
add_executable(tile_example_grouped_conv_bwd_weight EXCLUDE_FROM_ALL grouped_convolution_backward_weight.cpp)
|
||||
target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
|
||||
add_executable(tile_example_grouped_conv_bwd_weight grouped_convolution_backward_weight.cpp)
|
||||
target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
|
||||
|
||||
add_executable(tile_example_grouped_conv_bwd_weight_two_stage EXCLUDE_FROM_ALL grouped_convolution_backward_weight_two_stage.cpp)
|
||||
target_compile_options(tile_example_grouped_conv_bwd_weight_two_stage PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
|
||||
add_executable(tile_example_grouped_conv_bwd_weight_two_stage grouped_convolution_backward_weight_two_stage.cpp)
|
||||
target_compile_options(tile_example_grouped_conv_bwd_weight_two_stage PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
|
||||
|
||||
add_executable(tile_example_grouped_conv_bwd_data EXCLUDE_FROM_ALL grouped_convolution_backward_data.cpp)
|
||||
target_compile_options(tile_example_grouped_conv_bwd_data PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
|
||||
add_executable(tile_example_grouped_conv_bwd_data grouped_convolution_backward_data.cpp)
|
||||
target_compile_options(tile_example_grouped_conv_bwd_data PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
|
||||
endif()
|
||||
|
||||
@@ -101,7 +101,6 @@ struct GroupedConvolutionForwardInvoker
|
||||
const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split);
|
||||
const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop);
|
||||
const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
|
||||
float ave_time{0};
|
||||
|
||||
using TransformType =
|
||||
ck_tile::TransformConvFwdToGemm<NDimSpatial,
|
||||
@@ -242,13 +241,15 @@ struct GroupedConvolutionForwardInvoker
|
||||
// =====================================================================
|
||||
// Kernel launch lambda: Uses EnableSplitImage based on layout support
|
||||
// =====================================================================
|
||||
const auto Run = [&]<bool EnableSplitImage>(const auto has_hot_loop_,
|
||||
const auto tail_number_,
|
||||
const auto memory_operation_) {
|
||||
const auto Run = [&](const auto has_hot_loop_,
|
||||
const auto tail_number_,
|
||||
const auto memory_operation_,
|
||||
const auto enable_split_image_) {
|
||||
constexpr bool has_hot_loop_v = has_hot_loop_.value;
|
||||
constexpr auto tail_number_v = tail_number_.value;
|
||||
constexpr auto scheduler = ConvConfig::Scheduler;
|
||||
constexpr auto memory_operation = memory_operation_.value;
|
||||
constexpr bool EnableSplitImage = enable_split_image_.value;
|
||||
|
||||
using GroupedConvTraitsType = std::conditional_t<EnableSplitImage,
|
||||
GroupedConvTraitsTypeLargeTensor,
|
||||
@@ -357,11 +358,9 @@ struct GroupedConvolutionForwardInvoker
|
||||
<< ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
|
||||
}
|
||||
|
||||
ave_time = ck_tile::launch_kernel(
|
||||
return ck_tile::launch_kernel(
|
||||
s,
|
||||
ck_tile::make_kernel<ConvConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
|
||||
|
||||
return ave_time;
|
||||
};
|
||||
|
||||
// =====================================================================
|
||||
@@ -369,28 +368,33 @@ struct GroupedConvolutionForwardInvoker
|
||||
// =====================================================================
|
||||
if(use_split_image)
|
||||
{
|
||||
// Use split-image kernel (Kernel<true>)
|
||||
const auto RunSplitImage = [&](const auto has_hot_loop_, const auto tail_number_) {
|
||||
if(args.k_batch == 1)
|
||||
Run.template operator()<true>(has_hot_loop_, tail_number_, MemoryOpSet{});
|
||||
return Run(
|
||||
has_hot_loop_, tail_number_, MemoryOpSet{}, ck_tile::bool_constant<true>{});
|
||||
else
|
||||
Run.template operator()<true>(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
|
||||
return Run(has_hot_loop_,
|
||||
tail_number_,
|
||||
MemoryOpAtomicAdd{},
|
||||
ck_tile::bool_constant<true>{});
|
||||
};
|
||||
BaseGemmPipeline::TailHandler(RunSplitImage, has_hot_loop, tail_num);
|
||||
return BaseGemmPipeline::TailHandler(RunSplitImage, has_hot_loop, tail_num);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Use regular kernel (Kernel<false>)
|
||||
const auto RunRegular = [&](const auto has_hot_loop_, const auto tail_number_) {
|
||||
if(args.k_batch == 1)
|
||||
Run.template operator()<false>(has_hot_loop_, tail_number_, MemoryOpSet{});
|
||||
return Run(has_hot_loop_,
|
||||
tail_number_,
|
||||
MemoryOpSet{},
|
||||
ck_tile::bool_constant<false>{});
|
||||
else
|
||||
Run.template operator()<false>(
|
||||
has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
|
||||
return Run(has_hot_loop_,
|
||||
tail_number_,
|
||||
MemoryOpAtomicAdd{},
|
||||
ck_tile::bool_constant<false>{});
|
||||
};
|
||||
BaseGemmPipeline::TailHandler(RunRegular, has_hot_loop, tail_num);
|
||||
return BaseGemmPipeline::TailHandler(RunRegular, has_hot_loop, tail_num);
|
||||
}
|
||||
|
||||
return ave_time;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1 +1 @@
|
||||
add_executable(tile_example_gemm_multi_abd_fp16 EXCLUDE_FROM_ALL gemm_multi_abd_fp16.cpp)
|
||||
add_executable(tile_example_gemm_multi_abd_fp16 gemm_multi_abd_fp16.cpp)
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
set(TARGET_NAME tile_example_batched_transpose)
|
||||
add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL batched_transpose_example.cpp batched_transpose_api.cpp)
|
||||
target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
|
||||
|
||||
# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
|
||||
list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
|
||||
# list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
|
||||
target_compile_options(tile_example_batched_transpose PRIVATE ${EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS})
|
||||
if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a")
|
||||
set(TARGET_NAME tile_example_batched_transpose)
|
||||
add_executable(${TARGET_NAME} batched_transpose_example.cpp batched_transpose_api.cpp)
|
||||
target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
|
||||
|
||||
# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
|
||||
list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
|
||||
# list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
|
||||
target_compile_options(tile_example_batched_transpose PRIVATE ${EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS})
|
||||
endif()
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
set(EXAMPLE_POOL_3D "tile_example_pool3d")
|
||||
message(DEBUG "adding example ${EXAMPLE_POOL_3D}")
|
||||
|
||||
add_executable(${EXAMPLE_POOL_3D} EXCLUDE_FROM_ALL pool3d.cpp)
|
||||
add_executable(${EXAMPLE_POOL_3D} pool3d.cpp)
|
||||
target_include_directories(${EXAMPLE_POOL_3D} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
|
||||
|
||||
target_compile_options(${EXAMPLE_POOL_3D} PRIVATE ${EXAMPLE_POOL_COMPILE_OPTIONS})
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion
|
||||
|
||||
if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
|
||||
set(EXE_NAME tile_example_gemm_quant)
|
||||
add_executable(${EXE_NAME} EXCLUDE_FROM_ALL
|
||||
add_executable(${EXE_NAME}
|
||||
gemm_quant.cpp
|
||||
gemm_aquant_quantgrouped.cpp
|
||||
gemm_aquant_quantgrouped_preshufflequant.cpp
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
if(GPU_TARGETS MATCHES "gfx9")
|
||||
add_executable(tile_example_streamk_gemm_basic EXCLUDE_FROM_ALL streamk_gemm_basic.cpp)
|
||||
add_executable(tile_example_streamk_gemm_basic streamk_gemm_basic.cpp)
|
||||
set(EXAMPLE_GEMM_COMPILE_OPTIONS)
|
||||
if(CK_USE_OCP_FP8)
|
||||
list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
add_executable(tile_example_batched_contraction EXCLUDE_FROM_ALL batched_contraction.cpp)
|
||||
add_executable(tile_example_batched_contraction batched_contraction.cpp)
|
||||
set(EXAMPLE_CONTRACTION_COMPILE_OPTIONS)
|
||||
if(CK_USE_OCP_FP8)
|
||||
list(APPEND EXAMPLE_CONTRACTION_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
|
||||
|
||||
@@ -316,20 +316,20 @@ int run_batched_contraction_example_with_layouts(
|
||||
|
||||
auto start_time = std::chrono::high_resolution_clock::now();
|
||||
|
||||
calculate_reference_flat_indexing<ADataType,
|
||||
BDataType,
|
||||
DDataType,
|
||||
EDataType,
|
||||
AccDataType,
|
||||
CDEElementWise>(a_full_dims_host,
|
||||
b_full_dims_host,
|
||||
ds_full_dims_host,
|
||||
e_full_dims_host_ref,
|
||||
G_total,
|
||||
M_total,
|
||||
N_total,
|
||||
K_total,
|
||||
CDEElementWise{});
|
||||
ck_tile::calculate_reference_flat_indexing<ADataType,
|
||||
BDataType,
|
||||
DDataType,
|
||||
EDataType,
|
||||
AccDataType,
|
||||
CDEElementWise>(a_full_dims_host,
|
||||
b_full_dims_host,
|
||||
ds_full_dims_host,
|
||||
e_full_dims_host_ref,
|
||||
G_total,
|
||||
M_total,
|
||||
N_total,
|
||||
K_total,
|
||||
CDEElementWise{});
|
||||
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration =
|
||||
|
||||
@@ -436,12 +436,14 @@ struct PoolKernel
|
||||
// Main reduction loop - with index tracking
|
||||
for(int k_tile = amd_wave_read_first_lane(0); k_tile < num_k_tiles; ++k_tile)
|
||||
{
|
||||
const auto x_tile = load_tile(x_window);
|
||||
const auto x_tile = load_tile(x_window);
|
||||
const auto& in_tensor_padded_ref =
|
||||
in_tensor_padded; // structured bindings cannot be captured prior to cpp20
|
||||
auto index_calculator = [&](const auto& x_indices) {
|
||||
// Get global coordinates in the 2D matrix space (M, N)
|
||||
const auto global_M = x_indices.at(number<0>{}) + iM;
|
||||
const auto global_N = (k_tile * S::Block_N) + x_indices.at(number<1>{});
|
||||
return in_tensor_padded.get_tensor_descriptor().calculate_offset(
|
||||
return in_tensor_padded_ref.get_tensor_descriptor().calculate_offset(
|
||||
make_tuple(global_M, global_N));
|
||||
};
|
||||
|
||||
|
||||
53
script/infra_helper/capture_build_trace.js
Normal file
53
script/infra_helper/capture_build_trace.js
Normal file
@@ -0,0 +1,53 @@
|
||||
const puppeteer = require('puppeteer');
|
||||
|
||||
(async () => {
|
||||
try {
|
||||
// Launch the browser
|
||||
const browser = await puppeteer.launch({
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--headless',
|
||||
'--disable-gpu',
|
||||
'--window-size=1920x1080'
|
||||
]});
|
||||
const page = await browser.newPage();
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.goto('https://ui.perfetto.dev');
|
||||
// Wait for the home page to be visible
|
||||
console.log('Waiting for page to load...');
|
||||
await page.waitForSelector('.pf-home-page', { visible: true, timeout: 30000 });
|
||||
// Locate and click the Open trace button
|
||||
const elements = await page.$$('li');
|
||||
let element = null;
|
||||
for (const el of elements) {
|
||||
const text = await el.evaluate(node => node.textContent);
|
||||
if (text && text.includes('Open trace file')) {
|
||||
element = el;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (element) {
|
||||
const [fileChooser] = await Promise.all([
|
||||
page.waitForFileChooser(),
|
||||
element.click()
|
||||
]);
|
||||
await fileChooser.accept(['/workspace/ck_build_trace.json']);
|
||||
} else {
|
||||
throw new Error('Element not found');
|
||||
}
|
||||
console.log('Waiting for data to load...');
|
||||
// Wait for the timeline element to be visible
|
||||
await page.waitForSelector('.pf-track', { timeout: 30000 });
|
||||
// Wait for the data to finish loading
|
||||
await page.waitForFunction(() => {
|
||||
return !document.body.textContent.includes('Loading...');
|
||||
}, { timeout: 30000 });
|
||||
console.log('Capturing screenshot...');
|
||||
await page.screenshot({path: '/workspace/perfetto_snapshot_build.png'});
|
||||
console.log('Done capturing screenshot...');
|
||||
await browser.close();
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
}
|
||||
})();
|
||||
@@ -64,6 +64,7 @@ function(create_individual_gemm_target datatype layout trait tile_config config_
|
||||
|
||||
# Create the executable
|
||||
add_executable(${target_name}
|
||||
# to save build time, exclude the target from "all" target of "gemm" directory and its ancestors
|
||||
EXCLUDE_FROM_ALL
|
||||
${GEMM_SOURCE_DIR}/gemm_benchmark_single.cpp
|
||||
${instance_header}
|
||||
|
||||
@@ -67,6 +67,7 @@ function(create_individual_gemm_multi_d_target datatype layout trait tile_config
|
||||
|
||||
# Create the executable
|
||||
add_executable(${target_name}
|
||||
# to save build time, exclude the target from "all" target of "gemm_multi_d" directory and its ancestors
|
||||
EXCLUDE_FROM_ALL
|
||||
${GEMM_MULTI_D_SOURCE_DIR}/gemm_multi_d_benchmark_single.cpp
|
||||
${instance_header}
|
||||
|
||||
@@ -64,6 +64,7 @@ function(create_individual_gemm_preshuffle_target datatype layout trait tile_con
|
||||
|
||||
# Create the executable
|
||||
add_executable(${target_name}
|
||||
# to save build time, exclude the target from "all" target of "gemm_preshuffle" directory and its ancestors
|
||||
EXCLUDE_FROM_ALL
|
||||
${GEMM_PRESHUFFLE_SOURCE_DIR}/gemm_preshuffle_benchmark_single.cpp
|
||||
${instance_header}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
add_executable(tile_tutorial_copy_kernel EXCLUDE_FROM_ALL copy_basic.cpp)
|
||||
add_executable(tile_tutorial_copy_kernel copy_basic.cpp)
|
||||
|
||||
# Impact: This flag ensures that the compiler doesn't make
|
||||
# assumptions about memory aliasing that could interfere with Composable Kernel's explicit memory access patterns.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
add_executable(tile_tutorial_naive_gemm EXCLUDE_FROM_ALL practice_gemm.cpp)
|
||||
add_executable(tile_tutorial_naive_gemm practice_gemm.cpp)
|
||||
|
||||
target_compile_options(tile_tutorial_naive_gemm PRIVATE
|
||||
-mllvm -enable-noalias-to-md-conversion=0
|
||||
|
||||
@@ -28,9 +28,9 @@ struct PracticeGemmHostPipeline
|
||||
{
|
||||
|
||||
// Size of the entire problem
|
||||
const auto M = a_dram.get_tensor_descriptor().get_length(number<0>{}); // M x K
|
||||
const auto N = c_dram.get_tensor_descriptor().get_length(number<1>{}); // M x N
|
||||
const auto K = a_dram.get_tensor_descriptor().get_length(number<1>{}); // M x K
|
||||
const auto M = a_dram.get_tensor_descriptor().get_length(number<0>{}); // M x K
|
||||
const auto N = c_dram_ref.get_tensor_descriptor().get_length(number<1>{}); // M x N
|
||||
const auto K = a_dram.get_tensor_descriptor().get_length(number<1>{}); // M x K
|
||||
|
||||
// Size of the block tile
|
||||
const auto MPerBlock = BlockTile::at(number<0>{});
|
||||
@@ -83,7 +83,7 @@ struct PracticeGemmHostPipeline
|
||||
__shared__ char p_smem_char[block_gemm_pipeline.GetStaticLDSSize()];
|
||||
const auto c_block_tile =
|
||||
block_gemm_pipeline(a_block_window, b_block_window, num_loops_k, p_smem_char);
|
||||
auto c_window = make_tile_window(c_dram,
|
||||
auto c_window = make_tile_window(c_dram_ref,
|
||||
make_tuple(number<MPerBlock>{}, number<NPerBlock>{}),
|
||||
{tile_origin_m, tile_origin_n});
|
||||
store_tile(c_window, c_block_tile);
|
||||
|
||||
Reference in New Issue
Block a user