diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml index f4d0c0063c..86d134e456 100644 --- a/.github/workflows/therock-ci-linux.yml +++ b/.github/workflows/therock-ci-linux.yml @@ -53,8 +53,8 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/TheRock" - ref: c2921b151b8285a1d29942aceb33cfe0fea77ac9 # 10-15-2025 commit path: "TheRock" + ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit - name: Setup ccache run: | @@ -77,6 +77,8 @@ jobs: - name: Patch rocm-libraries run: | git config --global --add safe.directory '*' + # Remove patches here if they cannot be applied cleanly, and they have not been deleted from TheRock repo + rm -f ./TheRock/patches/amd-mainline/rocm-libraries/0008-Revert-remove-options-no-enumerate-966.patch git -c user.name="therockbot" -c "user.email=therockbot@amd.com" am --whitespace=nowarn ./TheRock/patches/amd-mainline/rocm-libraries/*.patch - name: Install python deps @@ -128,7 +130,7 @@ jobs: run: | python3 TheRock/build_tools/github_actions/post_build_upload.py \ --run-id ${{ github.run_id }} \ - --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \ + --artifact-group ${{ env.AMDGPU_FAMILIES }} \ --build-dir TheRock/build \ --upload diff --git a/.github/workflows/therock-test-component.yml b/.github/workflows/therock-test-component.yml index 1ccc1d57bc..27eff4fdb0 100644 --- a/.github/workflows/therock-test-component.yml +++ b/.github/workflows/therock-test-component.yml @@ -51,13 +51,13 @@ jobs: uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: repository: "ROCm/TheRock" - ref: c2921b151b8285a1d29942aceb33cfe0fea77ac9 # 10-15-2025 commit + ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit - name: Run setup test environment workflow uses: './.github/actions/setup_test_environment' with: ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }} - AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} + ARTIFACT_GROUP: ${{ inputs.amdgpu_families }} OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }} VENV_DIR: ${{ env.VENV_DIR }} FETCH_ARTIFACT_ARGS: ${{ fromJSON(inputs.component).fetch_artifact_args }} diff --git a/.github/workflows/therock-test-packages.yml b/.github/workflows/therock-test-packages.yml index efb5a6b1a0..81632fce48 100644 --- a/.github/workflows/therock-test-packages.yml +++ b/.github/workflows/therock-test-packages.yml @@ -27,7 +27,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/TheRock" - ref: c2921b151b8285a1d29942aceb33cfe0fea77ac9 # 10-15-2025 commit + ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit - name: "Configuring CI options" env: diff --git a/CHANGELOG.md b/CHANGELOG.md index 213631721f..8fd3c36255 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,11 +2,10 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/projects/composable_kernel/en/latest/](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/). -## (Unreleased) Composable Kernel for ROCm - -### Added +## Composable Kernel 1.2.0 for ROCm 7.2.0 ### Added +* Added support for bf16 data type to grouped_gemm and grouped_gemm_preshuffle. * Added support for mixed precision fp8 x bf8 universal GEMM and weight preshuffle GEMM * Added a compute async pipeline in the CK TILE universal GEMM on gfx950 * Added support for B Tensor type pk_int4_t in the CK TILE weight preshuffle GEMM. @@ -32,6 +31,16 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj * Added an optional template parameter `Arch` (`gfx9_t`, `gfx12_t` etc.) to `make_kernel` to support linking multiple object files that have the same kernel compiled for different architectures. * FMHA examples and tests can be built for multiple architectures (gfx9, gfx950, gfx12) at the same time. +### Upcoming changes + +* Composable Kernel will be adopting C++20 features in an upcoming ROCm release, updating the minimum compiler requirement to C++20. Ensure that your development environment complies with this requirement to facilitate a seamless transition. + +## Composable Kernel 1.1.0 for ROCm 7.1.1 + +### Upcoming changes + +* Composable Kernel will be adopting C++20 features in an upcoming ROCm release, updating the minimum compiler requirement to C++20. Ensure that your development environment complies with this requirement to facilitate a seamless transition. + ## Composable Kernel 1.1.0 for ROCm 7.1.0 ### Added diff --git a/CMakeLists.txt b/CMakeLists.txt index 049da5637f..45db703b82 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -122,7 +122,7 @@ add_compile_options( # Recent change in compiler makes this warning ON by default, which led to compile errors. add_compile_options(-Wno-nrvo) -if(NOT DISABLE_DL_KERNELS) +if(NOT DISABLE_DL_KERNELS AND GPU_TARGETS MATCHES "gfx101|gfx103|gfx10-1|gfx10-3") add_definitions(-DDL_KERNELS) set(DL_KERNELS "ON") set(CK_ENABLE_DL_KERNELS "ON") @@ -683,6 +683,12 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY) PACKAGE_NAME examples ) add_subdirectory(example) + + add_subdirectory(tutorial) + rocm_package_setup_component(tutorials + LIBRARY_NAME composablekernel + PACKAGE_NAME tutorials + ) add_subdirectory(tile_engine) if(BUILD_TESTING) add_subdirectory(test) diff --git a/Dockerfile.aiter b/Dockerfile.aiter index dab3f9588d..94591f9012 100644 --- a/Dockerfile.aiter +++ b/Dockerfile.aiter @@ -17,4 +17,6 @@ RUN pip install pandas zmq einops ninja && \ useradd -u 1001 -g 1001 -m -s /bin/bash jenkins && \ chown -R jenkins:jenkins /home/jenkins && \ chmod -R a+rwx /home/jenkins && \ + chown -R jenkins:jenkins /tmp && \ + chmod -R a+rwx /tmp && \ sudo usermod -aG irc jenkins diff --git a/Jenkinsfile b/Jenkinsfile index 80db99684a..cec317af63 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -213,39 +213,6 @@ def check_host() { } } -def build_compiler(){ - def compiler - compiler = "${params.BUILD_COMPILER}" - return compiler -} - -def check_arch(){ - def arch_type = 0 - sh 'rocminfo | tee rocminfo.log' - if ( runShell('grep -n "gfx90a" rocminfo.log') ){ - arch_type = 1 - } - else if ( runShell('grep -n "gfx942" rocminfo.log') ) { - arch_type = 2 - } - else if ( runShell('grep -n "gfx10" rocminfo.log') ) { - arch_type = 3 - } - else if ( runShell('grep -n "gfx11" rocminfo.log') ) { - arch_type = 4 - } - else if ( runShell('grep -n "gfx12" rocminfo.log') ) { - arch_type = 5 - } - else if ( runShell('grep -n "gfx908" rocminfo.log') ) { - arch_type = 6 - } - else if ( runShell('grep -n "gfx950" rocminfo.log') ) { - arch_type = 7 - } - return arch_type -} - def check_arch_name(){ def arch_name = "" sh 'rocminfo | tee rocminfo.log' @@ -261,7 +228,7 @@ def check_arch_name(){ else if ( runShell('grep -n "gfx11" rocminfo.log') ) { arch_name = "gfx11" } - else if ( runShell('grep -n "gfx12" rocminfo.log') ) { + else if ( runShell('grep -n "gfx120" rocminfo.log') ) { arch_name = "gfx12" } else if ( runShell('grep -n "gfx908" rocminfo.log') ) { @@ -274,21 +241,8 @@ def check_arch_name(){ } def getDockerImage(Map conf=[:]){ - env.DOCKER_BUILDKIT=1 - def prefixpath = conf.get("prefixpath", "/opt/rocm") - def no_cache = conf.get("no_cache", false) - def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' " - if(no_cache) - { - dockerArgs = dockerArgs + " --no-cache " - } - echo "Docker Args: ${dockerArgs}" def image - if ( params.BUILD_LEGACY_OS && conf.get("docker_name", "") != "" ){ - image = conf.get("docker_name", "") - echo "Using legacy docker: ${image}" - } - else if ( (params.BUILD_GFX950 || params.RUN_CK_TILE_FMHA_TESTS) && conf.get("docker_name", "") != "" ){ + if ( conf.get("docker_name", "") != "" ){ image = conf.get("docker_name", "") echo "Using special docker: ${image}" } @@ -361,11 +315,51 @@ def buildDocker(install_prefix){ } } +def get_docker_options(){ + def dockerOpts + if ( params.BUILD_INSTANCES_ONLY ){ + dockerOpts = "--network=host --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + } + else{ //only add kfd and dri paths if you actually going to run somthing on GPUs + dockerOpts = "--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --group-add irc --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + } + if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){ + // the --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 env variable is required when building code with offload-compress flag with + // newer clang22 compilers and running with older hip runtima libraries + dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 " + } + // on some machines the group ids for video and render groups may not be the same as in the docker image! + def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3') + def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3') + dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} " + echo "Docker flags: ${dockerOpts}" + return dockerOpts +} + +def build_client_examples(String arch){ + def cmd = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ + cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ + -DGPU_TARGETS="${arch}" \ + -DCMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" \ + -DCMAKE_HIP_COMPILER="${params.BUILD_COMPILER}" \ + -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ + return cmd +} + +def build_and_run_fmha(String arch){ + def cmd = """ cmake -G Ninja -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ + -DGPU_TARGETS="${arch}" \ + -DCMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" \ + -DCMAKE_HIP_COMPILER="${params.BUILD_COMPILER}" .. && \ + ninja -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \ + cd ../ && + example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" "${arch}" """ + return cmd +} + def cmake_build(Map conf=[:]){ - def compiler = build_compiler() def config_targets = conf.get("config_targets","check") - def debug_flags = "-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined " + conf.get("extradebugflags", "") def build_envs = "CTEST_PARALLEL_LEVEL=4 " + conf.get("build_env","") def prefixpath = conf.get("prefixpath","/opt/rocm") def setup_args = conf.get("setup_args","") @@ -376,10 +370,8 @@ def cmake_build(Map conf=[:]){ setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} " } - def build_type_debug = (conf.get("build_type",'release') == 'debug') - //cmake_env can overwrite default CXX variables. - def cmake_envs = "CXX=${compiler} CXXFLAGS='-Werror' " + conf.get("cmake_ex_env","") + def cmake_envs = "CXX=${params.BUILD_COMPILER} CXXFLAGS='-Werror' " + conf.get("cmake_ex_env","") if(conf.get("build_install","") == "true") { @@ -392,15 +384,10 @@ def cmake_build(Map conf=[:]){ setup_args = setup_args + " -DDISABLE_DL_KERNELS=ON " } - if(build_type_debug){ - setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'" + setup_args - }else{ - setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args - } + setup_args = " -DCMAKE_BUILD_TYPE=release " + setup_args def pre_setup_cmd = """ #!/bin/bash - echo \$HSA_ENABLE_SDMA ulimit -c unlimited rm -rf build mkdir build @@ -478,13 +465,12 @@ def cmake_build(Map conf=[:]){ def build_cmd def execute_cmd = conf.get("execute_cmd", "") if(!setup_args.contains("NO_CK_BUILD")){ - def cmake_flags = params.NINJA_FTIME_TRACE ? "-O3 -ftime-trace" : "-O3" if (params.NINJA_BUILD_TRACE) { echo "running ninja build trace" } setup_cmd = conf.get( "setup_cmd", - """${cmake_envs} cmake -G Ninja ${setup_args} -DCMAKE_CXX_FLAGS=" ${cmake_flags} " .. """ + """${cmake_envs} cmake -G Ninja ${setup_args} -DCMAKE_CXX_FLAGS=" -O3 " .. """ ) build_cmd = conf.get( "build_cmd", @@ -566,16 +552,11 @@ def cmake_build(Map conf=[:]){ } //check the node gpu architecture - def arch = check_arch() + def arch_name = check_arch_name() if (params.RUN_CK_TILE_FMHA_TESTS){ try{ archiveArtifacts "perf_fmha_*.log" - if (arch == 1){ - stash includes: "perf_fmha_**_gfx90a.log", name: "perf_fmha_log_gfx90a" - } - else if (arch == 2){ - stash includes: "perf_fmha_**_gfx942.log", name: "perf_fmha_log_gfx942" - } + stash includes: "perf_fmha_**.log", name: "perf_fmha_log_${arch_name}" } catch(Exception err){ echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." @@ -585,40 +566,15 @@ def cmake_build(Map conf=[:]){ def buildHipClangJob(Map conf=[:]){ show_node_info() - - env.HSA_ENABLE_SDMA=0 checkout scm def prefixpath = conf.get("prefixpath", "/opt/rocm") - - // Jenkins is complaining about the render group - def dockerOpts - if ( params.BUILD_INSTANCES_ONLY ){ - dockerOpts = "--group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" - } - else{ - dockerOpts = "--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" - } - if (conf.get("enforce_xnack_on", false)) { - dockerOpts = dockerOpts + " --env HSA_XNACK=1 " - } - def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " - if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){ - // the --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 env variable is required when building code with offload-compress flag with - // newer clang22 compilers and running with older hip runtima libraries - dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 " - } - def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3') - def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3') - dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} " - echo "Docker flags: ${dockerOpts}" - - def variant = env.STAGE_NAME + def dockerOpts = get_docker_options() def image def retimage (retimage, image) = getDockerImage(conf) - gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') { - withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { + gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') { + withDockerContainer(image: image, args: dockerOpts) { timeout(time: 20, unit: 'HOURS') { cmake_build(conf) @@ -628,10 +584,6 @@ def buildHipClangJob(Map conf=[:]){ return retimage } -def reboot(){ - build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),] -} - def buildHipClangJobAndReboot(Map conf=[:]){ try{ buildHipClangJob(conf) @@ -641,45 +593,17 @@ def buildHipClangJobAndReboot(Map conf=[:]){ echo 'Exception occurred: ' + e.toString() throw e } - finally{ - if (!conf.get("no_reboot", false)) { - reboot() - } - } } def Build_CK(Map conf=[:]){ show_node_info() - - env.HSA_ENABLE_SDMA=0 - env.DOCKER_BUILDKIT=1 checkout scm def prefixpath = conf.get("prefixpath", "/opt/rocm") - - // Jenkins is complaining about the render group - def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" - if (conf.get("enforce_xnack_on", false)) { - dockerOpts = dockerOpts + " --env HSA_XNACK=1 " - } - def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " - if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){ - // the --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 env variable is required when building code with offload-compress flag with - // newer clang22 compilers and running with older hip runtima libraries - dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' --env COMPRESSED_BUNDLE_FORMAT_VERSION=2 " - } - if(params.BUILD_LEGACY_OS){ - dockerOpts = dockerOpts + " --env LD_LIBRARY_PATH='/opt/Python-3.8.13/lib' " - } - def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3') - def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3') - dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} " - echo "Docker flags: ${dockerOpts}" - - def variant = env.STAGE_NAME + def dockerOpts=get_docker_options() def image def retimage - gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') { + gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') { try { (retimage, image) = getDockerImage(conf) withDockerContainer(image: image, args: dockerOpts) { @@ -698,11 +622,11 @@ def Build_CK(Map conf=[:]){ echo "The job was cancelled or aborted" throw e } - withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { + withDockerContainer(image: image, args: dockerOpts) { timeout(time: 20, unit: 'HOURS') { //check whether to run performance tests on this node - def arch = check_arch() + def arch = check_arch_name() cmake_build(conf) if ( params.RUN_INDUCTOR_TESTS && !params.BUILD_LEGACY_OS && arch == 1 ){ echo "Run inductor codegen tests" @@ -717,94 +641,50 @@ def Build_CK(Map conf=[:]){ // run performance tests, stash the logs, results will be processed on the master node dir("script"){ if (params.RUN_PERFORMANCE_TESTS){ - if (params.RUN_FULL_QA && arch == 1){ - // run full tests on gfx90a + if (params.RUN_FULL_QA && (arch == "gfx90a" || arch == "gfx942")){ + // run full tests on gfx90a or gfx942 echo "Run full performance tests" - sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx90a" - archiveArtifacts "perf_gemm_gfx90a.log" - archiveArtifacts "perf_resnet50_N256_gfx90a.log" - archiveArtifacts "perf_resnet50_N4_gfx90a.log" - archiveArtifacts "perf_batched_gemm_gfx90a.log" - archiveArtifacts "perf_grouped_gemm_gfx90a.log" - archiveArtifacts "perf_grouped_conv_fwd_gfx90a.log" - archiveArtifacts "perf_grouped_conv_bwd_data_gfx90a.log" - archiveArtifacts "perf_grouped_conv_bwd_weight_gfx90a.log" - archiveArtifacts "perf_gemm_bilinear_gfx90a.log" - archiveArtifacts "perf_reduction_gfx90a.log" - archiveArtifacts "perf_splitK_gemm_gfx90a.log" - archiveArtifacts "perf_onnx_gemm_gfx90a.log" - archiveArtifacts "perf_mixed_gemm_gfx90a.log" - stash includes: "perf_**.log", name: "perf_log_gfx90a" + sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} ${arch}" + archiveArtifacts "perf_*.log" + stash includes: "perf_**.log", name: "perf_log_${arch}" } - if (params.RUN_FULL_QA && arch == 2){ - // run full tests on gfx942 - echo "Run full performance tests" - sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx942" - archiveArtifacts "perf_gemm_gfx942.log" - archiveArtifacts "perf_resnet50_N256_gfx942.log" - archiveArtifacts "perf_resnet50_N4_gfx942.log" - archiveArtifacts "perf_batched_gemm_gfx942.log" - archiveArtifacts "perf_grouped_gemm_gfx942.log" - archiveArtifacts "perf_grouped_conv_fwd_gfx942.log" - archiveArtifacts "perf_grouped_conv_bwd_data_gfx942.log" - archiveArtifacts "perf_grouped_conv_bwd_weight_gfx942.log" - archiveArtifacts "perf_gemm_bilinear_gfx942.log" - archiveArtifacts "perf_reduction_gfx942.log" - archiveArtifacts "perf_splitK_gemm_gfx942.log" - archiveArtifacts "perf_onnx_gemm_gfx942.log" - archiveArtifacts "perf_mixed_gemm_gfx942.log" - stash includes: "perf_**.log", name: "perf_log_gfx942" - } - else if ( arch == 1 ){ - // run standard tests on gfx90a + else if (!params.RUN_FULL_QA && (arch == "gfx90a" || arch == "gfx942")){ + // run standard tests on gfx90a or gfx942 echo "Run performance tests" - sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx90a" - archiveArtifacts "perf_gemm_gfx90a.log" - archiveArtifacts "perf_onnx_gemm_gfx90a.log" - archiveArtifacts "perf_resnet50_N256_gfx90a.log" - archiveArtifacts "perf_resnet50_N4_gfx90a.log" - stash includes: "perf_**.log", name: "perf_log_gfx90a" - } - else if ( arch == 2 ){ - // run standard tests on gfx942 - echo "Run performance tests" - sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx942" - archiveArtifacts "perf_gemm_gfx942.log" - archiveArtifacts "perf_onnx_gemm_gfx942.log" - archiveArtifacts "perf_resnet50_N256_gfx942.log" - archiveArtifacts "perf_resnet50_N4_gfx942.log" - stash includes: "perf_**.log", name: "perf_log_gfx942" + sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} ${arch}" + archiveArtifacts "perf_*.log" + stash includes: "perf_**.log", name: "perf_log_${arch}" } // disable performance tests on gfx1030 for now. - //else if ( arch == 3){ + //else if ( arch == "gfx10"){ // run basic tests on gfx1030 // echo "Run gemm performance tests" // sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx10" // archiveArtifacts "perf_onnx_gemm_gfx10.log" // stash includes: "perf_onnx_gemm_gfx10.log", name: "perf_log_gfx10" //} - else if ( arch == 4){ + else if ( arch == "gfx11"){ // run basic tests on gfx11 echo "Run gemm performance tests" sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx11" archiveArtifacts "perf_onnx_gemm_gfx11.log" stash includes: "perf_onnx_gemm_gfx11.log", name: "perf_log_gfx11" } - else if ( arch == 5 ){ + else if ( arch == "gfx120" ){ // run basic tests on gfx12 echo "Run gemm performance tests" sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx12" archiveArtifacts "perf_onnx_gemm_gfx12.log" stash includes: "perf_onnx_gemm_gfx12.log", name: "perf_log_gfx12" } - else if ( arch == 6 ){ + else if ( arch == "gfx908" ){ // run basic tests on gfx908 echo "Run performance tests" sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx908" archiveArtifacts "perf_onnx_gemm_gfx908.log" stash includes: "perf_onnx_gemm_gfx908.log", name: "perf_log_gfx908" } - else if ( arch == 7 ){ + else if ( arch == "gfx950" ){ // run basic tests on gfx950 echo "Run performance tests" sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx950" @@ -813,7 +693,7 @@ def Build_CK(Map conf=[:]){ } } } - if (params.hipTensor_test && arch == 1 ){ + if (params.hipTensor_test && arch == "gfx90a" ){ // build and test hipTensor on gfx90a node sh """#!/bin/bash rm -rf "${params.hipTensor_branch}".zip @@ -846,34 +726,18 @@ def Build_CK_and_Reboot(Map conf=[:]){ echo 'Exception occurred: ' + e.toString() throw e } - finally{ - if (!conf.get("no_reboot", false)) { - reboot() - } - } } def process_results(Map conf=[:]){ - env.HSA_ENABLE_SDMA=0 checkout scm //use older image that has user jenkins def image = "${env.CK_DOCKERHUB}:ck_ub22.04_rocm6.3" - def prefixpath = "/opt/rocm" - // Jenkins is complaining about the render group - def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined" - if (conf.get("enforce_xnack_on", false)) { - dockerOpts = dockerOpts + " --env HSA_XNACK=1 " - } - - def variant = env.STAGE_NAME - def retimage - - gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') { + gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') { try { echo "Pulling image: ${image}" - retimage = docker.image("${image}") + def retimage = docker.image("${image}") withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) { retimage.pull() } @@ -884,7 +748,7 @@ def process_results(Map conf=[:]){ } } - withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { + withDockerContainer(image: image, args: '--cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v=/var/jenkins/:/var/jenkins') { timeout(time: 15, unit: 'MINUTES'){ try{ dir("script"){ @@ -998,19 +862,12 @@ def process_results(Map conf=[:]){ def run_aiter_tests(Map conf=[:]){ show_node_info() - env.HSA_ENABLE_SDMA=0 checkout scm //use the latest pytorch image def image = "${env.CK_DOCKERHUB_PRIVATE}:ck_aiter" - def dockerOpts="--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --group-add irc --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --user=jenkins -v=/var/jenkins/:/var/jenkins" - def variant = env.STAGE_NAME - def retimage - def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3') - def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3') - dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} " - echo "Docker flags: ${dockerOpts}" + def dockerOpts=get_docker_options() - gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') { + gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') { try { echo "Pulling image: ${image}" @@ -1057,19 +914,12 @@ def run_aiter_tests(Map conf=[:]){ def run_pytorch_tests(Map conf=[:]){ show_node_info() - env.HSA_ENABLE_SDMA=0 checkout scm //use the latest pytorch-nightly image def image = "${env.CK_DOCKERHUB}:ck_pytorch" - def dockerOpts="--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --group-add irc --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --user=jenkins -v=/var/jenkins/:/var/jenkins" - def variant = env.STAGE_NAME - def retimage - def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3') - def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3') - dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} " - echo "Docker flags: ${dockerOpts}" + def dockerOpts=get_docker_options() - gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') { + gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') { try { echo "Pulling image: ${image}" @@ -1343,7 +1193,7 @@ pipeline { --file-filter=*.cpp --force --enable=all --output-file=ck_cppcheck.log" } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true) + buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd) archiveArtifacts "build/ck_cppcheck.log" cleanWs() } @@ -1369,7 +1219,7 @@ pipeline { | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\')" } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true) + buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd) cleanWs() } } @@ -1453,7 +1303,7 @@ pipeline { ./bin/test_grouped_convnd_fwd_large_cases_xdl && ./bin/test_grouped_convnd_bwd_data_xdl_large_cases && ./bin/test_grouped_convnd_fwd_bias_clamp_large_cases""" } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } } @@ -1489,7 +1339,7 @@ pipeline { ./bin/test_grouped_convnd_fwd_dataset_xdl""" } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } } @@ -1512,11 +1362,11 @@ pipeline { agent{ label rocmnode("gfx90a")} environment{ setup_args = "NO_CK_BUILD" - execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake -DCMAKE_PREFIX_PATH=/opt/rocm ../codegen && \ + execute_args = """ cmake -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" ../codegen && \ make -j64 check""" } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } } @@ -1539,13 +1389,10 @@ pipeline { agent{ label rocmnode("gfx90a") } environment{ setup_args = "NO_CK_BUILD" - execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \ - make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \ - cd ../ && - example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """ + execute_args = build_and_run_fmha("gfx90a") } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } } @@ -1558,13 +1405,10 @@ pipeline { agent{ label rocmnode("gfx942") } environment{ setup_args = "NO_CK_BUILD" - execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \ - make -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \ - cd ../ && - example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """ + execute_args = build_and_run_fmha("gfx942") } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } } @@ -1577,13 +1421,10 @@ pipeline { agent{ label rocmnode("gfx950") } environment{ setup_args = "NO_CK_BUILD" - execute_args = """ ../script/cmake-ck-dev.sh ../ gfx950 && \ - make -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \ - cd ../ && - example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx950 """ + execute_args = build_and_run_fmha("gfx950") } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } } @@ -1596,13 +1437,10 @@ pipeline { agent{ label rocmnode("gfx1201") } environment{ setup_args = "NO_CK_BUILD" - execute_args = """ ../script/cmake-ck-dev.sh ../ gfx12-generic && \ - make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \ - cd ../ && - example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx1201 """ + execute_args = build_and_run_fmha("gfx1201") } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } } @@ -1626,7 +1464,7 @@ pipeline { environment{ setup_args = "NO_CK_BUILD" execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \ - -D CMAKE_CXX_COMPILER="${build_compiler()}" \ + -D CMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" \ -D CMAKE_BUILD_TYPE=Release \ -D GPU_TARGETS="gfx90a" \ -D GEMM_DATATYPE="fp8;fp16" \ @@ -1634,20 +1472,14 @@ pipeline { -D GEMM_MULTI_D_DATATYPE="fp16" \ -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \ -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \ - -D GEMM_PRESHUFFLE_LAYOUT="rcr" \ - -DCMAKE_CXX_FLAGS=" -O3 " .. && \ - ninja -j64 benchmark_gemm_all && \ - python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" \ - --warmup 5 --repeat 5 --verbose --json results.json && \ - ninja -j64 benchmark_gemm_preshuffle_all && \ - python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" \ - --warmup 5 --repeat 5 --verbose --json results.json && \ - ninja -j64 benchmark_gemm_multi_d_all && \ - python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" \ - --warmup 5 --repeat 5 --verbose --json results.json """ + -D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \ + ninja -j64 benchmark_gemm_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \ + python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ + python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ + python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """ } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } } @@ -1661,7 +1493,7 @@ pipeline { environment{ setup_args = "NO_CK_BUILD" execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \ - -D CMAKE_CXX_COMPILER="${build_compiler()}" \ + -D CMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" \ -D CMAKE_BUILD_TYPE=Release \ -D GPU_TARGETS="gfx942" \ -D GEMM_DATATYPE="fp8;fp16" \ @@ -1669,20 +1501,14 @@ pipeline { -D GEMM_MULTI_D_DATATYPE="fp16" \ -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \ -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \ - -D GEMM_PRESHUFFLE_LAYOUT="rcr" \ - -DCMAKE_CXX_FLAGS=" -O3 " .. && \ - ninja -j64 benchmark_gemm_all && \ - python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" \ - --warmup 5 --repeat 5 --verbose --json results.json && \ - ninja -j64 benchmark_gemm_preshuffle_all && \ - python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" \ - --warmup 5 --repeat 5 --verbose --json results.json && \ - ninja -j64 benchmark_gemm_multi_d_all && \ - python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" \ - --warmup 5 --repeat 5 --verbose --json results.json """ + -D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \ + ninja -j64 benchmark_gemm_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \ + python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ + python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \ + python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """ } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } } @@ -1696,18 +1522,16 @@ pipeline { environment{ setup_args = "NO_CK_BUILD" execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \ - -D CMAKE_CXX_COMPILER="${build_compiler()}" \ + -D CMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" \ -D CMAKE_BUILD_TYPE=Release \ -D GPU_TARGETS="gfx1201" \ -D GEMM_DATATYPE="fp16" \ - -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \ - -DCMAKE_CXX_FLAGS=" -O3 " .. && \ + -D GEMM_LAYOUT="rcr;rrr;crr;ccr" .. && \ ninja -j64 benchmark_gemm_all && \ - python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" \ - --warmup 5 --repeat 5 --verbose --json results.json """ + python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """ } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } } @@ -1730,15 +1554,11 @@ pipeline { } agent{ label rocmnode("gfx90a") } environment{ - def docker_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_rhel8_rocm6.3" - setup_args = """ -DGPU_TARGETS="gfx942" \ - -DCMAKE_CXX_FLAGS=" -O3 " \ - -DCK_CXX_STANDARD="17" \ - -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """ + setup_args = """ -DGPU_TARGETS="gfx942" -DCK_CXX_STANDARD="17" -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """ execute_args = " " } steps{ - Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name) + Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", build_type: 'Release', docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_rhel8_rocm6.3") cleanWs() } } @@ -1750,14 +1570,11 @@ pipeline { } agent{ label rocmnode("gfx90a") } environment{ - def docker_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_sles15_rocm6.3" - setup_args = """ -DGPU_TARGETS="gfx942" \ - -DCMAKE_CXX_FLAGS=" -O3 " \ - -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """ + setup_args = """ -DGPU_TARGETS="gfx942" -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """ execute_args = " " } steps{ - Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name) + Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", build_type: 'Release', docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_sles15_rocm6.3") cleanWs() } } @@ -1769,18 +1586,11 @@ pipeline { } agent{ label rocmnode("gfx942") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \ - -DGPU_TARGETS="gfx942" \ - -DCMAKE_CXX_FLAGS=" -O3 " """ - execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ - cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ - -DGPU_TARGETS="gfx942" \ - -DCMAKE_CXX_COMPILER="${build_compiler()}" \ - -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ - -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx942" """ + execute_args = build_client_examples("gfx942") } steps{ - Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') + Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } } @@ -1792,18 +1602,11 @@ pipeline { } agent{ label rocmnode("gfx950") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \ - -DGPU_TARGETS="gfx950" \ - -DCMAKE_CXX_FLAGS=" -O3 " """ - execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ - cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ - -DGPU_TARGETS="gfx950" \ - -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ - -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ - -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx950" """ + execute_args = build_client_examples("gfx950") } steps{ - Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') + Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } } @@ -1815,16 +1618,11 @@ pipeline { } agent{ label rocmnode("gfx908") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908" -DCMAKE_CXX_FLAGS=" -O3 " """ - execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ - cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ - -DGPU_TARGETS="gfx908" \ - -DCMAKE_CXX_COMPILER="${build_compiler()}" \ - -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ - -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908" """ + execute_args = build_client_examples("gfx908") } steps{ - Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') + Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } } @@ -1836,17 +1634,11 @@ pipeline { } agent{ label rocmnode("gfx90a") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx90a" -DCK_CXX_STANDARD="17" -DCMAKE_CXX_FLAGS=" -O3 " """ - execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ - cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ - -DGPU_TARGETS="gfx90a" \ - -DCK_CXX_STANDARD="17" \ - -DCMAKE_CXX_COMPILER="${build_compiler()}" \ - -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ - -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx90a" -DCK_CXX_STANDARD="17" """ + execute_args = build_client_examples("gfx90a") } steps{ - Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') + Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } } @@ -1859,17 +1651,12 @@ pipeline { agent{ label rocmnode("gfx942") } steps{ script { - def execute_args = params.NINJA_FTIME_TRACE ? - """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \ - -D CMAKE_CXX_COMPILER="${build_compiler()}" \ - -D CMAKE_BUILD_TYPE=Release \ - -D CMAKE_CXX_FLAGS=" -O3 -ftime-trace" .. && ninja -j64 """ : - """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \ - -D CMAKE_CXX_COMPILER="${build_compiler()}" \ - -D CMAKE_BUILD_TYPE=Release \ - -D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """ + def execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \ + -DCMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" \ + -DCMAKE_HIP_COMPILER="${params.BUILD_COMPILER}" \ + -D CMAKE_BUILD_TYPE=Release .. && ninja -j64 """ - buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1") + buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", build_type: 'Release', execute_cmd: execute_args) } cleanWs() } @@ -1882,16 +1669,11 @@ pipeline { } agent{ label rocmnode("gfx1030") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx10-3-generic" -DCMAKE_CXX_FLAGS=" -O3 " """ - execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ - cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ - -DGPU_TARGETS="gfx10-3-generic" \ - -DCMAKE_CXX_COMPILER="${build_compiler()}" \ - -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ - -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx10-3-generic" """ + execute_args = build_client_examples("gfx10-3-generic") } steps{ - Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') + Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } } @@ -1903,16 +1685,11 @@ pipeline { } agent{ label 'miopen && (gfx1101 || gfx1100)' } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx11-generic" -DCMAKE_CXX_FLAGS=" -O3 " """ - execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ - cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ - -DGPU_TARGETS="gfx11-generic" \ - -DCMAKE_CXX_COMPILER="${build_compiler()}" \ - -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ - -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx11-generic" """ + execute_args = build_client_examples("gfx11-generic") } steps{ - Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') + Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } } @@ -1924,16 +1701,11 @@ pipeline { } agent{ label rocmnode("gfx1201") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx12-generic" -DCMAKE_CXX_FLAGS=" -O3 " """ - execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ - cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ - -DGPU_TARGETS="gfx12-generic" \ - -DCMAKE_CXX_COMPILER="${build_compiler()}" \ - -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ - -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx12-generic" """ + execute_args = build_client_examples("gfx12-generic") } steps{ - Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') + Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } } @@ -1942,8 +1714,7 @@ pipeline { success { script { // Report the parent stage build ck and run tests status - def variant = env.STAGE_NAME - gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') { + gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${env.STAGE_NAME}", account: 'ROCm', repo: 'composable_kernel') { echo "Reporting success status for build ck and run tests" } } @@ -1970,13 +1741,11 @@ pipeline { success { script { // Report the skipped parent's stage status - def parentVariant = "Process Performance Test Results" - gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${parentVariant}", account: 'ROCm', repo: 'composable_kernel') { + gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Process Performance Test Results", account: 'ROCm', repo: 'composable_kernel') { echo "Process Performance Test Results stage skipped." } // Report the skipped stage's status - def variant = "Process results" - gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') { + gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Process results", account: 'ROCm', repo: 'composable_kernel') { echo "Process Performance Test Results stage skipped." } } diff --git a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp index 98f41dc7fb..9b0ee27f38 100644 --- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp +++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp @@ -24,6 +24,8 @@ #include "ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp" #include "ck/library/utility/host_tensor.hpp" +using ::ck::HostTensorDescriptor; + using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ConvScaleRelu = ck::tensor_operation::element_wise::ScaleScaleRelu; using ConvScale = ck::tensor_operation::element_wise::ScaleScalePass; diff --git a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_bias_fastgelu_xdl_bf16_i8.cpp b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_bias_fastgelu_xdl_bf16_i8.cpp index 0bf748cdbb..b43f89698c 100644 --- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_bias_fastgelu_xdl_bf16_i8.cpp +++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_bias_fastgelu_xdl_bf16_i8.cpp @@ -15,6 +15,8 @@ #include "ck/library/tensor_operation_instance/gpu/grouped_gemm_multi_abd_fixed_nk.hpp" +using ::ck::hip_check_error; + template using S = ck::Sequence; diff --git a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_fastgelu_xdl_bf16_i8.cpp b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_fastgelu_xdl_bf16_i8.cpp index f300583d13..cf12ace19c 100644 --- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_fastgelu_xdl_bf16_i8.cpp +++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_fastgelu_xdl_bf16_i8.cpp @@ -17,6 +17,8 @@ #include "ck/host_utility/hip_check_error.hpp" +using ::ck::hip_check_error; + template using S = ck::Sequence; diff --git a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp index 47d3e0abf9..d69233fef1 100644 --- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp +++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_bias_fastgelu_xdl_bf16_i8.cpp @@ -17,6 +17,8 @@ #include "ck/host_utility/hip_check_error.hpp" +using ::ck::hip_check_error; + template using S = ck::Sequence; diff --git a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp index 8c705d3bcc..9f20133689 100644 --- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp +++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_multiply_xdl_bf16_i8.cpp @@ -17,6 +17,8 @@ #include "ck/host_utility/hip_check_error.hpp" +using ::ck::hip_check_error; + template using S = ck::Sequence; diff --git a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_xdl_bf16_i8.cpp b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_xdl_bf16_i8.cpp index 557dea7676..87dfc2a5f9 100644 --- a/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_xdl_bf16_i8.cpp +++ b/client_example/31_grouped_gemm_bf16Aint8B/grouped_gemm_xdl_bf16_i8.cpp @@ -17,6 +17,8 @@ #include "ck/host_utility/hip_check_error.hpp" +using ::ck::hip_check_error; + template using S = ck::Sequence; diff --git a/docs/conf.py b/docs/conf.py index fe8a1c1d79..58e78f3d1d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,3 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index 03bde86421..a9ae0b2a6a 100644 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -105,7 +105,7 @@ foreach(gpu IN LISTS GPU_TARGETS) endif() endforeach() -list(APPEND gpu_list_tf32 gfx942) +list(APPEND gpu_list_tf32 gfx942 gfx950) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0) diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp index e482953e46..300f33b164 100644 --- a/example/01_gemm/common.hpp +++ b/example/01_gemm/common.hpp @@ -25,6 +25,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp" +#include "ck/host_utility/kernel_launch.hpp" + +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; struct ProblemSize final { diff --git a/example/01_gemm/gemm_wmma_fp8_v3.cpp b/example/01_gemm/gemm_wmma_fp8_v3.cpp index 0376820b7b..2f8eac113b 100644 --- a/example/01_gemm/gemm_wmma_fp8_v3.cpp +++ b/example/01_gemm/gemm_wmma_fp8_v3.cpp @@ -13,7 +13,7 @@ using CDataType = ck::bhalf_t; using ComputeTypeA = ck::f8_t; using ComputeTypeB = ck::f8_t; -using ALayout = Row; +using ALayout = Col; using BLayout = Col; using CLayout = Row; @@ -30,13 +30,13 @@ using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuf PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 64, - 8, 8, + 16, 16, // AK1, BK1 16, 16, 4, 2, + S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, + 1, 4, 16, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, - 2, 8, 8, 0, - S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, - 2, 8, 8, 0, + 2, 16, 16, 0, 1, 1, S<1, 32, 1, 4>, 8, ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, ComputeTypeA, ComputeTypeB>; diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp index 10dd4eaa1f..08c186856e 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp @@ -19,6 +19,10 @@ #include "ck/library/utility/check_err.hpp" #include "ck/host_utility/device_prop.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + struct AlphaBetaAdd { AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){}; diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp index 556aa90f3d..27b166ad80 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp @@ -19,6 +19,10 @@ #include "ck/library/utility/check_err.hpp" #include "ck/host_utility/device_prop.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + struct AlphaBetaAdd { AlphaBetaAdd(int alpha, int beta) : alpha_(alpha), beta_(beta){}; diff --git a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp index 8f8b2e80fe..0f27e95cb1 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp @@ -18,6 +18,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + struct AlphaBetaAdd { AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){}; diff --git a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp index 17e9ceccec..becbed5fd2 100644 --- a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp +++ b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp @@ -19,6 +19,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/04_gemm_add_add_fastgelu/common.hpp b/example/04_gemm_add_add_fastgelu/common.hpp index 91d17df95f..55837f6eab 100644 --- a/example/04_gemm_add_add_fastgelu/common.hpp +++ b/example/04_gemm_add_add_fastgelu/common.hpp @@ -23,6 +23,10 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt index 4f174bfcbb..791d81e264 100644 --- a/example/09_convnd_fwd/CMakeLists.txt +++ b/example/09_convnd_fwd/CMakeLists.txt @@ -21,7 +21,7 @@ foreach(gpu IN LISTS GPU_TARGETS) endif() endforeach() -list(APPEND gpu_list_tf32 gfx942) +list(APPEND gpu_list_tf32 gfx942 gfx950) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0) diff --git a/example/09_convnd_fwd/convnd_fwd_common.hpp b/example/09_convnd_fwd/convnd_fwd_common.hpp index d82b56ec00..001e2ea72f 100644 --- a/example/09_convnd_fwd/convnd_fwd_common.hpp +++ b/example/09_convnd_fwd/convnd_fwd_common.hpp @@ -19,6 +19,10 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + void print_helper_msg() { std::cout << "arg1: verification (0=no, 1=yes)\n" @@ -77,7 +81,7 @@ inline __host__ __device__ constexpr double get_atol() { if constexpr(std::is_same_v && std::is_same_v) { - return 1e-2; + return 1e-3; } else if constexpr(std::is_same_v) { diff --git a/example/09_convnd_fwd/convnd_fwd_dl_common.hpp b/example/09_convnd_fwd/convnd_fwd_dl_common.hpp index aeddd4fc59..3b43dfa4be 100644 --- a/example/09_convnd_fwd/convnd_fwd_dl_common.hpp +++ b/example/09_convnd_fwd/convnd_fwd_dl_common.hpp @@ -19,6 +19,10 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + void print_helper_msg() { std::cout << "arg1: verification (0=no, 1=yes)\n" diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp index 7142521c55..30a2297a28 100644 --- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp +++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp @@ -26,6 +26,10 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using BF16 = ck::bhalf_t; using FP16 = ck::half_t; using FP32 = float; diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp index 9e125c4e5d..5e09229663 100644 --- a/example/12_reduce/reduce_blockwise_two_call.cpp +++ b/example/12_reduce/reduce_blockwise_two_call.cpp @@ -20,6 +20,10 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_common_util.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using namespace ck; using namespace ck::tensor_operation::device; diff --git a/example/13_pool2d_fwd/pool2d_fwd_common.hpp b/example/13_pool2d_fwd/pool2d_fwd_common.hpp index abbf1b29f7..0d3a60db53 100644 --- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp +++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp @@ -19,6 +19,10 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/14_gemm_quantization/gemm_wmma_quantization_int8.cpp b/example/14_gemm_quantization/gemm_wmma_quantization_int8.cpp index a3023997a1..f06bf7304c 100644 --- a/example/14_gemm_quantization/gemm_wmma_quantization_int8.cpp +++ b/example/14_gemm_quantization/gemm_wmma_quantization_int8.cpp @@ -20,6 +20,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp b/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp index 8f68ac6b05..221fbb0dbc 100644 --- a/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp +++ b/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp @@ -19,6 +19,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp b/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp index db7f3cb091..e9a8e89625 100644 --- a/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp +++ b/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp @@ -19,6 +19,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/15_grouped_gemm/CMakeLists.txt b/example/15_grouped_gemm/CMakeLists.txt index 20cbc5fdca..20d9bab7e1 100644 --- a/example/15_grouped_gemm/CMakeLists.txt +++ b/example/15_grouped_gemm/CMakeLists.txt @@ -33,3 +33,13 @@ if(USE_BITINT_EXTENSION_INT4) add_example_executable(example_grouped_gemm_xdl_int4 grouped_gemm_xdl_int4.cpp) add_example_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_int4) endif() + +list(APPEND gpu_list_tf32 gfx942 gfx950) +set(target 0) +foreach(gpu IN LISTS GPU_TARGETS) + if(gpu IN_LIST gpu_list_tf32 AND target EQUAL 0) + add_example_executable(example_grouped_gemm_xdl_fp32_tf32 grouped_gemm_xdl_fp32_tf32.cpp) + add_example_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_fp32_tf32) + set(target 1) + endif() +endforeach() diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp index 3e1f7f0893..b9fca2df87 100644 --- a/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_dl_fp16.cpp @@ -25,6 +25,11 @@ #include "ck/utility/tuple.hpp" #include "ck/utility/sequence.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp index 117a18e3bd..248252ded9 100644 --- a/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_splitk_xdl_fp16.cpp @@ -23,6 +23,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp index c8de51f550..5940c820d8 100644 --- a/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_multiple_d_xdl_fp16.cpp @@ -23,6 +23,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp index ac64a468a4..5bfdbd3c83 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp @@ -19,6 +19,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp index 2fcc0e3cb1..abfa8fd491 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_bias_fp16.cpp @@ -20,6 +20,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp index fb611fd444..8aa73e491d 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp @@ -20,6 +20,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp index 47eb6637bd..fd40aa0410 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp @@ -20,6 +20,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp index 85ea8c2f2c..3856372eb2 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp @@ -19,6 +19,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp index 236e5e4fa2..ce57c01fab 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp @@ -19,6 +19,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp32_tf32.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp32_tf32.cpp new file mode 100644 index 0000000000..ad6a1bf4e3 --- /dev/null +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp32_tf32.cpp @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" + +#define EXAMPLE_WITH_COMPUTE_DATATYPE + +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + +template +using S = ck::Sequence; + +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using ADataType = F32; +using BDataType = F32; +using AccDataType = F32; +using CShuffleDataType = F32; +using DsDataType = ck::Tuple<>; +using EDataType = F32; +using ComputeDataType = ck::tf32_t; + +using ALayout = Row; +using BLayout = Col; +using DsLayout = ck::Tuple<>; +using ELayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl + // clang-format off +//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| +//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| +//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| +//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4, ck::LoopScheduler::Default, ComputeDataType>; +// clang-format on + +#include "run_grouped_gemm_example.inc" + +int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); } + +#undef EXAMPLE_WITH_COMPUTE_DATATYPE diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp index 60c4a71a35..3bfacb210b 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp @@ -19,6 +19,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp index 6b8de05f73..b69f0b3878 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp @@ -19,6 +19,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp index 16d018936b..10f9f9db31 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_splitk_fp16.cpp @@ -19,6 +19,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc index 87ccebc3c4..62f0f3673d 100644 --- a/example/15_grouped_gemm/run_grouped_gemm_example.inc +++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc @@ -3,6 +3,11 @@ #pragma once +// use macro to minimize code change +#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE +using ComputeDataType = AccDataType; +#endif + struct ProblemSize final { std::vector Ms; @@ -231,7 +236,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co AccDataType, AElementOp, BElementOp, - CDEElementOp>; + CDEElementOp, + ComputeDataType>; for(std::size_t i = 0; i < gemm_descs.size(); i++) { @@ -253,7 +259,9 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co pass &= ck::utils::check_err(c_device_result_converted, c_host_tensors[i]); #else - pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]); + pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]); #endif } } diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp index 3cc38b381b..9268b64bed 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp @@ -19,6 +19,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp index 3de32e9a6d..273de555e4 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp @@ -8,6 +8,10 @@ #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + // DataType using ADataType = INT8; using BDataType = INT8; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp index 0290c1829d..6a2935746f 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp @@ -7,6 +7,10 @@ #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + // DataType using ADataType = BF16; using BDataType = BF16; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp index e211a63b0b..98fa0ba861 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp @@ -7,6 +7,10 @@ #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + // DataType using ADataType = F16; using BDataType = F16; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp index 90c2cdcdaa..f7573ca7d6 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp @@ -7,6 +7,10 @@ #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + // DataType using ADataType = F32; using BDataType = F32; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp index e8594e206b..69bf7e431c 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp @@ -7,6 +7,10 @@ #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using ADataType = INT4; using ADataKernelType = INT8; using BDataType = INT4; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp index ee274f3a55..ba23a41049 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp @@ -7,6 +7,10 @@ #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using ADataType = INT8; using BDataType = INT8; using GemmAccDataType = INT32; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp index 3ee3037179..af5002f9f1 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp @@ -7,6 +7,10 @@ #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + // DataType using ADataType = BF16; using BDataType = BF16; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp index 9ce1e76cf5..3eec9b81c1 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp @@ -7,6 +7,10 @@ #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + // DataType using ADataType = F16; using BDataType = F16; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp index 7815d2beea..0247beb976 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp @@ -7,6 +7,10 @@ #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + // DataType using ADataType = F32; using BDataType = F32; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp b/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp index 3e3c586dba..d95fe4b716 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp @@ -18,6 +18,10 @@ #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp index d219df0245..80918c4a1e 100644 --- a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp +++ b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp @@ -18,6 +18,10 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + void print_helper_msg() { std::cout << "arg1: verification (0=no, 1=yes)\n" diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp index f4e6b4d6e3..0806a245c0 100644 --- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp +++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp @@ -19,6 +19,10 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp index 1e7899b35d..007c6d7037 100644 --- a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp +++ b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp @@ -14,6 +14,10 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using F16 = ck::half_t; using F32 = float; diff --git a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp index 5f321e6284..52b8bda3f1 100644 --- a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp +++ b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp @@ -14,6 +14,10 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using F16 = ck::half_t; using F32 = float; diff --git a/example/19_binary_elementwise/elementwise_add_1d.cpp b/example/19_binary_elementwise/elementwise_add_1d.cpp index 90e2d28d95..040809e7c1 100644 --- a/example/19_binary_elementwise/elementwise_add_1d.cpp +++ b/example/19_binary_elementwise/elementwise_add_1d.cpp @@ -12,6 +12,10 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using F16 = ck::half_t; using F32 = float; diff --git a/example/19_binary_elementwise/elementwise_add_4d.cpp b/example/19_binary_elementwise/elementwise_add_4d.cpp index 797521dcb4..fc34bd714f 100644 --- a/example/19_binary_elementwise/elementwise_add_4d.cpp +++ b/example/19_binary_elementwise/elementwise_add_4d.cpp @@ -14,6 +14,10 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using F16 = ck::half_t; using F32 = float; diff --git a/example/20_grouped_conv_bwd_weight/common.hpp b/example/20_grouped_conv_bwd_weight/common.hpp index 9159e51eaf..6b603d2e48 100644 --- a/example/20_grouped_conv_bwd_weight/common.hpp +++ b/example/20_grouped_conv_bwd_weight/common.hpp @@ -20,6 +20,10 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using BF16 = ck::bhalf_t; using F16 = ck::half_t; using F32 = float; diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp index abbc7a946c..eed3ef2e73 100644 --- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp @@ -19,6 +19,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp index ae5e3f36ad..918b66ec2d 100644 --- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp @@ -19,6 +19,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp" #include "ck/library/utility/check_err.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp index 23c602c39e..b3ac78d3c0 100644 --- a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp @@ -19,6 +19,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp index 10d90b795c..92aa74371e 100644 --- a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp @@ -18,6 +18,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + // This example demonstrate a single kernel that runs GEMM layer and laynorm in one fused kernel // // The GEMM + Layernorm implementation is a specialized kernel which allows fusing both layers diff --git a/example/22_cgemm/cgemm_xdl_common.hpp b/example/22_cgemm/cgemm_xdl_common.hpp index 26137a7c2e..1caad311e7 100644 --- a/example/22_cgemm/cgemm_xdl_common.hpp +++ b/example/22_cgemm/cgemm_xdl_common.hpp @@ -14,6 +14,10 @@ #include "ck/library/utility/literals.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/23_softmax/softmax_blockwise.cpp b/example/23_softmax/softmax_blockwise.cpp index d09e434bcf..8c52e0d184 100644 --- a/example/23_softmax/softmax_blockwise.cpp +++ b/example/23_softmax/softmax_blockwise.cpp @@ -18,6 +18,10 @@ #include "ck/library/utility/host_common_util.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using namespace ck::tensor_operation::device; using InDataType = ck::half_t; diff --git a/example/24_batched_gemm/batched_gemm_xdl_bf16.cpp b/example/24_batched_gemm/batched_gemm_xdl_bf16.cpp index d155b2c411..a1be6d7e75 100644 --- a/example/24_batched_gemm/batched_gemm_xdl_bf16.cpp +++ b/example/24_batched_gemm/batched_gemm_xdl_bf16.cpp @@ -18,6 +18,11 @@ #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp b/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp index bd4be91103..e9bbcbce76 100644 --- a/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp +++ b/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp @@ -18,6 +18,11 @@ #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp b/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp index 80f6b1d663..05136daa61 100644 --- a/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp +++ b/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp @@ -18,6 +18,11 @@ #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/24_batched_gemm/batched_gemm_xdl_fp16int4_b_scale_v3.cpp b/example/24_batched_gemm/batched_gemm_xdl_fp16int4_b_scale_v3.cpp index 46e452005d..7b8e50b12b 100644 --- a/example/24_batched_gemm/batched_gemm_xdl_fp16int4_b_scale_v3.cpp +++ b/example/24_batched_gemm/batched_gemm_xdl_fp16int4_b_scale_v3.cpp @@ -18,6 +18,11 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp b/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp index 19a63ff841..fd650d9666 100644 --- a/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp +++ b/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp @@ -18,6 +18,11 @@ #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp b/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp index 2a4610bf90..12fe9bc803 100644 --- a/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp +++ b/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp @@ -18,6 +18,11 @@ #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/24_batched_gemm/batched_gemm_xdl_int4.cpp b/example/24_batched_gemm/batched_gemm_xdl_int4.cpp index 4adb79ed29..cb42f0c775 100644 --- a/example/24_batched_gemm/batched_gemm_xdl_int4.cpp +++ b/example/24_batched_gemm/batched_gemm_xdl_int4.cpp @@ -18,6 +18,11 @@ #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/24_batched_gemm/batched_gemm_xdl_int8.cpp b/example/24_batched_gemm/batched_gemm_xdl_int8.cpp index 06b8c3c0cd..9d95e02720 100644 --- a/example/24_batched_gemm/batched_gemm_xdl_int8.cpp +++ b/example/24_batched_gemm/batched_gemm_xdl_int8.cpp @@ -18,6 +18,11 @@ #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp index 4f4003809b..0ede6c40cc 100644 --- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp +++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp @@ -19,6 +19,11 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::make_ParallelTensorFunctor; +using ::ck::Tensor; + using Row = ck::tensor_layout::gemm::RowMajor; using Bypass = ck::tensor_layout::BypassLayoutVerification; diff --git a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp index e4881da0f5..f1e2bf76f2 100644 --- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp +++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp @@ -17,6 +17,11 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/numeric.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::make_ParallelTensorFunctor; +using ::ck::Tensor; + using Row = ck::tensor_layout::gemm::RowMajor; using Bypass = ck::tensor_layout::BypassLayoutVerification; diff --git a/example/26_contraction/run_contraction_bilinear_example.inc b/example/26_contraction/run_contraction_bilinear_example.inc index ddb6d678e2..a71476043f 100644 --- a/example/26_contraction/run_contraction_bilinear_example.inc +++ b/example/26_contraction/run_contraction_bilinear_example.inc @@ -15,6 +15,10 @@ #include "ck/library/utility/numeric.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using Row = ck::tensor_layout::gemm::RowMajor; int run_contraction_bilinear_example(int argc, char* argv[]) diff --git a/example/26_contraction/run_contraction_scale_example.inc b/example/26_contraction/run_contraction_scale_example.inc index 0a7287edf9..2c7c849601 100644 --- a/example/26_contraction/run_contraction_scale_example.inc +++ b/example/26_contraction/run_contraction_scale_example.inc @@ -15,6 +15,10 @@ #include "ck/library/utility/numeric.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using Row = ck::tensor_layout::gemm::RowMajor; int run_contraction_scale_example(int argc, char* argv[]) diff --git a/example/27_layernorm2d_fwd/layernorm2d_fwd_fp16.cpp b/example/27_layernorm2d_fwd/layernorm2d_fwd_fp16.cpp index 20db24f56d..2952c93fd1 100644 --- a/example/27_layernorm2d_fwd/layernorm2d_fwd_fp16.cpp +++ b/example/27_layernorm2d_fwd/layernorm2d_fwd_fp16.cpp @@ -3,6 +3,10 @@ #include "common.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using XDataType = ck::half_t; using GammaDataType = ck::half_t; using BetaDataType = ck::half_t; diff --git a/example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp b/example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp index 5a57082c79..909fd2c687 100644 --- a/example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp +++ b/example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp @@ -3,6 +3,10 @@ #include "common.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using XDataType = ck::half_t; using GammaDataType = ck::half_t; using BetaDataType = ck::half_t; diff --git a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp index 70c4a01185..a94bae4f30 100644 --- a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp +++ b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp @@ -18,6 +18,11 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/numeric.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::make_ParallelTensorFunctor; +using ::ck::Tensor; + using Row = ck::tensor_layout::gemm::RowMajor; using Bypass = ck::tensor_layout::BypassLayoutVerification; diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp index c4cb7a13a2..60b2569c13 100644 --- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp +++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp @@ -17,6 +17,11 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/numeric.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::make_ParallelTensorFunctor; +using ::ck::Tensor; + using Row = ck::tensor_layout::gemm::RowMajor; using Bypass = ck::tensor_layout::BypassLayoutVerification; diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp index 2b9afc342e..aea466626c 100644 --- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp +++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp @@ -17,6 +17,11 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/numeric.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::make_ParallelTensorFunctor; +using ::ck::Tensor; + using Row = ck::tensor_layout::gemm::RowMajor; using Bypass = ck::tensor_layout::BypassLayoutVerification; diff --git a/example/30_grouped_conv_fwd_multiple_d/common.hpp b/example/30_grouped_conv_fwd_multiple_d/common.hpp index 92fdaa7f33..4e8f08ca36 100644 --- a/example/30_grouped_conv_fwd_multiple_d/common.hpp +++ b/example/30_grouped_conv_fwd_multiple_d/common.hpp @@ -24,6 +24,11 @@ #include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using BF16 = ck::bhalf_t; using FP16 = ck::half_t; using FP32 = float; diff --git a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp index 56b0e6d1dc..78de7ccc0b 100644 --- a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp +++ b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp @@ -24,6 +24,11 @@ #include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using BF16 = ck::bhalf_t; using FP16 = ck::half_t; using FP32 = float; diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_bf16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_bf16.cpp index b1d732d95b..db65cf524a 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_bf16.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_bf16.cpp @@ -19,6 +19,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/host_utility/device_prop.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using BF16 = ck::bhalf_t; using F32 = float; diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp16.cpp index e3f22a3186..ad8dc90376 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp16.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp16.cpp @@ -19,6 +19,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/host_utility/device_prop.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using F16 = ck::half_t; using F32 = float; diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp8.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp8.cpp index e58333b2f3..a59012f69f 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp8.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_fp8.cpp @@ -19,6 +19,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/host_utility/device_prop.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ADataType = ck::f8_t; diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_int8.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_int8.cpp index d534306115..37ba7c8496 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_int8.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_wmma_cshuffle_v3_int8.cpp @@ -19,6 +19,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/host_utility/device_prop.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ADataType = int8_t; diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp index 9afd199f24..0dc2944e07 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp @@ -26,6 +26,10 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp index 6a8aa8a721..ed0e513388 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp @@ -26,6 +26,10 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp index 8a27fb2b3c..bff8bdedbe 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp @@ -26,6 +26,10 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp index bf55e2fd84..d6fbf970e5 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp @@ -28,6 +28,10 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp index e5789ba4b0..3145c6d324 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp @@ -26,6 +26,10 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp index 69ab5c5c0b..5e9fbba935 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -29,6 +29,10 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/host_utility/device_prop.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp index 2604a50a76..e4191577bc 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp @@ -28,6 +28,10 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp index f5cedb14c9..2cc1ba94c8 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -29,6 +29,10 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/host_utility/device_prop.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp index 331bfe99c2..53a9db1029 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp @@ -27,6 +27,10 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp index cd321c0da3..c87e718ce3 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp @@ -28,6 +28,10 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp index f30ec3fd03..74781e2d45 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp @@ -26,6 +26,10 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp index e403ba7f66..54365aae5b 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp @@ -27,6 +27,10 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp index 41c6dff2df..90dbc67e92 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp @@ -29,6 +29,10 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/host_utility/device_prop.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp index 7738a6b6d4..36aef659b8 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp @@ -27,6 +27,10 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp index b59498829e..7b88e0c530 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp @@ -28,6 +28,10 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp index 955c25f0d1..65fa1885ac 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp @@ -30,6 +30,10 @@ Example is GQA-4 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/host_utility/device_prop.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp index 112be07c49..4fe9ddc3eb 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp @@ -28,6 +28,10 @@ Shazeer, Noam. “Fast Transformer Decoding: One Write-Head Is All You Need.” #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/host_utility/device_prop.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp index 9ec1bc933f..8a45f4b5bb 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp @@ -29,6 +29,10 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/host_utility/device_prop.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/33_multiple_reduce/dual_reduce_common.hpp b/example/33_multiple_reduce/dual_reduce_common.hpp index cd21790be6..66f877ffb4 100644 --- a/example/33_multiple_reduce/dual_reduce_common.hpp +++ b/example/33_multiple_reduce/dual_reduce_common.hpp @@ -19,6 +19,11 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_common_util.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::joinable_thread; +using ::ck::Tensor; + static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, {"verify", required_argument, nullptr, 'v'}, {"help", no_argument, nullptr, '?'}, diff --git a/example/34_batchnorm/batchnorm_backward_nhwc.cpp b/example/34_batchnorm/batchnorm_backward_nhwc.cpp index 9737b0d99b..4344ec5525 100644 --- a/example/34_batchnorm/batchnorm_backward_nhwc.cpp +++ b/example/34_batchnorm/batchnorm_backward_nhwc.cpp @@ -14,6 +14,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp" #include "ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'}, {"verify", required_argument, nullptr, 'v'}, {"help", no_argument, nullptr, '?'}, diff --git a/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp index 1ffbabd04b..5f65de1b2c 100644 --- a/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp +++ b/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp @@ -20,6 +20,10 @@ #include "batchnorm_infer_impl.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'}, {"verify", required_argument, nullptr, 'v'}, {"help", no_argument, nullptr, '?'}, diff --git a/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp index 06441be860..a29b58412d 100644 --- a/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp +++ b/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp @@ -20,6 +20,10 @@ #include "ck/library/utility/host_common_util.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'}, {"verify", required_argument, nullptr, 'v'}, {"help", no_argument, nullptr, '?'}, diff --git a/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp b/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp index 8f2b7613b5..2e5f7a3086 100644 --- a/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp +++ b/example/34_batchnorm/batchnorm_forward_training_nhwc_obsolete.cpp @@ -20,6 +20,10 @@ #include "ck/library/utility/host_common_util.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'}, {"verify", required_argument, nullptr, 'v'}, {"help", no_argument, nullptr, '?'}, diff --git a/example/35_splitK_gemm/common.hpp b/example/35_splitK_gemm/common.hpp index 325cc37731..6ed9214a55 100644 --- a/example/35_splitK_gemm/common.hpp +++ b/example/35_splitK_gemm/common.hpp @@ -23,6 +23,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + struct ProblemSizeSplitK final { ck::index_t M = 256; diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp index 1b8194f838..97608418f9 100644 --- a/example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp +++ b/example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp @@ -18,6 +18,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp index 8628e8770c..d7240396c5 100644 --- a/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp +++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp @@ -18,6 +18,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp16_fp8.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_fp16_fp8.cpp index 8091a5b448..f3b9f66346 100644 --- a/example/35_splitK_gemm/splitK_gemm_xdl_fp16_fp8.cpp +++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp16_fp8.cpp @@ -18,6 +18,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp index 4257451754..66ceb55a79 100644 --- a/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp +++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp @@ -18,6 +18,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp index f0d4e28ad2..229d2b8709 100644 --- a/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp +++ b/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp @@ -18,6 +18,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp index d800443932..17bb675d34 100644 --- a/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp +++ b/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp @@ -18,6 +18,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_lds_direct_load_fp16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_lds_direct_load_fp16.cpp index ef27c7bb9f..41596019ca 100644 --- a/example/35_splitK_gemm/splitK_gemm_xdl_lds_direct_load_fp16.cpp +++ b/example/35_splitK_gemm/splitK_gemm_xdl_lds_direct_load_fp16.cpp @@ -26,6 +26,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp index ff7acea48d..232a0fcf86 100644 --- a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp +++ b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp @@ -19,6 +19,10 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + // clang-format off using EmbType = ck::half_t; using IndexType = int64_t; diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp index 4934f74393..51344f5572 100644 --- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp +++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp @@ -22,6 +22,10 @@ Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1 #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp index 1823d4fc0a..3297e818b8 100644 --- a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp +++ b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp @@ -22,6 +22,11 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/39_permute/common.hpp b/example/39_permute/common.hpp index b23128a536..be1b824341 100644 --- a/example/39_permute/common.hpp +++ b/example/39_permute/common.hpp @@ -26,6 +26,10 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using F16 = ck::half_t; using F32 = float; using F64 = double; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp index f9a7d9f638..2b89958342 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp @@ -4,6 +4,11 @@ #include "common.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using InDataType = int8_t; using WeiDataType = int8_t; using BiasDataType = int32_t; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp index 333987edd6..06fa50689c 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp @@ -4,6 +4,11 @@ #include "common.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using InDataType = int8_t; using WeiDataType = int8_t; using BiasDataType = int32_t; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp index 4b94045421..aa31cb2de8 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp @@ -4,6 +4,11 @@ #include "common.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using InDataType = int8_t; using WeiDataType = int8_t; using BiasDataType = int32_t; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp index b74e06b10a..48e27370cd 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp @@ -4,6 +4,11 @@ #include "common.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using InDataType = int8_t; using WeiDataType = int8_t; using BiasDataType = int32_t; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp index c3ac40a1bc..8ce49d86db 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp @@ -4,6 +4,11 @@ #include "common.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using InDataType = int8_t; using WeiDataType = int8_t; using RequantScaleDataType = float; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp index 437fd6f4c2..b58e25cb53 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp @@ -4,6 +4,11 @@ #include "common.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using InDataType = int8_t; using WeiDataType = int8_t; using AccDataType = int32_t; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp index d9cfae2898..8aa0a43356 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp @@ -4,6 +4,11 @@ #include "common.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using InDataType = int8_t; using WeiDataType = int8_t; using BiasDataType = int32_t; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp index 9d3024fce7..22a5521dd3 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp @@ -4,6 +4,11 @@ #include "common.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using InDataType = int8_t; using WeiDataType = int8_t; using BiasDataType = int32_t; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp index 2d4ae1f837..c01e9d9bb0 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp @@ -4,6 +4,11 @@ #include "common.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using InDataType = int8_t; using WeiDataType = int8_t; using RequantScaleDataType = float; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp index 79b0c00fa5..5091065c30 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp @@ -4,6 +4,11 @@ #include "common.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using InDataType = int8_t; using WeiDataType = int8_t; using AccDataType = int32_t; diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp index ba589ec044..fd746cecfa 100644 --- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp +++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp @@ -20,6 +20,10 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using In0DataType = ck::bhalf_t; using Wei0DataType = ck::bhalf_t; using Acc0DataType = float; diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp index 847859068f..541967fc8e 100644 --- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp +++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp @@ -20,6 +20,10 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using In0DataType = ck::half_t; using Wei0DataType = ck::half_t; using Acc0DataType = float; diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp index 9a104dbfab..f8d6827ed0 100644 --- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp +++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp @@ -20,6 +20,10 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using In0DataType = float; using Wei0DataType = float; using Acc0DataType = float; diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp index cf7b1ce3a8..a010a88b19 100644 --- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp +++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp @@ -22,6 +22,10 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using In0DataType = ck::int4_t; using Wei0DataType = ck::int4_t; using KernelIn0DataType = int8_t; diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp index 3ade6c811a..2e59a36b01 100644 --- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp +++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp @@ -20,6 +20,10 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using In0DataType = int8_t; using Wei0DataType = int8_t; using Acc0DataType = int32_t; diff --git a/example/42_groupnorm_fwd/groupnorm_fwd_sigmoid_mul_fp16.cpp b/example/42_groupnorm_fwd/groupnorm_fwd_sigmoid_mul_fp16.cpp index 15c02b8213..35aaa21e38 100644 --- a/example/42_groupnorm_fwd/groupnorm_fwd_sigmoid_mul_fp16.cpp +++ b/example/42_groupnorm_fwd/groupnorm_fwd_sigmoid_mul_fp16.cpp @@ -3,6 +3,10 @@ #include "common.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + constexpr int Rank = 5; constexpr int NumReduceDim = 3; diff --git a/example/42_groupnorm_fwd/groupnorm_fwd_splitk_fp16.cpp b/example/42_groupnorm_fwd/groupnorm_fwd_splitk_fp16.cpp index 37fdf1b253..18e3fdf9ba 100644 --- a/example/42_groupnorm_fwd/groupnorm_fwd_splitk_fp16.cpp +++ b/example/42_groupnorm_fwd/groupnorm_fwd_splitk_fp16.cpp @@ -3,6 +3,10 @@ #include "common.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + constexpr int Rank = 5; constexpr int NumReduceDim = 3; diff --git a/example/42_groupnorm_fwd/groupnorm_fwd_swish_fp16.cpp b/example/42_groupnorm_fwd/groupnorm_fwd_swish_fp16.cpp index 6d17264cef..3259fdf78d 100644 --- a/example/42_groupnorm_fwd/groupnorm_fwd_swish_fp16.cpp +++ b/example/42_groupnorm_fwd/groupnorm_fwd_swish_fp16.cpp @@ -3,6 +3,10 @@ #include "common.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + constexpr int Rank = 5; constexpr int NumReduceDim = 3; diff --git a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp index 873982227b..03f9812557 100644 --- a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp +++ b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp @@ -16,6 +16,11 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::make_ParallelTensorFunctor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp index 32a7e4a76e..9462f95e8a 100644 --- a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp +++ b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp @@ -16,6 +16,11 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::make_ParallelTensorFunctor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp index bd9a8c151b..8688451dec 100644 --- a/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp +++ b/example/44_elementwise_permute/elementwise_binary_4D_fp16.cpp @@ -16,6 +16,10 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using F16 = ck::half_t; using F32 = float; diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp index 2d689648f2..8a7b4a5e45 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp @@ -16,6 +16,10 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using F16 = ck::half_t; using F32 = float; diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp index 6e70a306d3..84e97f0a62 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp @@ -17,6 +17,10 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using F16 = ck::half_t; using F32 = float; diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp index 632d88e88a..50ebcd25f8 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp @@ -16,6 +16,10 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using F16 = ck::half_t; using F32 = float; diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp index bd54f1c19c..72891a2249 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp @@ -16,6 +16,10 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using F16 = ck::half_t; using F32 = float; diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp index 9621d591a9..84b593e9ed 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp @@ -16,6 +16,10 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using F16 = ck::half_t; using F32 = float; diff --git a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp index 1ce797e4dd..6785ccefb8 100644 --- a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp +++ b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp @@ -19,6 +19,10 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/utility/reduction_enums.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using F16 = ck::half_t; using F32 = float; using F8 = ck::f8_t; diff --git a/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp index be4014f636..764b11dfa7 100644 --- a/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp +++ b/example/44_elementwise_permute/elementwise_trinary_4D_fp16.cpp @@ -16,6 +16,10 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using F16 = ck::half_t; using F32 = float; diff --git a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp index 21c5ff8d5a..4dcac14a8c 100644 --- a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp +++ b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp @@ -19,6 +19,10 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using ADataType = ck::half_t; // Input 1 using BDataType = ck::half_t; // Input 2 using XDataType = ck::half_t; diff --git a/example/46_gemm_add_multiply/common.hpp b/example/46_gemm_add_multiply/common.hpp index 2c656cf441..83796a4424 100644 --- a/example/46_gemm_add_multiply/common.hpp +++ b/example/46_gemm_add_multiply/common.hpp @@ -22,6 +22,10 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp index 3e69caf51e..464e3b2cf3 100644 --- a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp +++ b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute_xdl.cpp @@ -18,6 +18,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; using Bypass = ck::tensor_layout::BypassLayoutVerification; diff --git a/example/48_pool3d_fwd/pool3d_fwd_common.hpp b/example/48_pool3d_fwd/pool3d_fwd_common.hpp index ef64dd167d..cfdc366ba0 100644 --- a/example/48_pool3d_fwd/pool3d_fwd_common.hpp +++ b/example/48_pool3d_fwd/pool3d_fwd_common.hpp @@ -18,6 +18,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template std::vector f_tensor_strides_ncdhw(ck::index_t N_, ck::index_t C_, diff --git a/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp b/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp index 51b7f0015d..abfbc39e1a 100644 --- a/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp +++ b/example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp @@ -19,6 +19,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template std::vector f_tensor_strides_ncdhw(ck::index_t N_, ck::index_t C_, diff --git a/example/52_im2col_col2im/column_to_image_f32.cpp b/example/52_im2col_col2im/column_to_image_f32.cpp index 047f2a2118..76a13ab42c 100644 --- a/example/52_im2col_col2im/column_to_image_f32.cpp +++ b/example/52_im2col_col2im/column_to_image_f32.cpp @@ -3,6 +3,10 @@ #include "common.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using InDataType = FP32; // ck::bhalf_t;//FP32; using OutDataType = FP32; // ck::bhalf_t;//FP32; diff --git a/example/52_im2col_col2im/image_to_column_f32.cpp b/example/52_im2col_col2im/image_to_column_f32.cpp index 140bdf8062..90157bdc52 100644 --- a/example/52_im2col_col2im/image_to_column_f32.cpp +++ b/example/52_im2col_col2im/image_to_column_f32.cpp @@ -3,6 +3,10 @@ #include "common.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using InDataType = FP32; using OutDataType = FP32; diff --git a/example/53_layernorm2d_bwd/layernorm2d_bwd_fp32.cpp b/example/53_layernorm2d_bwd/layernorm2d_bwd_fp32.cpp index 0b0a3e72ad..79c9c627dd 100644 --- a/example/53_layernorm2d_bwd/layernorm2d_bwd_fp32.cpp +++ b/example/53_layernorm2d_bwd/layernorm2d_bwd_fp32.cpp @@ -19,6 +19,10 @@ #include "ck/tensor_operation/gpu/device/impl/device_normalization_bwd_gamma_beta_impl.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using DYDataType = float; using XDataType = float; using GammaDataType = float; diff --git a/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp b/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp index dcbb472118..b5245a5785 100644 --- a/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp +++ b/example/54_groupnorm_bwd/groupnorm_bwd_fp32.cpp @@ -19,6 +19,10 @@ #include "ck/tensor_operation/gpu/device/impl/device_normalization_bwd_gamma_beta_impl.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using DYDataType = float; using XDataType = float; using GammaDataType = float; diff --git a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp index 63343df3a8..bad76fc4f4 100644 --- a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp +++ b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp @@ -20,6 +20,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; @@ -221,8 +226,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b0_tensors_device.emplace_back(std::make_unique( sizeof(B0DataType) * problem_size.Ns[i] * problem_size.Ks[i])); - b1_tensors_device.emplace_back( - std::make_unique(sizeof(B1DataType) * problem_size.Ns[i])); + b1_tensors_device.emplace_back(std::make_unique( + sizeof(B1DataType) * problem_size.Ns[i] * problem_size.Ks[i])); d0_tensors_device.emplace_back( std::make_unique(sizeof(D0DataType) * problem_size.Ns[i])); diff --git a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp index 78f7d954f0..83f7eab9b2 100644 --- a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp +++ b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_fp16.cpp @@ -20,6 +20,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_bias_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_bias_fastgelu_bf16_i8.cpp index d40d09540f..45c61a7a79 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_bias_fastgelu_bf16_i8.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_bias_fastgelu_bf16_i8.cpp @@ -20,6 +20,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fastgelu_bf16_i8.cpp index 102b7f50de..9a245f574f 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fastgelu_bf16_i8.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fastgelu_bf16_i8.cpp @@ -20,6 +20,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fp16.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fp16.cpp index aeaa5fe776..edadaac3db 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fp16.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fp16.cpp @@ -18,6 +18,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_multiply_bias_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_multiply_bias_fastgelu_bf16_i8.cpp index 9363953a6e..0cecbf8cc7 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_multiply_bias_fastgelu_bf16_i8.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_multiply_bias_fastgelu_bf16_i8.cpp @@ -20,6 +20,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp index a599f9d032..1240be87d2 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp @@ -20,6 +20,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fastgelu_bf16_i8.cpp index d7e316e1e0..6530d8cb45 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fastgelu_bf16_i8.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fastgelu_bf16_i8.cpp @@ -20,6 +20,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp index 2a44c8ad2a..6bae806b99 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_fp16.cpp @@ -18,6 +18,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp index 83cc61284e..199cb6288f 100644 --- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp +++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp @@ -20,6 +20,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp index a9a30b4c27..c522fd56fa 100644 --- a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp +++ b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp16.cpp @@ -19,6 +19,10 @@ #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/numeric.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using Row = ck::tensor_layout::gemm::RowMajor; using Bypass = ck::tensor_layout::BypassLayoutVerification; diff --git a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp8.cpp b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp8.cpp index 4f7414abfa..7eed258191 100644 --- a/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp8.cpp +++ b/example/61_contraction_multi_ABD/contraction_multi_ABD_xdl_fp8.cpp @@ -19,6 +19,10 @@ #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/numeric.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using Row = ck::tensor_layout::gemm::RowMajor; using Bypass = ck::tensor_layout::BypassLayoutVerification; diff --git a/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp b/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp index 2710dd6b63..125d592835 100644 --- a/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp +++ b/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp @@ -22,6 +22,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + constexpr ck::index_t NDimSpatial = 3; using InDataType = ck::half_t; using WeiDataType = ck::half_t; diff --git a/example/62_convnd_activ/binary/convnd_bwd_weight_xdl_bilinear_residual_fp16.cpp b/example/62_convnd_activ/binary/convnd_bwd_weight_xdl_bilinear_residual_fp16.cpp index cb37ebf575..382640fdc3 100644 --- a/example/62_convnd_activ/binary/convnd_bwd_weight_xdl_bilinear_residual_fp16.cpp +++ b/example/62_convnd_activ/binary/convnd_bwd_weight_xdl_bilinear_residual_fp16.cpp @@ -21,6 +21,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + constexpr ck::index_t NDimSpatial = 3; using InDataType = ck::half_t; using WeiDataType = ck::half_t; diff --git a/example/62_convnd_activ/binary/convnd_fwd_xdl_bilinear_residual_fp16.cpp b/example/62_convnd_activ/binary/convnd_fwd_xdl_bilinear_residual_fp16.cpp index 616d0cc9e8..610a805816 100644 --- a/example/62_convnd_activ/binary/convnd_fwd_xdl_bilinear_residual_fp16.cpp +++ b/example/62_convnd_activ/binary/convnd_fwd_xdl_bilinear_residual_fp16.cpp @@ -21,6 +21,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + constexpr ck::index_t NDimSpatial = 3; using InDataType = ck::half_t; using WeiDataType = ck::half_t; @@ -263,4 +267,8 @@ bool run_grouped_conv(bool do_verification, #include "../run_convnd_activ_example.inc" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); } diff --git a/example/62_convnd_activ/convinvscale/convnd_fwd_convinvscale_common.hpp b/example/62_convnd_activ/convinvscale/convnd_fwd_convinvscale_common.hpp index 4b2ebf8484..91a73b56a3 100644 --- a/example/62_convnd_activ/convinvscale/convnd_fwd_convinvscale_common.hpp +++ b/example/62_convnd_activ/convinvscale/convnd_fwd_convinvscale_common.hpp @@ -22,6 +22,10 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ConvInvscale = ck::tensor_operation::element_wise::ConvInvscale; +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + void print_helper_msg() { std::cout << "arg1: verification (0=no, 1=yes)\n" diff --git a/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp b/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp index 0a802ee27d..2aae48d994 100644 --- a/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp +++ b/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp @@ -22,6 +22,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + constexpr ck::index_t NDimSpatial = 3; using InDataType = ck::half_t; using WeiDataType = ck::half_t; diff --git a/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp b/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp index 3266c55d7c..17be982924 100644 --- a/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp +++ b/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp @@ -21,6 +21,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + constexpr ck::index_t NDimSpatial = 3; using InDataType = ck::half_t; using WeiDataType = ck::half_t; diff --git a/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp b/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp index bf560f8a43..79f14992c2 100644 --- a/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp +++ b/example/62_convnd_activ/convscale/convnd_fwd_convscale_common.hpp @@ -22,6 +22,10 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ConvScale = ck::tensor_operation::element_wise::ConvScale; +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + void print_helper_msg() { std::cout << "arg1: verification (0=no, 1=yes)\n" diff --git a/example/62_convnd_activ/convscale_add/convnd_fwd_convscale_add_common.hpp b/example/62_convnd_activ/convscale_add/convnd_fwd_convscale_add_common.hpp index b588d71087..c9b59c1e6d 100644 --- a/example/62_convnd_activ/convscale_add/convnd_fwd_convscale_add_common.hpp +++ b/example/62_convnd_activ/convscale_add/convnd_fwd_convscale_add_common.hpp @@ -17,6 +17,10 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ConvScaleAdd = ck::tensor_operation::element_wise::ConvScaleAdd; +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + void print_helper_msg() { std::cout << "arg1: verification (0=no, 1=yes)\n" diff --git a/example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp b/example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp index f521c51d67..b99c91de1c 100644 --- a/example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp +++ b/example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp @@ -23,6 +23,10 @@ #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/utility/type.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + namespace ew = ck::tensor_operation::element_wise; using PassThrough = ew::PassThrough; diff --git a/example/62_convnd_activ/convscale_relu/convnd_fwd_convscale_relu_common.hpp b/example/62_convnd_activ/convscale_relu/convnd_fwd_convscale_relu_common.hpp index d2dacc2052..17c162305e 100644 --- a/example/62_convnd_activ/convscale_relu/convnd_fwd_convscale_relu_common.hpp +++ b/example/62_convnd_activ/convscale_relu/convnd_fwd_convscale_relu_common.hpp @@ -15,6 +15,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" #include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ConvScaleRelu = ck::tensor_operation::element_wise::ConvScaleRelu; diff --git a/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp b/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp index 4af7f4535a..de5cd96951 100644 --- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp +++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_common.hpp @@ -23,6 +23,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + constexpr ck::index_t NDimSpatial = 3; using InDataType = ck::half_t; using WeiDataType = ck::half_t; diff --git a/example/62_convnd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp b/example/62_convnd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp index 566aa50d23..edbca0c7a1 100644 --- a/example/62_convnd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp +++ b/example/62_convnd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp @@ -21,6 +21,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + constexpr ck::index_t NDimSpatial = 3; template diff --git a/example/62_convnd_activ/unary/convnd_fwd_activ_unary_common.hpp b/example/62_convnd_activ/unary/convnd_fwd_activ_unary_common.hpp index dd171d9ed3..67f8b3e1a7 100644 --- a/example/62_convnd_activ/unary/convnd_fwd_activ_unary_common.hpp +++ b/example/62_convnd_activ/unary/convnd_fwd_activ_unary_common.hpp @@ -23,6 +23,10 @@ #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + constexpr ck::index_t NDimSpatial = 3; using InDataType = ck::half_t; using WeiDataType = ck::half_t; diff --git a/example/63_layernorm4d_fwd/layernorm4d_fwd_fp16.cpp b/example/63_layernorm4d_fwd/layernorm4d_fwd_fp16.cpp index 659cc3554d..4453cb1ddf 100644 --- a/example/63_layernorm4d_fwd/layernorm4d_fwd_fp16.cpp +++ b/example/63_layernorm4d_fwd/layernorm4d_fwd_fp16.cpp @@ -3,6 +3,10 @@ #include "common.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using XDataType = ck::half_t; using GammaDataType = ck::half_t; using BetaDataType = ck::half_t; diff --git a/example/63_layernorm4d_fwd/layernorm4d_fwd_splitk_fp16.cpp b/example/63_layernorm4d_fwd/layernorm4d_fwd_splitk_fp16.cpp index 415635a0e3..e9bd64d9ec 100644 --- a/example/63_layernorm4d_fwd/layernorm4d_fwd_splitk_fp16.cpp +++ b/example/63_layernorm4d_fwd/layernorm4d_fwd_splitk_fp16.cpp @@ -3,6 +3,10 @@ #include "common.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + using XDataType = ck::half_t; using GammaDataType = ck::half_t; using BetaDataType = ck::half_t; diff --git a/example/64_fpAintB_gemm/common.hpp b/example/64_fpAintB_gemm/common.hpp index 4fb4c41d05..65b2b1b6a3 100644 --- a/example/64_fpAintB_gemm/common.hpp +++ b/example/64_fpAintB_gemm/common.hpp @@ -22,6 +22,11 @@ #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::make_ParallelTensorFunctor; +using ::ck::Tensor; + struct ProblemSize final { ck::index_t M = 3840; diff --git a/example/65_gemm_multiply_multiply/gemm_add_add_wmma_fp16.cpp b/example/65_gemm_multiply_multiply/gemm_add_add_wmma_fp16.cpp index 54abab2f60..8b71caa5cd 100644 --- a/example/65_gemm_multiply_multiply/gemm_add_add_wmma_fp16.cpp +++ b/example/65_gemm_multiply_multiply/gemm_add_add_wmma_fp16.cpp @@ -21,6 +21,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/65_gemm_multiply_multiply/gemm_add_add_xdl_fp16.cpp b/example/65_gemm_multiply_multiply/gemm_add_add_xdl_fp16.cpp index fe8fd9c100..6c8c711da1 100644 --- a/example/65_gemm_multiply_multiply/gemm_add_add_xdl_fp16.cpp +++ b/example/65_gemm_multiply_multiply/gemm_add_add_xdl_fp16.cpp @@ -21,6 +21,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp index 8b8cee9e52..24b69fdd8d 100644 --- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp +++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp16_bpreshuffle.cpp @@ -22,6 +22,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp index 43637e4a1f..0982ef9830 100644 --- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp +++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp @@ -21,6 +21,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp index 3b21f95119..6fde02553a 100644 --- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp +++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_ab_scale.cpp @@ -21,6 +21,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle.cpp index d64266bccf..3fd2a2ef4f 100644 --- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle.cpp +++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_blockscale_bpreshuffle.cpp @@ -21,6 +21,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp index 3ee4955ae4..3c2499eafa 100644 --- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp +++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp @@ -21,6 +21,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp index cc01d01e64..3530ca828b 100644 --- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp +++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_int8.cpp @@ -21,6 +21,11 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::hip_check_error; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp index 2cb2dc17f4..aeb34d88fd 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp @@ -21,6 +21,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp index bca5ffec78..eb568950c3 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8_blockscale.cpp @@ -21,6 +21,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp index d14885e7f2..f31ecae7e7 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp @@ -21,6 +21,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp index d80c75abe8..974b0ba466 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp @@ -21,6 +21,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp index 02369f344e..66118fb1d1 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp @@ -21,6 +21,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp index cafea72559..617bb2461d 100644 --- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp +++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp @@ -21,6 +21,10 @@ #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc b/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc index b08d12de86..9a0736fe33 100644 --- a/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc +++ b/example/66_complex_contraction_bilinear/run_complex_contraction_bilinear_example.inc @@ -15,6 +15,10 @@ #include "ck/library/utility/numeric.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + int run_complex_contraction_bilinear_example(int argc, char* argv[]) { bool do_verification = true; diff --git a/example/67_gemm_microscaling/gemm_mx_common.hpp b/example/67_gemm_microscaling/gemm_mx_common.hpp index 2d0585c880..b056e7358d 100644 --- a/example/67_gemm_microscaling/gemm_mx_common.hpp +++ b/example/67_gemm_microscaling/gemm_mx_common.hpp @@ -20,6 +20,10 @@ #include "ck/library/utility/fill.hpp" #include "ck/library/utility/host_tensor.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp index 0c51a24679..9962e80fca 100644 --- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp +++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4.cpp @@ -21,6 +21,10 @@ #include "ck/library/utility/fill.hpp" #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp index b6d5d8f211..8354c25e22 100644 --- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp +++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bns.cpp @@ -21,6 +21,10 @@ #include "ck/library/utility/fill.hpp" #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp index ebb73ca7e0..74006909ee 100644 --- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp +++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp @@ -21,6 +21,10 @@ #include "ck/library/utility/fill.hpp" #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp index 61a63b47ac..d3ecbd226a 100644 --- a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp +++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4.cpp @@ -21,6 +21,10 @@ #include "ck/library/utility/fill.hpp" #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp index 2670468c4b..62d9868881 100644 --- a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp +++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bns.cpp @@ -21,6 +21,10 @@ #include "ck/library/utility/fill.hpp" #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp index c3454be84a..310737eaa8 100644 --- a/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp +++ b/example/67_gemm_microscaling/moe_gemm2_xdl_mx_fp4_bpreshuffle.cpp @@ -21,6 +21,10 @@ #include "ck/library/utility/fill.hpp" #include "ck/utility/blkgemmpipe_scheduler.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/68_gemm_add/common.hpp b/example/68_gemm_add/common.hpp index 38e77a160f..a83e540cc6 100644 --- a/example/68_gemm_add/common.hpp +++ b/example/68_gemm_add/common.hpp @@ -29,6 +29,10 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/69_gemm_add_relu/common.hpp b/example/69_gemm_add_relu/common.hpp index 311cbb2dfb..9d2bb0ba5c 100644 --- a/example/69_gemm_add_relu/common.hpp +++ b/example/69_gemm_add_relu/common.hpp @@ -29,6 +29,10 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/literals.hpp" +using ::ck::DeviceMem; +using ::ck::HostTensorDescriptor; +using ::ck::Tensor; + template using S = ck::Sequence; diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc index d5f164c40f..1c57a03c97 100644 --- a/example/ck_tile/03_gemm/run_gemm_example.inc +++ b/example/ck_tile/03_gemm/run_gemm_example.inc @@ -2,6 +2,7 @@ // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once #include "ck_tile/host/permute_pk_int4.hpp" +#include "ck_tile/host/tensor_shuffle_utils.hpp" template static constexpr inline auto is_row_major(Layout layout_) @@ -172,69 +173,6 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, return ave_time; } -template -auto shuffle_b(const ck_tile::HostTensor& t) -{ - assert(t.get_lengths().size() == 2); - int n_ = t.get_lengths()[1]; - int k_ = t.get_lengths()[0]; - - if(ck_tile::is_gfx12_supported()) - { - constexpr int divisor = 2; - constexpr int kABK1PerLane = 8; - constexpr int kABK0PerLane = GemmConfig::K_Warp_Tile / divisor / kABK1PerLane; - ck_tile::HostTensor t_view({n_ / GemmConfig::N_Warp_Tile, - GemmConfig::N_Warp_Tile, - k_ / GemmConfig::K_Warp_Tile, - kABK0PerLane, - divisor, - kABK1PerLane}); - std::copy(t.begin(), t.end(), t_view.begin()); - return ck_tile::reference_permute(t_view, {0, 2, 4, 1, 3, 5}); - } - else - { - int divisor = 1; - if(ck_tile::is_gfx11_supported()) - { - divisor = 1; - } - else - { - assert(is_wave32() == false); - divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4; - } - ck_tile::HostTensor t_view({n_ / GemmConfig::N_Warp_Tile, - GemmConfig::N_Warp_Tile, - k_ / GemmConfig::K_Warp_Tile, - divisor, - GemmConfig::K_Warp_Tile / divisor}); - std::copy(t.begin(), t.end(), t_view.begin()); - return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4}); - } -} - -template -auto shuffle_b_permuteN(const ck_tile::HostTensor& t) -{ - assert(t.get_lengths().size() == 2); - - int n_ = t.get_lengths()[1]; - int k_ = t.get_lengths()[0]; - constexpr int divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4; - constexpr int NRepeat = GemmConfig::N_Tile / GemmConfig::N_Warp_Tile / GemmConfig::N_Warp; - ck_tile::HostTensor t_view({n_ / GemmConfig::N_Tile, - GemmConfig::N_Warp, - GemmConfig::N_Warp_Tile, - NRepeat, - k_ / GemmConfig::K_Warp_Tile, - divisor, - GemmConfig::K_Warp_Tile / divisor}); - std::copy(t.begin(), t.end(), t_view.begin()); - return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 5, 2, 6}); -} - template bool do_verify(const ck_tile::HostTensor& c_m_n_dev_result, const ck_tile::HostTensor& c_m_n_ref, @@ -309,8 +247,8 @@ int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser, if(init_method == 0) { - ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k); - ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n); + ck_tile::FillUniformDistribution{-2.f, 2.f}(a_m_k); + ck_tile::FillUniformDistribution{-2.f, 2.f}(b_k_n); } else if(init_method == 1) { diff --git a/example/ck_tile/17_grouped_gemm/README.md b/example/ck_tile/17_grouped_gemm/README.md index 32d7d0516f..c3fd7d4f82 100644 --- a/example/ck_tile/17_grouped_gemm/README.md +++ b/example/ck_tile/17_grouped_gemm/README.md @@ -26,7 +26,7 @@ Multi-D operations extend the standard GEMM operation by supporting additional e - **Implementation**: Available in `grouped_gemm_multi_d.cpp` - **Operation**: E = C × D₀ × D₁ (where C = A × B is the standard GEMM result) - **Configuration**: Uses `GemmConfigV3`, `GemmConfigV4`, `GemmConfigMemory` template configuration with 2 D tensors -- **Data Types**: Supports fp16, fp8 +- **Data Types**: Supports fp16, bf16, fp8 - **Benefits**: Enables complex operations like scaling, activation functions, or other elementwise transformations in a single kernel call - **Build Target**: `make tile_example_grouped_gemm_multi_d -j` @@ -61,7 +61,7 @@ args: -a_layout A tensor data layout - (Default: Row). -b_layout B tensor data layout - (Default: Col). -c_layout C tensor data layout - (Default: Row). - -prec data type. fp16/fp8 - (Default: fp16). + -prec data type. fp16/bf16/fp8 - (Default: fp16). -validate 0. No validation, 1. Validation on CPU. (Default: 1). -warmup Number of iterations before benchmark the kernel. (Default: 10). -repeat Number of iterations to benchmark the kernel. (Default: 100). diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp index f5335c3ec0..52391cde62 100644 --- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp @@ -333,6 +333,11 @@ int run_grouped_gemm_example(int argc, char* argv[]) return run_gemm_example_prec_type, ck_tile::half_t>( a_layout, b_layout, argc, argv); } + else if(data_type == "bf16") + { + return run_gemm_example_prec_type, ck_tile::bf16_t>( + a_layout, b_layout, argc, argv); + } else if(data_type == "fp8") { return run_gemm_example_prec_type, ck_tile::fp8_t>( diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp index 049957cbfd..df8fa8fde0 100644 --- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp @@ -66,6 +66,15 @@ struct GemmTypeConfig using CDataType = ck_tile::half_t; }; +template <> +struct GemmTypeConfig +{ + using ADataType = ck_tile::bf16_t; + using BDataType = ck_tile::bf16_t; + using AccDataType = float; + using CDataType = ck_tile::bf16_t; +}; + struct GemmConfigBase { static constexpr bool kPadM = false; @@ -323,49 +332,6 @@ inline std::size_t get_workspace_size(const std::vector& gem return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg<>); } -template -auto shuffle_b(const ck_tile::HostTensor& t) -{ - assert(t.get_lengths().size() == 2); - int n_ = t.get_lengths()[1]; - int k_ = t.get_lengths()[0]; - - if(ck_tile::is_gfx12_supported()) - { - constexpr int divisor = 2; - constexpr int kABK1PerLane = 8; - constexpr int kABK0PerLane = GemmConfig::K_Warp_Tile / divisor / kABK1PerLane; - ck_tile::HostTensor t_view({n_ / GemmConfig::N_Warp_Tile, - GemmConfig::N_Warp_Tile, - k_ / GemmConfig::K_Warp_Tile, - kABK0PerLane, - divisor, - kABK1PerLane}); - std::copy(t.begin(), t.end(), t_view.begin()); - return ck_tile::reference_permute(t_view, {0, 2, 4, 1, 3, 5}); - } - else - { - int divisor = 1; - if(ck_tile::is_gfx11_supported()) - { - divisor = 1; - } - else - { - assert(is_wave32() == false); - divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4; - } - ck_tile::HostTensor t_view({n_ / GemmConfig::N_Warp_Tile, - GemmConfig::N_Warp_Tile, - k_ / GemmConfig::K_Warp_Tile, - divisor, - GemmConfig::K_Warp_Tile / divisor}); - std::copy(t.begin(), t.end(), t_view.begin()); - return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4}); - } -} - template , ck_tile::half_t>( a_layout, b_layout, argc, argv); } + else if(data_type == "bf16") + { + return run_gemm_example_prec_type, ck_tile::bf16_t>( + a_layout, b_layout, argc, argv); + } else if(data_type == "fp8") { return run_gemm_example_prec_type, ck_tile::fp8_t>( diff --git a/example/ck_tile/18_flatmm/CMakeLists.txt b/example/ck_tile/18_flatmm/CMakeLists.txt index d2ad442248..43789750d0 100644 --- a/example/ck_tile/18_flatmm/CMakeLists.txt +++ b/example/ck_tile/18_flatmm/CMakeLists.txt @@ -14,9 +14,16 @@ if(has_supported_gpu) add_executable(tile_example_moe_flatmm EXCLUDE_FROM_ALL moe_flatmm.cpp) add_executable(tile_example_a16w4_moe_flatmm EXCLUDE_FROM_ALL mixed_prec/a16w4_moe_flatmm.cpp) add_executable(tile_example_grouped_flatmm EXCLUDE_FROM_ALL grouped_flatmm.cpp) - add_executable(tile_example_mx_flatmm EXCLUDE_FROM_ALL mxgemm/mx_flatmm.cpp) # TODO: 950 only - set(EXAMPLE_FLATMM_COMPILE_OPTIONS) + include(mxgemm/mx_flatmm_instance.cmake) + mx_flatmm_instance_generate(EXAMPLE_MX_FLATMM_FILES) + message(STATUS "Generated MX FlatMM kernel files: ${EXAMPLE_MX_FLATMM_FILES}") + add_executable(tile_example_mx_flatmm EXCLUDE_FROM_ALL mxgemm/mx_flatmm.cpp ${EXAMPLE_MX_FLATMM_FILES}) + target_include_directories(tile_example_mx_flatmm PRIVATE mxgemm) + + # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations + # ... because they are auto-generated + set(EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template) set(EXAMPLE_MOE_FLATMM_COMPILE_OPTIONS) if(CK_USE_OCP_FP8) diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp index 9155b27dba..cf05abd51c 100644 --- a/example/ck_tile/18_flatmm/flatmm_basic.cpp +++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp @@ -47,7 +47,7 @@ static constexpr inline auto is_row_major(Layout layout_) // mfma_type, 0:32x32, 1:16x16 template -auto shuffle_b(const ck_tile::HostTensor& t) +auto shuffle_b_v0(const ck_tile::HostTensor& t) { assert(t.get_lengths().size() == 2); int n_ = t.get_lengths()[1]; diff --git a/example/ck_tile/18_flatmm/moe_flatmm.cpp b/example/ck_tile/18_flatmm/moe_flatmm.cpp index 4db6a1171f..064522a360 100644 --- a/example/ck_tile/18_flatmm/moe_flatmm.cpp +++ b/example/ck_tile/18_flatmm/moe_flatmm.cpp @@ -29,7 +29,7 @@ static constexpr inline auto is_row_major(Layout layout_) } template -auto shuffle_b(const ck_tile::HostTensor& t) +auto flatmm_shuffle_b(const ck_tile::HostTensor& t) { assert(t.get_lengths().size() == 2); int n_ = t.get_lengths()[1]; diff --git a/example/ck_tile/18_flatmm/mxgemm/mx_flatmm.cpp b/example/ck_tile/18_flatmm/mxgemm/mx_flatmm.cpp index 0474e4b1d6..14976b8093 100644 --- a/example/ck_tile/18_flatmm/mxgemm/mx_flatmm.cpp +++ b/example/ck_tile/18_flatmm/mxgemm/mx_flatmm.cpp @@ -20,211 +20,6 @@ static constexpr inline auto is_row_major(Layout layout_) ck_tile::tensor_layout::gemm::RowMajor>>{}; } -template -float mx_flatmm_calc(const ck_tile::ScaleFlatmmHostArgs& args, - const ck_tile::stream_config& s) -{ - using CodegenFlatmmShape = ck_tile::TileGemmShape< - ck_tile::sequence, - ck_tile::sequence, - ck_tile::sequence>; - - using TilePartitioner = - ck_tile::GemmSpatiallyLocalTilePartitioner; - - using Traits = ck_tile::TileGemmTraits; - - using CodegenGemmTraits = ck_tile::TileGemmUniversalTraits; - - using ComputeDataType = ADataType; - static_assert(sizeof(ComputeDataType) >= sizeof(BDataType), - "mixed_prec_flatmm requires ADataType is a wider type than BDataType"); - - using GemmPipelineProblem = ck_tile::GemmPipelineProblem; - - using BaseGemmPipeline = ck_tile::BaseFlatmmPipelineAGmemBGmemCRegV1; - - const ck_tile::index_t k_grain = args.k_batch * FlatmmConfig::K_Tile; - const ck_tile::index_t K_split = (args.K + k_grain - 1) / k_grain * FlatmmConfig::K_Tile; - const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); - const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); - const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); - float ave_time{0}; - - const auto Run = [&](const auto has_hot_loop_, - const auto tail_number_, - const auto memory_operation_) { - constexpr bool has_hot_loop_v = has_hot_loop_.value; - constexpr auto tail_number_v = tail_number_.value; - constexpr auto scheduler = FlatmmConfig::Scheduler; - constexpr auto memory_operation = memory_operation_.value; - - constexpr int BlockedXDLN_PerWarp = 2; // determined by scale shuffle pattern - - using CodegenPipelineProblem = ck_tile::MXFlatmmPipelineProblem; - - using CodegenMXFlatmmPipeline = - ck_tile::MXF4FlatmmPipelineAGmemBGmemCRegV1; - - using GemmEpilogue = ck_tile::CShuffleEpilogue< - ck_tile::CShuffleEpilogueProblem>; - - using Kernel = - ck_tile::MXFlatmmKernel; - - auto kargs = Kernel::MakeKernelArgs(args); - - const dim3 grids = Kernel::GridSize(kargs); - constexpr dim3 blocks = Kernel::BlockSize(); - - if(!Kernel::IsSupportedArgument(kargs)) - { - throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); - } - - if(s.log_level_ > 0) - { - std::cout << "Launching kernel with args:" << CodegenFlatmmShape::GetName() << "\n" - << "Shape: " << CodegenFlatmmShape::GetName() << "\n" - << "problem: " << CodegenPipelineProblem::GetName() << "\n" - << "pipeline: " << CodegenMXFlatmmPipeline::GetName() << "\n" - << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" - << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" - << std::endl; - } - - // Declare rotating_mem_ptr here so it stays in scope until it is needed - std::unique_ptr> rotating_mem_ptr; - std::function preprocess; - - auto clear_gemm_output = [&]() { - if(args.k_batch > 1) - hipGetErrorString(hipMemsetAsync( - args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_)); - }; - - if(s.flush_cache_) - { - std::cout << "Flushing cache..." << std::endl; - constexpr ck_tile::index_t APackedSize = ck_tile::numeric_traits::PackedSize; - constexpr ck_tile::index_t BPackedSize = ck_tile::numeric_traits::PackedSize; - - ck_tile::HostTensor a_m(ck_tile::host_tensor_descriptor( - args.M, args.K, args.stride_A, is_row_major(ALayout{}))); - ck_tile::HostTensor b_n(ck_tile::host_tensor_descriptor( - args.K, args.N, args.stride_B, is_row_major(BLayout{}))); - - auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize; - auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize; - - rotating_mem_ptr = std::make_unique>( - kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer); - rotating_mem_ptr->Print(); - - preprocess = [&]() { - ck_tile::flush_icache(); - rotating_mem_ptr->Next(); - clear_gemm_output(); - }; - } - else - { - preprocess = clear_gemm_output; - } - - ave_time = ck_tile::launch_kernel_time_mask( - s, - preprocess, - ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); - return ave_time; - }; - - const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) { - if(args.k_batch == 1) - { - Run(has_hot_loop_, - tail_number_, - ck_tile::integral_constant{}); - } - else - { - Run(has_hot_loop_, - tail_number_, - ck_tile::integral_constant{}); - } - }; - BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num); - return ave_time; -} - template ( - args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50}); + using FlatmmShape = ck_tile::TileGemmShape< + ck_tile::sequence, + ck_tile::sequence, + ck_tile::sequence>; + + using TilePartitioner = + ck_tile::GemmSpatiallyLocalTilePartitioner; + + using Traits = ck_tile::TileGemmTraits; + using GemmPipelineProblem = + ck_tile::GemmPipelineProblem; + + using BaseFlatmmPipeline = ck_tile::BaseFlatmmPipelineAGmemBGmemCRegV1; + + const ck_tile::index_t k_grain = args.k_batch * FlatmmConfig::K_Tile; + const ck_tile::index_t k_split = (K + k_grain - 1) / k_grain * FlatmmConfig::K_Tile; + const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(k_split); + const bool has_hot_loop = BaseFlatmmPipeline::BlockHasHotloop(num_loop); + const ck_tile::TailNumber tail_num = BaseFlatmmPipeline::GetBlockLoopTailNum(num_loop); + + float ave_time = BaseFlatmmPipeline::template TailHandler( + [&](auto has_hot_loop_, auto tail_num_) { + constexpr auto has_hot_loop_v = has_hot_loop_.value; + constexpr auto tail_num_v = tail_num_.value; + auto invoke_splitk_path = [&](auto split_k_) { + return mx_flatmm_calc( + args, + ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50}); + }; + return (args.k_batch == 1) ? invoke_splitk_path(std::false_type{}) + : invoke_splitk_path(std::true_type{}); + }, + has_hot_loop, + tail_num); constexpr int APackedSize = ck_tile::numeric_traits::PackedSize; constexpr int BPackedSize = ck_tile::numeric_traits::PackedSize; @@ -296,9 +136,9 @@ float invoke_mx_flatmm(ck_tile::DeviceMem& a_dev_buf, float tflops = static_cast(flop) / 1.E9 / ave_time; float gb_per_sec = num_byte / 1.E6 / ave_time; - std::cout << "Run MXFP4_Flatmm kernel " // - << " M =" << M << " N =" << N << " K =" << K << " StrideA =" << stride_A - << " StrideB =" << stride_B << " StrideC =" << stride_C << " : " << ave_time + std::cout << "Run " << ck_tile::gemm_prec_str() << " Flatmm kernel " // + << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << stride_A + << " StrideB = " << stride_B << " StrideC = " << stride_C << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl; return ave_time; @@ -332,42 +172,47 @@ auto create_args(int argc, char* argv[]) return std::make_tuple(result, arg_parser); } -template -void preShuffleWeight(const IterSrc src, IterDst dst, int N, int K) +template +auto preShuffleWeight(ck_tile::HostTensor& src) { - int KPack = 16; - int NLane = FlatmmConfig::N_Warp_Tile; - int KLane = 64 / NLane; - int K_pk = K / 2; - int K0 = K_pk / (KLane * KPack); + auto src_lengths = src.get_lengths(); + const int K = src_lengths[0]; + const int N = src_lengths[1]; + constexpr int packed_size = ck_tile::numeric_traits::PackedSize; + int KPack = 16 * packed_size; // fp4:32 or fp8:16 + int NLane = N_Warp_Tile; + int KLane = 64 / NLane; + int K0 = K / (KLane * KPack); + + ck_tile::HostTensor shuffled(ck_tile::HostTensorDescriptor({N * K}, {1})); + // K -> K0 KLane KPack // N -> N0 NLane // N, K -> N0 K0 KLane NLane KPack - int tempk; for(int n = 0; n < N; ++n) { - for(int k = 0; k < K_pk; ++k) + for(int k = 0; k < K; k += packed_size) { int n0 = n / NLane; int n1 = n % NLane; - int k0 = k / (KLane * KPack); - tempk = k % (KLane * KPack); - int k1 = tempk / KPack; - int k2 = tempk % KPack; + int k0 = k / (KLane * KPack); + int tempk = k % (KLane * KPack); + int k1 = tempk / KPack; + int k2 = tempk % KPack; int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane + k1 * KPack * NLane + n1 * KPack + k2; - dst[outputIndex] = src[n * K_pk + k]; + shuffled(outputIndex) = src(k, n); } } + return shuffled; } -template -auto preShuffleScale(Src& src) +template +auto preShuffleScale(ck_tile::HostTensor& src) { - using dtype = typename Src::Data::value_type; auto src_lengths = src.get_lengths(); const auto MN = KLast ? src_lengths[0] : src_lengths[1]; const auto K = KLast ? src_lengths[1] : src_lengths[0]; @@ -421,7 +266,6 @@ auto preShuffleScale(Src& src) #include "run_mx_flatmm.inc" -template int run_mx_flatmm_example(int argc, char* argv[]) { auto [result, arg_parser] = create_args(argc, argv); @@ -438,32 +282,31 @@ int run_mx_flatmm_example(int argc, char* argv[]) if(a_layout == "R" && b_layout == "C") { - if(mx_prec == "fp4xfp4") + if(mx_prec == "fp4" || mx_prec == "fp4xfp4") { if(persistent_opt == 0) - { - run_mx_flatmm_with_layouts(argc, argv, Row{}, Col{}, Row{}); - } + return run_mx_flatmm_with_layouts(argc, argv, Row{}, Col{}, Row{}); else - { - run_mx_flatmm_with_layouts(argc, argv, Row{}, Col{}, Row{}); - } + throw std::runtime_error("Only non-persistent kernels are supported currently!"); } - else if(mx_prec == "fp6xfp6") + else if(mx_prec == "fp6" || mx_prec == "fp6xfp6") { - throw std::runtime_error("Only support fp4xfp4 now!"); + throw std::runtime_error("fp6xfp6 is not supported."); } - else if(mx_prec == "fp8xfp8") + else if(mx_prec == "fp8" || mx_prec == "fp8xfp8") { - throw std::runtime_error("Only support fp4xfp4 now!"); + if(persistent_opt == 0) + return run_mx_flatmm_with_layouts(argc, argv, Row{}, Col{}, Row{}); + else + throw std::runtime_error("Only support non-persistent kernel now!"); } else { @@ -474,7 +317,6 @@ int run_mx_flatmm_example(int argc, char* argv[]) { throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!"); } - return -1; } int main(int argc, char* argv[]) @@ -487,7 +329,7 @@ int main(int argc, char* argv[]) int warp_tile = arg_parser.get_int("warp_tile"); if(warp_tile == 0) { - return !run_mx_flatmm_example(argc, argv); + return run_mx_flatmm_example(argc, argv); } else if(warp_tile == 1) { diff --git a/example/ck_tile/18_flatmm/mxgemm/mx_flatmm.hpp b/example/ck_tile/18_flatmm/mxgemm/mx_flatmm.hpp index b47d3a95ab..248cf28341 100644 --- a/example/ck_tile/18_flatmm/mxgemm/mx_flatmm.hpp +++ b/example/ck_tile/18_flatmm/mxgemm/mx_flatmm.hpp @@ -12,4 +12,87 @@ #include "ck_tile/ops/flatmm.hpp" #include "ck_tile/ops/gemm.hpp" -#include "mxfp4_flatmm.hpp" +// GEMM config with 16x16 warp tile +struct MXfp4_FlatmmConfig16 +{ + static constexpr ck_tile::index_t M_Tile = 128; + static constexpr ck_tile::index_t N_Tile = 512; + static constexpr ck_tile::index_t K_Tile = 256; + + static constexpr ck_tile::index_t M_Warp = 1; + static constexpr ck_tile::index_t N_Warp = 4; + static constexpr ck_tile::index_t K_Warp = 1; + + static constexpr ck_tile::index_t M_Warp_Tile = 16; + static constexpr ck_tile::index_t N_Warp_Tile = 16; + static constexpr ck_tile::index_t K_Warp_Tile = 128; + + static constexpr bool kPadM = false; + static constexpr bool kPadN = false; + static constexpr bool kPadK = false; + + static constexpr bool TransposeC = false; + static constexpr bool UseStructuredSparsity = false; + + static constexpr int kBlockPerCu = 1; + static constexpr int TileParitionerGroupNum = 8; + static constexpr int TileParitionerM01 = 4; + static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Default; + static constexpr ck_tile::index_t NumWaveGroups = 1; + static constexpr bool DoubleSmemBuffer = false; + + static constexpr int N_Repeat = N_Tile / N_Warp_Tile / N_Warp; + static constexpr bool TiledMMAPermuteN = false; +}; + +struct MXfp8_FlatmmConfig16 +{ + static constexpr ck_tile::index_t M_Tile = 128; + static constexpr ck_tile::index_t N_Tile = 256; + static constexpr ck_tile::index_t K_Tile = 256; + + static constexpr ck_tile::index_t M_Warp = 1; + static constexpr ck_tile::index_t N_Warp = 4; + static constexpr ck_tile::index_t K_Warp = 1; + + static constexpr ck_tile::index_t M_Warp_Tile = 16; + static constexpr ck_tile::index_t N_Warp_Tile = 16; + static constexpr ck_tile::index_t K_Warp_Tile = 128; + + static constexpr bool kPadM = false; + static constexpr bool kPadN = false; + static constexpr bool kPadK = false; + + static constexpr bool TransposeC = false; + static constexpr bool UseStructuredSparsity = false; + + static constexpr int kBlockPerCu = 1; + static constexpr int TileParitionerGroupNum = 8; + static constexpr int TileParitionerM01 = 4; + static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Default; + static constexpr ck_tile::index_t NumWaveGroups = 1; + static constexpr bool DoubleSmemBuffer = false; + + static constexpr int N_Repeat = N_Tile / N_Warp_Tile / N_Warp; + static constexpr bool TiledMMAPermuteN = false; +}; + +template +float mx_flatmm_calc(const ck_tile::ScaleFlatmmHostArgs& args, + const ck_tile::stream_config& s); diff --git a/example/ck_tile/18_flatmm/mxgemm/mx_flatmm_instance.cmake b/example/ck_tile/18_flatmm/mxgemm/mx_flatmm_instance.cmake new file mode 100644 index 0000000000..63158b807f --- /dev/null +++ b/example/ck_tile/18_flatmm/mxgemm/mx_flatmm_instance.cmake @@ -0,0 +1,32 @@ +function(mx_flatmm_instance_generate FILE_LIST) + set(C_DATA_TYPE FP16) + set(A_LAYOUT ROW) + set(B_LAYOUT COL) + set(C_LAYOUT ROW) + set(FLATMM_CONFIG_FP4 "MXfp4_FlatmmConfig16") + set(FLATMM_CONFIG_FP8 "MXfp8_FlatmmConfig16") + + # foreach(PERSISTENT false true) + # TODO: Persistent kernels are disabled due to compilation failures with some LLVM versions. + foreach(PERSISTENT false) + foreach(DATA_TYPE FP4 FP8) + set(FLATMM_CONFIG ${FLATMM_CONFIG_${DATA_TYPE}}) + set(A_DATA_TYPE ${DATA_TYPE}) + set(B_DATA_TYPE ${DATA_TYPE}) + foreach(SPLIT_K false true) + foreach(HAS_HOT_LOOP false true) + foreach(TAIL_NUMBER ODD EVEN) + set(KERNEL_FILE mxgemm/mx_flatmm_instance_${PERSISTENT}_${DATA_TYPE}_${SPLIT_K}_${HAS_HOT_LOOP}_${TAIL_NUMBER}.cpp) + string(TOLOWER ${KERNEL_FILE} KERNEL_FILE) + configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/mxgemm/mx_flatmm_instance.cpp.in + ${CMAKE_CURRENT_BINARY_DIR}/${KERNEL_FILE} + @ONLY) + list(APPEND ${FILE_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${KERNEL_FILE}) + endforeach() + endforeach() + endforeach() + endforeach() + endforeach() + set(${FILE_LIST} ${${FILE_LIST}} PARENT_SCOPE) +endfunction() diff --git a/example/ck_tile/18_flatmm/mxgemm/mx_flatmm_instance.cpp.in b/example/ck_tile/18_flatmm/mxgemm/mx_flatmm_instance.cpp.in new file mode 100644 index 0000000000..d9fe78b701 --- /dev/null +++ b/example/ck_tile/18_flatmm/mxgemm/mx_flatmm_instance.cpp.in @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "mx_flatmm_instance.hpp" + +// clang-format off +#define FLATMM_CONFIG @FLATMM_CONFIG@ +#define A_DATA_TYPE @A_DATA_TYPE@ +#define B_DATA_TYPE @B_DATA_TYPE@ +#define C_DATA_TYPE @C_DATA_TYPE@ +#define A_LAYOUT @A_LAYOUT@ +#define B_LAYOUT @B_LAYOUT@ +#define C_LAYOUT @C_LAYOUT@ +#define PERSISTENT @PERSISTENT@ +#define SPLIT_K @SPLIT_K@ +#define HAS_HOT_LOOP @HAS_HOT_LOOP@ +#define TAIL_NUMBER @TAIL_NUMBER@ +// clang-format on + +using FP4 = ck_tile::pk_fp4_t; +using FP8 = ck_tile::fp8_t; +using FP16 = ck_tile::fp16_t; +using BF16 = ck_tile::bf16_t; + +using ROW = ck_tile::tensor_layout::gemm::RowMajor; +using COL = ck_tile::tensor_layout::gemm::ColumnMajor; + +inline constexpr auto ODD = ck_tile::TailNumber::Odd; +inline constexpr auto EVEN = ck_tile::TailNumber::Even; + +inline constexpr int ScaleGranularityM = 1; +inline constexpr int ScaleGranularityN = 1; +inline constexpr int ScaleGranularityK = 32; +using ScaleM = ck_tile::FlatmmScalePointer; +using ScaleN = ck_tile::FlatmmScalePointer; + +template float mx_flatmm_calc, + /*AccDataType*/ float, + C_DATA_TYPE, + A_LAYOUT, + B_LAYOUT, + /*DsLayout*/ ck_tile::tuple<>, + C_LAYOUT, + ScaleM, + ScaleN, + PERSISTENT, + /*CDEElementWise*/ ck_tile::element_wise::PassThrough, + SPLIT_K, + HAS_HOT_LOOP, + TAIL_NUMBER>(const ck_tile::ScaleFlatmmHostArgs& args, + const ck_tile::stream_config& s); diff --git a/example/ck_tile/18_flatmm/mxgemm/mx_flatmm_instance.hpp b/example/ck_tile/18_flatmm/mxgemm/mx_flatmm_instance.hpp new file mode 100644 index 0000000000..c7614e9bd4 --- /dev/null +++ b/example/ck_tile/18_flatmm/mxgemm/mx_flatmm_instance.hpp @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include +#include +#include +#include +#include + +#include "ck_tile/host.hpp" +#include "mx_flatmm.hpp" + +template +using is_row_major_t = ck_tile::bool_constant< + std::is_same_v, ck_tile::tensor_layout::gemm::RowMajor>>; + +template +float mx_flatmm_calc(const ck_tile::ScaleFlatmmHostArgs& args, + const ck_tile::stream_config& s) +{ + using FlatmmShape = ck_tile::TileGemmShape< + ck_tile::sequence, + ck_tile::sequence, + ck_tile::sequence>; + + using MXGemmTraits = ck_tile::TileGemmUniversalTraits; + + using ComputeDataType = ADataType; + static_assert(sizeof(ComputeDataType) >= sizeof(BDataType), + "mixed_prec_flatmm requires ADataType is a wider type than BDataType"); + + constexpr auto scheduler = FlatmmConfig::Scheduler; + constexpr auto memory_operation = + Splitk ? ck_tile::memory_operation_enum::atomic_add : ck_tile::memory_operation_enum::set; + + constexpr int BlockedXDLN_PerWarp = 2; // determined by scale shuffle pattern + + using MXPipelineProblem = ck_tile::MXFlatmmPipelineProblem; + + using MXFlatmmPipeline = ck_tile::MXF4FlatmmPipelineAGmemBGmemCRegV1; + + using TilePartitioner = + ck_tile::GemmSpatiallyLocalTilePartitioner; + using GemmEpilogue = + ck_tile::CShuffleEpilogue>; + + using Kernel = ck_tile::MXFlatmmKernel; + + auto kargs = Kernel::MakeKernelArgs(args); + + const dim3 grids = Kernel::GridSize(kargs); + constexpr dim3 blocks = Kernel::BlockSize(); + + if(!Kernel::IsSupportedArgument(kargs)) + throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args:" << FlatmmShape::GetName() << "\n" + << "Shape: " << FlatmmShape::GetName() << "\n" + << "problem: " << MXPipelineProblem::GetName() << "\n" + << "pipeline: " << MXFlatmmPipeline::GetName() << "\n" + << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + + // Declare rotating_mem_ptr here so it stays in scope until it is needed + std::unique_ptr> rotating_mem_ptr; + std::function preprocess; + + auto clear_gemm_output = [&]() { + if(args.k_batch > 1) + hipGetErrorString( + hipMemsetAsync(args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_)); + }; + + if(s.flush_cache_) + { + std::cout << "Flushing cache..." << std::endl; + constexpr ck_tile::index_t APackedSize = ck_tile::numeric_traits::PackedSize; + constexpr ck_tile::index_t BPackedSize = ck_tile::numeric_traits::PackedSize; + + ck_tile::HostTensor a_m(ck_tile::host_tensor_descriptor( + args.M, args.K, args.stride_A, is_row_major_t{})); + ck_tile::HostTensor b_n(ck_tile::host_tensor_descriptor( + args.K, args.N, args.stride_B, is_row_major_t{})); + + auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize; + auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize; + + rotating_mem_ptr = std::make_unique>( + kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer); + rotating_mem_ptr->Print(); + + preprocess = [&]() { + ck_tile::flush_icache(); + rotating_mem_ptr->Next(); + clear_gemm_output(); + }; + } + else + { + preprocess = clear_gemm_output; + } + + return ck_tile::launch_kernel_time_mask( + s, + preprocess, + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); +} diff --git a/example/ck_tile/18_flatmm/mxgemm/mxfp4_flatmm.hpp b/example/ck_tile/18_flatmm/mxgemm/mxfp4_flatmm.hpp deleted file mode 100644 index 4ef627969c..0000000000 --- a/example/ck_tile/18_flatmm/mxgemm/mxfp4_flatmm.hpp +++ /dev/null @@ -1,40 +0,0 @@ - -// SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck_tile/core.hpp" - -// GEMM config with 16x16 warp tile -struct MXfp4_FlatmmConfig16 -{ - static constexpr ck_tile::index_t M_Tile = 128; - static constexpr ck_tile::index_t N_Tile = 512; - static constexpr ck_tile::index_t K_Tile = 256; - - static constexpr ck_tile::index_t M_Warp = 1; - static constexpr ck_tile::index_t N_Warp = 4; - static constexpr ck_tile::index_t K_Warp = 1; - - static constexpr ck_tile::index_t M_Warp_Tile = 16; - static constexpr ck_tile::index_t N_Warp_Tile = 16; - static constexpr ck_tile::index_t K_Warp_Tile = 128; - - static constexpr bool kPadM = false; - static constexpr bool kPadN = false; - static constexpr bool kPadK = false; - - static constexpr bool TransposeC = false; - static constexpr bool UseStructuredSparsity = false; - - static constexpr int kBlockPerCu = 1; - static constexpr int TileParitionerGroupNum = 8; - static constexpr int TileParitionerM01 = 4; - static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Default; - static constexpr ck_tile::index_t NumWaveGroups = 1; - static constexpr bool DoubleSmemBuffer = false; - - static constexpr int N_Repeat = N_Tile / N_Warp_Tile / N_Warp; - static constexpr bool TiledMMAPermuteN = false; -}; diff --git a/example/ck_tile/18_flatmm/mxgemm/run_mx_flatmm.inc b/example/ck_tile/18_flatmm/mxgemm/run_mx_flatmm.inc index bc24427780..dd522bbcb6 100644 --- a/example/ck_tile/18_flatmm/mxgemm/run_mx_flatmm.inc +++ b/example/ck_tile/18_flatmm/mxgemm/run_mx_flatmm.inc @@ -88,10 +88,7 @@ int run_mx_flatmm_with_layouts(int argc, throw std::runtime_error("wrong! Unexpected init_method"); } - ck_tile::HostTensor b_shuffled_host( - ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout))); - preShuffleWeight(b_origin_host.begin(), b_shuffled_host.begin(), N, K); - + const auto b_shuffled_host = preShuffleWeight(b_origin_host); const auto scale_a_shuffled = preShuffleScale(scale_a); const auto scale_b_shuffled = preShuffleScale(scale_b); @@ -163,5 +160,5 @@ int run_mx_flatmm_with_layouts(int argc, std::cout << "The GPU veification result is: " << (pass ? "correct" : "fail") << std::endl; } - return pass; + return pass ? 0 : -1; } diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc index 69bf39f670..4063fe284e 100644 --- a/example/ck_tile/18_flatmm/run_flatmm_example.inc +++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc @@ -103,7 +103,7 @@ int run_flatmm_example_with_layouts(int argc, } else { - return shuffle_b(b_origin_host); + return shuffle_b_v0(b_origin_host); } }(); ck_tile::DeviceMem b_shuffle_dev_buf(b_shuffle_host.get_element_space_size_in_bytes()); diff --git a/example/ck_tile/18_flatmm/run_grouped_flatmm_example.inc b/example/ck_tile/18_flatmm/run_grouped_flatmm_example.inc index fbab5b6d0e..3bb039aae8 100644 --- a/example/ck_tile/18_flatmm/run_grouped_flatmm_example.inc +++ b/example/ck_tile/18_flatmm/run_grouped_flatmm_example.inc @@ -3,23 +3,6 @@ #pragma once -// mfma_type, 0:32x32, 1:16x16 -template -auto shuffle_b(const ck_tile::HostTensor& t) -{ - assert(t.get_lengths().size() == 2); - int n_ = t.get_lengths()[1]; - int k_ = t.get_lengths()[0]; - constexpr int divisor = FlatmmConfig::N_Warp_Tile == 32 ? 2 : 4; - ck_tile::HostTensor t_view({n_ / FlatmmConfig::N_Warp_Tile, - FlatmmConfig::N_Warp_Tile, - k_ / FlatmmConfig::K_Warp_Tile, - divisor, - FlatmmConfig::K_Warp_Tile / divisor}); - std::copy(t.begin(), t.end(), t_view.begin()); - return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4}); -} - template auto calculate_rtol_atol(const ck_tile::index_t K, const ck_tile::index_t kbatch, diff --git a/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc b/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc index 9e0cbda0c0..d898ed2f29 100644 --- a/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc +++ b/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc @@ -114,7 +114,7 @@ int run_moe_gemm_example_with_layouts(int argc, ck_tile::FillUniformDistribution{0.0f, 1.0f}(a_m_k_tensor); ck_tile::FillUniformDistribution{-.5f, .5f}(b_k_n_tensor); - auto b_shuffle_host = shuffle_b(b_k_n_tensor); + auto b_shuffle_host = flatmm_shuffle_b(b_k_n_tensor); std::cout << "moe_flatmm:" // << "\n num_experts: " << experts << "\n num_tokens: " << num_tokens diff --git a/example/ck_tile/36_pooling/README.md b/example/ck_tile/36_pooling/README.md index ab49b57095..4417e03734 100644 --- a/example/ck_tile/36_pooling/README.md +++ b/example/ck_tile/36_pooling/README.md @@ -2,6 +2,116 @@ This folder contains example for the pooling operator using ck_tile tile-programming implementation. Currently the pooling kernel only supports 2D and 3D pooling. +## Tensor Descriptor Transformations + +The pooling kernel transforms the input tensor into 2D format suitable for reduction. This section explains the transformation pipeline for both 2D and 3D pooling operations. + +### 3D Pooling Transformations + +For 3D pooling, the input tensor has shape `(N, D, H, W, C)` where: +- `N`: batch size +- `D`: depth dimension +- `H`: height dimension +- `W`: width dimension +- `C`: channel dimension + +The transformations convert this 5D tensor into a 2D tensor where rows represent output positions (M) and columns represent pooling window elements (K). + +```mermaid +graph TD + %% Input Tensor: (N, D, H, W, C) + Input["Input Tensor
(N, D, H, W, C)"] + style Input fill:#e1f5fe + + %% Pass-through N dimension + PassN["Pass-through N
(batch size)"] + style PassN fill:#f3e5f5 + Input --> PassN + + %% Pad spatial dimensions + PadD["Pad D
(depth with left/right padding)"] + style PadD fill:#fff9c4 + Input --> PadD + + PadH["Pad H
(height with left/right padding)"] + style PadH fill:#fff9c4 + Input --> PadH + + PadW["Pad W
(width with left/right padding)"] + style PadW fill:#fff9c4 + Input --> PadW + + %% Pass-through C dimension + PassC["Pass-through C
(channels)"] + style PassC fill:#f3e5f5 + Input --> PassC + + %% Embed sliding windows + EmbedD["Embed D
window(Z) × output_positions(Dₒ)"] + style EmbedD fill:#fff3e0 + PadD --> EmbedD + + EmbedH["Embed H
window(Y) × output_positions(Hₒ)"] + style EmbedH fill:#fff3e0 + PadH --> EmbedH + + EmbedW["Embed W
window(X) × output_positions(Wₒ)"] + style EmbedW fill:#fff3e0 + PadW --> EmbedW + + %% Merge into 2D matrix + MergeM["Merge M
(N, Dₒ, Hₒ, Wₒ, C)
→ output positions"] + style MergeM fill:#e8f5e9 + PassN --> MergeM + EmbedD --> MergeM + EmbedH --> MergeM + EmbedW --> MergeM + PassC --> MergeM + + MergeK["Merge K
(Z, Y, X)
→ window elements"] + style MergeK fill:#e8f5e9 + EmbedD --> MergeK + EmbedH --> MergeK + EmbedW --> MergeK + + %% Final padding for block alignment + PadM["Right-pad M
(for block alignment)"] + style PadM fill:#fff9c4 + MergeM --> PadM + + PadK["Right-pad K
(for block alignment)"] + style PadK fill:#fff9c4 + MergeK --> PadK + + %% Result + Result["2D Matrix
(M × K)"] + style Result fill:#c8e6c9 + PadM --> Result + PadK --> Result +``` + +**Transformation Steps:** +1. **Padding**: Apply left and right padding to spatial dimensions (D, H, W) to handle boundary conditions +2. **Sliding Windows**: Use embed transforms to create sliding windows across each spatial dimension, expanding each dimension into (window_size, output_positions) +3. **Reshaping**: Merge all dimensions into a 2D matrix where: + - M dimension = N × Dₒ × Hₒ × Wₒ × C (total output positions) + - K dimension = Z × Y × X (elements per pooling window) +4. **Block Alignment**: Apply right padding to ensure M and K dimensions are aligned to block size + +### 2D Pooling Transformations + +2D pooling follows the same transformation pipeline but operates on 4D tensors with shape `(N, H, W, C)`. The process is identical except: +- Only H and W dimensions are padded and embedded +- K dimension merges only (Y, X) window elements +- M dimension merges (N, Hₒ, Wₒ, C) + +### Output Tensor Transformations + +The output tensor transformations are simpler: +- Merge all output dimensions (N, Dₒ/Hₒ, Wₒ, C) into a single M dimension +- Apply right padding for block alignment +- The result is a 1D tensor that maps directly to the M dimension of the computation matrix + ## build ``` # in the root of ck_tile diff --git a/example/ck_tile/36_pooling/pool3d.cpp b/example/ck_tile/36_pooling/pool3d.cpp index 092020c4ae..498694e2f5 100644 --- a/example/ck_tile/36_pooling/pool3d.cpp +++ b/example/ck_tile/36_pooling/pool3d.cpp @@ -31,8 +31,8 @@ auto create_args(int argc, char* argv[]) .insert("RightPy", "1", "right padding h") .insert("RightPx", "1", "right padding w") .insert("v", "1", "cpu validation or not") - .insert("warmup", "0", "cold iter") - .insert("repeat", "1", "hot iter"); + .insert("warmup", "20", "cold iter") + .insert("repeat", "100", "hot iter"); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser); @@ -120,10 +120,10 @@ bool run(const ck_tile::ArgParser& arg_parser) in_buf.ToDevice(in.data()); using ReduceOp = ck_tile::ReduceOp::Max; - using BlockWarps = ck_tile::sequence<4, 1>; - using BlockTile = ck_tile::sequence<128, 128>; - using WarpTile = ck_tile::sequence<32, 128>; - using ThreadTile = ck_tile::sequence<8, 8>; + using BlockWarps = ck_tile::sequence<1, 1>; + using BlockTile = ck_tile::sequence<128, 1>; + using WarpTile = ck_tile::sequence<128, 1>; + using ThreadTile = ck_tile::sequence<2, 1>; using Shape = ck_tile::PoolShape; using Problem = ck_tile::PoolProblem +using GemmConfig = GemmConfigQuant; + +void aquant_quantgrouped_instance_factory( + std::unordered_map>& lut) +{ + using QuantGroupSize = ck_tile::QuantGroupShape>; + lut[hash_multiple_strings({"fp8", "aquant", "1x1x128"})] = [](const ck_tile::ArgParser& + arg_parser) { + using TypeConfig = + decltype(GemmQuantTypeConfig{}); + return run_gemm_example_prec_type, + TypeConfig, + QuantGroupSize, + ck_tile::QuantType::AQuantGrouped>(arg_parser); + }; + lut[hash_multiple_strings({"bf8", "aquant", "1x1x128"})] = [](const ck_tile::ArgParser& + arg_parser) { + using TypeConfig = + decltype(GemmQuantTypeConfig{}); + return run_gemm_example_prec_type, + TypeConfig, + QuantGroupSize, + ck_tile::QuantType::AQuantGrouped>(arg_parser); + }; + lut[hash_multiple_strings({"fp8i4", "aquant", "1x1x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using TypeConfig = decltype(GemmQuantTypeConfig{}); + return run_gemm_example_prec_type, + TypeConfig, + QuantGroupSize, + ck_tile::QuantType::AQuantGrouped>(arg_parser); + }; + lut[hash_multiple_strings({"bf8i4", "aquant", "1x1x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using TypeConfig = decltype(GemmQuantTypeConfig{}); + return run_gemm_example_prec_type, + TypeConfig, + QuantGroupSize, + ck_tile::QuantType::AQuantGrouped>(arg_parser); + }; +} diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_bf8.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_bf8.cpp new file mode 100644 index 0000000000..cb9f8b62cf --- /dev/null +++ b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_bf8.cpp @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) , Advanced Micro Devices, Inc. All rights reserved. + +#include "run_gemm_quant_example.inc" + +template +using GemmConfig = GemmConfigBQuantPrefill; + +#define RUN_GEMM_EXAMPLE_PREC_TYPE \ + run_gemm_example_prec_type, \ + TypeConfig, \ + QuantGroupSize, \ + ck_tile::QuantType::BQuantGrouped>(arg_parser); + +void bquant_quantgrouped_bf8_instance_factory( + std::unordered_map>& lut) +{ + using TypeConfig = + decltype(GemmQuantTypeConfig{}); +#ifndef CK_GFX950_SUPPORT + lut[hash_multiple_strings({"bf8", "bquant", "non-preshuffleb", "1x1x64"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; +#endif + lut[hash_multiple_strings({"bf8", "bquant", "non-preshuffleb", "1x1x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; + lut[hash_multiple_strings({"bf8", "bquant", "non-preshuffleb", "1x8x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; + lut[hash_multiple_strings({"bf8", "bquant", "non-preshuffleb", "1x32x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; + lut[hash_multiple_strings({"bf8", "bquant", "non-preshuffleb", "1x64x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; +} diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_bf8i4.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_bf8i4.cpp new file mode 100644 index 0000000000..33ae3bc4a9 --- /dev/null +++ b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_bf8i4.cpp @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) , Advanced Micro Devices, Inc. All rights reserved. + +#include "run_gemm_quant_example.inc" + +template +using GemmConfig = GemmConfigBQuantPrefill; + +#define RUN_GEMM_EXAMPLE_PREC_TYPE \ + run_gemm_example_prec_type, \ + TypeConfig, \ + QuantGroupSize, \ + ck_tile::QuantType::BQuantGrouped>(arg_parser); + +void bquant_quantgrouped_bf8i4_instance_factory( + std::unordered_map>& lut) +{ + using TypeConfig = decltype(GemmQuantTypeConfig{}); +#ifndef CK_GFX950_SUPPORT + lut[hash_multiple_strings({"bf8i4", "bquant", "non-preshuffleb", "1x1x64"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; +#endif + lut[hash_multiple_strings({"bf8i4", "bquant", "non-preshuffleb", "1x1x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; + lut[hash_multiple_strings({"bf8i4", "bquant", "non-preshuffleb", "1x8x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; + lut[hash_multiple_strings({"bf8i4", "bquant", "non-preshuffleb", "1x32x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; + lut[hash_multiple_strings({"bf8i4", "bquant", "non-preshuffleb", "1x64x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; +} diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_fp8.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_fp8.cpp new file mode 100644 index 0000000000..526c35b081 --- /dev/null +++ b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_fp8.cpp @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) , Advanced Micro Devices, Inc. All rights reserved. + +#include "run_gemm_quant_example.inc" + +template +using GemmConfig = GemmConfigBQuantPrefill; + +#define RUN_GEMM_EXAMPLE_PREC_TYPE \ + run_gemm_example_prec_type, \ + TypeConfig, \ + QuantGroupSize, \ + ck_tile::QuantType::BQuantGrouped>(arg_parser); + +void bquant_quantgrouped_fp8_instance_factory( + std::unordered_map>& lut) +{ + using TypeConfig = + decltype(GemmQuantTypeConfig{}); +#ifndef CK_GFX950_SUPPORT + lut[hash_multiple_strings({"fp8", "bquant", "non-preshuffleb", "1x1x64"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; +#endif + lut[hash_multiple_strings({"fp8", "bquant", "non-preshuffleb", "1x1x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; + lut[hash_multiple_strings({"fp8", "bquant", "non-preshuffleb", "1x8x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; + lut[hash_multiple_strings({"fp8", "bquant", "non-preshuffleb", "1x32x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; + lut[hash_multiple_strings({"fp8", "bquant", "non-preshuffleb", "1x64x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; +} diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_fp8i4.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_fp8i4.cpp new file mode 100644 index 0000000000..4b2a8efb14 --- /dev/null +++ b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_prefill_fp8i4.cpp @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) , Advanced Micro Devices, Inc. All rights reserved. + +#include "run_gemm_quant_example.inc" + +template +using GemmConfig = GemmConfigBQuantPrefill; + +#define RUN_GEMM_EXAMPLE_PREC_TYPE \ + run_gemm_example_prec_type, \ + TypeConfig, \ + QuantGroupSize, \ + ck_tile::QuantType::BQuantGrouped>(arg_parser); + +void bquant_quantgrouped_fp8i4_instance_factory( + std::unordered_map>& lut) +{ + using TypeConfig = decltype(GemmQuantTypeConfig{}); +#ifndef CK_GFX950_SUPPORT + lut[hash_multiple_strings({"fp8i4", "bquant", "non-preshuffleb", "1x1x64"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; +#endif + lut[hash_multiple_strings({"fp8i4", "bquant", "non-preshuffleb", "1x1x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; + lut[hash_multiple_strings({"fp8i4", "bquant", "non-preshuffleb", "1x8x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; + lut[hash_multiple_strings({"fp8i4", "bquant", "non-preshuffleb", "1x32x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; + lut[hash_multiple_strings({"fp8i4", "bquant", "non-preshuffleb", "1x64x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using QuantGroupSize = ck_tile::QuantGroupShape>; + return RUN_GEMM_EXAMPLE_PREC_TYPE; + }; +} diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_preshuffleb_prefill.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_preshuffleb_prefill.cpp new file mode 100644 index 0000000000..d9591bb588 --- /dev/null +++ b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_preshuffleb_prefill.cpp @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) , Advanced Micro Devices, Inc. All rights reserved. + +#include "run_gemm_quant_example.inc" + +template +using GemmConfig = GemmConfigPreshuffleB_Bquant_prefill; + +void bquant_quantgrouped_preshuffleb_instance_factory( + std::unordered_map>& lut) +{ + using QuantGroupSize = ck_tile::QuantGroupShape>; + lut[hash_multiple_strings( + {"fp8", "bquant", "preshuffleb", "1x1x128"})] = [](const ck_tile::ArgParser& arg_parser) { + using TypeConfig = + decltype(GemmQuantTypeConfig{}); + return run_gemm_example_prec_type, + TypeConfig, + QuantGroupSize, + ck_tile::QuantType::BQuantGrouped>(arg_parser); + }; + lut[hash_multiple_strings( + {"bf8", "bquant", "preshuffleb", "1x1x128"})] = [](const ck_tile::ArgParser& arg_parser) { + using TypeConfig = + decltype(GemmQuantTypeConfig{}); + return run_gemm_example_prec_type, + TypeConfig, + QuantGroupSize, + ck_tile::QuantType::BQuantGrouped>(arg_parser); + }; + lut[hash_multiple_strings({"fp8i4", "bquant", "preshuffleb", "1x1x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using TypeConfig = decltype(GemmQuantTypeConfig{}); + return run_gemm_example_prec_type, + TypeConfig, + QuantGroupSize, + ck_tile::QuantType::BQuantGrouped>(arg_parser); + }; + lut[hash_multiple_strings({"bf8i4", "bquant", "preshuffleb", "1x1x128"})] = + [](const ck_tile::ArgParser& arg_parser) { + using TypeConfig = decltype(GemmQuantTypeConfig{}); + return run_gemm_example_prec_type, + TypeConfig, + QuantGroupSize, + ck_tile::QuantType::BQuantGrouped>(arg_parser); + }; +} diff --git a/example/ck_tile/38_block_scale_gemm/gemm_quant.cpp b/example/ck_tile/38_block_scale_gemm/gemm_quant.cpp new file mode 100644 index 0000000000..a35f867f5d --- /dev/null +++ b/example/ck_tile/38_block_scale_gemm/gemm_quant.cpp @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) , Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck_tile/core/config.hpp" +#include "ck_tile/host.hpp" +#include "ck_tile/host/permute_pk_int4.hpp" +#include "ck_tile/host/tensor_shuffle_utils.hpp" +#include "gemm_utils.hpp" + +auto create_args(int argc, char* argv[]) +{ + ck_tile::ArgParser arg_parser; + arg_parser.insert("h", "false", "Print help message") + .insert("m", "3840", "m dimension") + .insert("n", "4096", "n dimension") + .insert("k", "2048", "k dimension") + .insert("a_layout", "R", "A tensor data layout - Row or Column") + .insert("b_layout", "C", "B tensor data layout - Row or Column") + .insert("bq_layout", "C", "Bq tensor data layout - Row or Column") + .insert("c_layout", "R", "C tensor data layout - Row or Column") + .insert("stride_a", "0", "Tensor A stride") + .insert("stride_q", "0", "Tensor AQ stride") + .insert("stride_b", "0", "Tensor B stride") + .insert("stride_c", "0", "Tensor C stride") + .insert("v", "1", "0: No validation, 1: Validation on CPU, 2: Validation on GPU") + .insert("prec", + "fp8", + "Data type. For AQuant: fp8, bf8, i4fp8, or i4bf8; for Bquant: fp8, bf8, fp8i4, " + "or bf8i4") + .insert("warmup", "50", "Number of iterations before benchmarking the kernel") + .insert("repeat", "1000", "Number of iterations to benchmark the kernel") + .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer") + .insert("split_k", "1", "SplitK value") + .insert("device", "0", "Device id that will be used to run the kernel") + .insert("init", "0", "0:random, 1:linear, 2:constant(1)") + .insert("flush_cache", "true", "Flush cache before running the kernel") + .insert("rotating_count", "1000", "Rotating count") + .insert("quant_mode", "bquant", "Choose aquant, bquant, tensor or rowcol") + .insert("preshuffleb", "false", "Enable preshuffle of tensor B") + .insert("group_size", + "1x1x128", + "Quantization group size as MxNxK, e.g., 1x1x128, 1x32x128, 1x64x128"); + + bool result = arg_parser.parse(argc, argv); + return std::make_tuple(result, arg_parser); +} + +auto gen_lut_key(const ck_tile::ArgParser& arg_parser) +{ + std::string data_type = arg_parser.get_str("prec"); + std::string quant_mode = arg_parser.get_str("quant_mode"); + + std::vector params = {data_type, quant_mode}; + + if(quant_mode == "bquant") + { + std::string preshuffleb = + arg_parser.get_bool("preshuffleb") ? "preshuffleb" : "non-preshuffleb"; + params.push_back(preshuffleb); + } + if(quant_mode != "rowcol" && quant_mode != "tensor") + { + // NOTE: rowcol and tensor pipeline do not use group size + std::string group_size_str = arg_parser.get_str("group_size"); + params.push_back(group_size_str); + } + + return hash_multiple_strings(params); +} + +void aquant_quantgrouped_instance_factory( + std::unordered_map>& lut); +void bquant_quantgrouped_fp8_instance_factory( + std::unordered_map>& lut); +void bquant_quantgrouped_bf8_instance_factory( + std::unordered_map>& lut); +void bquant_quantgrouped_fp8i4_instance_factory( + std::unordered_map>& lut); +void bquant_quantgrouped_bf8i4_instance_factory( + std::unordered_map>& lut); +void bquant_quantgrouped_preshuffleb_instance_factory( + std::unordered_map>& lut); +void quant_rowcol_instance_factory( + std::unordered_map>& lut); +void quant_tensor_instance_factory( + std::unordered_map>& lut); + +int main(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result || arg_parser.get_bool("h")) + { + arg_parser.print(); + return -1; + } + + auto device_id = arg_parser.get_int("device"); + std::cout << "Device ID: " << device_id << std::endl; + ck_tile::hip_check_error(hipSetDevice(device_id)); + + std::unordered_map> lut; + aquant_quantgrouped_instance_factory(lut); + bquant_quantgrouped_fp8_instance_factory(lut); + bquant_quantgrouped_bf8_instance_factory(lut); + bquant_quantgrouped_fp8i4_instance_factory(lut); + bquant_quantgrouped_bf8i4_instance_factory(lut); + bquant_quantgrouped_preshuffleb_instance_factory(lut); + quant_rowcol_instance_factory(lut); + quant_tensor_instance_factory(lut); + + auto key = gen_lut_key(arg_parser); + + if(lut.find(key) != lut.end()) + { + return lut[key](arg_parser); + } + else + { + std::cerr + << "Error: Combination of prec, quant_mode, preshuffleb, and group_size not supported." + << std::endl; + return -1; + } +} diff --git a/example/ck_tile/38_block_scale_gemm/gemm_quant_basic.cpp b/example/ck_tile/38_block_scale_gemm/gemm_quant_basic.cpp deleted file mode 100644 index b22596537f..0000000000 --- a/example/ck_tile/38_block_scale_gemm/gemm_quant_basic.cpp +++ /dev/null @@ -1,424 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. - -// This example demonstrates 2D block scale quantization (N×K) for BQuant -// using non-preshuffled configuration. -// NOTE: Once more 2d support is ready, we can migrate all 2d quant types to this example -// This is currently done separately to avoid too verbose dispatching. - -#include -#include -#include -#include -#include -#include - -#include "ck_tile/core/config.hpp" -#include "ck_tile/host.hpp" -#include "gemm_utils.hpp" - -template -float gemm_calc_quant(const ck_tile::QuantGemmHostArgs& args, const ck_tile::stream_config& s) -{ - static_assert(std::is_same_v); - using ComputeDataType = std::conditional_t; - - using GemmShape = ck_tile::TileGemmShape< - ck_tile::sequence, - ck_tile::sequence, - ck_tile:: - sequence>; - - using TilePartitioner = ck_tile::GemmTile1DPartitioner; - - using GemmTraits = ck_tile::TileGemmQuantTraits; - - using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase; - - // This example only supports BQuant (no AQuant) - // For non-preshuffled BQuant, use BaseBQuantGemmPipelineAgBgCrCompV3 - using BaseGemmPipeline = std::conditional_t< - GemmConfig::PreshuffleB == true, - ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV2, - ck_tile::BaseBQuantGemmPipelineAgBgCrCompV3>; - - const ck_tile::index_t K_split = - (args.K + GemmConfig::K_Tile - 1) / GemmConfig::K_Tile * GemmConfig::K_Tile; - const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); - const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); - const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); - - const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) { - constexpr bool has_hot_loop_v = has_hot_loop_.value; - constexpr auto tail_number_v = tail_number_.value; - constexpr bool transpose_c = false; - - // row-col and tensor quants use the regular pipeline, A/B quants use their own - using PipelineProblem = std::conditional_t< - QuantMode == ck_tile::QuantType::RowColQuant || - QuantMode == ck_tile::QuantType::TensorQuant, - ck_tile::GemmRowColTensorQuantPipelineProblem, - std::conditional_t, - ck_tile::GemmBQuantPipelineProblem>>; - - using GemmPipeline = std::conditional_t< - QuantMode == ck_tile::QuantType::RowColQuant || - QuantMode == ck_tile::QuantType::TensorQuant, - ck_tile::GemmPipelineAgBgCrCompV3, - std::conditional_t< - QuantMode == ck_tile::QuantType::AQuantGrouped, - ck_tile::AQuantGemmPipelineAgBgCrMem, // memory pipeline hardcoded - // for aquant - std::conditional_t, - ck_tile::BQuantGemmPipelineAgBgCrCompV3>>>; - - using GemmEpilogue = ck_tile::CShuffleEpilogue< - ck_tile::CShuffleEpilogueProblem, - typename TypeConfig::AccDataType, - typename TypeConfig::CDataType, - ck_tile::tuple<>, - CLayout, - CDEElementWise, - TilePartitioner::MPerBlock, - TilePartitioner::NPerBlock, - GemmConfig::M_Warp, - GemmConfig::N_Warp, - GemmConfig::M_Warp_Tile, - GemmConfig::N_Warp_Tile, - GemmConfig::K_Warp_Tile, - transpose_c, - ck_tile::memory_operation_enum::set, - 1, - false, - 1, - GemmConfig::TiledMMAPermuteN>>; - using Kernel = - ck_tile::QuantGemmKernel; - - auto kargs = Kernel::MakeKernelArgs(args); - - const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); - const dim3 blocks = Kernel::BlockSize(); - - if(args.k_batch != 1) - { - throw std::runtime_error("split-k is not supported yet!"); - } - - if(!Kernel::IsSupportedArgument(kargs)) - { - throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); - } - - if(s.log_level_ > 0) - { - std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n' - << "shape: " << GemmShape::GetName() << '\n' - << "problem: " << PipelineProblem::GetName() << '\n' - << "pipeline: " << GemmPipeline::GetName() << '\n' - << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" - << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" - << std::endl; - } - float ave_time = 0; - if(s.flush_cache_) - { - std::cout << "Flushing cache..." << std::endl; - - ck_tile::HostTensor a_m(ck_tile::host_tensor_descriptor( - args.M, args.K, args.stride_A, is_row_major(ALayout{}))); - ck_tile::HostTensor b_n(ck_tile::host_tensor_descriptor( - args.K, args.N, args.stride_B, is_row_major(BLayout{}))); - - auto size_a_buffer = a_m.get_element_space_size_in_bytes(); - auto size_b_buffer = b_n.get_element_space_size_in_bytes(); - - ck_tile::RotatingMemWrapper - rotating_mem( - kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer); - rotating_mem.Print(); - - auto run_flush_cache = [&]() { - // flush icache - ck_tile::flush_icache(); - // rotating mem - rotating_mem.Next(); - // clear c mem - if(args.k_batch > 1) - hipGetErrorString( - hipMemsetAsync(args.c_ptr, - 0, - args.M * args.N * sizeof(typename TypeConfig::CDataType), - s.stream_id_)); - }; - ave_time = ck_tile::launch_kernel_time_mask( - s, - run_flush_cache, - ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); - } - else - { - ave_time = ck_tile::launch_kernel( - s, - ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); - } - - return ave_time; - }; - return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num); -} - -#include "run_gemm_quant_example.inc" - -template -int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[]) -{ - using Row = ck_tile::tensor_layout::gemm::RowMajor; - using Col = ck_tile::tensor_layout::gemm::ColumnMajor; - - if((QuantMode == ck_tile::QuantType::AQuantGrouped || - QuantMode == ck_tile::QuantType::RowColQuant) && - GemmConfig::PreshuffleB) - { - throw std::runtime_error( - "Preshuffling weight matrix is not supported for AQuant or RowColQuant"); - } - - if constexpr(std::is_same_v || - std::is_same_v || - std::is_same_v) - { - if(a_layout == "R" && b_layout == "C") - { - return run_gemm_example_with_layouts( - argc, argv, Row{}, Row{}, Col{}, Col{}, Row{}); - } - else - { - throw std::runtime_error("Unsupported memory layout for the input matrices!"); - } - } - else - { - throw std::runtime_error("Unsupported data type for A."); - } - - return 0; -} - -// Forward declaration for dispatch function -template