diff --git a/.github/scripts/therock_configure_ci.py b/.github/scripts/therock_configure_ci.py new file mode 100644 index 0000000000..557afe2d84 --- /dev/null +++ b/.github/scripts/therock_configure_ci.py @@ -0,0 +1,112 @@ +import fnmatch +import json +import os +from pathlib import Path +import subprocess +import sys +from typing import Iterable, Optional, Mapping + +def gha_set_output(vars: Mapping[str, str | Path]): + """Sets values in a step's output parameters. + + This appends to the file located at the $GITHUB_OUTPUT environment variable. + + See + * https://docs.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-output-parameter + * https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/passing-information-between-jobs + """ + print(f"Setting github output:\n{vars}") + + step_output_file = os.getenv("GITHUB_OUTPUT") + if not step_output_file: + print(" Warning: GITHUB_OUTPUT env var not set, can't set github outputs") + return + + with open(step_output_file, "a") as f: + f.writelines(f"{k}={str(v)}" + "\n" for k, v in vars.items()) + +def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]: + """Returns the paths of modified files relative to the base reference.""" + try: + return subprocess.run( + ["git", "diff", "--name-only", base_ref], + stdout=subprocess.PIPE, + check=True, + text=True, + timeout=60, + ).stdout.splitlines() + except TimeoutError: + print( + "Computing modified files timed out. Not using PR diff to determine" + " jobs to run.", + file=sys.stderr, + ) + return None + +# Paths matching any of these patterns are considered to have no influence over +# build or test workflows so any related jobs can be skipped if all paths +# modified by a commit/PR match a pattern in this list. +SKIPPABLE_PATH_PATTERNS = [ + "docs/*", + "*.gitignore", + "*.md", + "*.pre-commit-config.*", + "*LICENSE", + 'Jenkinsfile', + '.github/ISSUE_TEMPLATE/*', + '.github/CODEOWNERS', + '.github/*.md', + '.github/dependabot.yml', +] + +def is_path_skippable(path: str) -> bool: + """Determines if a given relative path to a file matches any skippable patterns.""" + return any(fnmatch.fnmatch(path, pattern) for pattern in SKIPPABLE_PATH_PATTERNS) + +def check_for_non_skippable_path(paths: Optional[Iterable[str]]) -> bool: + """Returns true if at least one path is not in the skippable set.""" + if paths is None: + return False + return any(not is_path_skippable(p) for p in paths) + +def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool: + """Returns true if CI workflows should run given a list of modified paths.""" + + if paths is None: + print("No files were modified, skipping TheRock CI jobs") + return False + + paths_set = set(paths) + github_workflows_paths = set( + [p for p in paths if p.startswith(".github/workflows")] + ) + other_paths = paths_set - github_workflows_paths + + contains_other_non_skippable_files = check_for_non_skippable_path(other_paths) + + print("should_ci_run_given_modified_paths findings:") + print(f" contains_other_non_skippable_files: {contains_other_non_skippable_files}") + + if contains_other_non_skippable_files: + print("Enabling TheRock CI jobs since a non-skippable path was modified") + return True + else: + print( + "Only unrelated and/or skippable paths were modified, skipping TheRock CI jobs" + ) + return False + +def main(args): + base_ref = args.get("base_ref") + modified_paths = get_modified_paths(base_ref) + print("modified_paths (max 200):", modified_paths[:200]) + enable_jobs = should_ci_run_given_modified_paths(modified_paths) + output = { + 'enable_therock_ci': json.dumps(enable_jobs) + } + gha_set_output(output) + +if __name__ == "__main__": + args = {} + args["base_ref"] = os.environ.get("BASE_REF", "HEAD^1") + main(args) diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml index 645a91c030..7db124d2a1 100644 --- a/.github/workflows/therock-ci-linux.yml +++ b/.github/workflows/therock-ci-linux.yml @@ -21,9 +21,11 @@ jobs: id-token: write container: image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:044b113562629f4bd2ec5d2e64b32eee11562d48fb1a75d7493daec9dd8d8292 + options: -v /runner/config:/home/awsconfig/ env: AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} TEATIME_FORCE_INTERACTIVE: 0 + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini steps: - name: Checkout composable_kernel repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -83,9 +85,9 @@ jobs: echo "----------" du -h -d 1 TheRock/build/artifacts - - name: Configure AWS Credentials - if: always() - uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + - name: Configure AWS Credentials for non-forked repos + if: ${{ always() && !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1 with: aws-region: us-east-2 role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external diff --git a/.github/workflows/therock-ci.yml b/.github/workflows/therock-ci.yml index 18411baa09..3232652b6b 100644 --- a/.github/workflows/therock-ci.yml +++ b/.github/workflows/therock-ci.yml @@ -5,6 +5,15 @@ on: branches: - develop workflow_dispatch: + pull_request: + types: + - opened + - synchronize + branches: + - mainline + - release/* + - release-staging/* + - develop permissions: contents: read @@ -18,8 +27,29 @@ concurrency: cancel-in-progress: true jobs: + setup: + runs-on: ubuntu-24.04 + env: + # The commit being checked out is the merge commit for a PR. Its first + # parent will be the tip of the base branch. + BASE_REF: HEAD^ + outputs: + enable_therock_ci: ${{ steps.configure.outputs.enable_therock_ci }} + steps: + - name: "Checking out repository" + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + # We need the parent commit to do a diff + fetch-depth: 2 + + - name: "Configuring CI options" + id: configure + run: python .github/scripts/therock_configure_ci.py + therock-ci-linux: name: TheRock CI Linux + needs: setup + if: ${{ needs.setup.outputs.enable_therock_ci == 'true' }} permissions: contents: read id-token: write @@ -34,6 +64,7 @@ jobs: name: TheRock CI Summary if: always() needs: + - setup - therock-ci-linux runs-on: ubuntu-24.04 steps: diff --git a/.github/workflows/therock-test-packages.yml b/.github/workflows/therock-test-packages.yml index 439135743c..37ddd399ad 100644 --- a/.github/workflows/therock-test-packages.yml +++ b/.github/workflows/therock-test-packages.yml @@ -68,6 +68,7 @@ jobs: VENV_DIR: ${{ env.VENV_DIR }} FETCH_ARTIFACT_ARGS: ${{ matrix.components.fetch_artifact_args }} PLATFORM: ${{ inputs.platform }} + IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} - name: Test timeout-minutes: ${{ matrix.components.timeout_minutes }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c09271edc..8ae97b3d61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/projects/composable_kernel/en/latest/](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/). -## Composable Kernel 1.1.0 for ROCm 7.0.0 +## Composable Kernel 1.2.0 for ROCm 7.0.0 ### Added @@ -27,6 +27,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj * Added int8 support for CK_TILE GEMM. * Added support for elementwise kernel. * Added benchmarking support for tile engine GEMM Multi D. +* Added block scaling support in CK_TILE GEMM, allowing flexible use of quantization matrices from either A or B operands. ### Optimized @@ -48,6 +49,7 @@ None * Number of instances in instance factory for grouped convolution forward NGCHW/GKYXC/NGKHW has been reduced. * Number of instances in instance factory for grouped convolution backward weight NGCHW/GKYXC/NGKHW has been reduced. * Number of instances in instance factory for grouped convolution backward data NGCHW/GKYXC/NGKHW has been reduced. +* Removed `BlockSize` in `make_kernel` and `CShuffleEpilogueProblem` to support Wave32 in CK_TILE (#2594) ### Known issues diff --git a/CMakeLists.txt b/CMakeLists.txt index 07d2e166bb..52bb2ccd2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,12 +16,21 @@ else() "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel.") endif() +# Allow user to specify the C++ standard. +# We must support C++17 builds until downstream users are migrated to C++20, but we default to C++20. +set(CK_CXX_STANDARD "20" CACHE STRING "C++ standard to use (e.g. 17 or 20)") +set(valid_cxx_standards 17 20) +set_property(CACHE CK_CXX_STANDARD PROPERTY STRINGS ${valid_cxx_standards}) +if(NOT CK_CXX_STANDARD IN_LIST valid_cxx_standards) + message(FATAL_ERROR "CK_CXX_STANDARD must be one of ${valid_cxx_standards}") +endif() + # Default installation path if(NOT WIN32) set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "") endif() -set(version 1.1.0) +set(version 1.2.0) # Check support for CUDA/HIP in Cmake project(composable_kernel VERSION ${version} LANGUAGES CXX HIP) include(CTest) @@ -221,11 +230,20 @@ if (SUPPORTED_GPU_TARGETS MATCHES "gfx94" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9 add_definitions(-DCK_USE_GFX94) set(CK_USE_GFX94 "ON") endif() + +# new macro CK_TILE_USE_WMMA in order to separately compile examples for MFMA/WMMA +set(CK_TILE_USE_WMMA 0) + if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12") message(STATUS "Enabling WMMA instances") add_definitions(-DCK_USE_WMMA) set(CK_USE_WMMA "ON") + set(CK_TILE_USE_WMMA 1) endif() + +# define the macro with the current value (0 or 1) +add_definitions(-DCK_TILE_USE_WMMA=${CK_TILE_USE_WMMA}) + if (SUPPORTED_GPU_TARGETS MATCHES "gfx12") message(STATUS "Enabling WMMA FP8 gemms on native architectures") add_definitions(-DCK_USE_WMMA_FP8) @@ -324,32 +342,19 @@ if(USE_BITINT_EXTENSION_INT4) message(STATUS "CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}") endif() -if(USE_OPT_GFX11) - add_compile_options(-mcumode) - add_compile_options(-mno-wavefrontsize64) - add_compile_definitions(CK_TILE_WAVE32_ENABLED) - message(STATUS "CK compiled with USE_OPT_GFX11 set to ${USE_OPT_GFX11}") -endif() - if(ENABLE_ASM_DUMP) add_compile_options(--save-temps) add_compile_options(-Wno-gnu-line-marker) message("CK compiled with ENABLE_ASM_DUMP set to ${ENABLE_ASM_DUMP}") endif() -if(USE_OPT_GFX12 AND (SUPPORTED_GPU_TARGETS MATCHES "gfx12")) - add_compile_options(-mno-wavefrontsize64) - add_compile_definitions(CK_TILE_WAVE32_ENABLED) - message(STATUS "CK compiled with USE_OPT_GFX12 set to ${USE_OPT_GFX12}") -endif() - ## Threads set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) link_libraries(Threads::Threads) ## C++ -set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD ${CK_CXX_STANDARD}) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}") diff --git a/Dockerfile.pytorch b/Dockerfile.pytorch new file mode 100644 index 0000000000..1b71b00fbb --- /dev/null +++ b/Dockerfile.pytorch @@ -0,0 +1,23 @@ +ARG BASE_DOCKER="rocm/pytorch-nightly:latest" +FROM $BASE_DOCKER +ARG CK_PYTORCH_BRANCH="develop" +RUN groupadd -g 109 render && \ + usermod -u 1001 jenkins && \ + groupmod -g 1001 jenkins && \ + cd /tmp/pytorch && \ + rm -rf build && \ + cd /tmp/pytorch/third_party && \ + rm -rf composable_kernel && \ + git clone -b "$CK_PYTORCH_BRANCH" https://github.com/ROCm/composable_kernel.git && \ + cd /tmp/pytorch/third_party/aiter/3rdparty && \ + rm -rf composable_kernel && \ + git clone -b "$CK_PYTORCH_BRANCH" https://github.com/ROCm/composable_kernel.git && \ + cd /tmp/pytorch/third_party/fbgemm/external && \ + rm -rf composable_kernel && \ + git clone -b "$CK_PYTORCH_BRANCH" https://github.com/ROCm/composable_kernel.git && \ + cd /tmp/pytorch/third_party/flash-attention/csrc && \ + rm -rf composable_kernel && \ + git clone -b "$CK_PYTORCH_BRANCH" https://github.com/ROCm/composable_kernel.git && \ + chown -R jenkins:jenkins /tmp/pytorch && \ + chmod -R a+rwx /tmp/pytorch && \ + sudo usermod -aG irc jenkins diff --git a/Jenkinsfile b/Jenkinsfile index d1f1baf15f..e7e57aded9 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -192,12 +192,16 @@ def buildDocker(install_prefix){ image_name = "rocm/composable_kernel:ck_aiter" dockerArgs = dockerArgs + " --no-cache -f Dockerfile.aiter --build-arg AITER_BRANCH='${params.aiter_branch}' --build-arg CK_AITER_BRANCH='${params.ck_aiter_branch}' . " } - else{ + else if(params.RUN_PYTORCH_TESTS){ + image_name = "rocm/composable_kernel:ck_pytorch" + dockerArgs = dockerArgs + " --no-cache -f Dockerfile.pytorch --build-arg CK_PYTORCH_BRANCH='${params.ck_pytorch_branch}' . " + } + else{ dockerArgs = dockerArgs + " -f Dockerfile . " } echo "Build Args: ${dockerArgs}" try{ - if(params.BUILD_DOCKER || params.RUN_AITER_TESTS){ + if(params.BUILD_DOCKER || params.RUN_AITER_TESTS || params.RUN_PYTORCH_TESTS){ //force building the new docker if that parameter is true echo "Building image: ${image_name}" retimage = docker.build("${image_name}", dockerArgs) @@ -400,8 +404,9 @@ def cmake_build(Map conf=[:]){ echo "Build packages" sh 'ninja -j64 package' archiveArtifacts artifacts: 'composablekernel-dev*.deb' - sh 'mv composablekernel-dev_*.deb composablekernel-dev_all_targets_1.1.0_amd64.deb' - stash includes: "composablekernel-dev_all_targets_1.1.0_amd64.deb", name: "packages" + sh 'mv composablekernel-dev_*.deb composablekernel-dev_all_targets_1.2.0_amd64.deb' + sh 'mv composablekernel-ckprofiler_*.deb composablekernel-ckprofiler_1.2.0_amd64.deb' + stash includes: "composablekernel-**.deb", name: "packages" } } else{ @@ -571,50 +576,66 @@ def Build_CK(Map conf=[:]){ python3 -m pytest python/test/test_gen_instances.py """ } - dir("build"){ - if (params.RUN_FULL_QA && arch == 2 ){ - // build deb packages - echo "Build packages" - sh 'ninja package' - archiveArtifacts artifacts: 'composablekernel*.deb' - sh 'mv composablekernel-ckprofiler_*.deb composablekernel-ckprofiler_1.1.0_amd64.deb' - sh 'mv composablekernel-dev_*.deb composablekernel-dev_1.1.0_amd64.deb' - sh 'mv composablekernel-examples_*.deb composablekernel-examples_1.1.0_amd64.deb' - sh 'mv composablekernel-tests_*.deb composablekernel-tests_1.1.0_amd64.deb' - stash includes: "composablekernel-**.deb", name: "packages" - } - } // run performance tests, stash the logs, results will be processed on the master node dir("script"){ if (params.RUN_PERFORMANCE_TESTS){ if (params.RUN_FULL_QA && arch == 1){ // run full tests on gfx90a echo "Run full performance tests" - sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" - archiveArtifacts "perf_gemm.log" - archiveArtifacts "perf_resnet50_N256.log" - archiveArtifacts "perf_resnet50_N4.log" - archiveArtifacts "perf_batched_gemm.log" - archiveArtifacts "perf_grouped_gemm.log" - archiveArtifacts "perf_grouped_conv_fwd.log" - archiveArtifacts "perf_grouped_conv_bwd_data.log" - archiveArtifacts "perf_grouped_conv_bwd_weight.log" - archiveArtifacts "perf_gemm_bilinear.log" - archiveArtifacts "perf_reduction.log" - archiveArtifacts "perf_splitK_gemm.log" - archiveArtifacts "perf_onnx_gemm.log" - archiveArtifacts "perf_mixed_gemm.log" - stash includes: "perf_**.log", name: "perf_log" + sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx90a" + archiveArtifacts "perf_gemm_gfx90a.log" + archiveArtifacts "perf_resnet50_N256_gfx90a.log" + archiveArtifacts "perf_resnet50_N4_gfx90a.log" + archiveArtifacts "perf_batched_gemm_gfx90a.log" + archiveArtifacts "perf_grouped_gemm_gfx90a.log" + archiveArtifacts "perf_grouped_conv_fwd_gfx90a.log" + archiveArtifacts "perf_grouped_conv_bwd_data_gfx90a.log" + archiveArtifacts "perf_grouped_conv_bwd_weight_gfx90a.log" + archiveArtifacts "perf_gemm_bilinear_gfx90a.log" + archiveArtifacts "perf_reduction_gfx90a.log" + archiveArtifacts "perf_splitK_gemm_gfx90a.log" + archiveArtifacts "perf_onnx_gemm_gfx90a.log" + archiveArtifacts "perf_mixed_gemm_gfx90a.log" + stash includes: "perf_**.log", name: "perf_log_gfx90a" + } + if (params.RUN_FULL_QA && arch == 2){ + // run full tests on gfx942 + echo "Run full performance tests" + sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx942" + archiveArtifacts "perf_gemm_gfx942.log" + archiveArtifacts "perf_resnet50_N256_gfx942.log" + archiveArtifacts "perf_resnet50_N4_gfx942.log" + archiveArtifacts "perf_batched_gemm_gfx942.log" + archiveArtifacts "perf_grouped_gemm_gfx942.log" + archiveArtifacts "perf_grouped_conv_fwd_gfx942.log" + archiveArtifacts "perf_grouped_conv_bwd_data_gfx942.log" + archiveArtifacts "perf_grouped_conv_bwd_weight_gfx942.log" + archiveArtifacts "perf_gemm_bilinear_gfx942.log" + archiveArtifacts "perf_reduction_gfx942.log" + archiveArtifacts "perf_splitK_gemm_gfx942.log" + archiveArtifacts "perf_onnx_gemm_gfx942.log" + archiveArtifacts "perf_mixed_gemm_gfx942.log" + stash includes: "perf_**.log", name: "perf_log_gfx942" } else if ( arch == 1 ){ // run standard tests on gfx90a echo "Run performance tests" - sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" - archiveArtifacts "perf_gemm.log" - archiveArtifacts "perf_onnx_gemm.log" - archiveArtifacts "perf_resnet50_N256.log" - archiveArtifacts "perf_resnet50_N4.log" - stash includes: "perf_**.log", name: "perf_log" + sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx90a" + archiveArtifacts "perf_gemm_gfx90a.log" + archiveArtifacts "perf_onnx_gemm_gfx90a.log" + archiveArtifacts "perf_resnet50_N256_gfx90a.log" + archiveArtifacts "perf_resnet50_N4_gfx90a.log" + stash includes: "perf_**.log", name: "perf_log_gfx90a" + } + else if ( arch == 2 ){ + // run standard tests on gfx942 + echo "Run performance tests" + sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx942" + archiveArtifacts "perf_gemm_gfx942.log" + archiveArtifacts "perf_onnx_gemm_gfx942.log" + archiveArtifacts "perf_resnet50_N256_gfx942.log" + archiveArtifacts "perf_resnet50_N4_gfx942.log" + stash includes: "perf_**.log", name: "perf_log_gfx942" } // disable performance tests on gfx1030 for now. //else if ( arch == 3){ @@ -732,29 +753,64 @@ def process_results(Map conf=[:]){ if (params.RUN_CK_TILE_FMHA_TESTS){ try{ unstash "perf_fmha_log_gfx942" + } + catch(Exception err){ + echo "could not locate the FMHA performance logs for gfx942: ${err.getMessage()}." + } + try{ unstash "perf_fmha_log_gfx90a" } catch(Exception err){ - echo "could not locate the FMHA performance logs: ${err.getMessage()}." + echo "could not locate the FMHA performance logs for gfx90a: ${err.getMessage()}." } } - if (params.RUN_FULL_QA || params.BUILD_INSTANCES_ONLY){ + if (params.BUILD_INSTANCES_ONLY){ // unstash deb packages unstash "packages" sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no composablekernel-*.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/" } else{ // unstash perf files to master - unstash "perf_log" + try{ + unstash "perf_log_gfx90a" + } + catch(Exception err){ + echo "could not locate the gfx90a performance logs: ${err.getMessage()}." + } + try{ + unstash "perf_log_gfx942" + } + catch(Exception err){ + echo "could not locate the gfx942 performance logs: ${err.getMessage()}." + } + try{ + unstash "perf_log_gfx950" + } + catch(Exception err){ + echo "could not locate the gfx950 performance logs: ${err.getMessage()}." + } + try{ + unstash "perf_log_gfx908" + } + catch(Exception err){ + echo "could not locate the gfx908 performance logs: ${err.getMessage()}." + } try{ unstash "perf_log_gfx11" + } + catch(Exception err){ + echo "could not locate the gfx11 performance logs: ${err.getMessage()}." + } + try{ + unstash "perf_log_gfx12" } catch(Exception err){ - echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}." + echo "could not locate the gfx12 performance logs: ${err.getMessage()}." } - sh "./process_perf_data.sh" } + // process the logs + sh "./process_perf_data.sh" } } catch(e){ @@ -819,13 +875,64 @@ def run_aiter_tests(Map conf=[:]){ } } + +def run_pytorch_tests(Map conf=[:]){ + show_node_info() + env.HSA_ENABLE_SDMA=0 + checkout scm + //use the latest pytorch-nightly image + def image = "rocm/composable_kernel:ck_pytorch" + def dockerOpts="--network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add render --group-add irc --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --user=jenkins -v=/var/jenkins/:/var/jenkins" + def variant = env.STAGE_NAME + def retimage + def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3') + def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3') + dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} " + echo "Docker flags: ${dockerOpts}" + + gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { + try + { + echo "Pulling image: ${image}" + retimage = docker.image("${image}") + withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) { + retimage.pull() + } + } + catch(Exception ex) + { + error "Unable to locate image: ${image}" + } + } + + withDockerContainer(image: image, args: dockerOpts) { + timeout(time: 45, unit: 'MINUTES'){ + try{ + sh "rocminfo" + sh "python3 --version" + sh "python3 /tmp/pytorch/tools/amd_build/build_amd.py" + sh "USE_ROCM_CK_SDPA=1 PYTORCH_ROCM_ARCH=gfx942 python /tmp/pytorch/setup.py develop" + } + catch(e){ + echo "Throwing error exception while building Pytorch" + echo 'Exception occurred: ' + e.toString() + throw e + } + finally{ + echo "Finished building Pytorch" + } + } + } +} + //launch develop branch daily jobs CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true 0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true 0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true 0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true 0 15 * * * % BUILD_INSTANCES_ONLY=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true - 0 13 * * * % RUN_AITER_TESTS=true;BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false''' : "" + 0 13 * * * % RUN_AITER_TESTS=true;BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false + 0 11 * * * % RUN_PYTORCH_TESTS=true;RUN_CODEGEN_TESTS=false;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;BUILD_GFX10=false;BUILD_GFX11=false;BUILD_GFX12=false;BUILD_GFX90A=false''' : "" pipeline { agent none @@ -960,6 +1067,14 @@ pipeline { name: "RUN_ALL_UNIT_TESTS", defaultValue: false, description: "Run all unit tests (default: OFF)") + booleanParam( + name: "RUN_PYTORCH_TESTS", + defaultValue: false, + description: "Try building PYTORCH with latest CK develop branch (default: OFF)") + string( + name: 'ck_pytorch_branch', + defaultValue: 'develop', + description: 'Specify which branch of CK to test with Pytorch (default: develop)') booleanParam( name: "RUN_AITER_TESTS", defaultValue: false, @@ -1051,6 +1166,24 @@ pipeline { } } } + } + stage("Run Pytorch Tests") + { + parallel + { + stage("Run Pytorch Tests on gfx942") + { + when { + beforeAgent true + expression { params.RUN_PYTORCH_TESTS.toBoolean() } + } + agent{ label rocmnode("gfx942")} + steps{ + run_pytorch_tests() + cleanWs() + } + } + } } stage("Run AITER Tests") { @@ -1107,11 +1240,16 @@ pipeline { agent{ label rocmnode("gfx90a")} environment{ setup_args = "NO_CK_BUILD" - execute_args = """ cd test_data && \ - ./generate_test_dataset.sh && \ - cd ../script && \ + execute_args = """ cd ../build && \ ../script/cmake-ck-dev.sh ../ gfx90a && \ make -j64 test_grouped_convnd_fwd_dataset_xdl && \ + cd ../test_data && \ + # Dataset generation modes: + # - small: ~60 test cases (minimal, quick testing - 3 models, 2 batch sizes, 2 image sizes) + # - half: ~300 test cases (moderate coverage - 16 models, 3 batch sizes, 5 image sizes), ~ 17 hours testing time + # - full: ~600 test cases (comprehensive - 16 models, 5 batch sizes, 9 image sizes), ~ 40 hours testing time + ./generate_test_dataset.sh half && \ + cd ../build && \ ./bin/test_grouped_convnd_fwd_dataset_xdl""" } steps{ @@ -1306,6 +1444,7 @@ pipeline { def docker_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_rhel8_rocm6.3" setup_args = """ -DGPU_TARGETS="gfx942" \ -DCMAKE_CXX_FLAGS=" -O3 " \ + -DCK_CXX_STANDARD="17" \ -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """ execute_args = " " } @@ -1440,7 +1579,7 @@ pipeline { -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """ - buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0") } cleanWs() } @@ -1517,7 +1656,7 @@ pipeline { stage("Process results"){ when { beforeAgent true - expression { params.RUN_PERFORMANCE_TESTS.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() } + expression { (params.RUN_PERFORMANCE_TESTS.toBoolean() || params.BUILD_INSTANCES_ONLY.toBoolean() || params.RUN_CK_TILE_FMHA_TESTS.toBoolean()) && !params.BUILD_LEGACY_OS.toBoolean() } } agent { label 'mici' } steps{ diff --git a/docs/Contributors_Guide.rst b/docs/Contributors_Guide.rst index 3788ba609c..1b978ed63e 100644 --- a/docs/Contributors_Guide.rst +++ b/docs/Contributors_Guide.rst @@ -19,7 +19,6 @@ Getting started build the library. You can also find some of this information in the `README file `_ on the project's GitHub page. -#. **Additional reading:** The blog post `AMD Composable Kernel library: efficient fused kernels for AI apps with just a few lines of code `_ provides a deeper understanding of the CK library and showcases its performance capabilities. `_ from the AMD Community portal. It offers a deeper understanding of the library's objectives and showcases its performance capabilities. #. **General information:** For broader information about AMD products, consider exploring the diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc index 6c5d9f9fba..3e018aad1e 100644 --- a/example/01_gemm/run_gemm_example.inc +++ b/example/01_gemm/run_gemm_example.inc @@ -1,7 +1,8 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once +#include "ck/library/utility/validation_common.hpp" template bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) @@ -53,6 +54,17 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) StrideB = f_get_default_stride(K, N, StrideB, BLayout{}); StrideC = f_get_default_stride(M, N, StrideC, CLayout{}); + try + { + ck::utils::validate_gemm_strides_abc( + M, N, K, StrideA, StrideB, StrideC); + } + catch(const std::runtime_error& e) + { + std::cerr << "Error: " << e.what() << std::endl; + return false; + } + Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); diff --git a/example/01_gemm/run_gemm_example_v2.inc b/example/01_gemm/run_gemm_example_v2.inc index 4adb6f896b..3d8cf32221 100644 --- a/example/01_gemm/run_gemm_example_v2.inc +++ b/example/01_gemm/run_gemm_example_v2.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/example/ck_tile/01_fmha/README.md b/example/ck_tile/01_fmha/README.md index 72109a660b..f72d7afa02 100644 --- a/example/ck_tile/01_fmha/README.md +++ b/example/ck_tile/01_fmha/README.md @@ -7,7 +7,7 @@ This folder contains example for fmha(fused multi-head attention) using ck_tile # in the root of ck_tile mkdir build && cd build # you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank -sh ../script/cmake-ck-dev.sh ../ +../script/cmake-ck-dev.sh ../ make tile_example_fmha_fwd -j ``` This will result in an executable `build/bin/tile_example_fmha_fwd` diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py index 5d55e8bc36..0d8f366d8a 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py @@ -110,9 +110,9 @@ float fmha_batch_prefill_(const ck_tile::stream_config& s, fmha_b if(s.log_level_ > 0) std::cout << ", " << k_::GetName() << std::flush; auto [kargs, grids] = fmha_batch_prefill_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); + const dim3 blocks = k_::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); + return ck_tile::launch_kernel(s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); }} """ diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py index bb3a0587e7..0391191fb2 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py @@ -136,10 +136,10 @@ float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config& s if(s.log_level_ > 0) std::cout << ", " << k_::GetName() << std::flush; auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); + const dim3 blocks = k_::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; return ck_tile::launch_kernel( - s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); + s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); }} template <> @@ -148,9 +148,9 @@ void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_co {{ using k_ = fmha_bwd_dq_dk_dv_kernel_{F_idx}; auto [kargs, grids] = fmha_bwd_dq_dk_dv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); + const dim3 blocks = k_::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)( + ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)( ck_tile::stream_config{{s.stream_id_}}); }} @@ -425,10 +425,10 @@ float fmha_bwd_dot_do_o_(const ck_tile::stream_config& s if(s.log_level_ > 0) std::cout << ", " << k_::GetName() << std::flush; auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); + const dim3 blocks = k_::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; return ck_tile::launch_kernel( - s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); + s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); }} template <> @@ -436,9 +436,9 @@ void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_co {{ using k_ = fmha_bwd_dot_do_o_kernel_{F_idx}; auto [kargs, grids] = fmha_bwd_dot_do_o_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); + const dim3 blocks = k_::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)( + ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)( ck_tile::stream_config{{s.stream_id_}}); }} @@ -530,10 +530,10 @@ float fmha_bwd_convert_dq_(const ck_tile::stream_confi if(s.log_level_ > 0) std::cout << ", " << k_::GetName() << std::flush; auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); + const dim3 blocks = k_::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; return ck_tile::launch_kernel( - s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); + s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); }} template <> @@ -542,9 +542,9 @@ void fmha_bwd_convert_dq_oneshot_(const ck_tile::strea {{ using k_ = fmha_bwd_convert_dq_kernel_{F_idx}; auto [kargs, grids] = fmha_bwd_convert_dq_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); + const dim3 blocks = k_::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)( + ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)( ck_tile::stream_config{{s.stream_id_}}); }} diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py index f614f42e6b..d9452206e7 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py @@ -110,9 +110,9 @@ float fmha_fwd_(const ck_tile::stream_config& s, fmha_fwd_args a) if(s.log_level_ > 0) std::cout << ", " << k_::GetName() << std::flush; auto [kargs, grids] = fmha_fwd_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); + const dim3 blocks = k_::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); + return ck_tile::launch_kernel(s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); }} """ @@ -385,7 +385,7 @@ class FmhaFwdApiPool: for i, dtype in enumerate(self.pool.keys()): per_hdim_case=str() for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()): - traits=self.pool[dtype][(hdim, hdim_v)] + traits=[t for t in self.pool[dtype][(hdim, hdim_v)] if tr_load == t.tr_load] inners=str() for k, trait in enumerate(traits): if_k = 'if' if k == 0 else 'else if' diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py index 2e5bc2bd3d..0ebeaddf9c 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py @@ -60,9 +60,9 @@ float fmha_fwd_appendkv_(const ck_tile::stream_config& s, fmha_fw if(s.log_level_ > 0) std::cout << ", " << k_::GetName() << std::flush; auto [kargs, grids] = fmha_fwd_appendkv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); + const dim3 blocks = k_::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); + return ck_tile::launch_kernel(s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); }} """ diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index b2d962cd74..1dd8f0e3c6 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -108,9 +108,9 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a) {{ using k_ = fmha_kernel; auto [kargs, grids] = fmha_fwd_splitkv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); + const dim3 blocks = k_::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)(ck_tile::stream_config{{s.stream_id_}}); + ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)(ck_tile::stream_config{{s.stream_id_}}); }} }}; }} @@ -208,9 +208,9 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a) {{ using k_ = fmha_kernel; auto [kargs, grids] = fmha_fwd_splitkv_combine_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); + const dim3 blocks = k_::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)(ck_tile::stream_config{{s.stream_id_}}); + ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)(ck_tile::stream_config{{s.stream_id_}}); }} }}; }} diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py b/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py index 650ebaf80e..e468e82ed5 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py @@ -109,9 +109,9 @@ float fmha_fwd_pagedkv_(const ck_tile::stream_config& s, fmha_fwd if(s.log_level_ > 0) std::cout << ", " << k_::GetName() << std::flush; auto [kargs, grids] = fmha_fwd_pagedkv_create_kargs_and_grids(a); - constexpr dim3 blocks = k_::BlockSize(); + const dim3 blocks = k_::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu; - return ck_tile::launch_kernel(s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); + return ck_tile::launch_kernel(s, ck_tile::make_kernel(k_{{}}, grids, blocks, 0, kargs)); }} """ diff --git a/example/ck_tile/01_fmha/fmha_bwd.cpp b/example/ck_tile/01_fmha/fmha_bwd.cpp index 9c2907778f..9f1e0f6948 100644 --- a/example/ck_tile/01_fmha/fmha_bwd.cpp +++ b/example/ck_tile/01_fmha/fmha_bwd.cpp @@ -809,20 +809,6 @@ bool run(const ck_tile::ArgParser& arg_parser) ck_tile::stream_config stream_config_v{ nullptr, true, 0, 0, 1, arg_parser.get_str("timer") == std::string("gpu")}; - - printf("\nfmha_bwd_traits: hdim_q=%d, hdim_v=%d, data_type=%s, is_group_mode=%d, mask_type=%d, " - "bias_type=%d, has_dbias=%d, has_dropout=%d, is_store_randval=%d, is_deterministic=%d\n", - fmha_traits.hdim_q, - fmha_traits.hdim_v, - fmha_traits.data_type.c_str(), - fmha_traits.is_group_mode, - static_cast(fmha_traits.mask_type), - static_cast(fmha_traits.bias_type), - fmha_traits.has_dbias, - fmha_traits.has_dropout, - fmha_traits.is_store_randval, - fmha_traits.is_deterministic); - fflush(stdout); fmha_bwd(fmha_traits, fmha_args, stream_config_v); dq_buf.FromDevice(dq_host.data()); diff --git a/example/ck_tile/01_fmha/script/run_full_test.sh b/example/ck_tile/01_fmha/script/run_full_test.sh index b5e6778aa5..e7babd2744 100755 --- a/example/ck_tile/01_fmha/script/run_full_test.sh +++ b/example/ck_tile/01_fmha/script/run_full_test.sh @@ -9,6 +9,8 @@ # host name : $hostname # gpu architecture: e.g., gfx90a, or gfx942, etc. +set -euo pipefail + #get the command line arguments: export env_type=$1 echo 'Environment type: ' $env_type diff --git a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh index 5ba3425e26..d123f842a2 100755 --- a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh +++ b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh @@ -1,5 +1,7 @@ -#!/bin/sh +#!/bin/bash # TODO: run this script from CK root or build directory +set -euo pipefail + EXE="$(find . -name tile_example_fmha_bwd -type f | head -n 1)" KNAME=1 @@ -17,12 +19,12 @@ for dbias in 0 ; do for p_drop in 0.0 0.2 ; do for deterministic in 0 ; do -$EXE -prec=$prec -b=1 -h=4 -h_k=2 -d=$hdim -s=259 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -$EXE -prec=$prec -b=2 -h=2 -d=$hdim -s=516 -s_k=253 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -$EXE -prec=$prec -b=1 -h=4 -h_k=1 -d=$hdim -s=500 -s_k=251 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=1 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -$EXE -prec=$prec -b=1 -h=2 -d=$hdim -s=900 -s_k=258 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=2 -v=1 -deterministic=$deterministic -mode=$mode -kname=$KNAME $COMMON_ARGS -$EXE -prec=$prec -b=2 -h=1 -d=$hdim -s=987 -s_k=219 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=t:128,30 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -$EXE -prec=$prec -b=2 -h=3 -h_k=1 -d=$hdim -s=244 -s_k=499 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=b:4,35 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS +$EXE -prec=$prec -b=1 -h=4 -h_k=2 -d=$hdim -s=259 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS +$EXE -prec=$prec -b=2 -h=2 -d=$hdim -s=516 -s_k=253 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS +$EXE -prec=$prec -b=1 -h=4 -h_k=1 -d=$hdim -s=500 -s_k=251 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=1 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS +$EXE -prec=$prec -b=1 -h=2 -d=$hdim -s=900 -s_k=258 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=2 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS +$EXE -prec=$prec -b=2 -h=1 -d=$hdim -s=987 -s_k=219 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=t:128,30 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS +$EXE -prec=$prec -b=2 -h=3 -h_k=1 -d=$hdim -s=244 -s_k=499 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=b:4,35 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS done done diff --git a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh index dc2be933bd..3913a0d5c2 100755 --- a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh +++ b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh @@ -1,5 +1,7 @@ #!/bin/bash # TODO: run this script from CK root or build directory +set -euo pipefail + EXE="$(find . -name tile_example_fmha_fwd -type f | head -n 1)" KNAME=1 @@ -51,19 +53,18 @@ run_fp16_bf16_tests() { for cache_batch_idx in $CACHE_BATCH_IDX ; do # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS - $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16, -d_v=$hdim -s=55 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS - $EXE -prec=$prec -mode=$mode -b=1 -h=3 -d=$hdim -s=100 -s_k=51 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS - $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=16 -d_v=$hdim -s=99 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS - $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1024 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS - $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -d_v=24 -s=3 -s_k=99 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS - $EXE -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim -s=200 -s_k=520 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS - $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -s=99 -s_k=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS - $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=33 -s_k=0 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS - $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1 -s_k=10 -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS + $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16 -d_v=$hdim -s=55 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS + $EXE -prec=$prec -mode=$mode -b=1 -h=3 -d=$hdim -s=100 -s_k=51 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS + $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=16 -d_v=$hdim -s=99 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS + $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1024 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS + $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -d_v=24 -s=3 -s_k=99 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS + $EXE -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim -s=200 -s_k=520 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS + $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -s=99 -s_k=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS + $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=33 -s_k=0 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS + $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1 -s_k=10 -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS done ; done ; done ; done ; done done ; done ; done ; done ; done - done ; } run_fp8_tests() { diff --git a/example/ck_tile/02_layernorm2d/README.md b/example/ck_tile/02_layernorm2d/README.md index 817f62dae7..da74e2e3c1 100644 --- a/example/ck_tile/02_layernorm2d/README.md +++ b/example/ck_tile/02_layernorm2d/README.md @@ -42,7 +42,7 @@ return hidden_states, per_token_scale ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... make tile_example_layernorm2d_fwd -j ``` This will result in an executable `build/bin/tile_example_layernorm2d_fwd` diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py index d77582630a..c4366f6662 100644 --- a/example/ck_tile/02_layernorm2d/generate.py +++ b/example/ck_tile/02_layernorm2d/generate.py @@ -235,7 +235,7 @@ float layernorm2d_fwd_(const S& s, A a) using Kernel = ck_tile::Layernorm2dFwd; const dim3 grids = Kernel::GridSize(a); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 blocks = Kernel::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = 1; auto kargs = Kernel::MakeKargs(a); @@ -243,7 +243,7 @@ float layernorm2d_fwd_(const S& s, A a) std::cout << ", " << Kernel::GetName() << std::flush; return ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{{}}, grids, blocks, 0, kargs)); + s, ck_tile::make_kernel(Kernel{{}}, grids, blocks, 0, kargs)); }} """ diff --git a/example/ck_tile/03_gemm/README.md b/example/ck_tile/03_gemm/README.md index c9e392dbd5..6358b76fd9 100644 --- a/example/ck_tile/03_gemm/README.md +++ b/example/ck_tile/03_gemm/README.md @@ -7,7 +7,7 @@ This folder contains example for GEMM using ck_tile tile-programming implementat # in the root of ck_tile mkdir build && cd build # you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank -sh ../script/cmake-ck-dev.sh ../ +../script/cmake-ck-dev.sh ../ # The basic pipeline method on the gemm calculation make tile_example_gemm_basic -j # The memory bound pipeline on the gemm calculation diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp index 25781a4ae8..99c943a7f1 100644 --- a/example/ck_tile/03_gemm/gemm_basic.cpp +++ b/example/ck_tile/03_gemm/gemm_basic.cpp @@ -26,6 +26,15 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) constexpr ck_tile::index_t N_Tile = 256; constexpr ck_tile::index_t K_Tile = 64; +#if CK_TILE_USE_WMMA + constexpr ck_tile::index_t M_Warp = 4; + constexpr ck_tile::index_t N_Warp = 2; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = 16; + constexpr ck_tile::index_t N_Warp_Tile = 16; + constexpr ck_tile::index_t K_Warp_Tile = 16; +#else constexpr ck_tile::index_t M_Warp = 2; constexpr ck_tile::index_t N_Warp = 2; constexpr ck_tile::index_t K_Warp = 1; @@ -33,6 +42,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) constexpr ck_tile::index_t M_Warp_Tile = 32; constexpr ck_tile::index_t N_Warp_Tile = 32; constexpr ck_tile::index_t K_Warp_Tile = 16; +#endif using CodegenGemmShape = ck_tile::TileGemmShape, @@ -65,7 +75,6 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) ck_tile::tuple<>, CLayout, ck_tile::element_wise::PassThrough, - CodegenPipelineProblem::kBlockSize, TilePartitioner::MPerBlock, TilePartitioner::NPerBlock, M_Warp, @@ -81,8 +90,8 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) using Kernel = ck_tile::GemmKernel; auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); + const dim3 blocks = Kernel::BlockSize(); if(!Kernel::IsSupportedArgument(kargs)) { @@ -100,10 +109,8 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) << std::endl; } - float ave_time = - ck_tile::launch_kernel(s, - ck_tile::make_kernel( - Kernel{}, grids, blocks, 0, kargs)); + float ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); return ave_time; }; diff --git a/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp index a4a8039288..f42135a0b5 100644 --- a/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp +++ b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. #include @@ -208,7 +208,6 @@ float gemm_stage1(const GemmSplitKHostArgs& args, const ck_tile::stream_config& DsLayout, ELayout, CDEElementWise, - UniversalGemmProblem::kBlockSize, TilePartitioner::MPerBlock, TilePartitioner::NPerBlock, GemmConfig::M_Warp, @@ -232,7 +231,7 @@ float gemm_stage1(const GemmSplitKHostArgs& args, const ck_tile::stream_config& { grids = Kernel::GridSize(args.M, args.N, args.k_batch); } - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 blocks = Kernel::BlockSize(); if(!Kernel::IsSupportedArgument(kargs)) { @@ -279,15 +278,13 @@ float gemm_stage1(const GemmSplitKHostArgs& args, const ck_tile::stream_config& ave_time = ck_tile::launch_kernel_time_mask( s, run_flush_cache, - ck_tile::make_kernel( - Kernel{}, grids, blocks, 0, kargs)); + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); } else { - ave_time = - ck_tile::launch_kernel(s, - ck_tile::make_kernel( - Kernel{}, grids, blocks, 0, kargs)); + ave_time = ck_tile::launch_kernel( + s, + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); } return ave_time; }; @@ -373,7 +370,7 @@ float reduce_stage2(const GemmSplitKHostArgs& args, const ck_tile::stream_config float ave_time = ck_tile::launch_kernel(s, - ck_tile::make_kernel( + ck_tile::make_kernel( Kernel{}, kGridSize, kBlockSize, diff --git a/example/ck_tile/03_gemm/gemm_utils.hpp b/example/ck_tile/03_gemm/gemm_utils.hpp old mode 100755 new mode 100644 index eb0a6de8aa..7f2af946e6 --- a/example/ck_tile/03_gemm/gemm_utils.hpp +++ b/example/ck_tile/03_gemm/gemm_utils.hpp @@ -172,6 +172,7 @@ struct GemmConfigComputeV3_2 : public GemmConfigBase static constexpr int kBlockPerCu = 2; }; +#if CK_TILE_USE_WMMA template struct GemmConfigComputeV3_WMMA : public GemmConfigBase { @@ -192,6 +193,7 @@ struct GemmConfigComputeV3_WMMA : public GemmConfigBase static constexpr int kBlockPerCu = 2; }; +#endif template struct GemmConfigComputeV4 : public GemmConfigBase @@ -484,7 +486,7 @@ auto create_args(int argc, char* argv[]) .insert("stride_b", "0", "Tensor B stride") .insert("stride_c", "0", "Tensor C stride") .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU") - .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8") + .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8/pk_int4_t") .insert("warmup", "50", "number of iterations before benchmark the kernel") .insert("repeat", "100", "number of iterations to benchmark the kernel") .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer") diff --git a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp index 2057f1e4f5..0018db2c99 100644 --- a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp +++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp @@ -103,7 +103,6 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) DsLayout, ELayout, CDEElementWise, - UniversalGemmProblem::kBlockSize, TilePartitioner::MPerBlock, TilePartitioner::NPerBlock, GemmConfig::M_Warp, @@ -126,7 +125,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) { grids = Kernel::GridSize(args.M, args.N, args.k_batch); } - constexpr dim3 blocks = Kernel::BlockSize(); + dim3 blocks = Kernel::BlockSize(); if(!Kernel::IsSupportedArgument(kargs)) { @@ -172,15 +171,13 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) ave_time = ck_tile::launch_kernel_time_mask( s, run_flush_cache, - ck_tile::make_kernel( - Kernel{}, grids, blocks, 0, kargs)); + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); } else { - ave_time = - ck_tile::launch_kernel(s, - ck_tile::make_kernel( - Kernel{}, grids, blocks, 0, kargs)); + ave_time = ck_tile::launch_kernel( + s, + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); } return ave_time; }; diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp index 149a8c2f0c..b80d9991d4 100644 --- a/example/ck_tile/03_gemm/universal_gemm.cpp +++ b/example/ck_tile/03_gemm/universal_gemm.cpp @@ -103,7 +103,6 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) DsLayout, ELayout, CDEElementWise, - UniversalGemmProblem::kBlockSize, TilePartitioner::MPerBlock, TilePartitioner::NPerBlock, GemmConfig::M_Warp, @@ -127,7 +126,7 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) { grids = Kernel::GridSize(args.M, args.N, args.k_batch); } - constexpr dim3 blocks = Kernel::BlockSize(); + dim3 blocks = Kernel::BlockSize(); if(!Kernel::IsSupportedArgument(kargs)) { @@ -173,15 +172,13 @@ float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) ave_time = ck_tile::launch_kernel_time_mask( s, run_flush_cache, - ck_tile::make_kernel( - Kernel{}, grids, blocks, 0, kargs)); + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); } else { - ave_time = - ck_tile::launch_kernel(s, - ck_tile::make_kernel( - Kernel{}, grids, blocks, 0, kargs)); + ave_time = ck_tile::launch_kernel( + s, + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); } return ave_time; }; @@ -338,7 +335,11 @@ int main(int argc, char* argv[]) try { +#if CK_TILE_USE_WMMA + return !run_gemm_example(arg_parser); +#else return !run_gemm_example(arg_parser); +#endif } catch(const std::runtime_error& e) { diff --git a/example/ck_tile/04_img2col/README.md b/example/ck_tile/04_img2col/README.md index df5c51a9c0..3b1b6f999b 100644 --- a/example/ck_tile/04_img2col/README.md +++ b/example/ck_tile/04_img2col/README.md @@ -7,7 +7,7 @@ This folder contains example for Image to Column using ck_tile tile-programming # in the root of ck_tile mkdir build && cd build # you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank -sh ../script/cmake-ck-dev.sh ../ +../script/cmake-ck-dev.sh ../ make tile_example_img2col -j ``` This will result in an executable `build/bin/tile_example_img2col` diff --git a/example/ck_tile/04_img2col/image_to_column.cpp b/example/ck_tile/04_img2col/image_to_column.cpp index 299a2f3444..22b5d640d8 100644 --- a/example/ck_tile/04_img2col/image_to_column.cpp +++ b/example/ck_tile/04_img2col/image_to_column.cpp @@ -55,13 +55,12 @@ float image_to_column(const image_to_column_traits& traits, args.N * args.output_spatial_lengths[0] * args.output_spatial_lengths[1], args.filter_spatial_lengths[0] * args.filter_spatial_lengths[1] * args.C, args.G); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 blocks = Kernel::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = 2; float ave_time = ck_tile::launch_kernel( - stream_conf, - ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + stream_conf, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); return ave_time; } diff --git a/example/ck_tile/05_reduce/reduce.cpp b/example/ck_tile/05_reduce/reduce.cpp index cf816caa88..a110c2f98d 100644 --- a/example/ck_tile/05_reduce/reduce.cpp +++ b/example/ck_tile/05_reduce/reduce.cpp @@ -94,18 +94,18 @@ bool run(const ck_tile::ArgParser& arg_parser) throw std::runtime_error("Wrong! Arguments not supported!\n"); } - float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, - ck_tile::make_kernel( - Kernel{}, - kGridSize, - kBlockSize, - 0, - static_cast(x_buf.GetDeviceBuffer()), - static_cast(y_buf.GetDeviceBuffer()), - input_shape, - input_strides, - kept_dim, - reduce_dims)); + float ave_time = launch_kernel( + ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, + ck_tile::make_kernel(Kernel{}, + kGridSize, + kBlockSize, + 0, + static_cast(x_buf.GetDeviceBuffer()), + static_cast(y_buf.GetDeviceBuffer()), + input_shape, + input_strides, + kept_dim, + reduce_dims)); std::size_t num_btype = sizeof(XDataType) * N * C * H * W + sizeof(YDataType) * N * C; diff --git a/example/ck_tile/06_permute/README.md b/example/ck_tile/06_permute/README.md index 03bd810ff4..5e88e71572 100644 --- a/example/ck_tile/06_permute/README.md +++ b/example/ck_tile/06_permute/README.md @@ -15,7 +15,7 @@ args: ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... make tile_example_permute -j ``` This will result in an executable `build/bin/tile_example_permute` diff --git a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp index 688f4f3d50..d486196fc3 100644 --- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp +++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -115,11 +115,12 @@ struct matrix_core_swizzle_kernel __host__ void operator()(const ck_tile::stream_config& s) const { - ck_tile::kentry<<>>(a); + ck_tile::kentry<1, kernel><<>>(a); } struct kernel { + static constexpr int kBlockSize = BLOCK_SIZE; __device__ static constexpr auto get_src_dist() { using namespace ck_tile; diff --git a/example/ck_tile/06_permute/permute.cpp b/example/ck_tile/06_permute/permute.cpp index 477ae370b9..aafece0f25 100644 --- a/example/ck_tile/06_permute/permute.cpp +++ b/example/ck_tile/06_permute/permute.cpp @@ -53,11 +53,11 @@ float permute(permute_traits t, permute_args a, const ck_tile::stream_config& s) auto kargs = Kernel::MakeKargs(a); - const dim3 grids = Kernel::GridSize(a); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 grids = Kernel::GridSize(a); + const dim3 blocks = Kernel::BlockSize(); - float ave_time = ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + float ave_time = + ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(Kernel{}, grids, blocks, 0, kargs)); return ave_time; } @@ -69,11 +69,11 @@ float permute(permute_traits t, permute_args a, const ck_tile::stream_config& s) auto kargs = Kernel::MakeKargs(a); - const dim3 grids = Kernel::GridSize(a); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 grids = Kernel::GridSize(a); + const dim3 blocks = Kernel::BlockSize(); - float ave_time = ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + float ave_time = + ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(Kernel{}, grids, blocks, 0, kargs)); return ave_time; } @@ -85,11 +85,11 @@ float permute(permute_traits t, permute_args a, const ck_tile::stream_config& s) auto kargs = Kernel::MakeKargs(a); - const dim3 grids = Kernel::GridSize(a); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 grids = Kernel::GridSize(a); + const dim3 blocks = Kernel::BlockSize(); - float ave_time = ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + float ave_time = + ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(Kernel{}, grids, blocks, 0, kargs)); return ave_time; } diff --git a/example/ck_tile/09_topk_softmax/README.md b/example/ck_tile/09_topk_softmax/README.md index 1043012900..2e15aeaae5 100644 --- a/example/ck_tile/09_topk_softmax/README.md +++ b/example/ck_tile/09_topk_softmax/README.md @@ -6,7 +6,7 @@ This folder contains example for topk-softmax kernel using ck_tile tile-programm ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... make tile_example_topk_softmax -j ``` This will result in an executable `build/bin/tile_example_topk_softmax` diff --git a/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp b/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp index 249a307b81..c2bad24cfe 100644 --- a/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp +++ b/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp @@ -13,11 +13,11 @@ \ auto kargs = kernel::MakeKargs(a); \ \ - const dim3 grids = kernel::GridSize(a); \ - constexpr dim3 blocks = kernel::BlockSize(); \ + const dim3 grids = kernel::GridSize(a); \ + const dim3 blocks = kernel::BlockSize(); \ \ - float ave_time = ck_tile::launch_kernel( \ - s, ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs)); \ + float ave_time = \ + ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(kernel{}, grids, blocks, 0, kargs)); \ \ return ave_time; diff --git a/example/ck_tile/10_rmsnorm2d/README.md b/example/ck_tile/10_rmsnorm2d/README.md index c067496477..1d27ad153e 100644 --- a/example/ck_tile/10_rmsnorm2d/README.md +++ b/example/ck_tile/10_rmsnorm2d/README.md @@ -6,7 +6,7 @@ This folder contains example for Rmsnorm2D forward using ck_tile tile-programmin ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... make tile_rmsnorm2d_fwd -j ``` This will result in an executable `build/bin/tile_rmsnorm2d_fwd` diff --git a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp index e0a71452ea..511efeeaec 100644 --- a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp +++ b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp @@ -138,12 +138,11 @@ bool run(const ck_tile::ArgParser& arg_parser) auto kargs = Kernel::MakeKargs(args); const dim3 grids = Kernel::GridSize(args); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 blocks = Kernel::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = 1; auto s = ck_tile::stream_config{nullptr, true, 0, warmup, repeat}; - ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + ck_tile::launch_kernel(s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); bool pass = true; diff --git a/example/ck_tile/10_rmsnorm2d/generate.py b/example/ck_tile/10_rmsnorm2d/generate.py index b0ba400af1..ea8dfdf9ce 100644 --- a/example/ck_tile/10_rmsnorm2d/generate.py +++ b/example/ck_tile/10_rmsnorm2d/generate.py @@ -249,7 +249,7 @@ float rmsnorm2d_fwd_(const S& s, A a) using Kernel = ck_tile::Rmsnorm2dFwd; const dim3 grids = Kernel::GridSize(a); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 blocks = Kernel::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = 1; auto kargs = Kernel::MakeKargs(a); @@ -257,7 +257,7 @@ float rmsnorm2d_fwd_(const S& s, A a) std::cout << ", " << Kernel::GetName() << std::flush; return ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{{}}, grids, blocks, 0, kargs)); + s, ck_tile::make_kernel(Kernel{{}}, grids, blocks, 0, kargs)); }} """ diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md b/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md index 960369b78d..f9ba76c9e3 100644 --- a/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/README.md @@ -6,7 +6,7 @@ This folder contains example for add + Rmsnorm2D + rowwise dynamic quantization ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... make tile_add_rmsnorm2d_rdquant_fwd -j ``` This will result in an executable `build/bin/tile_add_rmsnorm2d_rdquant_fwd` diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp index 449bc17e04..ace5fe0c4f 100644 --- a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp @@ -136,12 +136,11 @@ bool run(const ck_tile::ArgParser& arg_parser) auto kargs = Kernel::MakeKargs(args); const dim3 grids = Kernel::GridSize(args); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 blocks = Kernel::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = 1; auto s = ck_tile::stream_config{nullptr, true, 0, warmup, repeat}; - ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + ck_tile::launch_kernel(s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); bool pass = true; diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp index 25b10e1dc4..d997596414 100644 --- a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_instance_common.hpp @@ -58,7 +58,7 @@ float add_rmsnorm2d_rdquant_fwd_(const S& s, A a) using Kernel = ck_tile::AddRmsnorm2dRdquantFwd; const dim3 grids = Kernel::GridSize(a); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 blocks = Kernel::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = 1; auto kargs = Kernel::MakeKargs(a); @@ -66,5 +66,5 @@ float add_rmsnorm2d_rdquant_fwd_(const S& s, A a) std::cout << ", " << Kernel::GetName() << std::flush; return ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); } diff --git a/example/ck_tile/12_smoothquant/README.md b/example/ck_tile/12_smoothquant/README.md index d6b815f8cf..6b3acd558b 100644 --- a/example/ck_tile/12_smoothquant/README.md +++ b/example/ck_tile/12_smoothquant/README.md @@ -6,7 +6,7 @@ This folder contains example for smoothquant using ck_tile tile-programming impl ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... make tile_smoothquant -j ``` This will result in an executable `build/bin/tile_smoothquant` diff --git a/example/ck_tile/12_smoothquant/example_smoothquant.cpp b/example/ck_tile/12_smoothquant/example_smoothquant.cpp index 5fcacacee8..e688947d71 100644 --- a/example/ck_tile/12_smoothquant/example_smoothquant.cpp +++ b/example/ck_tile/12_smoothquant/example_smoothquant.cpp @@ -126,12 +126,11 @@ bool run(const ck_tile::ArgParser& arg_parser) auto kargs = Kernel::MakeKargs(args); const dim3 grids = Kernel::GridSize(args); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 blocks = Kernel::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = 1; auto s = ck_tile::stream_config{nullptr, true, 1, warmup, repeat}; - ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + ck_tile::launch_kernel(s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); bool pass = true; diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp b/example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp index 555159566e..873a474afb 100644 --- a/example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp +++ b/example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp @@ -50,7 +50,7 @@ float smoothquant_(const S& s, A a) using Kernel = ck_tile::Smoothquant; const dim3 grids = Kernel::GridSize(a); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 blocks = Kernel::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = 1; auto kargs = Kernel::MakeKargs(a); @@ -58,5 +58,5 @@ float smoothquant_(const S& s, A a) std::cout << ", " << Kernel::GetName() << std::flush; return ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); } diff --git a/example/ck_tile/13_moe_sorting/README.md b/example/ck_tile/13_moe_sorting/README.md index 1822ff3a37..c99f40aa57 100644 --- a/example/ck_tile/13_moe_sorting/README.md +++ b/example/ck_tile/13_moe_sorting/README.md @@ -6,7 +6,7 @@ This folder contains example for moe-sorting kernel using ck_tile tile-programmi ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... make tile_example_moe_sorting -j ``` This will result in an executable `build/bin/tile_example_moe_sorting` diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp index a71c5e51a6..d614b8462a 100644 --- a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp +++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp @@ -209,7 +209,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi auto kargs = kernel::MakeKargs(a); \ const dim3 grids = kernel::GridSize(a); \ const dim3 blocks = kernel::BlockSize(a); \ - return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs); \ + return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs); \ }() #define MOE_SORTING_MP_1(mesh_type_, unroll_num_, expert_masking_, local_token_) \ @@ -227,7 +227,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi auto kargs = kernel::MakeKargs(a); \ const dim3 grids = kernel::GridSize(a); \ const dim3 blocks = kernel::BlockSize(a); \ - return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs); \ + return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs); \ }() #if MOE_SORTING_SUPPORT_LARGE_EXPERT #define MOE_SORTING_MP_2(mesh_type_, unroll_num_, expert_masking_, local_token_) \ @@ -283,7 +283,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi const dim3 grids = kernel::GridSize(a); \ const dim3 blocks = kernel::BlockSize(a); \ const auto lds_size = kernel::GetSmemSize(a); \ - return ck_tile::make_kernel(kernel{}, grids, blocks, lds_size, kargs); \ + return ck_tile::make_kernel(kernel{}, grids, blocks, lds_size, kargs); \ }() #define MOR_SORTING_MP_DISPATCH_(mesh_type_, token_vec_0_, token_vec_1_, token_vec_23_) \ @@ -334,15 +334,15 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi } \ } -#define MOR_SORTING_CLEAR_WS_DISPATCH_(is_local_token_, block_size_, occu_) \ - [&]() { \ - using problem_ = \ - ck_tile::MoeSortingClearWorkspaceProblem; \ - using kernel = ck_tile::MoeSortingClearWorkspaceKernel; \ - auto kargs = kernel::MakeKargs(a); \ - const dim3 grids = kernel::GridSize(a); \ - const dim3 blocks = kernel::BlockSize(a); \ - return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs); \ +#define MOR_SORTING_CLEAR_WS_DISPATCH_(is_local_token_, block_size_, occu_) \ + [&]() { \ + using problem_ = \ + ck_tile::MoeSortingClearWorkspaceProblem; \ + using kernel = ck_tile::MoeSortingClearWorkspaceKernel; \ + auto kargs = kernel::MakeKargs(a); \ + const dim3 grids = kernel::GridSize(a); \ + const dim3 blocks = kernel::BlockSize(a); \ + return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs); \ }() float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s) diff --git a/example/ck_tile/14_moe_smoothquant/README.md b/example/ck_tile/14_moe_smoothquant/README.md index 599b4c3489..c10a922607 100644 --- a/example/ck_tile/14_moe_smoothquant/README.md +++ b/example/ck_tile/14_moe_smoothquant/README.md @@ -9,7 +9,7 @@ Unlike standard smoothquant op, the input scale is from different expert `[exper ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... make tile_example_moe_smoothquant -j ``` This will result in an executable `build/bin/tile_example_moe_smoothquant` diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp index 885d9ff7bf..607217ea52 100644 --- a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp +++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp @@ -53,7 +53,7 @@ float moe_smoothquant_(const S& s, A a) using Kernel = ck_tile::MoeSmoothquant; const dim3 grids = Kernel::GridSize(a); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 blocks = Kernel::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = 1; auto kargs = Kernel::MakeKargs(a); @@ -61,5 +61,5 @@ float moe_smoothquant_(const S& s, A a) std::cout << ", " << Kernel::GetName() << std::flush; return ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); } diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp index 6e54df9fde..9d1675386f 100644 --- a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp +++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp @@ -53,7 +53,7 @@ float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a) using f_kernel = ck_tile::FusedMoeGemmKernel; const dim3 grids = f_kernel::GridSize(a); - constexpr dim3 blocks = f_kernel::BlockSize(); + const dim3 blocks = f_kernel::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = 1; static int printed = 0; @@ -66,5 +66,5 @@ float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a) } return ck_tile::launch_kernel( - s, ck_tile::make_kernel(f_kernel{}, grids, blocks, 0, kargs)); + s, ck_tile::make_kernel(f_kernel{}, grids, blocks, 0, kargs)); } diff --git a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp index 5f87393a0a..441aa84edf 100644 --- a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp +++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp @@ -213,7 +213,7 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til auto kargs = kernel::MakeKargs(a); \ const dim3 grids = kernel::GridSize(a); \ const dim3 blocks = kernel::BlockSize(a); \ - return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs); \ + return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs); \ }() #define MOE_SORTING_MP_1(mesh_type_, unroll_num_, expert_masking_, local_token_) \ @@ -231,7 +231,7 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til auto kargs = kernel::MakeKargs(a); \ const dim3 grids = kernel::GridSize(a); \ const dim3 blocks = kernel::BlockSize(a); \ - return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs); \ + return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs); \ }() #if MOE_SORTING_SUPPORT_LARGE_EXPERT #define MOE_SORTING_MP_2(mesh_type_, unroll_num_, expert_masking_, local_token_) \ @@ -287,7 +287,7 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til const dim3 grids = kernel::GridSize(a); \ const dim3 blocks = kernel::BlockSize(a); \ const auto lds_size = kernel::GetSmemSize(a); \ - return ck_tile::make_kernel(kernel{}, grids, blocks, lds_size, kargs); \ + return ck_tile::make_kernel(kernel{}, grids, blocks, lds_size, kargs); \ }() #define MOR_SORTING_MP_DISPATCH_(mesh_type_, token_vec_0_, token_vec_1_, token_vec_23_) \ diff --git a/example/ck_tile/16_batched_gemm/README.md b/example/ck_tile/16_batched_gemm/README.md index 34b56db526..8a64a3912c 100644 --- a/example/ck_tile/16_batched_gemm/README.md +++ b/example/ck_tile/16_batched_gemm/README.md @@ -7,7 +7,7 @@ This folder contains example for batched GEMM using ck_tile tile-programming imp # in the root of ck_tile mkdir build && cd build # you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank -sh ../script/cmake-ck-dev.sh ../ +../script/cmake-ck-dev.sh ../ make tile_example_batched_gemm -j ``` This will result in an executable `build/bin/tile_example_batched_gemm` diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp index 9616abb800..09ba010e00 100644 --- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp +++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp @@ -142,7 +142,6 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre DsLayout, CLayout, CDEElementWise, - GemmPipelineProblem::kBlockSize, TilePartitioner::MPerBlock, TilePartitioner::NPerBlock, M_Warp, @@ -156,8 +155,8 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre using Kernel = ck_tile::BatchedGemmKernel; auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count); + const dim3 blocks = Kernel::BlockSize(); if(!Kernel::IsSupportedArgument(kargs)) { @@ -176,7 +175,7 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre } ave_time = ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); return ave_time; }; diff --git a/example/ck_tile/17_grouped_gemm/README.md b/example/ck_tile/17_grouped_gemm/README.md index 29642e96c1..8715ee79e1 100644 --- a/example/ck_tile/17_grouped_gemm/README.md +++ b/example/ck_tile/17_grouped_gemm/README.md @@ -148,7 +148,7 @@ All the necessary parameters are set, the tiling is computed, the GEMM pipeline # in the root of ck_tile mkdir build && cd build # you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank -sh ../script/cmake-ck-dev.sh ../ +../script/cmake-ck-dev.sh ../ # The basic pipeline method on the gemm calculation make tile_example_grouped_gemm -j ``` diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp index a821af0649..527ef1e466 100644 --- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp @@ -29,10 +29,6 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s, void* kargs_ptr, bool splitk) { - constexpr bool kPadM = false; - constexpr bool kPadN = false; - constexpr bool kPadK = false; - constexpr ck_tile::index_t TileParitionerGroupNum = 8; constexpr ck_tile::index_t TileParitionerM01 = 4; @@ -44,7 +40,6 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s, using TilePartitioner = ck_tile:: GemmSpatiallyLocalTilePartitioner; - using Traits = ck_tile::TileGemmTraits; using GemmUniversalTraits = ck_tile::PersistentTileGemmUniversalTraits; - using GemmPipelineProblem = - ck_tile::GemmPipelineProblem; float ave_time{0}; @@ -82,7 +75,6 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s, ck_tile::tuple<>, CLayout, ck_tile::element_wise::PassThrough, - GemmPipelineProblem::kBlockSize, TilePartitioner::MPerBlock, TilePartitioner::NPerBlock, GemmConfig::M_Warp, @@ -92,9 +84,9 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s, GemmConfig::K_Warp_Tile, UniversalGemmProblem::TransposeC, memory_operation>>; - using Kernel = ck_tile::GroupedGemmKernel; - constexpr dim3 blocks = Kernel::BlockSize(); - const dim3 grids = Kernel::MaxOccupancyGridSize(s); + using Kernel = ck_tile::GroupedGemmKernel; + const dim3 blocks = Kernel::BlockSize(); + const dim3 grids = Kernel::MaxOccupancyGridSize(s); if(s.log_level_ > 0) { @@ -105,7 +97,7 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s, ave_time = ck_tile::launch_kernel(s, - ck_tile::make_kernel( + ck_tile::make_kernel( Kernel{}, grids, blocks, diff --git a/example/ck_tile/18_flatmm/README.md b/example/ck_tile/18_flatmm/README.md index beaac785fc..eeaa7658bd 100644 --- a/example/ck_tile/18_flatmm/README.md +++ b/example/ck_tile/18_flatmm/README.md @@ -7,7 +7,7 @@ This folder contains example for FLATMM using ck_tile tile-programming implement # in the root of ck_tile mkdir build && cd build # you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank -sh ../script/cmake-ck-dev.sh ../ +../script/cmake-ck-dev.sh ../ # The basic pipeline method on the flatmm calculation make tile_example_flatmm_basic -j ``` diff --git a/example/ck_tile/18_flatmm/flatmm_basic.cpp b/example/ck_tile/18_flatmm/flatmm_basic.cpp index 50bf791207..93117e5b75 100644 --- a/example/ck_tile/18_flatmm/flatmm_basic.cpp +++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp @@ -101,7 +101,6 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_c DsLayout, ELayout, CDEElementWise, - CodegenPipelineProblem::kBlockSize, TilePartitioner::MPerBlock, TilePartitioner::NPerBlock, FlatmmConfig::M_Warp, @@ -119,8 +118,8 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_c auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); + const dim3 blocks = Kernel::BlockSize(); if(!Kernel::IsSupportedArgument(kargs)) { @@ -171,15 +170,13 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_c ave_time = ck_tile::launch_kernel_time_mask( s, run_flush_cache, - ck_tile::make_kernel( - Kernel{}, grids, blocks, 0, kargs)); + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); } else { - ave_time = - ck_tile::launch_kernel(s, - ck_tile::make_kernel( - Kernel{}, grids, blocks, 0, kargs)); + ave_time = ck_tile::launch_kernel( + s, + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); } return ave_time; }; diff --git a/example/ck_tile/18_flatmm/run_flatmm_example.inc b/example/ck_tile/18_flatmm/run_flatmm_example.inc index 8f39b07be5..ff1a239cba 100644 --- a/example/ck_tile/18_flatmm/run_flatmm_example.inc +++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc @@ -40,9 +40,11 @@ template auto shuffle_b(const ck_tile::HostTensor& t) { assert(t.get_lengths().size() == 2); - int n_ = t.get_lengths()[1]; - int k_ = t.get_lengths()[0]; - constexpr int divisor = FlatmmConfig::N_Warp_Tile == 32 ? 2 : 4; + int n_ = t.get_lengths()[1]; + int k_ = t.get_lengths()[0]; + + int divisor = ck_tile::is_wave32() ? (FlatmmConfig::N_Warp_Tile == 32 ? 1 : 2) + : (FlatmmConfig::N_Warp_Tile == 32 ? 2 : 4); ck_tile::HostTensor t_view({n_ / FlatmmConfig::N_Warp_Tile, FlatmmConfig::N_Warp_Tile, k_ / FlatmmConfig::K_Warp_Tile, @@ -213,6 +215,16 @@ int run_flatmm_example_with_layouts(int argc, ck_tile::FillUniformDistribution{1.f, 1.f}(a_host); ck_tile::FillUniformDistribution{1.f, 1.f}(b_origin_host); } + else if(init_method == 3) + { + ck_tile::FillUniformDistribution{-.5f, .5f}(a_host); + ck_tile::FillUniformDistribution{1.f, 1.f}(b_origin_host); + } + else if(init_method == 4) + { + ck_tile::FillUniformDistribution{1.f, 1.f}(a_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(b_origin_host); + } else { a_host.SetZero(); diff --git a/example/ck_tile/18_hstu_attention/hstu_attention_batched_forward_dispatch.hpp b/example/ck_tile/18_hstu_attention/hstu_attention_batched_forward_dispatch.hpp index c39ed59ca7..3b7fbd304b 100644 --- a/example/ck_tile/18_hstu_attention/hstu_attention_batched_forward_dispatch.hpp +++ b/example/ck_tile/18_hstu_attention/hstu_attention_batched_forward_dispatch.hpp @@ -135,9 +135,9 @@ struct batched_forward_causal_local_bias_dropout_dispatch constexpr dim3 kBlockSize = HstuKernel::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = HstuKernel::kBlockPerCu; - (void)ck_tile::launch_kernel(ck_tile::stream_config{stream, false}, - ck_tile::make_kernel( - HstuKernel{}, kGridSize, kBlockSize, 0, kargs)); + (void)ck_tile::launch_kernel( + ck_tile::stream_config{stream, false}, + ck_tile::make_kernel(HstuKernel{}, kGridSize, kBlockSize, 0, kargs)); }; }; diff --git a/example/ck_tile/18_hstu_attention/hstu_attention_jagged_forward_dispatch.hpp b/example/ck_tile/18_hstu_attention/hstu_attention_jagged_forward_dispatch.hpp index e3f6e00f79..b51f766da2 100644 --- a/example/ck_tile/18_hstu_attention/hstu_attention_jagged_forward_dispatch.hpp +++ b/example/ck_tile/18_hstu_attention/hstu_attention_jagged_forward_dispatch.hpp @@ -126,9 +126,9 @@ struct jagged_forward_causal_local_bias_dropout_dispatch constexpr dim3 kBlockSize = HstuKernel::BlockSize(); constexpr ck_tile::index_t kBlockPerCu = HstuKernel::kBlockPerCu; - (void)ck_tile::launch_kernel(ck_tile::stream_config{stream, false}, - ck_tile::make_kernel( - HstuKernel{}, kGridSize, kBlockSize, 0, kargs)); + (void)ck_tile::launch_kernel( + ck_tile::stream_config{stream, false}, + ck_tile::make_kernel(HstuKernel{}, kGridSize, kBlockSize, 0, kargs)); }; }; diff --git a/example/ck_tile/19_gemm_multi_d/README.md b/example/ck_tile/19_gemm_multi_d/README.md index 7e8cd87546..2cf2b1ea03 100644 --- a/example/ck_tile/19_gemm_multi_d/README.md +++ b/example/ck_tile/19_gemm_multi_d/README.md @@ -8,7 +8,7 @@ This folder contains example for Multiple D GEMM using ck_tile tile-programming mkdir build && cd build #you can replace < arch> with the appropriate architecture(for example gfx90a or gfx942) or \ leave it blank -sh ../script/cmake-ck-dev.sh ../ +../script/cmake-ck-dev.sh ../ #The basic pipeline method on the gemm calculation make tile_example_gemm_multi_d_fp16 -j ``` diff --git a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp index d7bf2b5c42..fc52cb66cc 100644 --- a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp +++ b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp @@ -146,7 +146,6 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config& DsLayout, CLayout, CDEElementWise, - GemmPipelineProblem::kBlockSize, TilePartitioner::MPerBlock, TilePartitioner::NPerBlock, M_Warp, @@ -160,8 +159,8 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config& using Kernel = ck_tile::GemmKernelMultiD; auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); + const dim3 blocks = Kernel::BlockSize(); if(!Kernel::IsSupportedArgument(kargs)) { @@ -176,7 +175,7 @@ auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config& } ave_time = ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); return ave_time; }; diff --git a/example/ck_tile/20_grouped_convolution/CMakeLists.txt b/example/ck_tile/20_grouped_convolution/CMakeLists.txt index c05dcac09c..5cb1d2650e 100644 --- a/example/ck_tile/20_grouped_convolution/CMakeLists.txt +++ b/example/ck_tile/20_grouped_convolution/CMakeLists.txt @@ -6,3 +6,6 @@ target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_GEMM_COMP add_executable(tile_example_grouped_conv_bwd_weight EXCLUDE_FROM_ALL grouped_convolution_backward_weight.cpp) target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) + +add_executable(tile_example_grouped_conv_bwd_data EXCLUDE_FROM_ALL grouped_convolution_backward_data.cpp) +target_compile_options(tile_example_grouped_conv_bwd_data PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp new file mode 100644 index 0000000000..52eaab9f94 --- /dev/null +++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include +#include +#include +#include +#include + +#include "ck_tile/host.hpp" +#include "grouped_convolution_utils.hpp" + +template , + typename DsLayout = ck_tile::tuple<>, + typename CDEElementWise = ck_tile::element_wise::PassThrough> +float grouped_conv_bwd_data(const ck_tile::GroupedConvBwdDataHostArgs& args, + const ck_tile::stream_config& s) +{ + constexpr int kBlockPerCu = 1; + + constexpr ck_tile::index_t M_Tile = 64; + constexpr ck_tile::index_t N_Tile = 64; + constexpr ck_tile::index_t K_Tile = 32; + + constexpr ck_tile::index_t M_Warp = 2; + constexpr ck_tile::index_t N_Warp = 2; + constexpr ck_tile::index_t K_Warp = 1; + + constexpr ck_tile::index_t M_Warp_Tile = 32; + constexpr ck_tile::index_t N_Warp_Tile = 32; + constexpr ck_tile::index_t K_Warp_Tile = 16; + + constexpr ck_tile::index_t VectorSizeA = 8; + constexpr ck_tile::index_t VectorSizeB = 8; + constexpr ck_tile::index_t VectorSizeC = 8; + + // Implicit GEMM Traits + using CodegenShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default; + using TilePartitioner = ck_tile::GemmTile1DPartitioner; + using GroupedConvTraitsType = + ck_tile::GroupedConvTraits; + using CodegenPipelineProblem = + ck_tile::GemmPipelineProblem; + using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1; + + const auto Run = [&](const auto memory_operation_) { + constexpr auto memory_operation = memory_operation_.value; + + using ConvEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem>; + + using Kernel = ck_tile::GroupedConvolutionBackwardDataKernel; + auto kargs = Kernel::MakeKernelArgs(args); + + const dim3 grids = Kernel::GridSize(args); + const dim3 blocks = Kernel::BlockSize(); + + if(!Kernel::IsSupportedArgument(kargs)) + { + throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n"); + } + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n' + << "shape: " << CodegenShape::GetName() << '\n' + << "problem: " << CodegenPipelineProblem::GetName() << '\n' + << "pipeline: " << CodegenPipeline::GetName() << '\n' + << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << '\n' + << "Vector size A: " << CodegenPipeline::GetVectorSizeA() + << ", Vector size B: " << CodegenPipeline::GetVectorSizeB() + << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl; + } + + float ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + return ave_time; + }; + + if(args.k_batch == 1) + { + return Run(ck_tile::integral_constant{}); + } + else + { + return Run(ck_tile::integral_constant{}); + } +} + +#include "run_grouped_convolution_bwd_data_example.inc" + +template +int run_grouped_conv_bwd_data_example_prec_type( + std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[]) +{ + using NWGC = ck_tile::tensor_layout::convolution::NWGC; + using NHWGC = ck_tile::tensor_layout::convolution::NHWGC; + using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC; + + using GKXC = ck_tile::tensor_layout::convolution::GKXC; + using GKYXC = ck_tile::tensor_layout::convolution::GKYXC; + using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC; + + using NWGK = ck_tile::tensor_layout::convolution::NWGK; + using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; + using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK; + + if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK") + { + return run_grouped_conv_bwd_data_example_with_layouts{}, + InPrecType, + WeiPrecType, + OutPrecType>( + argc, argv, NWGC{}, GKXC{}, NWGK{}); + } + else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK") + { + return run_grouped_conv_bwd_data_example_with_layouts{}, + InPrecType, + WeiPrecType, + OutPrecType>( + argc, argv, NHWGC{}, GKYXC{}, NHWGK{}); + } + else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK") + { + return run_grouped_conv_bwd_data_example_with_layouts{}, + InPrecType, + WeiPrecType, + OutPrecType>( + argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{}); + } + else + { + throw std::runtime_error("Unsupported memory layout!"); + } +} + +int run_grouped_conv_bwd_data_example(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + std::string data_type = arg_parser.get_str("prec"); + std::string in_layout = arg_parser.get_str("in_layout"); + std::string wei_layout = arg_parser.get_str("wei_layout"); + std::string out_layout = arg_parser.get_str("out_layout"); + + if(data_type == "fp16") + { + return run_grouped_conv_bwd_data_example_prec_type( + in_layout, wei_layout, out_layout, argc, argv); + } + else if(data_type == "bf16") + { + return run_grouped_conv_bwd_data_example_prec_type( + in_layout, wei_layout, out_layout, argc, argv); + } + else + { + throw std::runtime_error("Unsupported data type for this operation!"); + } +} + +int main(int argc, char* argv[]) { return !run_grouped_conv_bwd_data_example(argc, argv); } diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp index 67db775e09..debbb6bc0c 100644 --- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp +++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp @@ -78,7 +78,6 @@ float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args, typename GroupedConvTraitsType::ImplicitGemmDsLayout, ck_tile::tensor_layout::gemm::RowMajor, CDEElementWise, - CodegenPipelineProblem::kBlockSize, TilePartitioner::MPerBlock, TilePartitioner::NPerBlock, M_Warp, @@ -98,8 +97,8 @@ float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args, ConvEpilogue>; auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(kargs); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 grids = Kernel::GridSize(kargs); + const dim3 blocks = Kernel::BlockSize(); if(!Kernel::IsSupportedArgument(kargs)) { @@ -123,7 +122,7 @@ float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args, float ave_time = ck_tile::launch_kernel_time_mask( s, Kernel::Preprocess(kargs, s), - ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); return ave_time; }; diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp index ce19c77bc1..6700970583 100644 --- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp +++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp @@ -77,7 +77,6 @@ float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args, const ck_til typename GroupedConvTraitsType::ImplicitGemmDsLayout, ck_tile::tensor_layout::gemm::RowMajor, CDEElementWise, - CodegenPipelineProblem::kBlockSize, TilePartitioner::MPerBlock, TilePartitioner::NPerBlock, M_Warp, @@ -97,8 +96,8 @@ float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args, const ck_til ConvEpilogue>; auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(kargs); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 grids = Kernel::GridSize(kargs); + const dim3 blocks = Kernel::BlockSize(); if(!Kernel::IsSupportedArgument(kargs)) { @@ -120,7 +119,7 @@ float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args, const ck_til } float ave_time = ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); return ave_time; }; diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc new file mode 100644 index 0000000000..d1cf4fade7 --- /dev/null +++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + +template +float invoke_grouped_conv_bwd_data(ck_tile::GroupedConvBwdDataHostArgs& args, + int n_warmup, + int n_repeat) +{ + float ave_time = grouped_conv_bwd_data( + args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat}); + + std::size_t flop = args.GetFlops(); + std::size_t num_byte = args.GetByte(); + float tflops = static_cast(flop) / 1.E9 / ave_time; + float gb_per_sec = num_byte / 1.E6 / ave_time; + + std::cout << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << std::endl; + + return ave_time; +} + +template +int run_grouped_conv_bwd_data_example_with_layouts( + int argc, char* argv[], const InLayout, const WeiLayout, const OutLayout) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return -1; + + using AccDataType = float; + + std::vector filter_spatial_lengths; + std::vector image_spatial_lengths; + std::vector strides; + std::vector dilations; + std::vector lpads; + std::vector rpads; + + const ck_tile::index_t num_dim_sp = fill_spatial_dimensions(filter_spatial_lengths, + image_spatial_lengths, + strides, + dilations, + lpads, + rpads, + arg_parser); + + ck_tile::conv::ConvParam conv_param{num_dim_sp, + arg_parser.get_int("g"), + arg_parser.get_int("n"), + arg_parser.get_int("k"), + arg_parser.get_int("c"), + filter_spatial_lengths, + image_spatial_lengths, + strides, + dilations, + lpads, + rpads}; + + ck_tile::index_t kbatch = arg_parser.get_int("split_k"); + int n_warmup = arg_parser.get_int("warmup"); + int n_repeat = arg_parser.get_int("repeat"); + ck_tile::index_t init_method = arg_parser.get_int("init"); + + const auto in_g_n_c_wis_desc = + ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_param); + const auto wei_g_k_c_xs_desc = + ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed(conv_param); + const auto out_g_n_k_wos_desc = + ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed(conv_param); + + ck_tile::HostTensor input(in_g_n_c_wis_desc); + ck_tile::HostTensor weight(wei_g_k_c_xs_desc); + ck_tile::HostTensor output(out_g_n_k_wos_desc); + + if(init_method == 0) + { + ck_tile::FillUniformDistribution{-1.f, 1.f}(weight); + ck_tile::FillUniformDistribution{-1.f, 1.f}(output); + } + else if(init_method == 1) + { + ck_tile::FillMonotonicSeq{}(weight); + ck_tile::FillMonotonicSeq{}(output); + } + else if(init_method == 2) + { + ck_tile::FillUniformDistribution{1.f, 1.f}(weight); + ck_tile::FillUniformDistribution{1.f, 1.f}(output); + } + else + { + weight.SetZero(); + output.SetZero(); + } + + ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes()); + ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes()); + ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes()); + + input_dev_buf.SetZero(); + weight_dev_buf.ToDevice(weight.data()); + output_dev_buf.ToDevice(output.data()); + + ck_tile::GroupedConvBwdDataHostArgs args(conv_param, + input_dev_buf.GetDeviceBuffer(), + weight_dev_buf.GetDeviceBuffer(), + {}, + output_dev_buf.GetDeviceBuffer(), + kbatch); + + std::cout << "Run Grouped Conv Bwd Data kernel" << std::endl; + std::cout << "input: " << input.mDesc << std::endl; + std::cout << "weight: " << weight.mDesc << std::endl; + std::cout << "output: " << output.mDesc << std::endl; + + invoke_grouped_conv_bwd_data(args, n_warmup, n_repeat); + + input_dev_buf.FromDevice(input.data()); + bool pass = true; + + if(arg_parser.get_int("v") == 1) + { + ck_tile::HostTensor input_host_ref(in_g_n_c_wis_desc); + input_host_ref.SetZero(); + + ck_tile::reference_grouped_conv_bwd_data( + input_host_ref, + weight, + output, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_); + const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_); + const float max_accumulated_value = + *std::max_element(input_host_ref.mData.begin(), input_host_ref.mData.end()); + const auto rtol_atol = + calculate_rtol_atol( + GemmK, kbatch, max_accumulated_value); + pass = ck_tile::check_err(input, + input_host_ref, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); + + std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) + << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) + << std::endl; + std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl; + } + else if(arg_parser.get_int("v") == 2) + { + throw std::runtime_error("Unsupported gpu verification !!!"); + } + + return pass; +} diff --git a/example/ck_tile/21_elementwise/elementwise_example.cpp b/example/ck_tile/21_elementwise/elementwise_example.cpp index 469345b46c..2cc539e117 100644 --- a/example/ck_tile/21_elementwise/elementwise_example.cpp +++ b/example/ck_tile/21_elementwise/elementwise_example.cpp @@ -167,17 +167,17 @@ bool run(const ck_tile::ArgParser& arg_parser) } // 4. Run the kernel - float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, - ck_tile::make_kernel( - Kernel{}, - kGridSize, - kBlockSize, - 0, - input_size, - ck_tile::make_tuple(N, 1), // Input Stride - ck_tile::make_tuple(N, 1), // Output Stride - input_tensors, - static_cast(y_buf.GetDeviceBuffer()))); + float ave_time = launch_kernel( + ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, + ck_tile::make_kernel(Kernel{}, + kGridSize, + kBlockSize, + 0, + input_size, + ck_tile::make_tuple(N, 1), // Input Stride + ck_tile::make_tuple(N, 1), // Output Stride + input_tensors, + static_cast(y_buf.GetDeviceBuffer()))); std::cout << "Average time: " << ave_time << " ms" << std::endl; diff --git a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp index 4a031265c9..7087d092a2 100644 --- a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp +++ b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp @@ -113,7 +113,7 @@ bool run(const ck_tile::ArgParser& arg_parser) // Run the kernel float ave_time = launch_kernel( ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, - ck_tile::make_kernel( + ck_tile::make_kernel( Kernel{}, kGridSize, kBlockSize, diff --git a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp index aff74ae250..28cdaf27b9 100644 --- a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp +++ b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp @@ -112,17 +112,17 @@ bool run(const ck_tile::ArgParser& arg_parser) } // 4. Run the kernel - float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, - ck_tile::make_kernel( - Kernel{}, - kGridSize, - kBlockSize, - 0, // Shared memory - op_lengths, // Logical dimensions for the operation (M, N) - input_strides, // Strides for input tensor(s) - output_strides, // Strides for output tensor (N, M) - input_tensors, - static_cast(y_buf.GetDeviceBuffer()))); + float ave_time = launch_kernel( + ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, + ck_tile::make_kernel(Kernel{}, + kGridSize, + kBlockSize, + 0, // Shared memory + op_lengths, // Logical dimensions for the operation (M, N) + input_strides, // Strides for input tensor(s) + output_strides, // Strides for output tensor (N, M) + input_tensors, + static_cast(y_buf.GetDeviceBuffer()))); std::cout << "Average time: " << ave_time << " ms" << std::endl; diff --git a/example/ck_tile/21_elementwise/elementwise_example_unary.cpp b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp index d83592a033..782d3da24d 100644 --- a/example/ck_tile/21_elementwise/elementwise_example_unary.cpp +++ b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp @@ -99,17 +99,17 @@ bool run(const ck_tile::ArgParser& arg_parser) } // 4. Run the kernel - float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, - ck_tile::make_kernel( - Kernel{}, - kGridSize, - kBlockSize, - 0, - input_size, - ck_tile::make_tuple(N, 1), // Input Stride - ck_tile::make_tuple(N, 1), // Output Stride - input_tensors, - static_cast(y_buf.GetDeviceBuffer()))); + float ave_time = launch_kernel( + ck_tile::stream_config{nullptr, true, 0, warmup, repeat}, + ck_tile::make_kernel(Kernel{}, + kGridSize, + kBlockSize, + 0, + input_size, + ck_tile::make_tuple(N, 1), // Input Stride + ck_tile::make_tuple(N, 1), // Output Stride + input_tensors, + static_cast(y_buf.GetDeviceBuffer()))); std::cout << "Average time: " << ave_time << " ms" << std::endl; diff --git a/example/ck_tile/35_batched_transpose/README.md b/example/ck_tile/35_batched_transpose/README.md index 38bb2b32e4..56e9610b35 100644 --- a/example/ck_tile/35_batched_transpose/README.md +++ b/example/ck_tile/35_batched_transpose/README.md @@ -6,7 +6,7 @@ This folder contains example for batched Transpose using ck_tile tile-programmin # in the root of ck_tile mkdir build && cd build # you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank -sh ../script/cmake-ck-dev.sh ../ +../script/cmake-ck-dev.sh ../ # Make the transpose executable make tile_example_batched_transpose -j ``` diff --git a/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp b/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp index 1f0f0b9bc1..931a9dfa3c 100644 --- a/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp +++ b/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp @@ -74,8 +74,8 @@ float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_con auto kargs = kernel::MakeKargs(a); - const dim3 grids = kernel::GridSize(a); - constexpr dim3 blocks = kernel::BlockSize(); + const dim3 grids = kernel::GridSize(a); + const dim3 blocks = kernel::BlockSize(); printf("Pipeline: %d\n", Config::kPipelineId); printf("Grid: x=%u y=%u z=%u\n", grids.x, grids.y, grids.z); @@ -96,8 +96,8 @@ float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_con printf("Launching Kernel...\n"); - float ave_time = ck_tile::launch_kernel( - s, ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs)); + float ave_time = + ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(kernel{}, grids, blocks, 0, kargs)); printf("Kernel finished...\n"); diff --git a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt index 914fdac0e4..12cf874c73 100644 --- a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt +++ b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt @@ -8,9 +8,8 @@ list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95") add_executable(tile_example_gemm_aquant_basic EXCLUDE_FROM_ALL gemm_aquant_basic.cpp) target_compile_options(tile_example_gemm_aquant_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) - - add_executable(tile_example_gemm_aquant_preshuffle EXCLUDE_FROM_ALL gemm_aquant_preshuffle.cpp) - target_compile_options(tile_example_gemm_aquant_preshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) + add_executable(tile_example_gemm_bquant_basic EXCLUDE_FROM_ALL gemm_bquant_basic.cpp) + target_compile_options(tile_example_gemm_bquant_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) else() message(DEBUG "Skipping ck_tile quant gemm tests for current target") endif() diff --git a/example/ck_tile/38_block_scale_gemm/README.md b/example/ck_tile/38_block_scale_gemm/README.md index 742a88dee7..6d6aec28c8 100644 --- a/example/ck_tile/38_block_scale_gemm/README.md +++ b/example/ck_tile/38_block_scale_gemm/README.md @@ -7,9 +7,10 @@ This folder contains example for Block Scale GEMM using ck_tile tile-programming # in the root of ck_tile mkdir build && cd build # you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank -sh ../script/cmake-ck-dev.sh ../ +../script/cmake-ck-dev.sh ../ # The aquant pipeline method on the gemm calculation make tile_example_gemm_aquant_basic -j +make tile_example_gemm_bquant_basic -j ``` This will result in an executable `build/bin/tile_example_gemm_aquant_basic` diff --git a/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp b/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp index 2ac08c7343..d5a38fe754 100644 --- a/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp +++ b/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp @@ -8,11 +8,10 @@ #include #include -#include "ck_tile/core/config.hpp" -#include "ck_tile/host.hpp" #include "gemm_utils.hpp" -template + uint32_t QuantGroupSize> float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s) { constexpr bool kPadM = false; constexpr bool kPadN = false; constexpr bool kPadK = false; - constexpr int kBlockPerCu = 1; - static_assert(std::is_same_v); - constexpr ck_tile::index_t M_Tile = 16; - constexpr ck_tile::index_t N_Tile = 64; - constexpr ck_tile::index_t K_Tile = 256; + constexpr ck_tile::index_t M_Tile = GemmConfig::M_Tile; + constexpr ck_tile::index_t N_Tile = GemmConfig::N_Tile; + constexpr ck_tile::index_t K_Tile = GemmConfig::K_Tile; - constexpr ck_tile::index_t M_Warp = 1; - constexpr ck_tile::index_t N_Warp = 4; - constexpr ck_tile::index_t K_Warp = 1; + constexpr ck_tile::index_t M_Warp = GemmConfig::M_Warp; + constexpr ck_tile::index_t N_Warp = GemmConfig::N_Warp; + constexpr ck_tile::index_t K_Warp = GemmConfig::K_Warp; - constexpr ck_tile::index_t M_Warp_Tile = 16; - constexpr ck_tile::index_t N_Warp_Tile = 16; - constexpr ck_tile::index_t K_Warp_Tile = 32; + constexpr ck_tile::index_t M_Warp_Tile = GemmConfig::M_Warp_Tile; + constexpr ck_tile::index_t N_Warp_Tile = GemmConfig::N_Warp_Tile; + constexpr ck_tile::index_t K_Warp_Tile = GemmConfig::K_Warp_Tile; using CodegenGemmShape = ck_tile::TileGemmShape, @@ -52,8 +48,13 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s using TilePartitioner = ck_tile::GemmTile1DPartitioner; - using CodegenGemmTraits = - ck_tile::TileGemmAQuantTraits; + using CodegenGemmTraits = ck_tile::TileGemmAQuantTraits; using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase, CLayout, ck_tile::element_wise::PassThrough, - CodegenPipelineProblem::kBlockSize, TilePartitioner::MPerBlock, TilePartitioner::NPerBlock, M_Warp, @@ -111,8 +112,8 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); + const dim3 blocks = Kernel::BlockSize(); if(args.k_batch != 1) { @@ -136,7 +137,7 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s } float ave_time = ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); return ave_time; }; @@ -187,13 +188,14 @@ int run_gemm_example(int argc, char* argv[]) if(data_type == "fp8") { using TypeConfig = - decltype(GemmQuantTypeConfig{}); + decltype(GemmQuantTypeConfig{}); return run_gemm_example_prec_type, TypeConfig, 128>( a_layout, b_layout, argc, argv); } else if(data_type == "bf8") { - using TypeConfig = decltype(GemmQuantTypeConfig{}); + using TypeConfig = + decltype(GemmQuantTypeConfig{}); return run_gemm_example_prec_type, TypeConfig, 128>( a_layout, b_layout, argc, argv); } @@ -201,32 +203,18 @@ int run_gemm_example(int argc, char* argv[]) { using TypeConfig = decltype(GemmQuantTypeConfig{}); - return run_gemm_example_prec_type, TypeConfig, 128>( + return run_gemm_example_prec_type, TypeConfig, 128>( a_layout, b_layout, argc, argv); } else if(data_type == "i4bf8") { using TypeConfig = decltype(GemmQuantTypeConfig{}); - return run_gemm_example_prec_type, TypeConfig, 128>( - a_layout, b_layout, argc, argv); - } - else if(data_type == "i4f32fp8") - { - using TypeConfig = - decltype(GemmQuantTypeConfig{}); - return run_gemm_example_prec_type, TypeConfig, 128>( - a_layout, b_layout, argc, argv); - } - else if(data_type == "i4f32bf8") - { - using TypeConfig = - decltype(GemmQuantTypeConfig{}); - return run_gemm_example_prec_type, TypeConfig, 128>( + return run_gemm_example_prec_type, TypeConfig, 128>( a_layout, b_layout, argc, argv); } else @@ -235,4 +223,4 @@ int run_gemm_example(int argc, char* argv[]) } } -int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } +int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp b/example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp index f4f1aa98d3..13c416110a 100644 --- a/example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp +++ b/example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp @@ -8,11 +8,10 @@ #include #include -#include "ck_tile/core/config.hpp" -#include "ck_tile/host.hpp" #include "gemm_utils.hpp" -template + uint32_t QuantGroupSize> float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s) { constexpr bool kPadM = false; @@ -33,17 +31,17 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s static_assert(std::is_same_v); - constexpr ck_tile::index_t M_Tile = 16; - constexpr ck_tile::index_t N_Tile = 64; - constexpr ck_tile::index_t K_Tile = 256; + constexpr ck_tile::index_t M_Tile = GemmConfig::M_Tile; + constexpr ck_tile::index_t N_Tile = GemmConfig::N_Tile; + constexpr ck_tile::index_t K_Tile = GemmConfig::K_Tile; - constexpr ck_tile::index_t M_Warp = 1; - constexpr ck_tile::index_t N_Warp = 4; - constexpr ck_tile::index_t K_Warp = 1; + constexpr ck_tile::index_t M_Warp = GemmConfig::M_Warp; + constexpr ck_tile::index_t N_Warp = GemmConfig::N_Warp; + constexpr ck_tile::index_t K_Warp = GemmConfig::K_Warp; - constexpr ck_tile::index_t M_Warp_Tile = 16; - constexpr ck_tile::index_t N_Warp_Tile = 16; - constexpr ck_tile::index_t K_Warp_Tile = 32; + constexpr ck_tile::index_t M_Warp_Tile = GemmConfig::M_Warp_Tile; + constexpr ck_tile::index_t N_Warp_Tile = GemmConfig::N_Warp_Tile; + constexpr ck_tile::index_t K_Warp_Tile = GemmConfig::K_Warp_Tile; using CodegenGemmShape = ck_tile::TileGemmShape, @@ -52,8 +50,13 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s using TilePartitioner = ck_tile::GemmTile1DPartitioner; - using CodegenGemmTraits = - ck_tile::TileGemmAQuantTraits; + using CodegenGemmTraits = ck_tile::TileGemmAQuantTraits; using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase, CLayout, ck_tile::element_wise::PassThrough, - CodegenPipelineProblem::kBlockSize, TilePartitioner::MPerBlock, TilePartitioner::NPerBlock, M_Warp, @@ -111,8 +114,8 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); - constexpr dim3 blocks = Kernel::BlockSize(); + const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); + const dim3 blocks = Kernel::BlockSize(); if(args.k_batch != 1) { @@ -136,7 +139,7 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s } float ave_time = ck_tile::launch_kernel( - s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); return ave_time; }; @@ -187,13 +190,14 @@ int run_gemm_example(int argc, char* argv[]) if(data_type == "fp8") { using TypeConfig = - decltype(GemmQuantTypeConfig{}); + decltype(GemmQuantTypeConfig{}); return run_gemm_example_prec_type, TypeConfig, 128>( a_layout, b_layout, argc, argv); } else if(data_type == "bf8") { - using TypeConfig = decltype(GemmQuantTypeConfig{}); + using TypeConfig = + decltype(GemmQuantTypeConfig{}); return run_gemm_example_prec_type, TypeConfig, 128>( a_layout, b_layout, argc, argv); } @@ -201,7 +205,7 @@ int run_gemm_example(int argc, char* argv[]) { using TypeConfig = decltype(GemmQuantTypeConfig{}); return run_gemm_example_prec_type, TypeConfig, 128>( a_layout, b_layout, argc, argv); @@ -210,29 +214,18 @@ int run_gemm_example(int argc, char* argv[]) { using TypeConfig = decltype(GemmQuantTypeConfig{}); return run_gemm_example_prec_type, TypeConfig, 128>( a_layout, b_layout, argc, argv); } - else if(data_type == "i4f32fp8") - { - using TypeConfig = - decltype(GemmQuantTypeConfig{}); - return run_gemm_example_prec_type, TypeConfig, 128>( - a_layout, b_layout, argc, argv); - } - else if(data_type == "i4f32bf8") - { - using TypeConfig = - decltype(GemmQuantTypeConfig{}); - return run_gemm_example_prec_type, TypeConfig, 128>( - a_layout, b_layout, argc, argv); - } else { throw std::runtime_error("Unsupported data type for this operation !!!"); } } -int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } +int main(int argc, char* argv[]) +{ + return !run_gemm_example(argc, argv); +} diff --git a/example/ck_tile/38_block_scale_gemm/gemm_bquant_basic.cpp b/example/ck_tile/38_block_scale_gemm/gemm_bquant_basic.cpp new file mode 100644 index 0000000000..49e60bf86d --- /dev/null +++ b/example/ck_tile/38_block_scale_gemm/gemm_bquant_basic.cpp @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck_tile/core/config.hpp" +#include "ck_tile/host.hpp" +#include "gemm_utils.hpp" + +template +float gemm_calc_bquant(const ck_tile::BQuantGemmHostArgs& args, const ck_tile::stream_config& s) +{ + constexpr bool kPadM = false; + constexpr bool kPadN = false; + constexpr bool kPadK = false; + + static_assert(std::is_same_v); + + constexpr ck_tile::index_t M_Tile = GemmConfig::M_Tile; + constexpr ck_tile::index_t N_Tile = GemmConfig::N_Tile; + constexpr ck_tile::index_t K_Tile = GemmConfig::K_Tile; + + constexpr ck_tile::index_t M_Warp = GemmConfig::M_Warp; + constexpr ck_tile::index_t N_Warp = GemmConfig::N_Warp; + constexpr ck_tile::index_t K_Warp = GemmConfig::K_Warp; + + constexpr ck_tile::index_t M_Warp_Tile = GemmConfig::M_Warp_Tile; + constexpr ck_tile::index_t N_Warp_Tile = GemmConfig::N_Warp_Tile; + constexpr ck_tile::index_t K_Warp_Tile = GemmConfig::K_Warp_Tile; + + using CodegenGemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence>; + + using TilePartitioner = ck_tile::GemmTile1DPartitioner; + + using CodegenGemmTraits = ck_tile::TileGemmBQuantTraits; + + using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase; + + using BaseGemmPipeline = ck_tile::BaseBQuantGemmPipelineAgBgCrCompV3; + + const ck_tile::index_t K_split = (args.K + K_Tile - 1) / K_Tile * K_Tile; + const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); + const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); + const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); + constexpr bool transposed_warp_gemm = false; + + const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) { + constexpr bool has_hot_loop_v = has_hot_loop_.value; + constexpr auto tail_number_v = tail_number_.value; + + using CodegenPipelineProblem = + ck_tile::GemmBQuantPipelineProblem; + using CodegenGemmPipeline = ck_tile::BQuantGemmPipelineAgBgCrCompV3; + using GemmEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem, + AccDataType, + CDataType, + ck_tile::tuple<>, + CLayout, + ck_tile::element_wise::PassThrough, + TilePartitioner::MPerBlock, + TilePartitioner::NPerBlock, + M_Warp, + N_Warp, + M_Warp_Tile, + N_Warp_Tile, + K_Warp_Tile, + transposed_warp_gemm, + ck_tile::memory_operation_enum::set>>; + using Kernel = + ck_tile::BQuantGemmKernel; + + auto kargs = Kernel::MakeKernelArgs(args); + + const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch); + const dim3 blocks = Kernel::BlockSize(); + + if(args.k_batch != 1) + { + throw std::runtime_error("split-k is not supported yet!"); + } + + if(!Kernel::IsSupportedArgument(kargs)) + { + throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n"); + } + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n' + << "shape: " << CodegenGemmShape::GetName() << '\n' + << "problem: " << CodegenPipelineProblem::GetName() << '\n' + << "pipeline: " << CodegenGemmPipeline::GetName() << '\n' + << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + + float ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + return ave_time; + }; + return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num); + ; +} + +#include "run_gemm_bquant_example.inc" + +template +int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[]) +{ + using Row = ck_tile::tensor_layout::gemm::RowMajor; + using Col = ck_tile::tensor_layout::gemm::ColumnMajor; + + if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + if(a_layout == "R" && b_layout == "C") + { + return run_gemm_example_with_layouts( + argc, argv, Row{}, Col{}, Col{}, Row{}); + } + else + { + throw std::runtime_error("Unsupported memory layout for the input matrices!"); + } + } + else + { + throw std::runtime_error("Unsupported data type for B."); + } + + return 0; +} + +template