Merge branch 'develop' into hstu_attention_n0loop_fused_unroll

This commit is contained in:
Qianfeng Zhang
2025-08-18 13:47:44 +00:00
1876 changed files with 192746 additions and 21091 deletions

12
.github/CODEOWNERS vendored
View File

@@ -1,8 +1,8 @@
* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent
* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli @shumway @vidyasagar-amd
# Documentation files
docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz
*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz
*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz
.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz
docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd @ddembeckAMD
# Header directory for Doxygen documentation
library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz
library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @shumway @vidyasagar-amd

128
.github/workflows/therock-ci-linux.yml vendored Normal file
View File

@@ -0,0 +1,128 @@
name: TheRock CI Linux
on:
workflow_call:
inputs:
cmake_options:
type: string
amdgpu_families:
type: string
test_runs_on:
type: string
permissions:
contents: read
jobs:
therock-build-linux:
name: Build Linux Packages
runs-on: azure-linux-scale-rocm
permissions:
id-token: write
container:
image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:044b113562629f4bd2ec5d2e64b32eee11562d48fb1a75d7493daec9dd8d8292
env:
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
TEATIME_FORCE_INTERACTIVE: 0
steps:
- name: Checkout composable_kernel repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Checkout TheRock repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
ref: ec1c2ef4f2636bce7733fd8c95e1dbb6692c8a57
path: "TheRock"
- name: Runner Health Settings
run: |
df -h
cmake --version
echo "Installed Python versions:"
ls -d /opt/python
echo "python: $(which python), python3: $(which python3)"
echo "Git version: $(git --version)"
git config --global --add safe.directory $PWD
git config fetch.parallel 10
- name: Fetch sources
run: |
./TheRock/build_tools/fetch_sources.py --jobs 12
- name: Install python deps
run: |
pip install -r TheRock/requirements.txt
pip freeze
- name: Configure Projects
env:
amdgpu_families: ${{ env.AMDGPU_FAMILIES }}
package_version: ADHOCBUILD
extra_cmake_options: ${{ inputs.cmake_options }}
BUILD_DIR: build
run: |
python3 TheRock/build_tools/github_actions/build_configure.py
- name: Build TheRock
run: cmake --build TheRock/build
- name: Build therock-archives
run: cmake --build TheRock/build --target therock-archives
- name: Report
if: ${{ !cancelled() }}
run: |
echo "Full SDK du:"
echo "------------"
du -h -d 1 TheRock/build/dist/rocm
echo "Artifact Archives:"
echo "------------------"
ls -lh TheRock/build/artifacts/*.tar.xz
echo "Artifacts:"
echo "----------"
du -h -d 1 TheRock/build/artifacts
- name: Configure AWS Credentials
if: always()
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
aws-region: us-east-2
role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
- name: Create Logs index Files and upload logs
if: always()
run: |
python3 TheRock/build_tools/github_actions/create_log_index.py \
--build-dir=TheRock/build \
--amdgpu-family=${{ env.AMDGPU_FAMILIES }}
python3 TheRock/build_tools/github_actions/upload_build_logs_to_s3.py \
--build-dir=TheRock/build \
--run-id ${{ github.run_id }} \
--amdgpu-family ${{ env.AMDGPU_FAMILIES }}
- name: Upload artifacts
run: |
python TheRock/build_tools/github_actions/upload_build_artifacts.py \
--run-id ${{ github.run_id }} \
--amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
--build-dir TheRock/build
- name: Add Links to Job Summary
if: always()
run: |
python TheRock/build_tools/github_actions/upload_build_summary.py \
--run-id ${{ github.run_id }} \
--amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
--build-dir TheRock/build
therock-test-linux:
name: "Test"
needs: [therock-build-linux]
uses: ./.github/workflows/therock-test-packages.yml
with:
project_to_test: "miopen"
amdgpu_families: ${{ inputs.amdgpu_families }}
test_runs_on: ${{ inputs.test_runs_on }}
platform: "linux"

50
.github/workflows/therock-ci.yml vendored Normal file
View File

@@ -0,0 +1,50 @@
name: TheRock CI for composable_kernel
on:
push:
branches:
- develop
workflow_dispatch:
permissions:
contents: read
concurrency:
# A PR number if a pull request and otherwise the commit hash. This cancels
# queued and in-progress runs for the same PR (presubmit) or commit
# (postsubmit). The workflow name is prepended to avoid conflicts between
# different workflows.
group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
cancel-in-progress: true
jobs:
therock-ci-linux:
name: TheRock CI Linux
permissions:
contents: read
id-token: write
uses: ./.github/workflows/therock-ci-linux.yml
secrets: inherit
with:
cmake_options: "-DTHEROCK_ENABLE_COMPOSABLE_KERNEL=ON -DTHEROCK_ENABLE_MIOPEN=ON -DTHEROCK_ENABLE_ALL=OFF -DTHEROCK_USE_EXTERNAL_CK=ON -DTHEROCK_CK_SOURCE_DIR=../"
amdgpu_families: "gfx94X-dcgpu"
test_runs_on: "linux-mi325-1gpu-ossci-rocm"
therock_ci_summary:
name: TheRock CI Summary
if: always()
needs:
- therock-ci-linux
runs-on: ubuntu-24.04
steps:
- name: Output failed jobs
run: |
echo '${{ toJson(needs) }}'
FAILED_JOBS="$(echo '${{ toJson(needs) }}' \
| jq --raw-output \
'map_values(select(.result!="success" and .result!="skipped")) | keys | join(",")' \
)"
if [[ "${FAILED_JOBS}" != "" ]]; then
echo "The following jobs failed: ${FAILED_JOBS}"
exit 1
fi

View File

@@ -0,0 +1,76 @@
name: TheRock Test Packages
on:
workflow_call:
inputs:
project_to_test:
type: string
amdgpu_families:
type: string
test_runs_on:
type: string
platform:
type: string
permissions:
contents: read
jobs:
configure_test_matrix:
name: "Configure test matrix"
runs-on: ubuntu-24.04
if: ${{ inputs.test_runs_on != '' }}
outputs:
components: ${{ steps.configure.outputs.components }}
steps:
- name: "Checking out repository"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
- name: "Configuring CI options"
env:
PLATFORM: ${{ inputs.platform }}
project_to_test: ${{ inputs.project_to_test }}
id: configure
run: python ./build_tools/github_actions/fetch_test_configurations.py
test_components:
name: 'Test ${{ matrix.components.job_name }}'
runs-on: ${{ inputs.test_runs_on }}
needs: configure_test_matrix
# skip tests if no test matrix to run
if: ${{ needs.configure_test_matrix.outputs.components != '[]' }}
strategy:
fail-fast: false
matrix:
components: ${{ fromJSON(needs.configure_test_matrix.outputs.components) }}
defaults:
run:
shell: bash
env:
VENV_DIR: ${{ github.workspace }}/.venv
ARTIFACT_RUN_ID: "${{ github.run_id }}"
OUTPUT_ARTIFACTS_DIR: ${{ github.workspace }}/build
THEROCK_BIN_DIR: "./build/bin"
steps:
- name: Checkout Repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
- name: Run setup test environment workflow
uses: './.github/actions/setup_test_environment'
with:
ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
VENV_DIR: ${{ env.VENV_DIR }}
FETCH_ARTIFACT_ARGS: ${{ matrix.components.fetch_artifact_args }}
PLATFORM: ${{ inputs.platform }}
- name: Test
timeout-minutes: ${{ matrix.components.timeout_minutes }}
run: |
if [ "${{ inputs.PLATFORM }}" == "linux" ]; then source ${VENV_DIR}/bin/activate ; else . ${VENV_DIR}/Scripts/activate ; fi
${{ matrix.components.test_script }}

5
.gitignore vendored
View File

@@ -55,6 +55,8 @@ _static/
_templates/
_toc.yml
_doxygen/
docs/doxygen/html
docs/doxygen/xml
# JetBrains IDE
.idea/
@@ -66,3 +68,6 @@ build*/
# Python cache
__pycache__/
.cache/

26
.pre-commit-config.yaml Executable file → Normal file
View File

@@ -3,7 +3,7 @@ repos:
hooks:
- id: clang-format
name: clang-format
entry: clang-format-12 -i --style=file
entry: clang-format-18 -i --style=file
language: system
types_or: [c++, inc]
- id: copyright-year-checker
@@ -12,3 +12,27 @@ repos:
verbose: false
language: script
types: [c++]
- id: remove-exec-bit
name: Remove executable bit from non-executable files
entry: script/remove_exec_bit.sh
language: script
types_or: [c++, text]
verbose: true
- id: ruff-check
name: Ruff Linter
entry: ruff check --fix
language: python
types: [python]
additional_dependencies: [ruff]
- id: ruff-format
name: Ruff Formatter
entry: ruff format
language: python
types: [python]
additional_dependencies: [ruff]
- id: run-remod-if-ck-tile-changed
name: Run remod.py if ck_tile files changed
entry: script/remod_for_ck_tile.sh
language: script
always_run: true
pass_filenames: false

View File

@@ -2,16 +2,39 @@
Documentation for Composable Kernel available at [https://rocm.docs.amd.com/projects/composable_kernel/en/latest/](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/).
## Composable Kernel 1.1.0 for ROCm 6.5.0
## Composable Kernel 1.1.0 for ROCm 7.0.0
### Added
* Added a basic copy kernel example and supporting documentation for new CK Tile developers.
* Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data
* Added a fully asynchronous HOST (CPU) arguments copy flow for CK grouped GEMM kernels.
* Added support GKCYX layout for grouped convolution forward (NGCHW/GKCYX/NGKHW, number of instances in instance factory for NGCHW/GKYXC/NGKHW has been reduced).
* Added support for GKCYX layout for grouped convolution forward (NGCHW/GKCYX/NGKHW).
* Added support for GKCYX layout for grouped convolution backward weight (NGCHW/GKCYX/NGKHW).
* Added support for GKCYX layout for grouped convolution backward data (NGCHW/GKCYX/NGKHW).
* Added support for Stream-K version of mixed fp8/bf16 GEMM
* Added support for Multiple D GEMM
* Added GEMM pipeline for microscaling (MX) FP8/FP6/FP4 data types
* Added support for FP16 2:4 structured sparsity to universal GEMM.
* Added support for Split K for grouped convolution backward data.
* Added logit soft-capping support for fMHA forward kernels.
* Added support for hdim as a multiple of 32 for FMHA (fwd/fwd_splitkv)
* Added support for hdim as a multiple of 32 for FMHA (fwd/fwd_splitkv/bwd)
* Added benchmarking support for tile engine GEMM.
* Added Ping-pong scheduler support for GEMM operation along the K dimension.
* Added rotating buffer feature for CK_Tile GEMM.
* Added int8 support for CK_TILE GEMM.
* Added support for elementwise kernel.
* Added benchmarking support for tile engine GEMM Multi D.
### Optimized
None
* Optimize the gemm multiply multiply preshuffle & lds bypass with Pack of KGroup and better instruction layout. (#2166)
* Added Vectorize Transpose optimization for CK Tile (#2131)
* Added the asynchronous copy for gfx950 (#2425)
### Fixes
@@ -22,11 +45,18 @@ None
* Removed support for gfx940 and gfx941 targets (#1944)
* Replaced the raw buffer load/store intrinsics with Clang20 built-ins (#1876)
* DL and DPP kernels are now enabled by default.
* Number of instances in instance factory for grouped convolution forward NGCHW/GKYXC/NGKHW has been reduced.
* Number of instances in instance factory for grouped convolution backward weight NGCHW/GKYXC/NGKHW has been reduced.
* Number of instances in instance factory for grouped convolution backward data NGCHW/GKYXC/NGKHW has been reduced.
### Known issues
None
### Upcoming changes
* Non-grouped convolutions are deprecated. All of their functionality is supported by grouped convolution.
## Composable Kernel 1.1.0 for ROCm 6.1.0
### Additions

View File

@@ -26,17 +26,21 @@ set(version 1.1.0)
project(composable_kernel VERSION ${version} LANGUAGES CXX HIP)
include(CTest)
option(ENABLE_CLANG_CPP_CHECKS "Enables clang tidy, cppcheck" ON)
option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF)
option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)
# Usage: for customized Python location cmake -DCK_USE_ALTERNATIVE_PYTHON="/opt/Python-3.8.13/bin/python3.8"
# CK Codegen requires dataclass which is added in Python 3.7
# Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04
if(NOT CK_USE_ALTERNATIVE_PYTHON)
find_package(Python3 3.8 COMPONENTS Interpreter REQUIRED)
else()
message("Using alternative python version")
message(STATUS "Using alternative python version")
set(EXTRA_PYTHON_PATH)
# this is overly restrictive, we may need to be more flexible on the following
string(REPLACE "/bin/python3.8" "" EXTRA_PYTHON_PATH "${CK_USE_ALTERNATIVE_PYTHON}")
message("alternative python path is: ${EXTRA_PYTHON_PATH}")
message(STATUS "alternative python path is: ${EXTRA_PYTHON_PATH}")
find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED)
add_definitions(-DPython3_EXECUTABLE="${CK_USE_ALTERNATIVE_PYTHON}")
set(Python3_EXECUTABLE "${CK_USE_ALTERNATIVE_PYTHON}")
@@ -76,7 +80,7 @@ if (DTYPES)
add_definitions(-DCK_ENABLE_BF16)
set(CK_ENABLE_BF16 "ON")
endif()
message("DTYPES macro set to ${DTYPES}")
message(STATUS "DTYPES macro set to ${DTYPES}")
else()
add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16 -DCK_ENABLE_FP8 -DCK_ENABLE_BF8)
set(CK_ENABLE_INT8 "ON")
@@ -94,6 +98,15 @@ add_compile_options(-Wno-pass-failed)
add_compile_options(-Wno-switch-default)
add_compile_options(-Wno-unique-object-duplication)
# add -Og -gdwarf64 for debug builds
add_compile_options(
"$<$<CONFIG:Debug>:-Og>"
"$<$<CONFIG:Debug>:-gdwarf64>"
)
# Recent change in compiler makes this warning ON by default, which led to compile errors.
add_compile_options(-Wno-nrvo)
if(NOT DISABLE_DL_KERNELS)
add_definitions(-DDL_KERNELS)
set(DL_KERNELS "ON")
@@ -139,8 +152,8 @@ rocm_setup_version(VERSION ${version})
list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip "$ENV{ROCM_PATH}" "$ENV{HIP_PATH}")
message("GPU_TARGETS= ${GPU_TARGETS}")
message("GPU_ARCHS= ${GPU_ARCHS}")
message(STATUS "GPU_TARGETS= ${GPU_TARGETS}")
message(STATUS "GPU_ARCHS= ${GPU_ARCHS}")
if(GPU_ARCHS)
#disable GPU_TARGETS to avoid conflicts, this needs to happen before we call hip package
unset(GPU_TARGETS CACHE)
@@ -155,9 +168,9 @@ find_package(hip REQUIRED)
# No assumption that HIP kernels are launched with uniform block size for backward compatibility
# SWDEV-413293 and https://reviews.llvm.org/D155213
math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}")
message("hip_version_flat=${hip_VERSION_FLAT}")
message(STATUS "hip_version_flat=${hip_VERSION_FLAT}")
message("checking which targets are supported")
message(STATUS "checking which targets are supported")
#In order to build just the CK library (without tests and examples) for all supported GPU targets
#use -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
#the GPU_TARGETS flag will be reset in this case in order to avoid conflicts.
@@ -167,8 +180,12 @@ if(NOT ENABLE_ASAN_PACKAGING)
if(NOT WIN32 AND ${hip_VERSION_FLAT} LESS 600300000)
# WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above
set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
else()
elseif(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER_EQUAL 600300000 AND ${hip_VERSION_FLAT} LESS 600400000)
set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
elseif(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER_EQUAL 600400000 AND ${hip_VERSION_FLAT} LESS 600443483)
set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950")
elseif(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER_EQUAL 600443483)
set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx950;gfx10-3-generic;gfx11-generic;gfx12-generic")
endif()
else()
#build CK only for xnack-supported targets when using ASAN
@@ -192,23 +209,28 @@ endif()
rocm_check_target_ids(SUPPORTED_GPU_TARGETS
TARGETS ${CK_GPU_TARGETS})
message("Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}")
message(STATUS "Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}")
if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
message("Enabling XDL instances")
message(STATUS "Enabling XDL instances")
add_definitions(-DCK_USE_XDL)
set(CK_USE_XDL "ON")
endif()
if (SUPPORTED_GPU_TARGETS MATCHES "gfx94" OR SUPPORTED_GPU_TARGETS MATCHES "gfx95")
message("Enabling FP8 gemms on native architectures")
message(STATUS "Enabling XDL FP8 gemms on native architectures")
add_definitions(-DCK_USE_GFX94)
set(CK_USE_GFX94 "ON")
endif()
if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
message("Enabling WMMA instances")
message(STATUS "Enabling WMMA instances")
add_definitions(-DCK_USE_WMMA)
set(CK_USE_WMMA "ON")
endif()
if (SUPPORTED_GPU_TARGETS MATCHES "gfx12")
message(STATUS "Enabling WMMA FP8 gemms on native architectures")
add_definitions(-DCK_USE_WMMA_FP8)
set(CK_USE_WMMA_FP8 "ON")
endif()
if (SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx950")
add_definitions(-DCK_USE_OCP_FP8)
set(CK_USE_OCP_FP8 "ON")
@@ -220,6 +242,8 @@ endif()
if (SUPPORTED_GPU_TARGETS MATCHES "gfx950")
add_definitions(-DCK_USE_NATIVE_MX_SUPPORT)
set(CK_USE_NATIVE_MX_SUPPORT "ON")
add_definitions(-DCK_GFX950_SUPPORT)
set(CK_GFX950_SUPPORT "ON")
endif()
option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
@@ -234,32 +258,32 @@ configure_file(include/ck/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/con
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302)
check_cxx_compiler_flag("-fno-offload-uniform-block" HAS_NO_OFFLOAD_UNIFORM_BLOCK)
if(HAS_NO_OFFLOAD_UNIFORM_BLOCK)
message("Adding the fno-offload-uniform-block compiler flag")
message(STATUS "Adding the fno-offload-uniform-block compiler flag")
add_compile_options(-fno-offload-uniform-block)
endif()
endif()
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500500000)
check_cxx_compiler_flag("-mllvm --lsr-drop-solution=1" HAS_LSR_DROP_SOLUTION)
if(HAS_LSR_DROP_SOLUTION)
message("Adding the lsr-drop-solution=1 compiler flag")
message(STATUS "Adding the lsr-drop-solution=1 compiler flag")
add_compile_options("SHELL: -mllvm --lsr-drop-solution=1")
endif()
endif()
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED)
if(HAS_ENABLE_POST_MISCHED)
message("Adding the enable-post-misched=0 compiler flag")
message(STATUS "Adding the enable-post-misched=0 compiler flag")
add_compile_options("SHELL: -mllvm -enable-post-misched=0")
endif()
endif()
set(check-coerce)
check_cxx_compiler_flag(" -mllvm -amdgpu-coerce-illegal-types=1" check-coerce)
if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132)
message("Adding the amdgpu-coerce-illegal-types=1")
message(STATUS "Adding the amdgpu-coerce-illegal-types=1")
add_compile_options("SHELL: -mllvm -amdgpu-coerce-illegal-types=1")
endif()
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
message("Adding -amdgpu-early-inline-all=true and -amdgpu-function-calls=false")
message(STATUS "Adding -amdgpu-early-inline-all=true and -amdgpu-function-calls=false")
add_compile_options("SHELL: -mllvm -amdgpu-early-inline-all=true")
add_compile_options("SHELL: -mllvm -amdgpu-function-calls=false")
endif()
@@ -292,17 +316,31 @@ endif()
option(USE_BITINT_EXTENSION_INT4 "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
option(USE_OPT_GFX11 "Whether to enable LDS cumode and Wavefront32 mode for GFX11 silicons." OFF)
option(ENABLE_ASM_DUMP "Whether to enable assembly dump for kernels." OFF)
if(USE_BITINT_EXTENSION_INT4)
add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
add_compile_options(-Wno-bit-int-extension)
message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
message(STATUS "CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
endif()
if(USE_OPT_GFX11)
add_compile_options(-mcumode)
add_compile_options(-mno-wavefrontsize64)
message("CK compiled with USE_OPT_GFX11 set to ${USE_OPT_GFX11}")
add_compile_definitions(CK_TILE_WAVE32_ENABLED)
message(STATUS "CK compiled with USE_OPT_GFX11 set to ${USE_OPT_GFX11}")
endif()
if(ENABLE_ASM_DUMP)
add_compile_options(--save-temps)
add_compile_options(-Wno-gnu-line-marker)
message("CK compiled with ENABLE_ASM_DUMP set to ${ENABLE_ASM_DUMP}")
endif()
if(USE_OPT_GFX12 AND (SUPPORTED_GPU_TARGETS MATCHES "gfx12"))
add_compile_options(-mno-wavefrontsize64)
add_compile_definitions(CK_TILE_WAVE32_ENABLED)
message(STATUS "CK compiled with USE_OPT_GFX12 set to ${USE_OPT_GFX12}")
endif()
## Threads
@@ -311,10 +349,10 @@ find_package(Threads REQUIRED)
link_libraries(Threads::Threads)
## C++
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
message("CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
# https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_macros.html
# _GLIBCXX_ASSERTIONS
@@ -330,7 +368,7 @@ endif()
set(CMAKE_HIP_PLATFORM amd)
set(CMAKE_HIP_COMPILER ${CMAKE_CXX_COMPILER})
set(CMAKE_HIP_EXTENSIONS ON)
message("CMAKE_HIP_COMPILER: ${CMAKE_HIP_COMPILER}")
message(STATUS "CMAKE_HIP_COMPILER: ${CMAKE_HIP_COMPILER}")
## OpenMP
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
@@ -345,10 +383,10 @@ else()
find_package(OpenMP REQUIRED)
endif()
message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
message(STATUS "OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
message(STATUS "OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
message(STATUS "OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
message(STATUS "OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
link_libraries(${OpenMP_gomp_LIBRARY})
link_libraries(${OpenMP_pthread_LIBRARY})
@@ -380,146 +418,152 @@ else()
add_compile_definitions(__HIP_PLATFORM_HCC__=1)
endif()
## tidy
include(EnableCompilerWarnings)
## tidy
set(CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
set(CK_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter)
set(CK_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter)
# Enable tidy on hip
elseif(CK_BACKEND STREQUAL "HIP" OR CK_BACKEND STREQUAL "HIPNOGPU")
set(CK_TIDY_ERRORS ALL)
set(CK_TIDY_ERRORS ALL)
endif()
include(ClangTidy)
enable_clang_tidy(
CHECKS
*
-abseil-*
-android-cloexec-fopen
# Yea we shouldn't be using rand()
-cert-msc30-c
-bugprone-exception-escape
-bugprone-macro-parentheses
-cert-env33-c
-cert-msc32-c
-cert-msc50-cpp
-cert-msc51-cpp
-cert-dcl37-c
-cert-dcl51-cpp
-clang-analyzer-alpha.core.CastToStruct
-clang-analyzer-optin.performance.Padding
-clang-diagnostic-deprecated-declarations
-clang-diagnostic-extern-c-compat
-clang-diagnostic-unused-command-line-argument
-cppcoreguidelines-avoid-c-arrays
-cppcoreguidelines-avoid-magic-numbers
-cppcoreguidelines-explicit-virtual-functions
-cppcoreguidelines-init-variables
-cppcoreguidelines-macro-usage
-cppcoreguidelines-non-private-member-variables-in-classes
-cppcoreguidelines-pro-bounds-array-to-pointer-decay
-cppcoreguidelines-pro-bounds-constant-array-index
-cppcoreguidelines-pro-bounds-pointer-arithmetic
-cppcoreguidelines-pro-type-member-init
-cppcoreguidelines-pro-type-reinterpret-cast
-cppcoreguidelines-pro-type-union-access
-cppcoreguidelines-pro-type-vararg
-cppcoreguidelines-special-member-functions
-fuchsia-*
-google-explicit-constructor
-google-readability-braces-around-statements
-google-readability-todo
-google-runtime-int
-google-runtime-references
-hicpp-vararg
-hicpp-braces-around-statements
-hicpp-explicit-conversions
-hicpp-named-parameter
-hicpp-no-array-decay
# We really shouldn't use bitwise operators with signed integers, but
# opencl leaves us no choice
-hicpp-avoid-c-arrays
-hicpp-signed-bitwise
-hicpp-special-member-functions
-hicpp-uppercase-literal-suffix
-hicpp-use-auto
-hicpp-use-equals-default
-hicpp-use-override
-llvm-header-guard
-llvm-include-order
#-llvmlibc-*
-llvmlibc-restrict-system-libc-headers
-llvmlibc-callee-namespace
-llvmlibc-implementation-in-namespace
-llvm-else-after-return
-llvm-qualified-auto
-misc-misplaced-const
-misc-non-private-member-variables-in-classes
-misc-no-recursion
-modernize-avoid-bind
-modernize-avoid-c-arrays
-modernize-pass-by-value
-modernize-use-auto
-modernize-use-default-member-init
-modernize-use-equals-default
-modernize-use-trailing-return-type
-modernize-use-transparent-functors
-performance-unnecessary-value-param
-readability-braces-around-statements
-readability-else-after-return
# we are not ready to use it, but very useful
-readability-function-cognitive-complexity
-readability-isolate-declaration
-readability-magic-numbers
-readability-named-parameter
-readability-uppercase-literal-suffix
-readability-convert-member-functions-to-static
-readability-qualified-auto
-readability-redundant-string-init
# too many narrowing conversions in our code
-bugprone-narrowing-conversions
-cppcoreguidelines-narrowing-conversions
-altera-struct-pack-align
-cppcoreguidelines-prefer-member-initializer
${CK_TIDY_CHECKS}
${CK_TIDY_ERRORS}
HEADER_FILTER
"\.hpp$"
EXTRA_ARGS
-DCK_USE_CLANG_TIDY
)
if(ENABLE_CLANG_CPP_CHECKS)
include(ClangTidy)
enable_clang_tidy(
CHECKS
*
-abseil-*
-android-cloexec-fopen
# Yea we shouldn't be using rand()
-cert-msc30-c
-bugprone-exception-escape
-bugprone-macro-parentheses
-cert-env33-c
-cert-msc32-c
-cert-msc50-cpp
-cert-msc51-cpp
-cert-dcl37-c
-cert-dcl51-cpp
-clang-analyzer-alpha.core.CastToStruct
-clang-analyzer-optin.performance.Padding
-clang-diagnostic-deprecated-declarations
-clang-diagnostic-extern-c-compat
-clang-diagnostic-unused-command-line-argument
-cppcoreguidelines-avoid-c-arrays
-cppcoreguidelines-avoid-magic-numbers
-cppcoreguidelines-explicit-virtual-functions
-cppcoreguidelines-init-variables
-cppcoreguidelines-macro-usage
-cppcoreguidelines-non-private-member-variables-in-classes
-cppcoreguidelines-pro-bounds-array-to-pointer-decay
-cppcoreguidelines-pro-bounds-constant-array-index
-cppcoreguidelines-pro-bounds-pointer-arithmetic
-cppcoreguidelines-pro-type-member-init
-cppcoreguidelines-pro-type-reinterpret-cast
-cppcoreguidelines-pro-type-union-access
-cppcoreguidelines-pro-type-vararg
-cppcoreguidelines-special-member-functions
-fuchsia-*
-google-explicit-constructor
-google-readability-braces-around-statements
-google-readability-todo
-google-runtime-int
-google-runtime-references
-hicpp-vararg
-hicpp-braces-around-statements
-hicpp-explicit-conversions
-hicpp-named-parameter
-hicpp-no-array-decay
# We really shouldn't use bitwise operators with signed integers, but
# opencl leaves us no choice
-hicpp-avoid-c-arrays
-hicpp-signed-bitwise
-hicpp-special-member-functions
-hicpp-uppercase-literal-suffix
-hicpp-use-auto
-hicpp-use-equals-default
-hicpp-use-override
-llvm-header-guard
-llvm-include-order
#-llvmlibc-*
-llvmlibc-restrict-system-libc-headers
-llvmlibc-callee-namespace
-llvmlibc-implementation-in-namespace
-llvm-else-after-return
-llvm-qualified-auto
-misc-misplaced-const
-misc-non-private-member-variables-in-classes
-misc-no-recursion
-modernize-avoid-bind
-modernize-avoid-c-arrays
-modernize-pass-by-value
-modernize-use-auto
-modernize-use-default-member-init
-modernize-use-equals-default
-modernize-use-trailing-return-type
-modernize-use-transparent-functors
-performance-unnecessary-value-param
-readability-braces-around-statements
-readability-else-after-return
# we are not ready to use it, but very useful
-readability-function-cognitive-complexity
-readability-isolate-declaration
-readability-magic-numbers
-readability-named-parameter
-readability-uppercase-literal-suffix
-readability-convert-member-functions-to-static
-readability-qualified-auto
-readability-redundant-string-init
# too many narrowing conversions in our code
-bugprone-narrowing-conversions
-cppcoreguidelines-narrowing-conversions
-altera-struct-pack-align
-cppcoreguidelines-prefer-member-initializer
${CK_TIDY_CHECKS}
${CK_TIDY_ERRORS}
HEADER_FILTER
"\.hpp$"
EXTRA_ARGS
-DCK_USE_CLANG_TIDY
)
include(CppCheck)
enable_cppcheck(
CHECKS
warning
style
performance
portability
SUPPRESS
ConfigurationNotChecked
constStatement
duplicateCondition
noExplicitConstructor
passedByValue
preprocessorErrorDirective
shadowVariable
unusedFunction
unusedPrivateFunction
unusedStructMember
unmatchedSuppression
FORCE
SOURCES
library/src
INCLUDE
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_BINARY_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/library/include
DEFINE
CPPCHECK=1
__linux__=1
)
include(CppCheck)
enable_cppcheck(
CHECKS
warning
style
performance
portability
SUPPRESS
ConfigurationNotChecked
constStatement
duplicateCondition
noExplicitConstructor
passedByValue
preprocessorErrorDirective
shadowVariable
unusedFunction
unusedPrivateFunction
unusedStructMember
unmatchedSuppression
FORCE
SOURCES
library/src
INCLUDE
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_BINARY_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/library/include
DEFINE
CPPCHECK=1
__linux__=1
)
else()
function(clang_tidy_check TARGET)
# stub out empty function if clang tidy is not enabled
endfunction()
endif()
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
@@ -538,7 +582,7 @@ if(BUILD_DEV)
add_compile_options(-Werror)
add_compile_options(-Weverything)
endif()
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
add_compile_options(-fcolor-diagnostics)
@@ -547,12 +591,15 @@ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERS
add_compile_options(-fdiagnostics-color=always)
endif()
# make check runs the entire set of examples and tests
add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
# make smoke runs the tests and examples that runs within 30 seconds on gfx90a
add_custom_target(smoke COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR} -L "SMOKE_TEST")
# make regression runs the tests and examples that runs for more 30 seconds on gfx90a
add_custom_target(regression COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR} -L "REGRESSION_TEST")
if(NOT MIOPEN_REQ_LIBS_ONLY)
# make check runs the entire set of examples and tests
add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
# make smoke runs the tests and examples that runs within 30 seconds on gfx90a
add_custom_target(smoke COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR} -L "SMOKE_TEST")
# make regression runs the tests and examples that runs for more 30 seconds on gfx90a
add_custom_target(regression COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR} -L "REGRESSION_TEST")
endif()
file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
@@ -595,9 +642,14 @@ ENDIF()
ENDFOREACH()
add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES})
option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF)
option(DISABLE_OFFLOAD_COMPRESS "Disable offload compress compiler flag when building instances" OFF)
option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)
add_subdirectory(library)
if(NOT GPU_ARCHS AND USER_GPU_TARGETS)
if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY)
rocm_package_setup_component(tests
LIBRARY_NAME composablekernel
PACKAGE_NAME tests # Prevent -static suffix on package name
@@ -608,16 +660,19 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS)
PACKAGE_NAME examples
)
add_subdirectory(example)
add_subdirectory(tile_engine)
if(BUILD_TESTING)
add_subdirectory(test)
endif()
endif()
rocm_package_setup_component(profiler
LIBRARY_NAME composablekernel
PACKAGE_NAME ckprofiler
)
add_subdirectory(profiler)
if (NOT MIOPEN_REQ_LIBS_ONLY)
rocm_package_setup_component(profiler
LIBRARY_NAME composablekernel
PACKAGE_NAME ckprofiler
)
add_subdirectory(profiler)
endif()
if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))
add_subdirectory(codegen)

View File

@@ -20,10 +20,11 @@ Tejash Shah, 2019-2020
Xiaoyan Zhou, 2020
[Jianfeng Yan](https://github.com/j4yan), 2021-2022
[Jun Liu](https://github.com/junliume), 2021-2024
## Product Manager
[Jun Liu](https://github.com/junliume)
[John Afaganis](https://github.com/afagaj)
## Contributors

View File

@@ -1,6 +1,6 @@
FROM ubuntu:22.04
FROM ubuntu:24.04
ARG DEBIAN_FRONTEND=noninteractive
ARG ROCMVERSION=6.3
ARG ROCMVERSION=6.4.1
ARG compiler_version=""
ARG compiler_commit=""
ARG CK_SCCACHE=""
@@ -9,19 +9,18 @@ ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
# Add rocm repository
RUN set -xe && \
useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins && \
apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
RUN if [ "$ROCMVERSION" != "6.4" ]; then \
sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.3.60300-1_all.deb --no-check-certificate" && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.3.60300-1_all.deb && \
RUN if [ "$ROCMVERSION" != "6.5" ]; then \
sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/jammy/amdgpu-install_6.4.60401-1_all.deb --no-check-certificate" && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.4.60401-1_all.deb && \
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO jammy main > /etc/apt/sources.list.d/rocm.list" && \
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu jammy main > /etc/apt/sources.list.d/amdgpu.list'; \
fi
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" && \
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu jammy main universe | tee -a /etc/apt/sources.list" && \
amdgpu-install -y --usecase=rocm --no-dkms
## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
@@ -44,17 +43,13 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
iputils-ping \
jq \
libelf-dev \
libncurses5-dev \
libnuma-dev \
libpthread-stubs0-dev \
llvm-amdgpu \
mpich \
net-tools \
pkg-config \
python \
python3 \
python3-dev \
python3-pip \
python3-full \
redis \
rocm-llvm-dev \
sshpass \
@@ -67,6 +62,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
libzstd-dev \
openssh-server \
clang-format-12 \
clang-format-18 \
kmod && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* && \
@@ -74,17 +70,14 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
# Remove unnecessary rocm components that take a lot of space
apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt
# Update the cmake to version 3.27.5
RUN pip install --upgrade cmake==3.27.5 && \
#Install latest ccache
git clone https://github.com/ccache/ccache.git && \
RUN git clone https://github.com/ccache/ccache.git && \
cd ccache && mkdir build && cd build && cmake .. && make install && \
#Install ninja build tracing tools
cd / && \
wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip && \
gunzip /usr/local/bin/ninja.gz && \
chmod a+x /usr/local/bin/ninja && \
git clone https://github.com/nico/ninjatracing.git && \
#Install ClangBuildAnalyzer
git clone https://github.com/aras-p/ClangBuildAnalyzer.git && \
cd ClangBuildAnalyzer/ && \
@@ -98,8 +91,7 @@ RUN pip install --upgrade cmake==3.27.5 && \
wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \
dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \
# Install packages for processing the performance results
pip3 install --upgrade pip && \
pip3 install --upgrade pytest sqlalchemy==2.0.36 pymysql pandas==2.2.3 setuptools-rust setuptools>=75 sshtunnel==0.4.0 && \
pip3 install --break-system-packages --upgrade pytest pymysql pandas==2.2.3 sqlalchemy==2.0.3 setuptools-rust setuptools sshtunnel==0.4.0 && \
# Add render group
groupadd -f render && \
# Install the new rocm-cmake version

21
Dockerfile.aiter Normal file
View File

@@ -0,0 +1,21 @@
ARG BASE_DOCKER="rocm/pytorch:latest"
FROM $BASE_DOCKER
ARG AITER_BRANCH="main"
ARG CK_AITER_BRANCH="develop"
RUN groupadd -g 109 render && \
usermod -u 1001 jenkins && \
groupmod -g 1001 jenkins && \
pip install pandas zmq einops && \
pip install numpy==1.26.2 && \
sudo mkdir /home/jenkins && \
sudo mkdir /home/jenkins/workspace && \
cd /home/jenkins/workspace && \
rm -rf aiter && \
git clone -b "$AITER_BRANCH" --recursive https://github.com/ROCm/aiter.git && \
cd aiter && \
rm -rf 3rdparty/composable_kernel/ && \
git clone -b "$CK_AITER_BRANCH" https://github.com/ROCm/composable_kernel.git 3rdparty/composable_kernel/ && \
python3 setup.py develop && \
chown -R jenkins:jenkins /home/jenkins/workspace && \
chmod -R a+rwx /home/jenkins/workspace && \
sudo usermod -aG irc jenkins

View File

@@ -1,4 +1,4 @@
ARG BASE_DOCKER="rocm/composable_kernel:ck_ub22.04_rocm6.3"
ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm6.4.1"
FROM $BASE_DOCKER
ARG compiler_version=""
ARG compiler_commit=""

677
Jenkinsfile vendored

File diff suppressed because it is too large Load Diff

View File

@@ -96,7 +96,7 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
4. Build the entire CK library:
```bash
make -j
make -j"$(nproc)"
```
5. Install CK:
@@ -104,6 +104,7 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
```bash
make -j install
```
**[See Note on -j](#notes)**
## Optional post-install steps
@@ -146,7 +147,8 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
```
Note the `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
### Notes
The `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and
crash. On average, you should expect each thread to use ~2Gb of RAM.
Depending on the number of CPU cores and the amount of RAM on your system, you may want to

View File

@@ -1,2 +1,348 @@
[Back to the main page](./README.md)
# Composable Kernel terminology
# Composable Kernel Terminology
This document provides a technical reference for terminology used in the Composable Kernel library, organized by conceptual progression from hardware to machine learning operations.
---
## Glossary Index (Alphabetical)
- [Add+Multiply](#addmultiply)
- [Bank Conflict](#bank-conflict)
- [Batched GEMM](#batched-gemm)
- [Benchmark](#benchmark)
- [Block Size](#block-size)
- [Block Tile](#block-tile)
- [Compute Unit (CU)](#compute-unit-cu)
- [Coordinate Transformation Primitives](#coordinate-transformation-primitives)
- [CUDA](#cuda)
- [Dense Tensor](#dense-tensor)
- [Descriptor](#descriptor)
- [Device](#device)
- [Elementwise](#elementwise)
- [Epilogue](#epilogue)
- [Fast Changing Dimension](#fast-changing-dimension)
- [GEMM](#gemm-general-matrix-multiply)
- [GEMV](#gemv)
- [Grouped GEMM](#grouped-gemm)
- [Global Memory](#global-memory)
- [Grid](#grid)
- [Host](#host)
- [HIP](#hip)
- [Inner Dimension](#inner-dimension)
- [Inner Product](#inner-product)
- [Input/Problem Shape](#inputproblem-shape)
- [Kernel](#kernel)
- [Launch Parameters](#launch-parameters)
- [Load Tile](#load-tile)
- [LDS Banks](#lds-banks)
- [Matrix Core](#matrix-core)
- [MFMA (Matrix Fused Multiply-Add)](#mfma-matrix-fused-multiply-add)
- [Occupancy](#occupancy)
- [Outer Dimension](#outer-dimension)
- [Outer Product](#outer-product)
- [Pinned Memory](#pinned-memory)
- [Pipeline](#pipeline)
- [Policy](#policy)
- [Problem](#problem)
- [Processing Units](#processing-units)
- [Reference Kernel](#reference-kernel)
- [Regression Test](#regression-test)
- [ROCm](#rocm)
- [Scalar General Purpose Register (SGPR)](#scalar-general-purpose-register-sgpr)
- [Shared Memory / LDS (Local Data Share)](#shared-memory--lds-local-data-share)
- [SIMT / SIMD](#simt--simd)
- [Smoke Test](#smoke-test)
- [Sparse Tensor](#sparse-tensor)
- [Split-K GEMM](#split-k-gemm)
- [Store Tile](#store-tile)
- [Thread / Work-item](#thread--work-item)
- [Thread Block / Work Group](#thread-block--work-group)
- [Vanilla GEMM](#vanilla-gemm)
- [Tile](#tile)
- [Tile Distribution](#tile-distribution)
- [Tile Partitioner](#tile-partitioner)
- [Tile Programming API](#tile-programming-api)
- [Tile Window](#tile-window)
- [User Customized Tile Pipeline](#user-customized-tile-pipeline)
- [User Customized Tile Pipeline Optimization](#user-customized-tile-pipeline-optimization)
- [Vector](#vector)
- [Vector General Purpose Register (VGPR)](#vector-general-purpose-register-vgpr)
- [Warp / Wavefront](#warp--wavefront)
- [Wave Tile](#wave-tile)
- [XDL Instructions](#xdl-instructions)
---
## 1. Hardware and Memory
### Processing Units
The GPU is composed of multiple hardware units ([compute units (CUs)](#compute-unit-cu) on AMD, [streaming multiprocessors (SMs)](#compute-unit-cu) on NVIDIA), each containing many cores that run threads in parallel. These units manage shared resources and coordinate execution at scale.
### Matrix Core
Specialized GPU units that accelerate matrix operations for AI and deep learning tasks. Modern GPUs contain multiple matrix cores.
### Compute Unit (CU)
AMD's parallel vector processor in a GPU with multiple ALUs. Each compute unit will run all the waves in a workgroup. _This is equivalent to NVIDIA's streaming multiprocessor (SM)_.
### Matrix Fused Multiply-Add (MFMA)
AMD's matrix core instruction for efficient GEMM operations. CK optimizes kernel designs to maximize MFMA utilization and performance.
### Registers
The fastest memory tier, registers are private to each thread/work-item and used for storing temporary variables during computation. AMD distinguishes between [vector (VGPR)](#vector-general-purpose-register-vgpr) and [scalar (SGPR)](#scalar-general-purpose-register-sgpr) registers, while NVIDIA uses a unified register file.
### Vector General Purpose Register (VGPR)
Per-thread registers that store individual thread data within a wave. Each thread has its own set of VGPRs for private variables and calculations.
### Scalar General Purpose Register (SGPR)
Wave-level registers shared by all threads in a wave. Used for constants, addresses, and control flow common across the entire wave.
### Shared Memory / Local Data Share (LDS)
AMD's high-bandwidth, low-latency on-chip memory accessible to all threads within a work group. This is equivalent to NVIDIA's shared memory. It enables fast data sharing and synchronization, but is limited in capacity and must be managed to avoid [bank conflicts](#bank-conflict).
### LDS Banks
Memory organization where consecutive addresses are distributed across multiple memory banks for parallel access. Prevents memory access conflicts ([bank conflicts](#bank-conflict)) and improves bandwidth.
### Global Memory
The main device memory accessible by all threads, offering high capacity but higher latency than shared memory.
### Pinned Memory
Host memory that is page-locked to accelerate transfers between CPU and GPU, reducing overhead for large data movements.
### Dense Tensor
A tensor in which most elements are nonzero, typically stored in a contiguous block of memory.
### Sparse Tensor
A tensor in which most elements are zero, allowing for memory and computation optimizations by storing only nonzero values and their indices.
### Host
CPU and main memory system that manages GPU execution. Launches kernels, transfers data, and coordinates overall computation.
### Device
GPU hardware that executes parallel kernels. Contains compute units, memory hierarchy, and specialized accelerators.
---
## 2. GPU Programming Model
### Thread / Work-item
AMD's work-item is the smallest unit of parallel execution, each running an independent instruction stream on a single data element. This is equivalent to NVIDIA's thread. Work-items/threads are grouped into [wavefronts (AMD)](#warp--wavefront) and [warps (NVIDIA)](#warp--wavefront) for efficient scheduling and resource sharing.
### Warp / Wavefront
AMD's wavefront is a group of threads that run instructions in lockstep, forming the SIMD group. This is equivalent to NVIDIA's warp.
### Thread Block / Work Group
AMD's work group is a collection of threads/work-items that can synchronize and share memory. This is equivalent to NVIDIA's thread block. Work groups/thread blocks are scheduled independently and mapped to hardware units for execution.
### Grid
The complete collection of all work groups (thread blocks) that execute a kernel. A grid spans the entire computational domain and is organized in 1D, 2D, or 3D dimensions. Each work group within the grid operates independently and can be scheduled on different compute units, enabling massive parallel execution across the entire GPU.
### Block Size
Number of work-items/threads in a compute unit (CU). Determines work group size and memory usage.
### Single-Instruction, Multi-Thread (SIMT) / Single-Instruction, Multi-Data (SIMD)
SIMT (Single-Instruction, Multi-Thread) allows threads in a warp to diverge, while SIMD (Single-Instruction, Multi-Data) enforces strict lockstep execution within wavefronts. These models define how parallelism is expressed and managed on different architectures.
### Occupancy
The ratio of active warps/wavefronts to the maximum number of warps/wavefronts supported by a hardware unit. Affects the ability to hide memory latency and maximize throughput.
---
## 3. Kernel Structure
### Kernel
A function executed on the GPU, typically written in [HIP](#hip) or [CUDA](#cuda), that performs parallel computations over input data. Kernels are launched with specific grid and block dimensions to map computation to hardware. In CK, kernels are composed from pipelines and require a pipeline, tile partitioner, and epilogue component.
### Pipeline
A CK Pipeline orchestrates the sequence of operations for a kernel, including data loading, computation, and storage phases. It consists of two core components: a [Problem](#problem) component that defines what to compute, and a [Policy](#policy) component that specifies how to move data around.
### Tile Partitioner
Defines the mapping between problem dimensions (M, N, K) and GPU hierarchy. It specifies workgroup-level tile sizes (kM, kN, kK) and determines grid dimensions by dividing the problem size by tile sizes.
### Problem
Defines what to compute - input/output shapes, data types, and mathematical operations (e.g., GEMM, convolution).
### Policy
Defines memory access patterns and hardware-specific optimizations.
### User Customized Tile Pipeline
User-defined pipeline that combines custom problem and policy components for specialized computations. CK also provides prebuilt pipelines and policies for common operations that can be used as starting points.
### User Customized Tile Pipeline Optimization
Process of tuning tile sizes, memory access patterns, and hardware utilization for specific workloads. CK also provides prebuilt pipelines and policies for common operations that can be used as starting points.
### Tile Programming API
CK's high-level interface for defining tile-based computations with predefined hardware mapping for data load/store.
### Coordinate Transformation Primitives
CK utilities for converting between different coordinate systems (logical, physical, memory layouts).
### Reference Kernel
A baseline kernel implementation used to verify correctness and performance. CK has two reference kernel implementations: one for CPU and one for GPU.
### Launch Parameters
Configuration values (e.g., grid size, block size) that determine how a kernel is mapped to hardware resources. Proper tuning of these parameters is essential for optimal performance.
---
## 4. Memory Access and Data Layout
### Memory Coalescing
An optimization where consecutive threads access consecutive memory addresses, allowing a single memory transaction to serve multiple threads. Proper coalescing is vital for achieving peak memory bandwidth.
### Alignment
A memory management startegy for efficient memory access where data structures are stored at addresses that are multiples of a specific value.
### Bank Conflict
Occurs when multiple threads in a warp/wavefront access different addresses mapping to the same shared memory bank, causing serialization and reduced bandwidth.
### Padding
The addition of extra elements (often zeros) to tensor edges. This is used to control output size in convolution and pooling, or to align data for efficient memory access.
### Permute/Transpose
Operations that rearrange the order of tensor axes, often required to match kernel input formats or optimize memory access patterns.
### Host-Device Transfer
The process of moving data between CPU (host) and GPU (device) memory. Host-device transfers can be a performance bottleneck and are optimized using pinned memory and asynchronous operations.
### Stride
The step size to move from one element to the next in a particular dimension of a tensor or matrix. In convolution and pooling, stride determines how far the kernel moves at each step.
### Dilation
The spacing between kernel elements in convolution operations, allowing the receptive field to grow without increasing kernel size.
### Im2Col/Col2Im
Data transformation techniques that convert image data to column format (im2col) for efficient convolution and back (col2im) to reconstruct the original layout.
### Fast Changing Dimension
Innermost dimension that changes fastest in memory layout.
### Outer Dimension
Slower-changing dimension in memory layout.
### Inner Dimension
Faster-changing dimension in memory layout.
---
## 5. Tile-Based Computing and Data Structures
### Tile
A sub-region of a tensor or matrix processed by a block or thread. Tiles are used to improve memory locality and enable blocking strategies in kernels. Rectangular data blocks are the unit of computation and memory transfer in CK and the basis for tiled algorithms.
### Block Tile
Memory tile processed by a work group (thread block).
### Wave Tile
Sub-tile processed by a single wave within a work group. Represents the granularity of SIMD execution.
### Tile Distribution
Hierarchical data mapping from work-items to data in memory.
### Tile Window
Viewport into a larger tensor that defines the current tile's position and boundaries for computation.
### Load Tile
Operation that transfers data from global memory/LDS to per-thread registers using optimized memory access patterns.
### Store Tile
Operation that transfers data from per-thread registers to LDS/global memory using optimized memory access patterns.
### Descriptor
Metadata structure that defines tile properties, memory layouts, and coordinate transformations for CK operations.
### Input/Problem Shape
Dimensions and data types of input tensors that define the computational problem (e.g., M×K, K×N for GEMM).
### Vector
Smallest data unit processed by individual threads. Typically 4-16 elements depending on data type and hardware.
---
## 6. Kernel Operations and Optimization
### Elementwise
Operations applied independently to each tensor element, such as addition or multiplication. These are highly parallelizable and benefit from efficient memory access.
### Epilogue
The final stage of a kernel or operation, often applying activation functions, bias, or other post-processing steps. Epilogues are critical for integrating kernel outputs into larger computation graphs.
### Add+Multiply
A common fused operation in ML and linear algebra, where an elementwise addition is immediately followed by multiplication, often used for bias and scaling in neural network layers.
---
## 7. Linear Algebra and ML Operations
### General Matrix Multiply (GEMM)
Core matrix operation in linear algebra and deep learning. A GEMM is defined as C = αAB + βC for matrices A, B, and C.
### "Vanilla" GEMM (Naive GEMM) Kernel
The **vanilla GEMM** is the simplest form of GEMM in CK. It:
- Takes input matrices **A** and **B**
- Multiplies them to produce output matrix **C**
This is the **baseline** or **building block** GEMM that all other complex versions expand upon.
### Grouped GEMM (GGEMMs)
A kernel which calls multiple VGEMMs. Each call can have a different input shape. Each input shape problem first finds its corresponding kernel and then data is mapped to the work-group (blocks) of that kernel.
### Batched GEMM
A kernel which calls VGEMMs with different "batches" of data. All batches have the same input shape.
### Split-K GEMM
A parallelization strategy that partitions the reduction dimension (K) across multiple compute units, increasing parallelism for large matrix multiplications.
### GEMV
The operation of multiplying a matrix by a vector, producing another vector. GEMV (General Matrix Vector Multiplication) is a core linear algebra primitive, widely used in neural networks and scientific computing.
### Inner Product
Also known as the dot product, it computes the sum of elementwise products of two vectors, yielding a scalar.
### Outer Product
The result of multiplying a column vector by a row vector, producing a matrix. Outer products are used in rank-1 updates and some ML algorithms.
### Norm
A function that measures the magnitude of a vector or matrix, such as L2 (Euclidean) or L1 norm. Norms are used in regularization, normalization, and optimization.
---
## 8. Testing, Build, and Infrastructure
### Regression Test
Tests that are part of CK's ctest suite and explicitly take more than 30s to finish on gfx942.
### Smoke Test
Tests that are part of CK's ctest suite and take less than or equal to 30 seconds to finish on gfx942.
---
## 9. Low-Level Instructions and Optimizations
### eXtensible Data Language (XDL) Instructions
eXtensible Data Language (XDL) instructions are a set of specialized, low-level instructions used to optimize data movement, memory access, and layout in high-performance computing, GPU programming, and deep learning tasks.
---
## 10. Miscellaneous
### HIP
AMD's Heterogeneous-Computing Interface for Portability, a C++ runtime API and programming language that enables developers to create portable applications for AMD and NVIDIA GPUs. HIP provides a familiar CUDA-like programming model while maintaining compatibility across different GPU architectures.
### CUDA
NVIDIA's Compute Unified Device Architecture, a parallel computing platform and programming model for NVIDIA GPUs. CUDA provides a C++ extension for writing GPU kernels and managing GPU resources.
### ROCm
AMD's Radeon Open Compute platform, an open-source software stack for GPU computing that includes [HIP](#hip), libraries, and tools for high-performance computing and machine learning workloads on AMD GPUs.
---
## Scientific Context and References
This terminology is grounded in parallel computing theory, numerical linear algebra, and computer architecture. For further reading, see:
- [Building Efficient GEMM Kernels with CK Tile](https://rocm.blogs.amd.com/software-tools-optimization/building-efficient-gemm-kernels-with-ck-tile-vendo/README.html)
- [CK Tile Flash](https://rocm.blogs.amd.com/software-tools-optimization/ck-tile-flash/README.html)
This document assumes familiarity with parallel computing, linear algebra, and computer architecture principles.

View File

@@ -30,14 +30,14 @@ List of the device operations for grouped convolution forward in CK:
Table of supported cases by instance factory with XDL instruction:
| |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|GNHWC/GKYXC/GNHWK|
|-------|---|---|---|
|bf16 |2D, 3D|2D|1D, 2D, 3D|
|fp16 |2D, 3D|2D|1D, 2D, 3D|
|fp32 |2D, 3D|2D|1D, 2D, 3D|
|int8 |2D, 3D|2D|1D, 3D|
|fp8 |3D|&cross;|&cross;|
|bf8 |3D|&cross;|&cross;|
| |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|NGCHW/GKCYX/NGKHW|GNHWC/GKYXC/GNHWK|
|-------|---|---|---|---|
|bf16 |2D, 3D|2D|2D|1D, 2D, 3D|
|fp16 |2D, 3D|2D|2D|1D, 2D, 3D|
|fp32 |2D, 3D|2D|2D|1D, 2D, 3D|
|int8 |2D, 3D|2D|2D|1D, 3D|
|fp8 |3D|&cross;|&cross;|&cross;|
|bf8 |3D|&cross;|&cross;|&cross;|
Table of supported cases by instance factory with WMMA instruction:

View File

@@ -107,14 +107,14 @@ int execute_conv_fwd()
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{},
{},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
{},
{},
{},
{},
out_lengths,
out_strides,
filter_strides,

View File

@@ -130,14 +130,14 @@ int main()
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{},
{},
in.GetDeviceBuffer(),
out_lengths,
out_strides,
wei_lengths,
wei_strides,
{},
{},
{},
{},
in_lengths,
in_strides,
filter_strides,

View File

@@ -105,14 +105,14 @@ int main()
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{},
{},
in.GetDeviceBuffer(),
out_lengths,
out_strides,
wei_lengths,
wei_strides,
{},
{},
{},
{},
in_lengths,
in_strides,
filter_strides,

View File

@@ -109,14 +109,14 @@ int main()
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{},
{},
in.GetDeviceBuffer(),
out_lengths,
out_strides,
wei_lengths,
wei_strides,
{},
{},
{},
{},
in_lengths,
in_strides,
filter_strides,

View File

@@ -111,14 +111,14 @@ int main()
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{},
{},
in.GetDeviceBuffer(),
out_lengths,
out_strides,
wei_lengths,
wei_strides,
{},
{},
{},
{},
in_lengths,
in_strides,
filter_strides,

View File

@@ -34,12 +34,12 @@ List of the device operations for grouped convolution backward weight in CK:
Table of supported cases by instance factory with XDL instruction:
| |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|GNHWC/GKYXC/GNHWK|
|-------|---|---|---|
|bf16|2D, 3D|2D, 3D|&cross;|
|bf16(fp32 for weight)|2D, 3D|&cross;|1D, 2D, 3D|
|fp16 |2D, 3D|2D, 3D|1D, 2D, 3D|
|fp32 |2D, 3D|2D, 3D|1D, 2D, 3D|
| |NHWGC/GKYXC/NHWGK|NGCHW/GKYXC/NGKHW|NGCHW/GKCYX/NGKHW|GNHWC/GKYXC/GNHWK|
|-------|---|---|---|---|
|bf16|2D, 3D|2D, 3D|2D, 3D|&cross;|
|bf16(fp32 for weight)|2D, 3D|&cross;|&cross;|1D, 2D, 3D|
|fp16 |2D, 3D|2D, 3D|2D, 3D|1D, 2D, 3D|
|fp32 |2D, 3D|2D, 3D|2D, 3D|1D, 2D, 3D|
Table of supported cases by instance factory with WMMA instruction:

View File

@@ -59,7 +59,7 @@ int main()
SimpleDeviceMem y_dev_buf(sizeof(YDataType) * mn_size);
std::array<const void*, 2> ab_input = {a_dev_buf.GetDeviceBuffer(),
b_dev_buf.GetDeviceBuffer()};
b_dev_buf.GetDeviceBuffer()};
std::vector<ck::index_t> abStride = {Stride, 1};
std::array<std::vector<ck::index_t>, 2> abStrides = {abStride, abStride};

View File

@@ -68,15 +68,15 @@ int main(int argc, char* argv[])
SimpleDeviceMem out(sizeof(OutDataType) * num_out_elements);
using DeviceOp = ck::tensor_operation::device::DeviceReduce<InDataType,
AccDataType,
OutDataType,
Rank,
NumReduceDim,
ReduceAdd,
PassThrough,
UnaryDivide,
PropagateNan,
OutputIndex>;
AccDataType,
OutDataType,
Rank,
NumReduceDim,
ReduceAdd,
PassThrough,
UnaryDivide,
PropagateNan,
OutputIndex>;
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();

View File

@@ -117,14 +117,14 @@ int execute_conv_bwd_data_bilinear()
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{in.GetDeviceBuffer()},
{in.GetDeviceBuffer()},
in.GetDeviceBuffer(),
out_lengths,
out_strides,
wei_lengths,
wei_strides,
{in_lengths},
{in_strides},
{in_lengths},
{in_strides},
in_lengths,
in_strides,
filter_strides,

View File

@@ -116,14 +116,14 @@ int execute_conv_bwd_data_scale()
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{},
{},
in.GetDeviceBuffer(),
out_lengths,
out_strides,
wei_lengths,
wei_strides,
{},
{},
{},
{},
in_lengths,
in_strides,
filter_strides,

View File

@@ -121,14 +121,14 @@ int execute_conv_fwd_bilinear()
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{out.GetDeviceBuffer()},
{out.GetDeviceBuffer()},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
{out_lengths},
{out_strides},
{out_lengths},
{out_strides},
out_lengths,
out_strides,
filter_strides,

View File

@@ -222,13 +222,13 @@ bool run_grouped_conv_fwd_convscale_reduce(
ck::tensor_operation::element_wise::Scale{scale_wei},
{}};
auto conv_ok = ConvolutionScale<InDataType,
WeiDataType,
ConvOutDataType,
ConvElementOp,
InLayout,
WeiLayout,
OutLayout,
NumDimSpatial>(in,
WeiDataType,
ConvOutDataType,
ConvElementOp,
InLayout,
WeiLayout,
OutLayout,
NumDimSpatial>(in,
wei,
conv_out,
elementwise_op,
@@ -717,15 +717,15 @@ bool TensorFullReduction(SimpleDeviceMem& tensor,
{
std::cout << "\nReduction of spatial dimensions:" << std::endl;
using DeviceOp = ck::tensor_operation::device::DeviceReduce<OutDataType,
OutDataType,
OutDataType,
NumDimSpatial,
NumDimSpatial,
ReduceOperation,
PassThrough,
AccElementwiseOperation,
true, // PropagateNan
false>; // OutputIndex
OutDataType,
OutDataType,
NumDimSpatial,
NumDimSpatial,
ReduceOperation,
PassThrough,
AccElementwiseOperation,
true, // PropagateNan
false>; // OutputIndex
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();

View File

@@ -120,14 +120,14 @@ int execute_conv_fwd_scale()
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{},
{},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
{},
{},
{},
{},
out_lengths,
out_strides,
filter_strides,

View File

@@ -129,8 +129,8 @@ int execute_conv_fwd_scaleadd_ab()
in_strides,
wei_lengths,
wei_strides,
{},
{},
{},
{},
out_lengths,
out_strides,
filter_strides,

View File

@@ -132,9 +132,9 @@ void PerformImageToColumnPad0(const ck::index_t G,
ck::wrapper::size<0>(tile_shape));
const auto kernel = DeviceImageToColumnPad0<decltype(input_tensor_global),
decltype(output_tensor_global),
decltype(tile_shape),
decltype(thread_layout)>;
decltype(output_tensor_global),
decltype(tile_shape),
decltype(thread_layout)>;
const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true},
kernel,
dim3(grid_size_x, grid_size_y, 1),

View File

@@ -0,0 +1,4 @@
if(GPU_TARGETS MATCHES "gfx950")
add_executable(client_gemm_mx_fp8 gemm_mx_fp8.cpp)
target_link_libraries(client_gemm_mx_fp8 PRIVATE composable_kernel::device_gemm_operations)
endif()

View File

@@ -0,0 +1,330 @@
// SPDX-License-Identifier: MIT
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_mx.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_mx.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
using ADataType = ck::f8_t;
using BDataType = ck::f8_t;
using CDataType = ck::half_t;
using XDataType = ck::e8m0_bexp_t;
using XPackedDataType = int32_t;
template <typename X, typename Y>
inline constexpr bool is_same_v = ck::is_same<X, Y>::value;
using ALayout = Row;
using BLayout = Col;
using CLayout = Row;
using AScaleLayout = Row;
using BScaleLayout = Col;
template <bool KLast>
void preShuffleScaleBuffer(ck::e8m0_bexp_t* src, ck::e8m0_bexp_t* dst, int MN, int K)
{
int MNXdlPack = 2;
int KXdlPack = 2;
int XdlMNThread = 16;
int XdlKThread = 64 / XdlMNThread;
int K0 = K / KXdlPack / XdlKThread; // KRepeat
// The 4 16x128 building blocks will be packed into 1 32x256 for F4
// The 8 16x16x128 mfma will be packed into 1 32x32x256 for F4
// unfold the MN32xK(256/32) scale buffer
// 4 16 2 2
// To XdlKThread-> XdlMNThread -> KXdlPack -> MNXdlPack
// Then, MNRepeat->KRepeat
for(int n = 0; n < MN; ++n)
{
for(int k = 0; k < K; ++k)
{
int n0 = n / (XdlMNThread * MNXdlPack); // i MNRepeat
int tempn = n % (XdlMNThread * MNXdlPack);
int n1 = tempn % XdlMNThread; // i XdlMNThread
int n2 = tempn / XdlMNThread; // i MNXdlPack
int k0 = k / (XdlKThread * KXdlPack); // i KRepeat
int tempk = k % (XdlKThread * KXdlPack);
int k1 = tempk % XdlKThread; // i XdlKThread
int k2 = tempk / XdlKThread; // i KXdlPack
int outputIndex = n0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread * K0 +
k0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread +
k1 * MNXdlPack * KXdlPack * XdlMNThread + n1 * MNXdlPack * KXdlPack +
k2 * MNXdlPack + n2;
// src[n * K + k] = ck::type_convert<ck::e8m0_bexp_t>(static_cast<float>(powf(2.0f, n2 +
// k2 * MNXdlPack)));
if constexpr(KLast)
dst[outputIndex] = src[n * K + k];
else
dst[outputIndex] = src[k * MN + n];
}
}
}
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
mem_size_ = mem_size;
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
std::size_t mem_size_;
};
int main(int argc, char* argv[])
{
// GEMM shape
ck::index_t M = 3840;
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 4096;
ck::index_t StrideB = 4096;
ck::index_t StrideC = 4096;
ck::index_t KBatch = 1;
/* Require by mx type*/
constexpr ck::index_t ScaleBlockSize = 32; // scaling block size
if(argc == 1)
{
// use default case
}
else if(argc == 7)
{
M = std::stoi(argv[1]);
N = std::stoi(argv[2]);
K = std::stoi(argv[3]);
StrideA = std::stoi(argv[4]);
StrideB = std::stoi(argv[5]);
StrideC = std::stoi(argv[6]);
}
else
{
printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideC\n");
exit(0);
}
auto f_matrix_space_size =
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
using Layout = decltype(layout);
if constexpr(std::is_same<Layout, Row>::value)
{
return (nRow - 1) * stride + nCol;
}
else
{
return (nCol - 1) * stride + nRow;
}
};
/* Scale stride Calculation */
auto f_get_default_stride =
[](ck::index_t row, ck::index_t col, ck::index_t stride, auto layout) {
if(stride == -1)
{
// give a chance if stride is -1, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
return static_cast<ck::index_t>(col);
else
return static_cast<ck::index_t>(row);
}
else
return static_cast<ck::index_t>(stride);
};
if(K % ScaleBlockSize != 0)
{
throw std::runtime_error("wrong! K must be multiple of ScaleBlockSize.");
};
auto Scale_Padded_M = (M + ScaleBlockSize - 1) / ScaleBlockSize * ScaleBlockSize;
auto Scale_Stride_AM =
f_get_default_stride(Scale_Padded_M, K / ScaleBlockSize, -1, AScaleLayout{});
auto Scale_Stride_BN = f_get_default_stride(K / ScaleBlockSize, N, -1, BScaleLayout{});
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
SimpleDeviceMem a_scale_device_buf(
sizeof(XDataType) *
f_matrix_space_size(Scale_Padded_M, K / ScaleBlockSize, Scale_Stride_AM, AScaleLayout{}));
SimpleDeviceMem b_scale_device_buf(
sizeof(XDataType) *
f_matrix_space_size(K / ScaleBlockSize, N, Scale_Stride_BN, BScaleLayout{}));
using DeviceOp =
ck::tensor_operation::device::DeviceGemmMX<ALayout,
BLayout,
CLayout,
ADataType,
XPackedDataType,
BDataType,
XPackedDataType,
CDataType,
ScaleBlockSize,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{};
const auto c_element_op = CElementOp{};
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
static_cast<XPackedDataType*>(a_scale_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
static_cast<XPackedDataType*>(b_scale_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
Scale_Stride_AM,
StrideB,
Scale_Stride_BN,
StrideC,
KBatch,
a_element_op,
b_element_op,
c_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop =
std::size_t(2) * M * N * K + std::size_t(2) * M * N * K / ScaleBlockSize;
std::size_t num_btype = sizeof(ADataType) * M * K / ck::packed_size_v<ADataType> +
sizeof(BDataType) * K * N / ck::packed_size_v<BDataType> +
sizeof(CDataType) * M * N +
sizeof(XDataType) * M * K / ScaleBlockSize +
sizeof(XDataType) * N * K / ScaleBlockSize;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance
if(found)
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
static_cast<XPackedDataType*>(a_scale_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
static_cast<XPackedDataType*>(b_scale_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
Scale_Stride_AM,
StrideB,
Scale_Stride_BN,
StrideC,
KBatch,
a_element_op,
b_element_op,
c_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}

View File

@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.15)
project(ck_app)
add_compile_options(-std=c++17)
add_compile_options(-std=c++20)
if (DTYPES)
add_definitions(-DDTYPES)
@@ -32,7 +32,7 @@ if (DTYPES)
add_definitions(-DCK_ENABLE_BF16)
set(CK_ENABLE_BF16 "ON")
endif()
message("DTYPES macro set to ${DTYPES}")
message(DEBUG "DTYPES macro set to ${DTYPES}")
else()
add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
set(CK_ENABLE_INT8 "ON")

View File

@@ -14,8 +14,10 @@ cd client_example/build
cmake \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm;${PATH_TO_CK_INSTALL_DIRECTORY}" \
-D GPU_TARGETS="gfx908;gfx90a" \
..
```
You must set the `GPU_TARGETS` macro to specify the GPU target architecture(s).
### Build client example
```bash

View File

@@ -66,7 +66,8 @@ else()
-Wunreachable-code
-Wunused
-Wno-reserved-identifier
-Werror
# Werror set outside by BUILD_DEV
# -Werror
-Wno-option-ignored
-Wsign-compare
-Wno-extra-semi-stmt
@@ -108,7 +109,7 @@ else()
endif()
list(APPEND CMAKE_COMPILER_WARNINGS
-Wno-missing-field-initializers
-Wno-deprecated-declarations
-Wno-error=deprecated-declarations
)
endif()
add_definitions(${CMAKE_COMPILER_WARNINGS})

View File

@@ -0,0 +1,116 @@
# Function to generate templated instantiation functions and caller function.
# In order to reduce build times, we split the instantiation of template functions into multiple files.
# Developers can use ck::util::generate_sharded_instantiations to generate the instantiation functions,
# which can be placed the TEMPLATE_FILE (typically a .in file).
# This CMake function generates the instantiation functions and a caller function that calls all the instantiation
# functions. The ck::util::generate_sharded_instantiations function allows us to generate an arbitrary number of
# shards (NUM_SHARDS). This function loops over the shards, generates an instantiation function for each shard,
# and generates a caller function that calls all the instantiation functions.
# The explicit instatiation pattern requires the use of `extern template` to avoid implicit instantiation
# of the template functions in the caller function, and that code is automatically generated by this function.
# In addition to the user-supplied template, this CMake function uses two generic templates:
#
# 1. `instantiate_shard.in`: This is the template for the instantiation functions.
# 2. `call_shard.in`: This is the template for the caller function that calls all the instantiation functions.
# This function takes the following arguments:
#
# - INSTANCES_NAME: The name of the instances (the calling function will be named `add_${INSTANCE_NAMES}`).
# - TEMPLATE_FILE: The path to the template file that contains the templated instantiation function definitions.
# - NUM_SHARDS: The number of shards to generate.
# - OUTPUT_DIR: The build directory where the generated source files will be placed.
# - SRC_LIST: The list of source files to which the generated source files will be added.
function(generate_sharded_instantiations)
cmake_parse_arguments(
GEN_SHARDED
# No boolean arguments
""
# Single-value arguments
"INSTANCES_NAME;TEMPLATE_FILE;NUM_SHARDS;OUTPUT_DIR;SRC_LIST"
# No multi-value arguments.
""
${ARGN}
)
if (NOT GEN_SHARDED_INSTANCES_NAME)
message(FATAL_ERROR "INSTANCES_NAME is required for generate_sharded_instantiations")
endif()
if (NOT GEN_SHARDED_TEMPLATE_FILE)
message(FATAL_ERROR "TEMPLATE_FILE is required for generate_sharded_instantiations")
endif()
if (NOT GEN_SHARDED_NUM_SHARDS)
message(FATAL_ERROR "NUM_SHARDS is required for generate_sharded_instantiations")
endif()
if(NOT GEN_SHARDED_OUTPUT_DIR)
message(FATAL_ERROR "OUTPUT_DIR is required for generate_sharded_instantiations")
endif()
if (NOT GEN_SHARDED_SRC_LIST)
message(FATAL_ERROR "SRC_LIST is required for generate_sharded_instantiations")
endif()
file(MAKE_DIRECTORY ${GEN_SHARDED_OUTPUT_DIR})
set(GENERATED_SOURCE_FILES "")
set(EXTERN_TEMPLATE_STATEMENTS "")
set(CALL_STATEMENTS "")
message(STATUS "Generating sharded instantiations for target: ${GEN_SHARDED_INSTANCES_NAME}")
set(INSTANCES "${GEN_SHARDED_INSTANCES_NAME}")
# Generate the inc file with the template function defintions.
# This include file will hold the template function definitions and a using alias for all the shard
# instantiation functions.
configure_file(
"${GEN_SHARDED_TEMPLATE_FILE}"
"${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}.inc"
@ONLY
)
# Generate the sharded instantiation functions.
# This is where the build parallelization happens.
# Each of these source files will contain a single instantiation function for a shard,
# which will be called sequentially by the caller function.
set(INC_DIR "${GEN_SHARDED_INC_DIR}")
math(EXPR LAST_SHARD_ID "${GEN_SHARDED_NUM_SHARDS} - 1")
foreach(SHARD_ID RANGE 0 ${LAST_SHARD_ID})
set(NUM_SHARDS "${GEN_SHARDED_NUM_SHARDS}")
set(SHARD_FUNCTION_PATH "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}_shard_${SHARD_ID}.cpp")
set(SHARD_FUNCTION_TEMPLATE "${PROJECT_SOURCE_DIR}/cmake/instantiate_shard.in")
configure_file(
"${SHARD_FUNCTION_TEMPLATE}"
"${SHARD_FUNCTION_PATH}"
@ONLY
)
list(APPEND GENERATED_SOURCE_FILES "${SHARD_FUNCTION_PATH}")
set(SHARDED_FUNCTION_NAME "add_${INSTANCES}_shard<${NUM_SHARDS}, ${SHARD_ID}>")
list(APPEND EXTERN_TEMPLATE_STATEMENTS "extern template void\n${SHARDED_FUNCTION_NAME}(\n ${INSTANCES}& instances)")
list(APPEND CALL_STATEMENTS " ${SHARDED_FUNCTION_NAME}(instances)")
endforeach()
# Join the include statements, the extern template declarations, and the call statements each
# into a single string for variable substitution in the caller function.
string(REPLACE ";" ";\n" INCLUDE_STATEMENTS "${INCLUDE_STATEMENTS}")
string(REPLACE ";" ";\n" CALL_STATEMENTS "${CALL_STATEMENTS}")
string(REPLACE ";" ";\n" EXTERN_TEMPLATE_STATEMENTS "${EXTERN_TEMPLATE_STATEMENTS}")
# Generate the caller function.
set(CALLER_FUNCTION_PATH "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}.cpp")
set(FUNCTION_TEMPLATE "${PROJECT_SOURCE_DIR}/cmake/call_shard.in")
configure_file(
"${FUNCTION_TEMPLATE}"
"${CALLER_FUNCTION_PATH}"
@ONLY
)
list(APPEND GENERATED_SOURCE_FILES "${CALLER_FUNCTION_PATH}")
# Add the generated source files to the list of source files.
# This allows the generated source files to be included in the build.
list(APPEND ${GEN_SHARDED_SRC_LIST} ${GENERATED_SOURCE_FILES})
set(${GEN_SHARDED_SRC_LIST} "${${GEN_SHARDED_SRC_LIST}}" PARENT_SCOPE)
endfunction()

15
cmake/call_shard.in Normal file
View File

@@ -0,0 +1,15 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "@INSTANCES@.inc"
namespace ck::tensor_operation::device::instance {
@EXTERN_TEMPLATE_STATEMENTS@;
void add_@INSTANCES@(
@INSTANCES@& instances) {
@CALL_STATEMENTS@;
}
} // namespace ck::tensor_operation::device::instance

View File

@@ -68,3 +68,6 @@ endif()
target_compile_options(gtest PRIVATE ${GTEST_CXX_FLAGS})
target_compile_options(gtest_main PRIVATE ${GTEST_CXX_FLAGS})
target_compile_definitions(gtest PRIVATE GTEST_HAS_SEH=0)
target_compile_definitions(gtest_main PRIVATE GTEST_HAS_SEH=0)

View File

@@ -0,0 +1,9 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "@INSTANCES@.inc"
namespace ck::tensor_operation::device::instance {
template void add_@INSTANCES@_shard<@NUM_SHARDS@, @SHARD_ID@>(
@INSTANCES@& instances);
} // namespace ck::tensor_operation::device::instance

View File

@@ -19,12 +19,10 @@ list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake)
include(Embed)
file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
${CK_ROOT}/include/ck/*.hpp)
# printouts fot debug purposes
# message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
# message(STATUS "RELATIVE: ${CK_ROOT}/include")
add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)
add_compile_options(-std=c++17)
add_compile_options(-std=c++20)
file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
# TODO: Use object library
@@ -48,6 +46,7 @@ rocm_install_targets(
INCLUDE include
)
rocm_export_targets(
TARGETS ck_host ck_headers
EXPORT ck_host_targets
NAMESPACE composable_kernel::
)

View File

@@ -91,8 +91,9 @@ inline auto Transform(const Range& r, F f) -> std::vector<decltype(f(*r.begin())
}
template <class Range1, class Range2, class F>
inline auto Transform(const Range1& r1, const Range2& r2, F f)
-> std::vector<decltype(f(*r1.begin(), *r2.begin()))>
inline auto Transform(const Range1& r1,
const Range2& r2,
F f) -> std::vector<decltype(f(*r1.begin(), *r2.begin()))>
{
std::vector<decltype(f(*r1.begin(), *r2.begin()))> result;
assert(std::distance(r1.begin(), r1.end()) == std::distance(r2.begin(), r2.end()));

View File

@@ -142,12 +142,11 @@ std::vector<Operation_Conv_Fwd_Xdl_Cshuffle> Operation_Conv_Fwd_Xdl_Cshuffle::Cr
x.A = TensorDesc{prob.ADataType, prob.ALayout};
x.B = TensorDesc{prob.BDataType, prob.BLayout};
x.E = TensorDesc{prob.EDataType, prob.ELayout};
x.Ds = Transform(prob.DsLayout, prob.DsDataType, [](auto lo, auto dt) {
return TensorDesc{dt, lo};
});
x.a_elem_op = prob.AElementOp;
x.b_elem_op = prob.BElementOp;
x.cde_elem_op = prob.CDEElementOp;
x.Ds = Transform(
prob.DsLayout, prob.DsDataType, [](auto lo, auto dt) { return TensorDesc{dt, lo}; });
x.a_elem_op = prob.AElementOp;
x.b_elem_op = prob.BElementOp;
x.cde_elem_op = prob.CDEElementOp;
x.update_prologue(prologue);
x.update_epilogue(epilogue);
result.push_back(x);

View File

@@ -55,12 +55,12 @@ TEST_CASE(test_problem_kernel)
std::cout << "Testing solution " << std::to_string(i + 1) << std::endl;
auto&& solution = solutions[i];
auto src = ck::host::InterpolateString(gemm_compile_check,
{{"include", prob.GetIncludeHeader()},
{"template", solution.ToTemplateString()},
{"m", std::to_string(prob.M)},
{"n", std::to_string(prob.N)},
{"k", std::to_string(prob.K)},
{"o", std::to_string(prob.O)}});
{{"include", prob.GetIncludeHeader()},
{"template", solution.ToTemplateString()},
{"m", std::to_string(prob.M)},
{"n", std::to_string(prob.N)},
{"k", std::to_string(prob.K)},
{"o", std::to_string(prob.O)}});
auto srcs = get_headers_for_test();
srcs.push_back({"main.cpp", src});
rtc::compile_options options;

View File

@@ -60,11 +60,11 @@ TEST_CASE(test_problem_kernel)
std::cout << "Testing solution " << std::to_string(i + 1) << std::endl;
auto&& solution = solutions[i];
auto src = ck::host::InterpolateString(gemm_compile_check,
{{"include", prob.GetIncludeHeader()},
{"template", solution.ToTemplateString()},
{"m", std::to_string(prob.M)},
{"n", std::to_string(prob.N)},
{"k", std::to_string(prob.K)}});
{{"include", prob.GetIncludeHeader()},
{"template", solution.ToTemplateString()},
{"m", std::to_string(prob.M)},
{"n", std::to_string(prob.N)},
{"k", std::to_string(prob.K)}});
auto srcs = get_headers_for_test();
srcs.push_back({"main.cpp", src});
rtc::compile_options options;

View File

@@ -8,5 +8,5 @@ target_link_libraries(ck_rtc PUBLIC -lstdc++fs)
option(USE_HIPRTC_FOR_CODEGEN_TESTS "Whether to enable hipRTC for codegen tests." ON)
if(USE_HIPRTC_FOR_CODEGEN_TESTS)
target_compile_definitions(ck_rtc PUBLIC HIPRTC_FOR_CODEGEN_TESTS)
message("CK compiled with USE_HIPRTC_FOR_CODEGEN_TESTS set to ${USE_HIPRTC_FOR_CODEGEN_TESTS}")
message(STATUS "CK compiled with USE_HIPRTC_FOR_CODEGEN_TESTS set to ${USE_HIPRTC_FOR_CODEGEN_TESTS}")
endif()

View File

@@ -16,7 +16,7 @@ struct tmp_dir
void execute(const std::string& cmd) const;
tmp_dir(tmp_dir const&) = delete;
tmp_dir(tmp_dir const&) = delete;
tmp_dir& operator=(tmp_dir const&) = delete;
~tmp_dir();

View File

@@ -94,7 +94,7 @@ kernel clang_compile_kernel(const std::vector<src_file>& srcs, compile_options o
assert(not srcs.empty());
tmp_dir td{"compile"};
options.flags += " -I. -O3";
options.flags += " -std=c++17";
options.flags += " -std=c++20";
options.flags += " --offload-arch=" + get_device_name();
std::string out;
@@ -278,7 +278,7 @@ std::vector<std::vector<char>> compile_hip_src_with_hiprtc(const std::vector<src
static kernel hiprtc_compile_kernel(const std::vector<src_file>& srcs, compile_options options)
{
options.flags += " -I. -O3";
options.flags += " -std=c++17";
options.flags += " -std=c++20";
options.flags += " -DCK_CODE_GEN_RTC";
options.flags += " --offload-arch=" + get_device_name();
auto cos = compile_hip_src_with_hiprtc(srcs, options);

View File

@@ -1,18 +1,15 @@
.. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
:description: Composable Kernel mathematical basis
:keywords: composable kernel, CK, ROCm, API, mathematics, algorithm
.. _supported-primitives:
********************************************************************
Supported Primitives Guide
Composable Kernel mathematical basis
********************************************************************
This document contains details of supported primitives in Composable Kernel (CK). In contrast to the API Reference Guide, the Supported Primitives Guide is an introduction to the math which underpins the algorithms implemented in CK.
This is an introduction to the math which underpins the algorithms implemented in Composable Kernel.
------------
Softmax
------------
For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` you can decompose the
softmax of concatenated :math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as,

View File

@@ -0,0 +1,29 @@
.. meta::
:description: Composable Kernel structure
:keywords: composable kernel, CK, ROCm, API, structure
.. _what-is-ck:
********************************************************************
Composable Kernel structure
********************************************************************
The Composable Kernel library uses a tile-based programming model and tensor coordinate transformation to achieve performance portability and code maintainability. Tensor coordinate transformation is a complexity reduction technique for complex machine learning operators.
.. image:: ../data/ck_component.png
:alt: CK Components
The Composable Kernel library consists of four layers:
* a templated tile operator layer
* a templated kernel and invoker layer
* an instantiated kernel and invoker layer
* a client API layer.
A wrapper component is included to simplify tensor transform operations.
.. image:: ../data/ck_layer.png
:alt: CK Layers

View File

@@ -1,41 +0,0 @@
.. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
.. _what-is-ck:
********************************************************************
What is the Composable Kernel library
********************************************************************
Methodology
===========
The Composable Kernel (CK) library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages like HIP C++.
CK utilizes two concepts to achieve performance portability and code maintainability:
* A tile-based programming model
* Algorithm complexity reduction for complex ML operators using an innovative technique called
"Tensor Coordinate Transformation".
.. image:: ../data/ck_component.png
:alt: CK Components
Code Structure
==============
The CK library is structured into 4 layers:
* "Templated Tile Operators" layer
* "Templated Kernel and Invoker" layer
* "Instantiated Kernel and Invoker" layer
* "Client API" layer
It also includes a simple wrapper component used to perform tensor transform operations more easily and with fewer lines of code.
.. image:: ../data/ck_layer.png
:alt: CK Layers

View File

@@ -28,6 +28,7 @@ external_toc_path = "./sphinx/_toc.yml"
docs_core = ROCmDocs(left_nav_title)
docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
docs_core.enable_api_reference()
docs_core.setup()
external_projects_current_project = "composable_kernel"

File diff suppressed because it is too large Load Diff

View File

@@ -8,31 +8,38 @@
Composable Kernel User Guide
********************************************************************
The Composable Kernel (CK) library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages like HIP C++. This document contains instructions for installing, using, and contributing to the Composable Kernel project. To learn more see :ref:`what-is-ck`.
The Composable Kernel library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages such as `HIP C++ <https://rocm.docs.amd.com/projects/HIP/en/latest/index.html>`_.
The CK documentation is structured as follows:
The Composable Kernel repository is located at `https://github.com/ROCm/composable_kernel <https://github.com/ROCm/composable_kernel>`_.
.. grid:: 2
:gutter: 3
.. grid-item-card:: Installation
.. grid-item-card:: Install
* :ref:`docker-hub`
* :doc:`Composable Kernel prerequisites <./install/Composable-Kernel-prerequisites>`
* :doc:`Build and install Composable Kernel <./install/Composable-Kernel-install>`
* :doc:`Build and install Composable Kernel on a Docker image <./install/Composable-Kernel-Docker>`
.. grid-item-card:: Conceptual
* :ref:`what-is-ck`
* :doc:`Composable Kernel structure <./conceptual/Composable-Kernel-structure>`
* :doc:`Composable Kernel mathematical basis <./conceptual/Composable-Kernel-math>`
.. grid-item-card:: API reference
.. grid-item-card:: Tutorials
* :ref:`supported-primitives`
* :ref:`api-reference`
* :ref:`wrapper`
* :doc:`Composable Kernel examples and tests <./tutorial/Composable-Kernel-examples>`
.. grid-item-card:: Tutorial
* :ref:`hello-world`
.. grid-item-card:: Reference
* :doc:`Composable Kernel supported scalar types <./reference/Composable_Kernel_supported_scalar_types>`
* :doc:`Composable Kernel custom types <./reference/Composable_Kernel_custom_types>`
* :doc:`Composable Kernel vector utilities <./reference/Composable_Kernel_vector_utilities>`
* :ref:`wrapper`
* :doc:`Composable Kernel API reference <./doxygen/html/namespace_c_k>`
* :doc:`CK Tile API reference <./doxygen/html/namespaceck__tile>`
* :doc:`Composable Kernel complete API class list <./doxygen/html/annotated>`
To contribute to the documentation refer to `Contributing to ROCm <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.
You can find licensing information on the `Licensing <https://rocm.docs.amd.com/en/latest/about/license.html>`_ page.

View File

@@ -0,0 +1,16 @@
.. meta::
:description: Composable Kernel docker files
:keywords: composable kernel, CK, ROCm, API, docker
.. _docker-hub:
********************************************************************
Composable Kernel Docker containers
********************************************************************
Docker images that include all the required prerequisites for building Composable Kernel are available on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel/tags>`_.
The images also contain `ROCm <https://rocm.docs.amd.com/en/latest/index.html>`_, `CMake <https://cmake.org/getting-started/>`_, and the `ROCm LLVM compiler infrastructure <https://rocm.docs.amd.com/projects/llvm-project/en/latest/index.html>`_.
Composable Kernel Docker images are named according to their operating system and ROCm version. For example, a Docker image named ``ck_ub22.04_rocm6.3`` would correspond to an Ubuntu 22.04 image with ROCm 6.3.

View File

@@ -0,0 +1,72 @@
.. meta::
:description: Composable Kernel build and install
:keywords: composable kernel, CK, ROCm, API, documentation, install
******************************************************
Building and installing Composable Kernel with CMake
******************************************************
Before you begin, clone the `Composable Kernel GitHub repository <https://github.com/ROCm/composable_kernel.git>`_ and create a ``build`` directory in its root:
.. code:: shell
git clone https://github.com/ROCm/composable_kernel.git
cd composable_kernel
mkdir build
Change directory to the ``build`` directory and generate the makefile using the ``cmake`` command. Two build options are required:
* ``CMAKE_PREFIX_PATH``: The ROCm installation path. ROCm is installed in ``/opt/rocm`` by default.
* ``CMAKE_CXX_COMPILER``: The path to the Clang compiler. Clang is found at ``/opt/rocm/llvm/bin/clang++`` by default.
.. code:: shell
cd build
cmake ../. -D CMAKE_PREFIX_PATH="/opt/rocm" -D CMAKE_CXX_COMPILER="/opt/rocm/llvm/bin/clang++" [-D<OPTION1=VALUE1> [-D<OPTION2=VALUE2>] ...]
Other build options are:
* ``DISABLE_DL_KERNELS``: Set this to "ON" to not build deep learning (DL) and data parallel primitive (DPP) instances.
.. note::
DL and DPP instances are useful on architectures that don't support XDL or WMMA.
* ``CK_USE_FP8_ON_UNSUPPORTED_ARCH``: Set to ``ON`` to build FP8 data type instances on gfx90a without native FP8 support.
* ``GPU_TARGETS``: Target architectures. Target architectures in this list must all be different versions of the same architectures. Enclose the list of targets in quotation marks. Separate multiple targets with semicolons (``;``). For example, ``cmake -D GPU_TARGETS="gfx908;gfx90a"``. This option is required to build tests and examples.
* ``GPU_ARCHS``: Target architectures. Target architectures in this list are not limited to different versions of the same architectures. Enclose the list of targets in quotation marks. Separate multiple targets with semicolons (``;``). For example, ``cmake -D GPU_TARGETS="gfx908;gfx1100"``.
* ``CMAKE_BUILD_TYPE``: The build type. Can be ``None``, ``Release``, ``Debug``, ``RelWithDebInfo``, or ``MinSizeRel``. CMake will use ``Release`` by default.
.. Note::
If neither ``GPU_TARGETS`` nor ``GPU_ARCHS`` is specified, Composable Kernel will be built for all targets supported by the compiler.
Build Composable Kernel using the generated makefile. This will build the library, the examples, and the tests, and save them to ``bin``.
.. code:: shell
make -j20
The ``-j`` option speeds up the build by using multiple threads in parallel. For example, ``-j20`` uses twenty threads in parallel. On average, each thread will use 2GB of memory. Make sure that the number of threads you use doesn't exceed the available memory in your system.
Using ``-j`` alone will launch an unlimited number of threads and is not recommended.
Install the Composable Kernel library:
.. code:: shell
make install
After running ``make install``, the Composable Kernel files will be saved to the following locations:
* Library files: ``/opt/rocm/lib/``
* Header files: ``/opt/rocm/include/ck/`` and ``/opt/rocm/include/ck_tile/``
* Examples, tests, and ckProfiler: ``/opt/rocm/bin/``
For information about ckProfiler, see `the ckProfiler readme file <https://github.com/ROCm/composable_kernel/blob/develop/profiler/README.md>`_.
For information about running the examples and tests, see :doc:`Composable Kernel examples and tests <../tutorial/Composable-Kernel-examples>`.

View File

@@ -0,0 +1,32 @@
.. meta::
:description: Composable Kernel prerequisites
:keywords: composable kernel, CK, ROCm, API, documentation, prerequisites
******************************************************
Composable Kernel prerequisites
******************************************************
Docker images that include all the required prerequisites for building Composable Kernel are available on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel/tags>`_.
The following prerequisites are required to build and install Composable Kernel:
* cmake
* hip-rocclr
* iputils-ping
* jq
* libelf-dev
* libncurses5-dev
* libnuma-dev
* libpthread-stubs0-dev
* llvm-amdgpu
* mpich
* net-tools
* python3
* python3-dev
* python3-pip
* redis
* rocm-llvm-dev
* zlib1g-dev
* libzstd-dev
* openssh-server
* clang-format-18

View File

@@ -1,101 +0,0 @@
.. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
.. _docker-hub:
********************************************************************
CK Docker Hub
********************************************************************
Why do I need this?
===================
To make things simpler, and bring Composable Kernel and its dependencies together,
docker images can be found on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel/tags>`_. Docker images provide a complete image of the OS, the Composable Kernel library, and its dependencies in a single downloadable file.
Refer to `Docker Overview <https://docs.docker.com/get-started/overview/>`_ for more information on Docker images and containers.
Which image is right for me?
============================
The image naming includes information related to the docker image.
For example ``ck_ub20.04_rocm6.0`` indicates the following:
* ``ck`` - made for running Composable Kernel;
* ``ub20.04`` - based on Ubuntu 20.04;
* ``rocm6.0`` - ROCm platform version 6.0.
Download a docker image suitable for your OS and ROCm release, run or start the docker container, and then resume the tutorial from this point. Use the ``docker pull`` command to download the file::
docker pull rocm/composable_kernel:ck_ub20.04_rocm6.0
What is inside the image?
-------------------------
The docker images have everything you need for running CK including:
* `ROCm <https://rocm.docs.amd.com/en/latest/index.html>`_
* `CMake <https://cmake.org/getting-started/>`_
* `Compiler <https://github.com/ROCm/llvm-project>`_
* `Composable Kernel library <https://github.com/ROCm/composable_kernel>`_
Running the docker container
============================
After downloading the docker image, you can start the container using one of a number of commands. Start with the ``docker run`` command as shown below::
docker run \
-it \
--privileged \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
rocm/composable_kernel:ck_ub20.04_rocm6.0 \
/bin/bash
After starting the bash shell, the docker container current folder is `~/workspace`. The library path is ``~/workspace/composable_kernel``. Navigate to the library to begin the tutorial as explained in :ref:`hello-world`:
.. note::
If your current folder is different from `${HOME}`, adjust the line ``-v ${HOME}:/root/workspace`` in the ``docker run`` command to fit your folder structure.
Stop and restart the docker image
=================================
After finishing the tutorial, or just when you have completed your work session, you can close the docker container, or stop the docker container to restart it at another time. Closing the docker container means that it is still in the active state, and can be resumed from where you left it. Stopping the container closes it, and returns the image to its initial state.
Use the ``Ctrl-D`` option to exit the container, while leaving it active, so you can return to the container in its current state to resume the tutorial, or pickup your project where you left off.
To restart the active container use the ``docker exec`` command to specify the container name and options as follows::
docker exec -it <container_name> bash
Where:
* `exec` is the docker command
* `-it` is the interactive option for `exec`
* `<container_name>` specifies an active container on the system
* `bash` specifies the command to run in the interactive shell
.. note::
You can use the ``docker container ls`` command to list the active containers on the system.
To start a container from the image, use the ``docker start`` command::
docker start <container_name>
Then use the docker exec command as shown above to start the bash shell.
Use the ``docker stop`` command to stop the container and restore the image to its initial state::
docker stop <container_name>
Editing the docker image
=======================
If you want to customize the docker image, edit the
`Dockerfile <https://github.com/ROCm/composable_kernel/blob/develop/Dockerfile>`_
from the GitHub repository to suit your needs.

View File

@@ -1,48 +0,0 @@
.. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
.. _api-reference:
********************************************************************
API reference guide
********************************************************************
This document contains details of the APIs for the Composable Kernel (CK) library and introduces
some of the key design principles that are used to write new classes that extend CK functionality.
=================
CK Datatypes
=================
-----------------
DeviceMem
-----------------
.. doxygenstruct:: DeviceMem
---------------------------
Kernels For Flashattention
---------------------------
The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This section lists
the classes that are used in the CK GPU implementation of Flashattention.
**Gridwise classes**
.. doxygenstruct:: ck::GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
**Blockwise classes**
.. doxygenstruct:: ck::ThreadGroupTensorSliceTransfer_v4r1
.. doxygenstruct:: ck::BlockwiseGemmXdlops_v2
.. doxygenstruct:: ck::BlockwiseSoftmax
**Threadwise classes**
.. doxygenstruct:: ck::ThreadwiseTensorSliceTransfer_StaticToStatic
.. bibliography::

View File

@@ -1,20 +1,15 @@
.. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
:description: Composable Kernel wrapper
:keywords: composable kernel, CK, ROCm, API, wrapper
.. _wrapper:
********************************************************************
Wrapper
Composable Kernel wrapper
********************************************************************
-------------------------------------
Description
-------------------------------------
The CK library provides a lightweight wrapper for more complex operations implemented in
the library.
The Composable Kernel library provides a lightweight wrapper to simplify the more complex operations.
Example:

View File

@@ -0,0 +1,39 @@
.. meta::
:description: Composable Kernel supported custom types
:keywords: composable kernel, custom, data types, support, CK, ROCm
******************************************************
Composable Kernel custom data types
******************************************************
Composable Kernel supports the use of custom types that provide a way to implement specialized numerical formats.
To use custom types, a C++ type that implements the necessary operations for tensor computations needs to be created. These should include:
* Constructors and initialization methods
* Arithmetic operators if the type will be used in computational operations
* Any conversion functions needed to interface with other parts of an application
For example, to create a complex half-precision type:
.. code:: cpp
struct complex_half_t
{
half_t real;
half_t img;
};
struct complex_half_t
{
using type = half_t;
type real;
type img;
complex_half_t() : real{type{}}, img{type{}} {}
complex_half_t(type real_init, type img_init) : real{real_init}, img{img_init} {}
};
Custom types can be particularly useful for specialized applications such as complex number arithmetic,
custom quantization schemes, or domain-specific number representations.

View File

@@ -0,0 +1,69 @@
.. meta::
:description: Composable Kernel supported scalar types
:keywords: composable kernel, scalar, data types, support, CK, ROCm
***************************************************
Composable Kernel supported scalar data types
***************************************************
The Composable Kernel library provides support for the following scalar data types:
.. list-table::
:header-rows: 1
:widths: 25 15 60
* - Type
- Bit Width
- Description
* - ``double``
- 64-bit
- Standard IEEE 754 double precision floating point
* - ``float``
- 32-bit
- Standard IEEE 754 single precision floating point
* - ``int32_t``
- 32-bit
- Standard signed 32-bit integer
* - ``int8_t``
- 8-bit
- Standard signed 8-bit integer
* - ``uint8_t``
- 8-bit
- Standard unsigned 8-bit integer
* - ``bool``
- 1-bit
- Boolean type
* - ``ck::half_t``
- 16-bit
- IEEE 754 half precision floating point with 5 exponent bits, 10 mantissa bits, and 1 sign bit
* - ``ck::bhalf_t``
- 16-bit
- Brain floating point with 8 exponent bits, 7 mantissa bits, and 1 sign bit
* - ``ck::f8_t``
- 8-bit
- 8-bit floating point (E4M3 format) with 4 exponent bits, 3 mantissa bits, and 1 sign bit
* - ``ck::bf8_t``
- 8-bit
- 8-bit brain floating point (E5M2 format) with 5 exponent bits, 2 mantissa bits, and 1 sign bit
* - ``ck::f4_t``
- 4-bit
- 4-bit floating point format (E2M1 format) with 2 exponent bits, 1 mantissa bit, and 1 sign bit
* - ``ck::f6_t``
- 6-bit
- 6-bit floating point format (E2M3 format) with 2 exponent bits, 3 mantissa bits, and 1 sign bit
* - ``ck::bf6_t``
- 6-bit
- 6-bit brain floating point format (E3M2 format) with 3 exponent bits, 2 mantissa bits, and 1 sign bit

View File

@@ -0,0 +1,16 @@
.. meta::
:description: Composable Kernel supported precision types and custom type support
:keywords: composable kernel, precision, data types, ROCm
******************************************************
Composable Kernel vector template utilities
******************************************************
Composable Kernel includes template utilities for creating vector types with customizable widths. These template utilities also flatten nested vector types into a single, wider vector, preventing the creation of vectors of vectors.
Vectors composed of supported scalar and custom types can be created with the ``ck::vector_type`` template.
For example, ``ck::vector_type<float, 4>`` creates a vector composed of four floats and ``ck::vector_type<ck::half_t, 8>`` creates a vector composed of eight half-precision scalars.
For vector operations to be valid, the underlying types must be either a :doc:`supported scalar type <Composable_Kernel_supported_scalar_types>` or :doc:`a custom type <Composable_Kernel_custom_types>` that implements the required operations.

View File

@@ -3,34 +3,43 @@ defaults:
root: index
subtrees:
- caption: Conceptual
entries:
- file: conceptual/what-is-ck.rst
title: What is Composable Kernel?
- caption: Install
entries:
- file: install/dockerhub.rst
title: Docker Hub
- file: install/Composable-Kernel-prerequisites.rst
title: Composable Kernel prerequisites
- file: install/Composable-Kernel-install.rst
title: Build and install Composable Kernel
- file: install/Composable-Kernel-Docker.rst
title: Composable Kernel Docker images
- caption: CK API Reference
- caption: Conceptual
entries:
- file: reference/Supported_Primitives_Guide.rst
title: Supported Primitives
- file: reference/API_Reference_Guide.rst
title: API Reference
- file: reference/wrapper.rst
title: Wrapper
- file: conceptual/Composable-Kernel-structure.rst
title: Composable Kernel structure
- file: conceptual/Composable-Kernel-math.rst
title: Composable Kernel mathematical basis
- caption: Tutorial
entries:
- file: tutorial/tutorial_hello_world.rst
title: Hello World Tutorial
- file: tutorial/Composable-Kernel-examples.rst
title: Composable Kernel examples
- caption: Reference
entries:
- file: reference/Composable_Kernel_supported_scalar_types.rst
title: Composable Kernel scalar types
- file: reference/Composable_Kernel_custom_types.rst
title: Composable Kernel custom types
- file: reference/Composable_Kernel_vector_utilities.rst
title: Composable Kernel vector utilities
- file: reference/Composable-Kernel-wrapper.rst
title: Composable Kernel wrapper
- file: doxygen/html/annotated.rst
title: Composable Kernel class list
- caption: About
entries:
- file: Contributors_Guide.rst
title: Contributing to CK
title: Contributing to Composable Kernel
- file: license.rst
title: License

View File

@@ -1,2 +1,2 @@
rocm-docs-core==1.18.1
sphinxcontrib-bibtex==2.6.3
rocm-docs-core[api_reference]==1.20.1
sphinxcontrib-bibtex==2.6.5

View File

@@ -6,68 +6,79 @@
#
accessible-pygments==0.0.5
# via pydata-sphinx-theme
alabaster==0.7.16
alabaster==1.0.0
# via sphinx
asttokens==3.0.0
# via stack-data
attrs==24.3.0
attrs==25.3.0
# via
# jsonschema
# jupyter-cache
# referencing
babel==2.15.0
babel==2.17.0
# via
# pydata-sphinx-theme
# sphinx
beautifulsoup4==4.12.3
beautifulsoup4==4.13.4
# via pydata-sphinx-theme
breathe==4.35.0
breathe==4.36.0
# via rocm-docs-core
certifi==2024.7.4
certifi==2025.1.31
# via requests
cffi==1.16.0
cffi==1.17.1
# via
# cryptography
# pynacl
charset-normalizer==3.3.2
charset-normalizer==3.4.1
# via requests
click==8.1.7
click==8.1.8
# via
# click-log
# doxysphinx
# jupyter-cache
# sphinx-external-toc
click-log==0.4.0
# via doxysphinx
comm==0.2.2
# via ipykernel
cryptography==43.0.0
contourpy==1.3.2
# via matplotlib
cryptography==44.0.2
# via pyjwt
debugpy==1.8.12
cycler==0.12.1
# via matplotlib
debugpy==1.8.14
# via ipykernel
decorator==5.1.1
decorator==5.2.1
# via ipython
deprecated==1.2.14
deprecated==1.2.18
# via pygithub
docutils==0.21.2
# via
# breathe
# myst-parser
# pybtex-docutils
# pydata-sphinx-theme
# sphinx
# sphinxcontrib-bibtex
doxysphinx==3.3.12
# via rocm-docs-core
exceptiongroup==1.2.2
# via ipython
executing==2.1.0
executing==2.2.0
# via stack-data
fastjsonschema==2.20.0
fastjsonschema==2.21.1
# via
# nbformat
# rocm-docs-core
gitdb==4.0.11
fonttools==4.57.0
# via matplotlib
gitdb==4.0.12
# via gitpython
gitpython==3.1.43
gitpython==3.1.44
# via rocm-docs-core
greenlet==3.1.1
greenlet==3.2.1
# via sqlalchemy
idna==3.7
idna==3.10
# via requests
imagesize==1.4.1
# via sphinx
@@ -77,13 +88,13 @@ importlib-metadata==8.6.1
# myst-nb
ipykernel==6.29.5
# via myst-nb
ipython==8.31.0
ipython==8.35.0
# via
# ipykernel
# myst-nb
jedi==0.19.2
# via ipython
jinja2==3.1.4
jinja2==3.1.6
# via
# myst-parser
# sphinx
@@ -103,25 +114,35 @@ jupyter-core==5.7.2
# jupyter-client
# nbclient
# nbformat
kiwisolver==1.4.8
# via matplotlib
latexcodec==3.0.0
# via pybtex
libsass==0.22.0
# via doxysphinx
lxml==5.2.1
# via doxysphinx
markdown-it-py==3.0.0
# via
# mdit-py-plugins
# myst-parser
markupsafe==2.1.5
markupsafe==3.0.2
# via jinja2
matplotlib==3.10.1
# via doxysphinx
matplotlib-inline==0.1.7
# via
# ipykernel
# ipython
mdit-py-plugins==0.4.1
mdit-py-plugins==0.4.2
# via myst-parser
mdurl==0.1.2
# via markdown-it-py
myst-nb==1.1.2
mpire==2.10.2
# via doxysphinx
myst-nb==1.2.0
# via rocm-docs-core
myst-parser==3.0.1
myst-parser==4.0.1
# via myst-nb
nbclient==0.10.2
# via
@@ -134,26 +155,34 @@ nbformat==5.10.4
# nbclient
nest-asyncio==1.6.0
# via ipykernel
packaging==24.1
numpy==1.26.4
# via
# contourpy
# doxysphinx
# matplotlib
packaging==25.0
# via
# ipykernel
# matplotlib
# pydata-sphinx-theme
# sphinx
parso==0.8.4
# via jedi
pexpect==4.9.0
# via ipython
platformdirs==4.3.6
pillow==11.2.1
# via matplotlib
platformdirs==4.3.7
# via jupyter-core
prompt-toolkit==3.0.50
prompt-toolkit==3.0.51
# via ipython
psutil==6.1.1
psutil==7.0.0
# via ipykernel
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.3
# via stack-data
pybtex==0.24.0
pybtex==0.25.1
# via
# pybtex-docutils
# sphinxcontrib-bibtex
@@ -165,21 +194,30 @@ pydata-sphinx-theme==0.15.4
# via
# rocm-docs-core
# sphinx-book-theme
pygithub==2.3.0
pygithub==2.6.1
# via rocm-docs-core
pygments==2.18.0
pygments==2.19.1
# via
# accessible-pygments
# ipython
# mpire
# pydata-sphinx-theme
# sphinx
pyjwt[crypto]==2.8.0
pyjson5==1.6.8
# via doxysphinx
pyjwt[crypto]==2.10.1
# via pygithub
pynacl==1.5.0
# via pygithub
pyparsing==3.2.3
# via
# doxysphinx
# matplotlib
python-dateutil==2.9.0.post0
# via jupyter-client
pyyaml==6.0.1
# via
# jupyter-client
# matplotlib
pyyaml==6.0.2
# via
# jupyter-cache
# myst-nb
@@ -187,11 +225,11 @@ pyyaml==6.0.1
# pybtex
# rocm-docs-core
# sphinx-external-toc
pyzmq==26.2.0
pyzmq==26.4.0
# via
# ipykernel
# jupyter-client
referencing==0.36.1
referencing==0.36.2
# via
# jsonschema
# jsonschema-specifications
@@ -199,23 +237,21 @@ requests==2.32.3
# via
# pygithub
# sphinx
rocm-docs-core==1.18.1
rocm-docs-core[api-reference]==1.20.1
# via -r requirements.in
rpds-py==0.22.3
rpds-py==0.24.0
# via
# jsonschema
# referencing
six==1.16.0
# via
# pybtex
# python-dateutil
smmap==5.0.1
six==1.17.0
# via python-dateutil
smmap==5.0.2
# via gitdb
snowballstemmer==2.2.0
# via sphinx
soupsieve==2.5
soupsieve==2.7
# via beautifulsoup4
sphinx==7.4.7
sphinx==8.1.3
# via
# breathe
# myst-nb
@@ -228,19 +264,19 @@ sphinx==7.4.7
# sphinx-external-toc
# sphinx-notfound-page
# sphinxcontrib-bibtex
sphinx-book-theme==1.1.3
sphinx-book-theme==1.1.4
# via rocm-docs-core
sphinx-copybutton==0.5.2
# via rocm-docs-core
sphinx-design==0.6.0
sphinx-design==0.6.1
# via rocm-docs-core
sphinx-external-toc==1.0.1
# via rocm-docs-core
sphinx-notfound-page==1.0.3
sphinx-notfound-page==1.1.0
# via rocm-docs-core
sphinxcontrib-applehelp==2.0.0
# via sphinx
sphinxcontrib-bibtex==2.6.3
sphinxcontrib-bibtex==2.6.5
# via -r requirements.in
sphinxcontrib-devhelp==2.0.0
# via sphinx
@@ -252,18 +288,20 @@ sphinxcontrib-qthelp==2.0.0
# via sphinx
sphinxcontrib-serializinghtml==2.0.0
# via sphinx
sqlalchemy==2.0.37
sqlalchemy==2.0.40
# via jupyter-cache
stack-data==0.6.3
# via ipython
tabulate==0.9.0
# via jupyter-cache
tomli==2.0.1
tomli==2.2.1
# via sphinx
tornado==6.4.2
# via
# ipykernel
# jupyter-client
tqdm==4.67.1
# via mpire
traitlets==5.14.3
# via
# comm
@@ -274,21 +312,22 @@ traitlets==5.14.3
# matplotlib-inline
# nbclient
# nbformat
typing-extensions==4.12.2
typing-extensions==4.13.2
# via
# beautifulsoup4
# ipython
# myst-nb
# pydata-sphinx-theme
# pygithub
# referencing
# sqlalchemy
urllib3==2.2.2
urllib3==2.4.0
# via
# pygithub
# requests
wcwidth==0.2.13
# via prompt-toolkit
wrapt==1.16.0
wrapt==1.17.2
# via deprecated
zipp==3.21.0
# via importlib-metadata

View File

@@ -0,0 +1,40 @@
.. meta::
:description: Composable Kernel examples and tests
:keywords: composable kernel, CK, ROCm, API, examples, tests
********************************************************************
Composable Kernel examples and tests
********************************************************************
After :doc:`building and installing Composable Kernel <../install/Composable-Kernel-install>`, the examples and tests will be moved to ``/opt/rocm/bin/``.
All tests have the prefix ``test`` and all examples have the prefix ``example``.
Use ``ctest`` with no arguments to run all examples and tests, or use ``ctest -R`` to run a single test. For example:
.. code:: shell
ctest -R test_gemm_fp16
Examples can be run individually as well. For example:
.. code:: shell
./bin/example_gemm_xdl_fp16 1 1 1
For instructions on how to run individual examples and tests, see their README files in the |example|_ and |test|_ GitHub folders.
To run smoke tests, use ``make smoke``.
To run regression tests, use ``make regression``.
In general, tests that run for under thirty seconds are included in the smoke tests and tests that run for over thirty seconds are included in the regression tests.
.. |example| replace:: ``example``
.. _example: https://github.com/ROCm/composable_kernel/tree/develop/example
.. |client_example| replace:: ``client_example``
.. _client_example: https://github.com/ROCm/composable_kernel/tree/develop/client_example
.. |test| replace:: ``test``
.. _test: https://github.com/ROCm/composable_kernel/tree/develop/test

View File

@@ -1,165 +0,0 @@
.. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
.. _hello-world:
********************************************************************
Hello World Tutorial
********************************************************************
This tutorial is for engineers dealing with artificial intelligence and machine learning who
would like to optimize pipelines and improve performance using the Composable
Kernel (CK) library. This tutorial provides an introduction to the CK library. You will build the library and run some examples using a "Hello World" example.
Description
===========
Modern AI technology solves more and more problems in a variety of fields, but crafting fast and
efficient workflows is still challenging. CK can make the AI workflow fast
and efficient. CK is a collection of optimized AI operator kernels with tools to create
new kernels. The library has components required for modern neural network architectures
including matrix multiplication, convolution, contraction, reduction, attention modules, a variety of activation functions, and fused operators.
CK library acceleration features are based on:
* Layered structure
* Tile-based computation model
* Tensor coordinate transformation
* Hardware acceleration use
* Support of low precision data types including fp16, bf16, int8 and int4
If you need more technical details and benchmarking results read the following
`blog post <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_.
To download the library visit the `composable_kernel repository <https://github.com/ROCm/composable_kernel>`_.
Hardware targets
================
CK library fully supports `gfx908` and `gfx90a` GPU architectures, while only some operators are
supported for `gfx1030` devices. Check your hardware to determine the target GPU architecture.
========== =========
GPU Target AMD GPU
========== =========
gfx908 Radeon Instinct MI100
gfx90a Radeon Instinct MI210, MI250, MI250X
gfx1030 Radeon PRO V620, W6800, W6800X, W6800X Duo, W6900X, RX 6800, RX 6800 XT, RX 6900 XT, RX 6900 XTX, RX 6950 XT
========== =========
There are also `cloud options <https://aws.amazon.com/ec2/instance-types/g4/>`_ you can find if
you don't have an AMD GPU at hand.
Build the library
=================
This tutorial is based on the use of docker images as explained in :ref:`docker-hub`. Download a docker image suitable for your OS and ROCm release, run or start the docker container, and then resume the tutorial from this point.
.. note::
You can also `install ROCm <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/>`_ on your system, clone the `Composable Kernel repository <https://github.com/ROCm/composable_kernel.git>`_ on GitHub, and use that to build and run the examples using the commands described below.
Both the docker container and GitHub repository include the Composable Kernel library. Navigate to the library::
cd composable_kernel/
Create and change to a ``build`` directory::
mkdir build && cd build
The previous section discussed supported GPU architecture. Once you decide which hardware targets are needed, run CMake using the ``GPU_TARGETS`` flag::
cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_CXX_FLAGS="-O3" \
-D CMAKE_BUILD_TYPE=Release \
-D BUILD_DEV=OFF \
-D GPU_TARGETS="gfx908;gfx90a;gfx1030" ..
If everything goes well the CMake command will return::
-- Configuring done
-- Generating done
-- Build files have been written to: "/root/workspace/composable_kernel/build"
Finally, you can build examples and tests::
make -j examples tests
When complete you should see::
Scanning dependencies of target tests
[100%] Built target tests
Run examples and tests
======================
Examples are listed as test cases as well, so you can run all examples and tests with::
ctest
You can check the list of all tests by running::
ctest -N
You can also run examples separately as shown in the following example execution::
./bin/example_gemm_xdl_fp16 1 1 1
The arguments ``1 1 1`` mean that you want to run this example in the mode: verify results with CPU, initialize matrices with integers, and benchmark the kernel execution. You can play around with these parameters and see how output and execution results change.
If you have a device based on `gfx908` or `gfx90a` architecture, and if the example runs as expected, you should see something like::
a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
Perf: 1.08153 ms, 119.136 TFlops, 89.1972 GB/s, DeviceGemm_Xdl_CShuffle<Default, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, 8, 4, 1, 2> LoopScheduler: Interwave, PipelineVersion: v1
However, running it on a `gfx1030` device should result in the following::
a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 does not support this problem
Don't worry, some operators are supported on `gfx1030` architecture, so you can run a
separate example like::
./bin/example_gemm_dl_fp16 1 1 1
and it should return something like::
a_m_k: dim 2, lengths {3840, 4096}, strides {1, 4096}
b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
arg.a_grid_desc_k0_m0_m1_k1_{2048, 3840, 2}
arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
arg.c_grid_desc_m_n_{ 3840, 4096}
launch_and_time_kernel: grid_dim {960, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 3.65695 ms, 35.234 TFlops, 26.3797 GB/s, DeviceGemmDl<256, 128, 128, 16, 2, 4, 4, 1>
.. note::
A new CMake flag ``DL_KERNELS`` has been added to the latest versions of CK. If you do not see the above results when running ``example_gemm_dl_fp16``, you might need to add ``-D DL_KERNELS=ON`` to your CMake command to build the operators supported on the `gfx1030` architecture.
You can also run a separate test::
ctest -R test_gemm_fp16
If everything goes well you should see something like::
Start 121: test_gemm_fp16
1/1 Test #121: test_gemm_fp16 ................... Passed 51.81 sec
100% tests passed, 0 tests failed out of 1
Summary
=======
In this tutorial you took the first look at the Composable Kernel library, built it on your system and ran some examples and tests. In the next tutorial you will run kernels with different configurations to find out the best one for your hardware and task.
P.S.: If you are running on a cloud instance, don't forget to switch off the cloud instance.

27
example/01_gemm/CMakeLists.txt Executable file → Normal file
View File

@@ -28,11 +28,23 @@ add_example_executable(example_gemm_xdl_fp16_v3 gemm_xdl_fp16_v3.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_v3)
add_example_executable(example_gemm_xdl_fp8_v3 gemm_xdl_fp8_v3.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_v3)
add_example_executable(example_gemm_xdl_fp16_fp8_v3 gemm_xdl_fp16_fp8_v3.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_v3)
add_example_executable(example_gemm_xdl_fp16_fp8_streamk_v3 gemm_xdl_fp16_fp8_streamk_v3.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_streamk_v3)
add_example_executable(example_gemm_xdl_bf16_v3 gemm_xdl_bf16_v3.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_v3)
set(GEMM_OPTIONS)
list(APPEND GEMM_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm --slp-threshold=-16")
example_compile_options(example_gemm_xdl_fp8_v3 PRIVATE ${GEMM_OPTIONS})
example_compile_options(example_gemm_xdl_bf16_v3 PRIVATE ${GEMM_OPTIONS})
list(APPEND gpu_list gfx942 gfx950)
set(target 0)
foreach(gpu IN LISTS GPU_TARGETS)
@@ -103,3 +115,18 @@ add_example_executable(example_gemm_wmma_bf16 gemm_wmma_bf16.cpp)
add_example_dependencies(example_gemm_wmma example_gemm_wmma_bf16)
add_example_executable(example_gemm_wmma_int8 gemm_wmma_int8.cpp)
add_example_dependencies(example_gemm_wmma example_gemm_wmma_int8)
add_example_executable(example_gemm_wmma_bf16_v3 gemm_wmma_bf16_v3.cpp)
add_example_dependencies(example_gemm_wmma example_gemm_wmma_bf16_v3)
add_example_executable(example_gemm_wmma_bf16_pk_i4_v3 gemm_wmma_bf16_pk_i4_v3.cpp)
add_example_dependencies(example_gemm_wmma example_gemm_wmma_bf16_pk_i4_v3)
add_example_executable(example_gemm_wmma_fp8_v3 gemm_wmma_fp8_v3.cpp)
add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp8_v3)
add_example_executable(example_gemm_wmma_fp16_v3 gemm_wmma_fp16_v3.cpp)
add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_v3)
add_example_executable(example_gemm_wmma_fp16_pk_i4_v3 gemm_wmma_fp16_pk_i4_v3.cpp)
add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_pk_i4_v3)
add_example_executable(example_gemm_wmma_fp16_fp8_v3 gemm_wmma_fp16_fp8_v3.cpp)
add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_fp8_v3)
add_example_executable(example_gemm_wmma_fp16_pk_i4_v3_b_scale gemm_wmma_fp16_pk_i4_v3_b_scale.cpp)
add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_pk_i4_v3_b_scale)

View File

@@ -15,6 +15,8 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
@@ -57,8 +59,9 @@ struct ProblemSizeStreamK_universal final
ck::index_t StrideB = -1;
ck::index_t StrideC = -1;
ck::index_t Grid_size = -1; // defaults to max occupancy
ck::index_t Streamk_sel = 1; // defaults to 1-tile SK
ck::index_t Grid_size = -1; // defaults to max occupancy
ck::index_t Streamk_sel = 1; // defaults to 1-tile SK
ck::StreamKReductionStrategy reduction_strategy = ck::StreamKReductionStrategy::Atomic;
};
struct ProblemSizeSplitK final
@@ -128,11 +131,12 @@ bool parse_cmd_args<ProblemSize>(int argc,
}
else
{
std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<< std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl;
std::cerr
<< "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC (default: -1 or 0)"
<< std::endl;
return false;
}
@@ -172,7 +176,19 @@ bool parse_cmd_args<ProblemSizeStreamK_universal>(int argc,
if(argc >= 11)
{
problem_size.Streamk_sel = std::stoi(argv[10]);
problem_size.Grid_size = std::stoi(argv[11]);
if(argc >= 12)
{
problem_size.Grid_size = std::stoi(argv[11]);
if(argc >= 13)
{
int reduction_strategy = std::stoi(argv[12]);
problem_size.reduction_strategy = reduction_strategy == 0
? ck::StreamKReductionStrategy::Atomic
: ck::StreamKReductionStrategy::Reduction;
}
}
}
}
else
@@ -181,9 +197,12 @@ bool parse_cmd_args<ProblemSizeStreamK_universal>(int argc,
<< "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC (default: -1 or 0)"
<< std::endl
<< "arg10: stream-k select (-1: default config, 0: all DP, 1: 1-tile SK, 2: 2-tile SK)"
<< "\narg11: Grid_size(-1 for max occupancy)" << std::endl;
<< std::endl
<< "arg11: Grid_size(-1 for max occupancy)" << std::endl
<< "arg12: Reduction strategy (0: Atomic, 1: Reduction)" << std::endl;
return false;
}
@@ -227,13 +246,14 @@ bool parse_cmd_args<ProblemSizeStreamK>(int argc,
}
else
{
std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<< std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
<< "arg10: stream-k select (0: all DP, 1: 1-tile SK, 2: 2-tile SK)"
<< "\narg11: Grid_size(-1 for max occupancy)" << std::endl;
std::cerr
<< "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC (default: -1 or 0)"
<< std::endl
<< "arg10: stream-k select (0: all DP, 1: 1-tile SK, 2: 2-tile SK)"
<< "\narg11: Grid_size(-1 for max occupancy)" << std::endl;
return false;
}
@@ -277,12 +297,13 @@ bool parse_cmd_args<ProblemSizeSplitK>(int argc,
}
else
{
std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<< std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
<< "arg10: KBatch" << std::endl;
std::cerr
<< "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC (default: -1 or 0)"
<< std::endl
<< "arg10: KBatch" << std::endl;
return false;
}

View File

@@ -0,0 +1,253 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
using ADataType = ck::bhalf_t;
using BDataType = ck::pk_i4_t;
using AccDataType = float;
using CShuffleDataType = ck::bhalf_t;
using CDataType = ck::bhalf_t;
using ALayout = Row;
using BLayout = Col;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
static constexpr bool PermuteA = false;
static constexpr bool PermuteB = true;
static constexpr ck::index_t KPerBlock = 32;
// clang-format off
using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3<
ALayout, BLayout, CLayout,
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
AElementOp, BElementOp, CElementOp, GemmDefault,
256,
128, 128, KPerBlock,
8, 8,
16, 16,
4, 2,
S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 1,
S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 1,
1, 1, S<1, 32, 1, 8>, 8,
ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1,
ADataType, ADataType, PermuteA, PermuteB>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
AccDataType,
PassThrough,
PassThrough,
PassThrough>;
template <typename ProblemType>
bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto M = problem_size.M;
auto N = problem_size.N;
auto K = problem_size.K;
auto StrideA = problem_size.StrideA;
auto StrideB = problem_size.StrideB;
auto StrideC = problem_size.StrideC;
auto KBatch = problem_size.KBatch;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
auto f_get_default_stride =
[](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
if(stride == -1)
{
// give a chance if stride is -1, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return static_cast<std::size_t>(col);
}
else
{
return static_cast<std::size_t>(row);
}
}
else
return static_cast<std::size_t>(stride);
};
StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
switch(config.init_method)
{
case 0:
a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
break;
case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
break;
case 2:
a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
break;
case 3:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
}
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
// weight permute
if constexpr(PermuteB)
{
int K1 = KPerBlock;
int K0 = K / KPerBlock;
// int K0, N, K1
for(int j = 0; j < K0; j++)
{
for(int i = 0; i < N; i++)
{
for(int jj = 0; jj < K1; jj++)
{
b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj));
}
}
}
}
else
{
for(int i = 0; i < N; i++)
{
for(int j = 0; j < K; j++)
{
b_k_n_permute(i * K + j) = b_k_n(i * K + j);
}
}
}
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n_permute.mData.data());
DeviceMem workspace;
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{};
// do GEMM
auto gemm = DeviceGemmV2Instance{};
auto invoker = gemm.MakeInvoker();
float ave_time = 0;
auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
KBatch,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
bool pass = true;
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
ref_invoker.Run(ref_argument);
ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0});
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
}
if(config.time_kernel)
{
ave_time =
invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 20, 50, true, 50});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K +
sizeof(BDataType) * K * N /
(ck::is_same_v<ck::remove_cvref_t<BDataType>, ck::pk_i4_t> ? 2 : 1) +
sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s, " << gemm.GetTypeString() << std::endl;
}
return pass;
}
bool run_gemm_splitk_example(int argc, char* argv[])
{
ProblemSizeSplitK problem_size;
ExecutionConfig config;
return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
}
int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }

View File

@@ -0,0 +1,47 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
using ADataType = ck::bhalf_t;
using BDataType = ck::bhalf_t;
using AccDataType = float;
using CShuffleDataType = ck::bhalf_t;
using CDataType = ck::bhalf_t;
using ALayout = Col;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3<
ALayout, BLayout, CLayout,
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
PassThrough, PassThrough, PassThrough, GemmDefault,
256,
128, 128, 32,
8, 8,
16, 16,
4, 2,
S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 1, 8, 1,
S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 1, 8, 1,
1, 1, S<1, 32, 1, 8>, 8,
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
#include "run_gemm_example_v2.inc"
int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }

View File

@@ -0,0 +1,52 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
using ADataType = ck::half_t;
using BDataType = ck::f8_t;
using AccDataType = float;
using CShuffleDataType = ck::half_t;
using CDataType = ck::half_t;
using ALayout = Row;
using BLayout = Col;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3<
ALayout, BLayout, CLayout,
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
AElementOp, BElementOp, CElementOp, GemmDefault,
256,
128, 128, 32,
8, 8,
16, 16,
4, 2,
S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 1,
S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 1,
1, 1, S<1, 32, 1, 8>, 8,
ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
AccDataType,
PassThrough,
PassThrough,
PassThrough>;
#include "run_gemm_example_v2.inc"
int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }

View File

@@ -0,0 +1,302 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
using ADataType = ck::half_t;
using BDataType = ck::pk_i4_t;
using AccDataType = float;
using CShuffleDataType = ck::half_t;
using CDataType = ck::half_t;
using ALayout = Row;
using BLayout = Col;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
static constexpr bool PermuteA = false;
static constexpr bool PermuteB = true;
static constexpr ck::index_t KPerBlock = 32;
// clang-format off
using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3<
ALayout, BLayout, CLayout,
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
AElementOp, BElementOp, CElementOp, GemmDefault,
256,
128, 128, KPerBlock,
8, 8,
16, 16,
4, 2,
S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 1,
S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 1,
1, 1, S<1, 32, 1, 8>, 8,
ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1,
ADataType, ADataType, PermuteA, PermuteB>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
AccDataType,
PassThrough,
PassThrough,
PassThrough>;
template <typename ProblemType>
bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto M = problem_size.M;
auto N = problem_size.N;
auto K = problem_size.K;
auto StrideA = problem_size.StrideA;
auto StrideB = problem_size.StrideB;
auto StrideC = problem_size.StrideC;
auto KBatch = problem_size.KBatch;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
auto f_get_default_stride =
[](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
if(stride == -1)
{
// give a chance if stride is -1, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return static_cast<std::size_t>(col);
}
else
{
return static_cast<std::size_t>(row);
}
}
else
return static_cast<std::size_t>(stride);
};
StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
switch(config.init_method)
{
case 0:
a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
break;
case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
break;
case 2:
a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
break;
case 3:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
}
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
// weight permute
if constexpr(PermuteB)
{
int K1 = KPerBlock;
int K0 = K / KPerBlock;
// int K0, N, K1
for(int j = 0; j < K0; j++)
{
for(int i = 0; i < N; i++)
{
for(int jj = 0; jj < K1; jj++)
{
b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj));
}
}
}
}
else
{
for(int i = 0; i < N; i++)
{
for(int j = 0; j < K; j++)
{
b_k_n_permute(i * K + j) = b_k_n(i * K + j);
}
}
}
// vector pk_i4x4 permute
for(int i = 0; i < N; i++)
{
for(int j = 0; j < K; j += 8)
{
int input[8];
for(int k = 0; k < 4; k++)
{
int i4x2 = b_k_n_permute(j + k * 2, i).data;
input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
}
// permute 01234567->20643175
{
int hi = input[2];
int lo = input[0];
int i4x2 = (hi << 4) | lo;
b_k_n_permute(j + 0, i) = i4x2;
}
{
int hi = input[6];
int lo = input[4];
int i4x2 = (hi << 4) | lo;
b_k_n_permute(j + 2, i) = i4x2;
}
{
int hi = input[3];
int lo = input[1];
int i4x2 = (hi << 4) | lo;
b_k_n_permute(j + 4, i) = i4x2;
}
{
int hi = input[7];
int lo = input[5];
int i4x2 = (hi << 4) | lo;
b_k_n_permute(j + 6, i) = i4x2;
}
}
}
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n_permute.mData.data());
DeviceMem workspace;
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{};
// do GEMM
auto gemm = DeviceGemmV2Instance{};
auto invoker = gemm.MakeInvoker();
float ave_time = 0;
auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
KBatch,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
bool pass = true;
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
ref_invoker.Run(ref_argument);
ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0});
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
}
if(config.time_kernel)
{
ave_time =
invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 20, 50, true, 50});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K +
sizeof(BDataType) * K * N /
(ck::is_same_v<ck::remove_cvref_t<BDataType>, ck::pk_i4_t> ? 2 : 1) +
sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s, " << gemm.GetTypeString() << std::endl;
}
return pass;
}
bool run_gemm_splitk_example(int argc, char* argv[])
{
ProblemSizeSplitK problem_size;
ExecutionConfig config;
return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
}
int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }

View File

@@ -0,0 +1,367 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_scale.hpp"
using ADataType = ck::half_t;
using BDataType = ck::pk_i4_t;
using BScaleDataType = ck::half_t;
using AccDataType = float;
using CShuffleDataType = ck::half_t;
using CDataType = ck::half_t;
using ALayout = Row;
using BLayout = Col;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
static constexpr bool PermuteA = false;
static constexpr bool PermuteB = true;
static constexpr ck::index_t Scale_Block_N = 1;
static constexpr ck::index_t Scale_Block_K = 128;
static constexpr ck::index_t KPerBlock = 64;
// clang-format off
using DeviceGemmV2Instance =
ck::tensor_operation::device::DeviceGemm_BScale_Wmma_CShuffleV3<
ALayout, BLayout, CLayout,
ADataType, BDataType, BScaleDataType, CDataType, AccDataType, CShuffleDataType,
AElementOp, BElementOp, CElementOp, GemmDefault,
256, Scale_Block_N, Scale_Block_K,
128, 128,
KPerBlock, 8, 8,
16, 16,
4, 2,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
1, 1, S<1, 32, 1, 8>, 8,
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3,
CDataType, CDataType, PermuteA, PermuteB>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
AccDataType,
CDataType,
AccDataType,
PassThrough,
PassThrough,
PassThrough>;
template <typename ProblemType>
bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto M = problem_size.M;
auto N = problem_size.N;
auto K = problem_size.K;
auto StrideA = problem_size.StrideA;
auto StrideB = problem_size.StrideB;
auto StrideC = problem_size.StrideC;
auto KBatch = problem_size.KBatch;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
auto f_get_default_stride =
[](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
if(stride == -1)
{
// give a chance if stride is -1, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return static_cast<std::size_t>(col);
}
else
{
return static_cast<std::size_t>(row);
}
}
else
return static_cast<std::size_t>(stride);
};
ck::index_t Scale_Stride_BN = (K + Scale_Block_K - 1) / Scale_Block_K;
StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<BScaleDataType> b1_k_n(f_host_tensor_descriptor((K + Scale_Block_K - 1) / Scale_Block_K,
(N + Scale_Block_N - 1) / Scale_Block_N,
Scale_Stride_BN,
BLayout{}));
switch(config.init_method)
{
case 0:
a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
break;
case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
b1_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
break;
case 2:
a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
break;
case 3:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
break;
case 4:
a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
b1_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
break;
case 5:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
b1_k_n.GenerateTensorValue(GeneratorTensor_1<BScaleDataType>{1});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.5, 0.5});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
b1_k_n.GenerateTensorValue(GeneratorTensor_3<BScaleDataType>{0, 1.0});
}
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
DeviceMem b1_scale_device_buf(sizeof(BScaleDataType) * b1_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
// weight permute
if constexpr(PermuteB)
{
int K1 = KPerBlock;
int K0 = K / KPerBlock;
// int K0, N, K1
for(int j = 0; j < K0; j++)
{
for(int i = 0; i < N; i++)
{
for(int jj = 0; jj < K1; jj++)
{
b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj));
}
}
}
}
else
{
for(int i = 0; i < N; i++)
{
for(int j = 0; j < K; j++)
{
b_k_n_permute(i * K + j) = b_k_n(i * K + j);
}
}
}
// vector pk_i4x4 permute
for(int i = 0; i < N; i++)
{
for(int j = 0; j < K; j += 8)
{
int input[8];
for(int k = 0; k < 4; k++)
{
int i4x2 = b_k_n_permute(j + k * 2, i).data;
input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
}
// permute 01234567->20643175
{
int hi = input[2];
int lo = input[0];
int i4x2 = (hi << 4) | lo;
b_k_n_permute(j + 0, i) = i4x2;
}
{
int hi = input[6];
int lo = input[4];
int i4x2 = (hi << 4) | lo;
b_k_n_permute(j + 2, i) = i4x2;
}
{
int hi = input[3];
int lo = input[1];
int i4x2 = (hi << 4) | lo;
b_k_n_permute(j + 4, i) = i4x2;
}
{
int hi = input[7];
int lo = input[5];
int i4x2 = (hi << 4) | lo;
b_k_n_permute(j + 6, i) = i4x2;
}
}
}
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n_permute.mData.data());
b1_scale_device_buf.ToDevice(b1_k_n.mData.data());
DeviceMem workspace;
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{};
// do GEMM
auto gemm = DeviceGemmV2Instance{};
auto invoker = gemm.MakeInvoker();
float ave_time = 0;
auto argument =
gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
Scale_Stride_BN,
static_cast<BScaleDataType*>(b1_scale_device_buf.GetDeviceBuffer()),
KBatch,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
std::string device_name = ck::get_device_name();
if(!(device_name.find("gfx11") != std::string::npos ||
device_name.find("gfx12") != std::string::npos))
{
std::cout << "This kernel support gfx1100 and gfx1200 only" << std::endl;
return true;
}
bool pass = true;
if(config.do_verification)
{
Tensor<float> b_k_n_dequant({K, N});
float v_b = 0;
for(int n = 0; n < N; n++)
{
for(int k = 0; k < K; k++)
{
ck::pk_i4_t i4x2 = b_k_n(k, n).data;
int8_t i4 = 0;
if(k % 2 == 1)
i4 = (i4x2.data >> 0) & 0xf;
else
i4 = (i4x2.data >> 4) & 0xf;
i4 = i4 - 8;
v_b = ck::type_convert<float>(i4);
b_k_n_dequant(k, n) =
ck::type_convert<float>(v_b) *
ck::type_convert<float>(b1_k_n(k / Scale_Block_K, n / Scale_Block_N));
}
}
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n_dequant, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
ref_invoker.Run(ref_argument);
ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0});
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
}
if(config.time_kernel)
{
ave_time =
invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 20, 50, true, 50});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K +
sizeof(BDataType) * K * N /
(ck::is_same_v<ck::remove_cvref_t<BDataType>, ck::pk_i4_t> ? 2 : 1) +
sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s, " << gemm.GetTypeString() << std::endl;
}
return pass;
}
bool run_gemm_splitk_example(int argc, char* argv[])
{
ProblemSizeSplitK problem_size;
ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }

View File

@@ -0,0 +1,47 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using AccDataType = float;
using CShuffleDataType = ck::half_t;
using CDataType = ck::half_t;
using ALayout = Col;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3<
ALayout, BLayout, CLayout,
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
PassThrough, PassThrough, PassThrough, GemmDefault,
128,
128, 64,
64, 8, 8,
16, 16,
4, 2,
S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 1, 8, 1,
S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 1, 8, 1,
1, 1, S<1, 32, 1, 4>, 8,
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
#include "run_gemm_example_v2.inc"
int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }

View File

@@ -0,0 +1,67 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3.hpp"
using ADataType = ck::f8_t;
using BDataType = ck::f8_t;
using AccDataType = float;
using CShuffleDataType = ck::bhalf_t;
using CDataType = ck::bhalf_t;
using ComputeTypeA = ck::f8_t;
using ComputeTypeB = ck::f8_t;
using ALayout = Row;
using BLayout = Col;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3<
ALayout, BLayout, CLayout,
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
PassThrough, PassThrough, PassThrough, GemmDefault,
128,
128, 64, 64,
8, 8,
16, 16,
4, 2,
S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
1, 1, S<1, 32, 1, 4>, 8,
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1,
ComputeTypeA, ComputeTypeB>;
// clang-format on
using ReferenceComputeType = ck::f8_t;
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
AccDataType,
AElementOp,
BElementOp,
CElementOp,
ReferenceComputeType,
ReferenceComputeType>;
#include "run_gemm_example_v2.inc"
int main(int argc, char* argv[])
{
if(!ck::is_gfx12_supported())
{
std::cout << "This kernel support gfx12 only" << std::endl;
return 0;
}
return !run_gemm_splitk_example(argc, argv);
}

0
example/01_gemm/gemm_xdl_bf16.cpp Executable file → Normal file
View File

View File

@@ -133,7 +133,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
// weight permute
@@ -192,14 +192,20 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument) || ck::get_device_name() != "gfx942" ||
ck::get_device_name() != "gfx950")
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
{
std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
return true;
}
bool pass = true;
if(config.do_verification)
{

0
example/01_gemm/gemm_xdl_bf16_streamk_v3.cpp Executable file → Normal file
View File

View File

@@ -0,0 +1,64 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
using ADataType = ck::half_t;
using BDataType = ck::f8_t;
using AccDataType = float;
using CShuffleDataType = ck::half_t;
using CDataType = ck::half_t;
using ALayout = Row;
using BLayout = Col;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceGemmV2_Streamk_Instance =
ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
ALayout, BLayout, CLayout,
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
AElementOp, BElementOp, CElementOp, GemmDefault,
64,
16, 16,
256, 8, 16,
16, 16,
1, 1,
S<32, 2, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
S<16, 4, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 16, 16, 0,
1, 1, S<1, 16, 1, 4>, 4,
ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1>;
// clang-format on
using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
BLayout,
CLayout,
ADataType,
BDataType,
CDataType,
AccDataType,
AElementOp,
BElementOp,
CElementOp>;
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
AccDataType,
PassThrough,
PassThrough,
PassThrough>;
#include "run_gemm_example_streamk_v2.inc"
int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); }

View File

@@ -134,7 +134,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
// weight permute
@@ -242,14 +242,20 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument) || ck::get_device_name() != "gfx942" ||
ck::get_device_name() != "gfx950")
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
{
std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
return true;
}
bool pass = true;
if(config.do_verification)
{

View File

@@ -161,7 +161,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
DeviceMem b1_scale_device_buf(sizeof(BScaleDataType) * b1_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
@@ -274,14 +274,20 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument) || ck::get_device_name() != "gfx942" ||
ck::get_device_name() != "gfx950")
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
{
std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
return true;
}
bool pass = true;
if(config.do_verification)
{

View File

@@ -31,15 +31,10 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
#else
< ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>;
#endif
// clang-format on
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
AccDataType,
AElementOp,
BElementOp,
CElementOp>;
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
BLayout,

View File

@@ -32,6 +32,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8, LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
// this instance has been tested working on gfx950
// < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 128, 32, 32, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8, LoopSched, PipelineVer, ComputeTypeA, ComputeTypeB>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::

View File

@@ -152,7 +152,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_preshuffled.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_preshuffled.mDesc.GetElementSpaceSize() /
2);
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
// do GEMM
@@ -261,14 +262,20 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument) || ck::get_device_name() != "gfx942" ||
ck::get_device_name() != "gfx950")
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
{
std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
return true;
}
bool pass = true;
if(config.do_verification)
{

View File

@@ -132,7 +132,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize() / 2);
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
// weight permute
@@ -240,14 +240,20 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument) || ck::get_device_name() != "gfx942" ||
ck::get_device_name() != "gfx950")
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
if(!(ck::get_device_name() == "gfx942" || ck::get_device_name() == "gfx950"))
{
std::cout << "This kernel support gfx942 and gfx950 only" << std::endl;
return true;
}
bool pass = true;
if(config.do_verification)
{

0
example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp Executable file → Normal file
View File

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
@@ -38,7 +38,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| SrcAccessOrder| SrcVectorDim| Scalar| AddExtraM| ThreadCluster| SrcAccessOrder| SrcVectorDim| Scalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| | | PerVector| | Lengths_K0_N_K1| | | PerVector| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 4>, S<1, 0, 2>, 2, 2, 1, S<4, 16, 4>, S<1, 0, 2>, 2, 2, 1, 1, 1, S<1, 8, 1, 8>, 4>;
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 4>, S<1, 0, 2>, 2, 2, 0, S<4, 16, 4>, S<1, 0, 2>, 2, 2, 0, 1, 1, S<1, 8, 1, 8>, 4>;
// clang-format on
#else
// clang-format off

View File

@@ -56,10 +56,10 @@ using CDataType = float;
using AccDataType = float;
#endif
// clang-format on
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, float, AElementOp, BElementOp, CElementOp>;
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, float, AElementOp, BElementOp, CElementOp>;
template <typename DataType>
std::ostream& show_2d_matrix(std::ostream& os, Tensor<DataType>& matrix)

View File

@@ -33,7 +33,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
auto f_get_default_stride =
[](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
if(stride == -1)
if(stride == -1 || stride == 0)
{
// give a chance if stride is -1, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)

View File

@@ -36,7 +36,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
auto f_get_default_stride =
[](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
if(stride == -1)
if(stride == -1 || stride == 0)
{
// give a chance if stride is -1, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)

View File

@@ -21,6 +21,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
auto Grid_size = problem_size.Grid_size;
auto Streamk_sel = problem_size.Streamk_sel;
auto reduction_strategy = problem_size.reduction_strategy;
if(reduction_strategy == ck::StreamKReductionStrategy::Atomic)
{
std::cout << "Using Atomic reduction strategy" << std::endl;
}
else
{
std::cout << "Using Parallel reduction strategy" << std::endl;
}
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
@@ -35,7 +45,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
auto f_get_default_stride =
[](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
if(stride == -1)
if(stride == -1 || stride == 0)
{
// give a chance if stride is -1, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
@@ -152,7 +162,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
Grid_size,
a_element_op,
b_element_op,
c_element_op);
c_element_op,
reduction_strategy);
if(!gemm.IsSupportedArgument(argument))
{
@@ -242,7 +253,10 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s, " << gemm.GetTypeString() << std::endl;
<< " GB/s, " << gemm.GetTypeString()
<< (reduction_strategy == ck::StreamKReductionStrategy::Atomic ? " (Atomic)"
: " (Reduction)")
<< std::endl;
}
return pass;
}

View File

@@ -34,7 +34,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
auto f_get_default_stride =
[](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
if(stride == -1)
if(stride == -1 || stride == 0)
{
// give a chance if stride is -1, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -34,7 +34,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| SrcAccessOrder| SrcVectorDim| Scalar| AddExtraM| ThreadCluster| SrcAccessOrder| SrcVectorDim| Scalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| | | PerVector| | Lengths_K0_N_K1| | | PerVector| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 64, 64, 64, 64, 8, 8, 32, 32, 2, 2, S<1, 8, 8>, S<1, 0, 2>, 2, 1, 1, S<1, 8, 8>, S<1, 0, 2>, 2, 1, 1, 1, 1, S<1, 8, 1, 8>, 4>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 64, 64, 64, 64, 8, 8, 32, 32, 2, 2, S<8, 1, 8>, S<1, 0, 2>, 2, 1, 0, S<8, 1, 8>, S<1, 0, 2>, 2, 1, 0, 1, 1, S<1, 8, 1, 8>, 4>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,

View File

@@ -117,7 +117,7 @@ int reduce_blockwise_impl(bool do_verification,
using InOutDataTypeInDevice = typename std::
conditional<std::is_same<InOutDataType, int4_t>::value, int8_t, InOutDataType>::type;
#else
using InOutDataTypeInDevice = InOutDataType;
using InOutDataTypeInDevice = InOutDataType;
#endif
using DeviceReduceInstance =

View File

@@ -141,8 +141,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
a_tensors_device.reserve(group_count);
b_tensors_device.reserve(group_count);
d_tensors_device.reserve(group_count);
c_tensors_device.reserve(group_count);
d_tensors_device.resize(group_count); // reserve and update vector size
std::size_t flop = 0, num_btype = 0;

Some files were not shown because too many files have changed in this diff Show More