Merge branch 'wjx/preshuffle_format' of https://github.com/ROCm/composable_kernel into wjx/preshuffle_format

This commit is contained in:
lalala-sh
2025-09-30 11:07:32 +08:00
440 changed files with 13915 additions and 3223 deletions

2
.github/CODEOWNERS vendored
View File

@@ -1,4 +1,4 @@
* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli @aska-0096 @shumway @vidyasagar-amd
* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd
# Documentation files
docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD
*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD

View File

@@ -42,6 +42,24 @@ def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]:
file=sys.stderr,
)
return None
GITHUB_WORKFLOWS_CI_PATTERNS = [
"therock*",
]
def is_path_workflow_file_related_to_ci(path: str) -> bool:
return any(
fnmatch.fnmatch(path, ".github/workflows/" + pattern)
for pattern in GITHUB_WORKFLOWS_CI_PATTERNS
) or any(
fnmatch.fnmatch(path, ".github/scripts/" + pattern)
for pattern in GITHUB_WORKFLOWS_CI_PATTERNS
)
def check_for_workflow_file_related_to_ci(paths: Optional[Iterable[str]]) -> bool:
if paths is None:
return False
return any(is_path_workflow_file_related_to_ci(p) for p in paths)
# Paths matching any of these patterns are considered to have no influence over
# build or test workflows so any related jobs can be skipped if all paths
@@ -82,12 +100,16 @@ def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool:
)
other_paths = paths_set - github_workflows_paths
related_to_ci = check_for_workflow_file_related_to_ci(github_workflows_paths)
contains_other_non_skippable_files = check_for_non_skippable_path(other_paths)
print("should_ci_run_given_modified_paths findings:")
print(f" contains_other_non_skippable_files: {contains_other_non_skippable_files}")
if contains_other_non_skippable_files:
if related_to_ci:
print("Enabling build jobs since a related workflow file was modified")
return True
elif contains_other_non_skippable_files:
print("Enabling TheRock CI jobs since a non-skippable path was modified")
return True
else:

View File

@@ -27,30 +27,35 @@ jobs:
TEATIME_FORCE_INTERACTIVE: 0
AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
steps:
- name: "Checking out repository for rocm-libraries"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/rocm-libraries"
- name: Checkout composable_kernel repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
path: "composable_kernel"
- name: Checkout TheRock repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
ref: ec1c2ef4f2636bce7733fd8c95e1dbb6692c8a57
ref: 409f43ad9d564454bb1b23f8c8aa15d6b9d25200
path: "TheRock"
- name: Runner Health Settings
run: |
df -h
cmake --version
echo "Installed Python versions:"
ls -d /opt/python
echo "python: $(which python), python3: $(which python3)"
echo "Git version: $(git --version)"
git config --global --add safe.directory $PWD
git config fetch.parallel 10
./TheRock/build_tools/health_status.py
- name: Fetch sources
run: |
./TheRock/build_tools/fetch_sources.py --jobs 12
./TheRock/build_tools/fetch_sources.py --jobs 12 --no-include-rocm-libraries --no-include-ml-frameworks
- name: Patch rocm-libraries
run: |
git config --global --add safe.directory '*'
git -c user.name="therockbot" -c "user.email=therockbot@amd.com" am --whitespace=nowarn ./TheRock/patches/amd-mainline/rocm-libraries/*.patch
- name: Install python deps
run: |
@@ -92,32 +97,14 @@ jobs:
aws-region: us-east-2
role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
- name: Create Logs index Files and upload logs
- name: Post Build Upload
if: always()
run: |
python3 TheRock/build_tools/github_actions/create_log_index.py \
--build-dir=TheRock/build \
--amdgpu-family=${{ env.AMDGPU_FAMILIES }}
python3 TheRock/build_tools/github_actions/upload_build_logs_to_s3.py \
--build-dir=TheRock/build \
--run-id ${{ github.run_id }} \
--amdgpu-family ${{ env.AMDGPU_FAMILIES }}
- name: Upload artifacts
run: |
python TheRock/build_tools/github_actions/upload_build_artifacts.py \
python3 TheRock/build_tools/github_actions/post_build_upload.py \
--run-id ${{ github.run_id }} \
--amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
--build-dir TheRock/build
- name: Add Links to Job Summary
if: always()
run: |
python TheRock/build_tools/github_actions/upload_build_summary.py \
--run-id ${{ github.run_id }} \
--amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
--build-dir TheRock/build
--build-dir TheRock/build \
--upload
therock-test-linux:
name: "Test"

View File

@@ -56,7 +56,14 @@ jobs:
uses: ./.github/workflows/therock-ci-linux.yml
secrets: inherit
with:
cmake_options: "-DTHEROCK_ENABLE_COMPOSABLE_KERNEL=ON -DTHEROCK_ENABLE_MIOPEN=ON -DTHEROCK_ENABLE_ALL=OFF -DTHEROCK_USE_EXTERNAL_CK=ON -DTHEROCK_CK_SOURCE_DIR=../"
cmake_options: >-
-DTHEROCK_ENABLE_COMPOSABLE_KERNEL=ON
-DTHEROCK_ENABLE_MIOPEN=ON
-DTHEROCK_ENABLE_ALL=OFF
-DTHEROCK_USE_EXTERNAL_COMPOSABLE_KERNEL=ON
-DTHEROCK_COMPOSABLE_KERNEL_SOURCE_DIR=../composable_kernel
-DTHEROCK_USE_EXTERNAL_ROCM_LIBRARIES=ON
-DTHEROCK_ROCM_LIBRARIES_SOURCE_DIR=../
amdgpu_families: "gfx94X-dcgpu"
test_runs_on: "linux-mi325-1gpu-ossci-rocm"

View File

@@ -0,0 +1,71 @@
name: Test component
on:
workflow_call:
inputs:
artifact_run_id:
type: string
default: ""
amdgpu_families:
type: string
test_runs_on:
type: string
platform:
type: string
component:
type: string
permissions:
contents: read
jobs:
test_component:
name: 'Test ${{ fromJSON(inputs.component).job_name }} (shard ${{ matrix.shard }} of ${{ fromJSON(inputs.component).total_shards }})'
runs-on: ${{ inputs.test_runs_on }}
container:
image: ${{ inputs.platform == 'linux' && 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98' || null }}
options: --ipc host
--group-add video
--device /dev/kfd
--device /dev/dri
--group-add 992
--env-file /etc/podinfo/gha-gpu-isolation-settings
strategy:
fail-fast: false
matrix:
# The shard array is based on "total_shards" from "fetch_test_configurations.py"
# The test executable will shard based on the array. (ex: [1, 2, 3, 4] = four test shards)
shard: ${{ fromJSON(inputs.component).shard_arr }}
defaults:
run:
shell: bash
env:
VENV_DIR: ${{ github.workspace }}/.venv
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}"
OUTPUT_ARTIFACTS_DIR: "./build"
THEROCK_BIN_DIR: "./build/bin"
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
steps:
- name: Checkout Repository
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
repository: "ROCm/TheRock"
- name: Run setup test environment workflow
uses: './.github/actions/setup_test_environment'
with:
ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
VENV_DIR: ${{ env.VENV_DIR }}
FETCH_ARTIFACT_ARGS: ${{ fromJSON(inputs.component).fetch_artifact_args }}
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
- name: Test
timeout-minutes: ${{ fromJSON(inputs.component).timeout_minutes }}
env:
SHARD_INDEX: ${{ matrix.shard }}
TOTAL_SHARDS: ${{ fromJSON(inputs.component).total_shards }}
run: |
${{ fromJSON(inputs.component).test_script }}

View File

@@ -37,41 +37,17 @@ jobs:
test_components:
name: 'Test ${{ matrix.components.job_name }}'
runs-on: ${{ inputs.test_runs_on }}
needs: configure_test_matrix
needs: [configure_test_matrix]
# skip tests if no test matrix to run
if: ${{ needs.configure_test_matrix.outputs.components != '[]' }}
strategy:
fail-fast: false
matrix:
components: ${{ fromJSON(needs.configure_test_matrix.outputs.components) }}
defaults:
run:
shell: bash
env:
VENV_DIR: ${{ github.workspace }}/.venv
ARTIFACT_RUN_ID: "${{ github.run_id }}"
OUTPUT_ARTIFACTS_DIR: ${{ github.workspace }}/build
THEROCK_BIN_DIR: "./build/bin"
steps:
- name: Checkout Repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
- name: Run setup test environment workflow
uses: './.github/actions/setup_test_environment'
with:
ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
VENV_DIR: ${{ env.VENV_DIR }}
FETCH_ARTIFACT_ARGS: ${{ matrix.components.fetch_artifact_args }}
PLATFORM: ${{ inputs.platform }}
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
- name: Test
timeout-minutes: ${{ matrix.components.timeout_minutes }}
run: |
if [ "${{ inputs.PLATFORM }}" == "linux" ]; then source ${VENV_DIR}/bin/activate ; else . ${VENV_DIR}/Scripts/activate ; fi
${{ matrix.components.test_script }}
uses: './.github/workflows/therock-test-component.yml'
with:
artifact_run_id: ${{ github.run_id }}
amdgpu_families: ${{ inputs.amdgpu_families }}
test_runs_on: ${{ inputs.test_runs_on }}
platform: ${{ inputs.platform }}
component: ${{ toJSON(matrix.components) }}

View File

@@ -6,6 +6,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
### Added
* Added support for B Tensor type pk_int4_t in the CK TILE weight preshuffle GEMM.
* Added the new api to load different memory sizes to SGPR.
* Added support for B Tensor Preshuffle in CK TILE Grouped GEMM.
* Added a basic copy kernel example and supporting documentation for new CK Tile developers.
* Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data
@@ -31,7 +32,8 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
* Added benchmarking support for tile engine GEMM Multi D.
* Added block scaling support in CK_TILE GEMM, allowing flexible use of quantization matrices from either A or B operands.
* Added the row-wise column-wise quantization for CK_TILE GEMM & CK_TILE Grouped GEMM.
* Added tensor-wise quantization for CK_TILE GEMM
* Added support for f32 to FMHA (fwd/bwd).
* Added tensor-wise quantization for CK_TILE GEMM.
### Optimized

View File

@@ -220,6 +220,9 @@ rocm_check_target_ids(SUPPORTED_GPU_TARGETS
message(STATUS "Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}")
# Cache SUPPORTED_GPU_TARGETS for debug
set(SUPPORTED_GPU_TARGETS "${SUPPORTED_GPU_TARGETS}" CACHE STRING "List of supported GPU targets")
if (SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx11|gfx12")
message(STATUS "Enabling XDL instances")
add_definitions(-DCK_USE_XDL)
@@ -339,6 +342,7 @@ endif()
option(USE_BITINT_EXTENSION_INT4 "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
option(USE_OPT_GFX11 "Whether to enable LDS cumode and Wavefront32 mode for GFX11 silicons." OFF)
option(ENABLE_ASM_DUMP "Whether to enable assembly dump for kernels." OFF)
option(ENABLE_JSON_DUMP "Whether to enable json dump for examples." OFF)
if(USE_BITINT_EXTENSION_INT4)
add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
@@ -352,6 +356,11 @@ if(ENABLE_ASM_DUMP)
message("CK compiled with ENABLE_ASM_DUMP set to ${ENABLE_ASM_DUMP}")
endif()
if (ENABLE_JSON_DUMP)
add_compile_definitions(CK_ENABLE_JSON_DUMP)
message("CK compiled with ENABLE_JSON_DUMP set to ${ENABLE_JSON_DUMP}")
endif()
## Threads
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)

View File

@@ -1,27 +1,23 @@
FROM ubuntu:24.04
ARG DEBIAN_FRONTEND=noninteractive
ARG ROCMVERSION=6.4.1
ARG ROCMVERSION=7.0.1
ARG compiler_version=""
ARG compiler_commit=""
ARG CK_SCCACHE=""
ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
ENV DEBIAN_FRONTEND=noninteractive
# Add rocm repository
RUN set -xe && \
apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
RUN if [ "$ROCMVERSION" != "6.5" ]; then \
sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/jammy/amdgpu-install_6.4.60401-1_all.deb --no-check-certificate" && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.4.60401-1_all.deb && \
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO jammy main > /etc/apt/sources.list.d/rocm.list" && \
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu jammy main > /etc/apt/sources.list.d/amdgpu.list'; \
fi
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu jammy main universe | tee -a /etc/apt/sources.list" && \
amdgpu-install -y --usecase=rocm --no-dkms
RUN wget https://repo.radeon.com/amdgpu-install/7.0.1/ubuntu/noble/amdgpu-install_7.0.1.70001-1_all.deb && \
apt install ./amdgpu-install_7.0.1.70001-1_all.deb -y && \
apt update && \
apt install python3-setuptools python3-wheel -y && \
apt install rocm-dev -y
## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
@@ -45,7 +41,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
libelf-dev \
libnuma-dev \
libpthread-stubs0-dev \
llvm-amdgpu \
mpich \
net-tools \
pkg-config \
@@ -61,17 +56,13 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
zip \
libzstd-dev \
openssh-server \
clang-format-12 \
clang-format-18 \
kmod && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* && \
rm -rf amdgpu-install* && \
# Remove unnecessary rocm components that take a lot of space
apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt
#Install latest ccache
RUN git clone https://github.com/ccache/ccache.git && \
git clone https://github.com/ccache/ccache.git && \
cd ccache && mkdir build && cd build && cmake .. && make install && \
#Install ninja build tracing tools
cd / && \

View File

@@ -1,4 +1,4 @@
ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm6.4.1"
ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm7.0.1"
FROM $BASE_DOCKER
ARG compiler_version=""
ARG compiler_commit=""

72
Jenkinsfile vendored
View File

@@ -53,7 +53,7 @@ def getBaseDockerImageName(){
}
else{
def ROCM_numeric = parseVersion("${params.ROCMVERSION}")
if ( ROCM_numeric.major <= 6 && ROCM_numeric.minor < 5 ){
if ( ROCM_numeric.major <= 7 && ROCM_numeric.minor < 1 ){
img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}"
}
else{
@@ -476,7 +476,7 @@ def buildHipClangJob(Map conf=[:]){
def retimage
(retimage, image) = getDockerImage(conf)
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 20, unit: 'HOURS')
{
@@ -538,7 +538,7 @@ def Build_CK(Map conf=[:]){
def image
def retimage
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
try {
(retimage, image) = getDockerImage(conf)
withDockerContainer(image: image, args: dockerOpts) {
@@ -728,7 +728,7 @@ def process_results(Map conf=[:]){
def variant = env.STAGE_NAME
def retimage
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
try
{
echo "Pulling image: ${image}"
@@ -836,7 +836,7 @@ def run_aiter_tests(Map conf=[:]){
dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
echo "Docker flags: ${dockerOpts}"
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
try
{
echo "Pulling image: ${image}"
@@ -852,13 +852,14 @@ def run_aiter_tests(Map conf=[:]){
}
withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 2, unit: 'HOURS'){
timeout(time: 5, unit: 'HOURS'){
try{
sh "rocminfo"
sh "python3 --version"
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py"
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py"
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha.py"
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha_varlen.py"
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe.py"
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_2stage.py"
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_blockscale.py"
@@ -894,7 +895,7 @@ def run_pytorch_tests(Map conf=[:]){
dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
echo "Docker flags: ${dockerOpts}"
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
try
{
echo "Pulling image: ${image}"
@@ -930,7 +931,8 @@ def run_pytorch_tests(Map conf=[:]){
}
//launch develop branch daily jobs
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_PERFORMANCE_TESTS=true
0 22 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
@@ -957,8 +959,8 @@ pipeline {
description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
string(
name: 'ROCMVERSION',
defaultValue: '6.4.1',
description: 'Specify which ROCM version to use: 6.4.1 (default).')
defaultValue: '7.0.1',
description: 'Specify which ROCM version to use: 7.0.1 (default).')
string(
name: 'COMPILER_VERSION',
defaultValue: '',
@@ -1037,8 +1039,8 @@ pipeline {
description: "Build CK and run tests on gfx942 (default: ON)")
booleanParam(
name: "BUILD_GFX950",
defaultValue: false,
description: "Build CK and run tests on gfx950 (default: OFF)")
defaultValue: true,
description: "Build CK and run tests on gfx950 (default: ON)")
booleanParam(
name: "BUILD_GFX10",
defaultValue: true,
@@ -1125,16 +1127,16 @@ pipeline {
agent{ label rocmnode("nogpu") }
environment{
setup_args = "NO_CK_BUILD"
execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \
-o -not -path \'*.git*\' -iname \'*.hpp\' \
-o -not -path \'*.git*\' -iname \'*.cpp\' \
-o -iname \'*.h.in\' \
-o -iname \'*.hpp.in\' \
-o -iname \'*.cpp.in\' \
-o -iname \'*.cl\' \
execute_cmd = "(cd .. && git ls-files \'*.h\' \
\'*.hpp\' \
\'*.cpp\' \
\'*.h.in\' \
\'*.hpp.in\' \
\'*.cpp.in\' \
\'*.cl\' \
| grep -v 'build/' \
| grep -v 'include/rapidjson' \
| xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\' && \
| xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\') && \
/cppcheck/build/bin/cppcheck ../* -v -j \$(nproc) -I ../include -I ../profiler/include -I ../library/include \
-D CK_ENABLE_FP64 -D CK_ENABLE_FP32 -D CK_ENABLE_FP16 -D CK_ENABLE_FP8 -D CK_ENABLE_BF16 -D CK_ENABLE_BF8 -D CK_ENABLE_INT8 \
-D __gfx908__ -D __gfx90a__ -D __gfx942__ -D __gfx1030__ -D __gfx1100__ -D __gfx1101__ -D __gfx1102__ \
@@ -1155,16 +1157,17 @@ pipeline {
agent{ label rocmnode("nogpu") }
environment{
setup_args = "NO_CK_BUILD"
execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \
-o -not -path \'*.git*\' -iname \'*.hpp\' \
-o -not -path \'*.git*\' -iname \'*.cpp\' \
-o -iname \'*.h.in\' \
-o -iname \'*.hpp.in\' \
-o -iname \'*.cpp.in\' \
-o -iname \'*.cl\' \
execute_cmd = "(cd .. && git ls-files \
\'*.h\' \
\'*.hpp\' \
\'*.cpp\' \
\'*.h.in\' \
\'*.hpp.in\' \
\'*.cpp.in\' \
\'*.cl\' \
| grep -v 'build/' \
| grep -v 'include/rapidjson' \
| xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\'"
| xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-18 -style=file {} | diff - {}\')"
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
@@ -1290,7 +1293,7 @@ pipeline {
agent{ label rocmnode("gfx90a")}
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen && \
execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake -DCMAKE_PREFIX_PATH=/opt/rocm ../codegen && \
make -j64 check"""
}
steps{
@@ -1350,7 +1353,6 @@ pipeline {
}
agent{ label rocmnode("gfx950") }
environment{
def docker_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0"
setup_args = "NO_CK_BUILD"
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx950 && \
make -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \
@@ -1358,7 +1360,7 @@ pipeline {
example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx950 """
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, docker_name: docker_name, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
@@ -1566,7 +1568,7 @@ pipeline {
-DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
}
steps{
Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
cleanWs()
}
}
@@ -1631,7 +1633,7 @@ pipeline {
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """
buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0")
buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1")
}
cleanWs()
}
@@ -1657,13 +1659,13 @@ pipeline {
cleanWs()
}
}
stage("Build CK and run Tests on gfx1101")
stage("Build CK and run Tests on gfx11")
{
when {
beforeAgent true
expression { params.BUILD_GFX11.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
}
agent{ label rocmnode("gfx1101") }
agent{ label 'miopen && (gfx1101 || gfx1100)' }
environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx11-generic" -DUSE_OPT_GFX11=ON -DCMAKE_CXX_FLAGS=" -O3 " """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \

View File

@@ -12,6 +12,7 @@ configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h)
find_package(ROCM)
include(ROCMInstallTargets)
include(ROCMTest)
find_package(hiprtc REQUIRED)
rocm_setup_version(VERSION 1.0)
@@ -27,7 +28,7 @@ add_compile_options(-std=c++20)
file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
# TODO: Use object library
add_library(ck_host STATIC ${SOURCES})
target_link_libraries(ck_host PRIVATE ck_headers)
target_link_libraries(ck_host PRIVATE ck_headers hiprtc::hiprtc)
set_target_properties(ck_host PROPERTIES
LINKER_LANGUAGE CXX

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -37,7 +37,7 @@ using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffl
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 2, S<1, 16, 1, 16>, 8, ck::LoopScheduler::Interwave, ck::PipelineVersion::v1>;
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 2, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 2, S<1, 16, 1, 16>, 4, ck::LoopScheduler::Interwave, ck::PipelineVersion::v1>;
// clang-format on
using DeviceGemmInstance = DeviceGemmInstance1;

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -33,13 +33,13 @@ using DeviceGemmInstance =
2, 256,
256, 256,
32, 8, 4,
32, 32,
4, 4,
16, 16,
8, 8,
S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 8, 4, 0,
1, 1, S<1, 32, 1, 8>, 8,
1, 1, S<1, 32, 1, 8>, 4,
ck::LoopScheduler::Default, ck::PipelineVersion::v1>;
// clang-format on

View File

@@ -36,7 +36,7 @@ using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
#else
< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, 4, 4, 7, 1>;
< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 16, 128, 4, 4, 16, 16, 1, 2, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, 4, 4, 7, 1>;
using ADataType = float;
using BDataType = float;
using CDataType = float;
@@ -185,7 +185,6 @@ int main(int argc, char* argv[])
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
@@ -209,8 +208,7 @@ int main(int argc, char* argv[])
return 0;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -29,7 +29,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_WaveletM
// ######| | | | Type| Type| Type| DataType| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| ThreadGroupSize| ThreadGroupSize| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, F16, CDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1,8>, 8>;
< ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, F16, CDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1,8>, 4>;
// clang-format on
using DeviceGemmInstance = DeviceGemmInstance;

View File

@@ -2,7 +2,6 @@
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/library/utility/validation_common.hpp"
// use macro to minimize code change
#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE
@@ -29,11 +28,11 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
}
};
@@ -59,17 +58,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
try
{
ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
M, N, K, StrideA, StrideB, StrideC);
}
catch(const std::runtime_error& e)
{
std::cerr << "Error: " << e.what() << std::endl;
return false;
}
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));

View File

@@ -174,6 +174,9 @@ int main(int argc, char* argv[])
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
const auto StrideD = std::is_same<decltype(ELayout{}), ck::tensor_layout::gemm::RowMajor>::value
? d_m_n.mDesc.GetStrides()[0]
: d_m_n.mDesc.GetStrides()[1];
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
@@ -221,7 +224,7 @@ int main(int argc, char* argv[])
K,
StrideA,
StrideB,
std::array<ck::index_t, 1>{0},
std::array<ck::index_t, 1>{static_cast<int>(StrideD)},
StrideE,
a_element_op,
b_element_op,

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -32,7 +32,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -32,7 +32,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,

View File

@@ -1,4 +1,4 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -31,7 +31,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 32, 1, 8>, 4>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 32, 1, 8>, 2>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,

View File

@@ -7,7 +7,9 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
#endif
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
ProblemSize ps =
problem_size; // make mutable copy because default stride values of 0 need to be updated
auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = ps;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
@@ -41,6 +43,30 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
// If any user-provided leading stride <= 0, replace it with the one determined by the
// created tensor descriptor. For RowMajor the leading stride is index 0, for ColMajor index 1.
auto fetch_leading_stride = [](const auto& tensor, auto layout_tag) -> int {
if constexpr(std::is_same_v<decltype(layout_tag), ck::tensor_layout::gemm::RowMajor>)
{
return static_cast<int>(tensor.GetStrides()[0]);
}
else
{
return static_cast<int>(tensor.GetStrides()[1]);
}
};
if(StrideA <= 0)
StrideA = fetch_leading_stride(a_m_k, ALayout{});
if(StrideB <= 0)
StrideB = fetch_leading_stride(b_k_n, BLayout{});
if(StrideD0 <= 0)
StrideD0 = fetch_leading_stride(d0_m_n, D0Layout{});
if(StrideD1 <= 0)
StrideD1 = fetch_leading_stride(d1_m_n, D1Layout{});
if(StrideE <= 0)
StrideE = fetch_leading_stride(e_m_n_host_result, ELayout{});
switch(config.init_method)
{
case 0: break;

View File

@@ -125,7 +125,7 @@ inline bool parse_cmd_args(int argc,
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
problem_size = ck::utils::conv::parse_conv_param(
num_dim_spatial, threshold_to_catch_partial_args, argv);
num_dim_spatial, threshold_to_catch_partial_args + 1, argv);
}
else
{

View File

@@ -23,7 +23,7 @@ using RsGlobalReduceOp =
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
// clang-format off
template <ck::index_t NDimSpatial>

View File

@@ -78,12 +78,12 @@ bool pool_test(bool do_verification,
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value)
{
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz}, layout);
}
else if constexpr(ck::is_same<decltype(layout),
ck::tensor_layout::convolution::NHWC>::value)
{
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_}, layout);
}
};

View File

@@ -115,12 +115,14 @@ int main()
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1_uz}));
std::vector<std::size_t>({stride, 1_uz}),
layout);
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1_uz, stride}));
std::vector<std::size_t>({1_uz, stride}),
layout);
}
};

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -54,7 +54,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4>;
// clang-format on
#include "run_grouped_gemm_example.inc"

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -54,7 +54,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 4>;
// clang-format on
#include "run_grouped_gemm_example.inc"

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -54,7 +54,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 32, 1, 8>, 4>;
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 16, 4, 4, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 32, 1, 8>, 2>;
// clang-format on
#include "run_grouped_gemm_example.inc"

View File

@@ -137,11 +137,13 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({batch_count, row, col}, {row * stride, stride, 1_uz});
return HostTensorDescriptor(
{batch_count, row, col}, {row * stride, stride, 1_uz}, layout);
}
else
{
return HostTensorDescriptor({batch_count, row, col}, {col * stride, 1_uz, stride});
return HostTensorDescriptor(
{batch_count, row, col}, {col * stride, 1_uz, stride}, layout);
}
};

View File

@@ -65,7 +65,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDLayern
//######| | | | | Type| Type| Type| DataType| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|
//######| | | | | | | | | | | | | | Operation| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _M_N| _M_N| _M_N| _M|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EMeanVarDataType, GammaDataType, BetaDataType, HDataType, AElementOp, BElementOp, CDEElementOp, HElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<32, 8>, 8, S<8, 32>, 8>;
< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EMeanVarDataType, GammaDataType, BetaDataType, HDataType, AElementOp, BElementOp, CDEElementOp, HElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<32, 8>, 4, S<8, 32>, 4>;
// clang-format on
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
@@ -154,8 +154,8 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
int main()
{
// temp disable on gfx11 & gfx12
if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
// temp disable on gfx11
if(ck::is_gfx11_supported())
{
return 0;
}

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
@@ -48,10 +48,10 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
32, // index_t KPerBlock
8, // index_t AK1
8, // index_t BK1
32, // index_t MPerXDL
32, // index_t NPerXDL
4, // index_t MXdlPerWave
2, // index_t NXdlPerWave
16, // index_t MPerXDL
16, // index_t NPerXDL
8, // index_t MXdlPerWave
4, // index_t NXdlPerWave
S<4, 64, 1>, // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // typename ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // typename ABlockTransferSrcAccessOrder
@@ -69,7 +69,7 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
1, // index_t CShuffleMXdlPerWavePerShuffle
1, // index_t CShuffleNXdlPerWavePerShuffle
S<1, 32, 1, 8>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
8>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
4>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
// clang-format on
int main(int argc, char* argv[])

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
@@ -47,10 +47,10 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
32, // index_t KPerBlock
8, // index_t AK1
8, // index_t BK1
32, // index_t MPerXDL
32, // index_t NPerXDL
4, // index_t MXdlPerWave
2, // index_t NXdlPerWave
16, // index_t MPerXDL
16, // index_t NPerXDL
8, // index_t MXdlPerWave
4, // index_t NXdlPerWave
S<4, 64, 1>, // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // typename ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // typename ABlockTransferSrcAccessOrder
@@ -68,7 +68,7 @@ using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_
1, // index_t CShuffleMXdlPerWavePerShuffle
1, // index_t CShuffleNXdlPerWavePerShuffle
S<1, 32, 1, 8>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
8>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
4>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
// clang-format on
int main(int argc, char* argv[])

View File

@@ -59,11 +59,13 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz});
return HostTensorDescriptor(
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
}
else
{
return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride});
return HostTensorDescriptor(
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
}
};

View File

@@ -137,11 +137,13 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz});
return HostTensorDescriptor(
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
}
else
{
return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride});
return HostTensorDescriptor(
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
}
};

View File

@@ -64,11 +64,13 @@ bool run_batched_gemm_rowwise(const ProblemSize& problem_size, const ExecutionCo
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz});
return HostTensorDescriptor(
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
}
else
{
return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride});
return HostTensorDescriptor(
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
}
};

View File

@@ -19,6 +19,9 @@
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
using Row = ck::tensor_layout::gemm::RowMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -247,11 +250,11 @@ int main(int argc, char* argv[])
exit(0);
}
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides, Row{});
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides, Bypass{});
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -342,7 +345,8 @@ int main(int argc, char* argv[])
if(do_verification)
{
Tensor<CShuffleDataType> c_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
Tensor<CShuffleDataType> c_gs_ms_ns_host_result(
e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
using ReferenceOpInstance = ReferenceContraction_G1_M2_N3_K1<NumDimM,
NumDimN,

View File

@@ -17,6 +17,9 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/numeric.hpp"
using Row = ck::tensor_layout::gemm::RowMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -247,11 +250,11 @@ int main(int argc, char* argv[])
exit(0);
}
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides, Row{});
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides, Bypass{});
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -342,7 +345,8 @@ int main(int argc, char* argv[])
if(do_verification)
{
Tensor<CShuffleDataType> c_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
Tensor<CShuffleDataType> c_gs_ms_ns_host_result(
e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
using ReferenceOpInstance = ReferenceContraction_G1_M3_N2_K1<NumDimG,
NumDimM,

View File

@@ -15,6 +15,8 @@
#include "ck/library/utility/numeric.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
using Row = ck::tensor_layout::gemm::RowMajor;
int run_contraction_bilinear_example(int argc, char* argv[])
{
bool do_verification = true;
@@ -95,11 +97,11 @@ int run_contraction_bilinear_example(int argc, char* argv[])
exit(0);
}
Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides, Row{});
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides, Row{});
Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides, Row{});
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
@@ -189,7 +191,7 @@ int run_contraction_bilinear_example(int argc, char* argv[])
if(do_verification)
{
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
using ReferenceOpInstance =
ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,

View File

@@ -15,6 +15,8 @@
#include "ck/library/utility/numeric.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
using Row = ck::tensor_layout::gemm::RowMajor;
int run_contraction_scale_example(int argc, char* argv[])
{
bool do_verification = true;
@@ -85,10 +87,10 @@ int run_contraction_scale_example(int argc, char* argv[])
exit(0);
}
Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides, Row{});
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides, Row{});
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
@@ -173,7 +175,7 @@ int run_contraction_scale_example(int argc, char* argv[])
if(do_verification)
{
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
using ReferenceOpInstance =
ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -18,6 +18,9 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/numeric.hpp"
using Row = ck::tensor_layout::gemm::RowMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -304,10 +307,10 @@ int main(int argc, char* argv[])
const auto e_ms_ns_lengths = contraction_descs[i].e_ms_ns_lengths;
const auto e_ms_ns_strides = contraction_descs[i].e_ms_ns_strides;
Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
Tensor<DDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides, Row{});
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides, Row{});
Tensor<DDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides, Bypass{});
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
ck::index_t M_ =
ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
@@ -416,9 +419,9 @@ int main(int argc, char* argv[])
const auto e_ms_ns_lengths = contraction_descs[i].e_ms_ns_lengths;
const auto e_ms_ns_strides = contraction_descs[i].e_ms_ns_strides;
Tensor<EDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<EDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
e_tensors_device[i]->FromDevice(e_device_tensors[i].mData.data());

View File

@@ -17,6 +17,9 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/numeric.hpp"
using Row = ck::tensor_layout::gemm::RowMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -300,11 +303,11 @@ int main(int argc, char* argv[])
std::vector<ck::index_t> e_gs_ms_ns_strides{
G1 * M0 * N0 * M1 * N1, M0 * N0 * M1 * N1, N0 * M1 * N1, N1, M1 * N1, 1};
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides, Row{});
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides, Bypass{});
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
@@ -396,7 +399,8 @@ int main(int argc, char* argv[])
if(do_verification)
{
Tensor<CShuffleDataType> c_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
Tensor<CShuffleDataType> c_ms_ns_host_result(
e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
NumDimM,

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -17,6 +17,9 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/numeric.hpp"
using Row = ck::tensor_layout::gemm::RowMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -54,7 +57,7 @@ using DeviceOpInstanceKKNN = ck::tensor_operation::device::
//############################################| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//############################################| | | | | | | | | | | Operation| Operation| Operation| | | | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//############################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceBatchedContractionMultipleD_Xdl_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK, F16, F16, F32, F16, DsDataType, F16, AElementOp, BElementOp, CDEElementOp, GemmSpec, ABSpec, ABSpec, DESpec, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>;
DeviceBatchedContractionMultipleD_Xdl_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK, F16, F16, F32, F16, DsDataType, F16, AElementOp, BElementOp, CDEElementOp, GemmSpec, ABSpec, ABSpec, DESpec, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 4>;
// clang-format on
using DeviceOpInstance = DeviceOpInstanceKKNN;
@@ -247,11 +250,11 @@ int main(int argc, char* argv[])
exit(0);
}
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides, Row{});
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides, Bypass{});
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -345,7 +348,8 @@ int main(int argc, char* argv[])
if(do_verification)
{
Tensor<CShuffleDataType> c_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
Tensor<CShuffleDataType> c_ms_ns_host_result(
e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
NumDimM,

View File

@@ -160,7 +160,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.G_ * conv_param.C_ // wi
});
},
ck::tensor_layout::convolution::GNCW{});
case 2:
return HostTensorDescriptor(
@@ -176,7 +177,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
},
ck::tensor_layout::convolution::GNCHW{});
case 3:
return HostTensorDescriptor(
@@ -195,7 +197,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
conv_param.G_ * conv_param.C_, // di
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
},
ck::tensor_layout::convolution::GNCDHW{});
}
throw std::runtime_error("unsuppored # dim spatial");
@@ -213,7 +216,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
1, // c
conv_param.C_ // x
});
},
ck::tensor_layout::convolution::GKCX{});
case 2:
return HostTensorDescriptor(
{conv_param.G_,
@@ -229,7 +233,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
conv_param.C_ // x
});
},
ck::tensor_layout::convolution::GKCYX{});
case 3:
return HostTensorDescriptor(
{conv_param.G_,
@@ -249,7 +254,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
conv_param.C_, // z
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
conv_param.C_ // x
});
},
ck::tensor_layout::convolution::GKCZYX{});
}
throw std::runtime_error("unsuppored # dim spatial");
@@ -267,7 +273,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
0, // k
1, // c
0 // x
});
},
ck::tensor_layout::convolution::GNKW{});
case 2:
return HostTensorDescriptor({conv_param.G_,
conv_param.N_,
@@ -280,7 +287,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
1, // k
0, // ho
0 // wo
});
},
ck::tensor_layout::convolution::GNKHW{});
case 3:
return HostTensorDescriptor({conv_param.G_,
conv_param.N_,
@@ -295,7 +303,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
0, // z
0, // y
0 // x
});
},
ck::tensor_layout::convolution::GNKDHW{});
}
throw std::runtime_error("unsuppored # dim spatial");
@@ -314,7 +323,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.G_ * conv_param.K_ // wo
});
},
ck::tensor_layout::convolution::GNKW{});
case 2:
return HostTensorDescriptor(
{conv_param.G_,
@@ -329,7 +339,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
},
ck::tensor_layout::convolution::GNKHW{});
case 3:
return HostTensorDescriptor(
@@ -348,7 +359,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
conv_param.G_ * conv_param.K_, // do
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
},
ck::tensor_layout::convolution::GNKDHW{});
}
throw std::runtime_error("unsuppored # dim spatial");

View File

@@ -160,7 +160,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.G_ * conv_param.C_ // wi
});
},
ck::tensor_layout::convolution::GNCW{});
case 2:
return HostTensorDescriptor(
@@ -176,7 +177,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
},
ck::tensor_layout::convolution::GNCHW{});
case 3:
return HostTensorDescriptor(
@@ -195,7 +197,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
conv_param.G_ * conv_param.C_, // di
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
},
ck::tensor_layout::convolution::GNCDHW{});
}
throw std::runtime_error("unsuppored # dim spatial");
@@ -213,7 +216,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
1, // c
conv_param.C_ // x
});
},
ck::tensor_layout::convolution::GKCX{});
case 2:
return HostTensorDescriptor(
{conv_param.G_,
@@ -229,7 +233,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
conv_param.C_ // x
});
},
ck::tensor_layout::convolution::GKCYX{});
case 3:
return HostTensorDescriptor(
{conv_param.G_,
@@ -249,7 +254,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
conv_param.C_, // z
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
conv_param.C_ // x
});
},
ck::tensor_layout::convolution::GKCZYX{});
}
throw std::runtime_error("unsuppored # dim spatial");
@@ -267,7 +273,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
0, // k
1, // c
0 // x
});
},
ck::tensor_layout::convolution::GNKW{});
case 2:
return HostTensorDescriptor({conv_param.G_,
conv_param.N_,
@@ -280,7 +287,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
1, // k
0, // ho
0 // wo
});
},
ck::tensor_layout::convolution::GNKHW{});
case 3:
return HostTensorDescriptor({conv_param.G_,
conv_param.N_,
@@ -295,7 +303,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
0, // z
0, // y
0 // x
});
},
ck::tensor_layout::convolution::GNKDHW{});
}
throw std::runtime_error("unsuppored # dim spatial");
@@ -314,7 +323,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.G_ * conv_param.K_ // wo
});
},
ck::tensor_layout::convolution::GNKW{});
case 2:
return HostTensorDescriptor(
{conv_param.G_,
@@ -329,7 +339,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
},
ck::tensor_layout::convolution::GNKHW{});
case 3:
return HostTensorDescriptor(
@@ -348,7 +359,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
conv_param.G_ * conv_param.K_, // do
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
},
ck::tensor_layout::convolution::GNKDHW{});
}
throw std::runtime_error("unsuppored # dim spatial");

View File

@@ -261,6 +261,10 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<
B1ElementOp,
CElementOp>;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
#include "run_batched_gemm_gemm_wmma_cshuffle_v3.inc"
int main(int argc, char* argv[])

View File

@@ -110,11 +110,13 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[])
if(std::is_same<decltype(layout), Row>::value)
{
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
return HostTensorDescriptor(
{batch_count, row, col}, {batch_stride, stride, 1_uz}, layout);
}
else
{
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
return HostTensorDescriptor(
{batch_count, row, col}, {batch_stride, 1_uz, stride}, layout);
}
};

View File

@@ -62,17 +62,19 @@ int run(int argc, char* argv[])
std::vector<ck::index_t> b1_g_o_n_lengths{G, O, N};
#ifdef CK_MHA_USE_RCCR_LAYOUT
std::vector<ck::index_t> b1_g_o_n_strides{N * O, N, 1}; // B1 layout [G, O, N]
auto b1_layout = Row{};
#else
std::vector<ck::index_t> b1_g_o_n_strides{N * O, 1, O}; // B1 layout [G, N, O]
auto b1_layout = Col{};
#endif
std::vector<ck::index_t> c_g_m_o_lengths{G, M, O};
std::vector<ck::index_t> c_g_m_o_strides{M * O, O, 1}; // C layout [G, M, O]
Tensor<ADataType> a_g_m_k(a_g_m_k_lengths, a_g_m_k_strides);
Tensor<B0DataType> b0_g_n_k(b0_g_n_k_lengths, b0_g_n_k_strides);
Tensor<B1DataType> b1_g_o_n(b1_g_o_n_lengths, b1_g_o_n_strides);
Tensor<CDataType> c_g_m_o_host_result(c_g_m_o_lengths, c_g_m_o_strides);
Tensor<CDataType> c_g_m_o_device_result(c_g_m_o_lengths, c_g_m_o_strides);
Tensor<ADataType> a_g_m_k(a_g_m_k_lengths, a_g_m_k_strides, Row{});
Tensor<B0DataType> b0_g_n_k(b0_g_n_k_lengths, b0_g_n_k_strides, Row{});
Tensor<B1DataType> b1_g_o_n(b1_g_o_n_lengths, b1_g_o_n_strides, b1_layout);
Tensor<CDataType> c_g_m_o_host_result(c_g_m_o_lengths, c_g_m_o_strides, Row{});
Tensor<CDataType> c_g_m_o_device_result(c_g_m_o_lengths, c_g_m_o_strides, Row{});
std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
std::cout << "b0_g_n_k: " << b0_g_n_k.mDesc << std::endl;

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
/*
Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
@@ -100,11 +100,11 @@ using DeviceGemmInstance =
8, // AK1
8, // BK1
2, // B1K1
32, // MPerXDL
32, // NPerXDL
1, // MXdlPerWave
4, // NXdlPerWave
2, // Gemm1NXdlPerWave
16, // MPerXDL
16, // NPerXDL
2, // MXdlPerWave
8, // NXdlPerWave
4, // Gemm1NXdlPerWave
S<4, 64, 1>, // ABlockTransfer
S<1, 0, 2>,
S<1, 0, 2>,
@@ -129,7 +129,7 @@ using DeviceGemmInstance =
1, // CShuffleMXdlPerWavePerShuffle
2, // CShuffleNXdlPerWavePerShuffle
S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
8, // CShuffleBlockTransferScalarPerVector_NPerBlock
4, // CShuffleBlockTransferScalarPerVector_NPerBlock
MaskingSpec>; // MaskingSpecialization
// Ref Gemm0: fp16 in, fp32 out

View File

@@ -111,12 +111,14 @@ int run(int argc, char* argv[])
if(std::is_same<decltype(layout), Row>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
std::vector<std::size_t>({batch_stride, stride, 1}));
std::vector<std::size_t>({batch_stride, stride, 1}),
layout);
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
std::vector<std::size_t>({batch_stride, 1, stride}));
std::vector<std::size_t>({batch_stride, 1, stride}),
layout);
}
};

View File

@@ -1,6 +1,8 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
using Bypass = ck::tensor_layout::BypassLayoutVerification;
int run(int argc, char* argv[])
{
bool do_verification = true;
@@ -88,11 +90,11 @@ int run(int argc, char* argv[])
? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
: std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Bypass{});
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, Bypass{});
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, Bypass{});
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Bypass{});
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Bypass{});
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;

View File

@@ -1,6 +1,10 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
int run(int argc, char* argv[])
{
bool do_verification = true;
@@ -88,11 +92,30 @@ int run(int argc, char* argv[])
? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
: std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
std::vector<ck::index_t> strides,
bool permute,
auto layout) {
if(permute)
{
return HostTensorDescriptor(lens, strides, Bypass{});
}
else
{
return HostTensorDescriptor(lens, strides, layout);
}
};
Tensor<ADataType> a_gs_ms_ks(
f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
Tensor<B0DataType> b0_gs_ns_ks(
f_host_tensor_descriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
Tensor<B1DataType> b1_gs_os_ns(
f_host_tensor_descriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
Tensor<CDataType> c_gs_ms_os_host_result(
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
Tensor<CDataType> c_gs_ms_os_device_result(
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;

View File

@@ -1,6 +1,10 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
int run(int argc, char* argv[])
{
bool do_verification = true;
@@ -113,11 +117,30 @@ int run(int argc, char* argv[])
head_dim,
1}; // C layout [batch_size, head_num, q_sequence_length, head_dim]
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
std::vector<ck::index_t> strides,
bool permute,
auto layout) {
if(permute)
{
return HostTensorDescriptor(lens, strides, Bypass{});
}
else
{
return HostTensorDescriptor(lens, strides, layout);
}
};
Tensor<ADataType> a_gs_ms_ks(
f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
Tensor<B0DataType> b0_gs_ns_ks(
f_host_tensor_descriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
Tensor<B1DataType> b1_gs_os_ns(
f_host_tensor_descriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
Tensor<CDataType> c_gs_ms_os_host_result(
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
Tensor<CDataType> c_gs_ms_os_device_result(
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
@@ -191,7 +214,7 @@ int run(int argc, char* argv[])
head_num * 2 * head_dim,
head_dim,
1}; // kv layout [batch_size, q_sequence_length, head_num, 2, head_dim]
Tensor<ADataType> kv_gs_ns_ks(kv_gs_ns_ks_lengths, kv_gs_ns_ks_strides);
Tensor<ADataType> kv_gs_ns_ks(kv_gs_ns_ks_lengths, kv_gs_ns_ks_strides, Bypass{});
// merge kv into a packed pointer send to device
b0_gs_ns_ks.ForEach(
[&](auto& self, auto idx) { kv_gs_ns_ks(idx[0], idx[1], idx[2], 0, idx[3]) = self(idx); });

View File

@@ -1,6 +1,10 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
int run(int argc, char* argv[])
{
bool do_verification = true;
@@ -63,6 +67,19 @@ int run(int argc, char* argv[])
std::size_t flop = 0, num_byte = 0;
auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
std::vector<ck::index_t> strides,
bool permute,
auto layout) {
if(permute)
{
return HostTensorDescriptor(lens, strides, Bypass{});
}
else
{
return HostTensorDescriptor(lens, strides, layout);
}
};
std::cout << "group count " << group_count << ". printing first 4 groups\n";
for(std::size_t i = 0; i < group_count; i++)
{
@@ -113,10 +130,14 @@ int run(int argc, char* argv[])
{}}); // acc1_biases_gs_ms_os_strides
// C_m_o = A_m_k * B0_k_n * B1_n_o
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
Tensor<ADataType> a_gs_ms_ks(
f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
Tensor<B0DataType> b0_gs_ns_ks(f_host_tensor_descriptor(
b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
Tensor<B1DataType> b1_gs_os_ns(f_host_tensor_descriptor(
b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
Tensor<CDataType> c_gs_ms_os_device_result(f_host_tensor_descriptor(
c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
int Batch = G0 * G1;
flop += (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * Batch;
@@ -252,7 +273,8 @@ int run(int argc, char* argv[])
Tensor<AccDataType> acc0_g_m_n({G0 * G1, M, N}); // scratch object after gemm0
Tensor<ADataType> a1_g_m_n({G0 * G1, M, N}); // scratch object after softmax
Tensor<CDataType> c_g_m_o_host_result({G0 * G1, M, O}); // scratch object after gemm1
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
Tensor<CDataType> c_gs_ms_os_host_result(f_host_tensor_descriptor(
c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
// permute
a_gs_ms_ks.ForEach([&](auto& self, auto idx) {

View File

@@ -1,6 +1,10 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
int run(int argc, char* argv[])
{
bool do_verification = true;
@@ -91,11 +95,30 @@ int run(int argc, char* argv[])
? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
: std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
std::vector<ck::index_t> strides,
bool permute,
auto layout) {
if(permute)
{
return HostTensorDescriptor(lens, strides, Bypass{});
}
else
{
return HostTensorDescriptor(lens, strides, layout);
}
};
Tensor<ADataType> a_gs_ms_ks(
f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
Tensor<B0DataType> b0_gs_ns_ks(
f_host_tensor_descriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
Tensor<B1DataType> b1_gs_os_ns(
f_host_tensor_descriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
Tensor<CDataType> c_gs_ms_os_host_result(
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
Tensor<CDataType> c_gs_ms_os_device_result(
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;

View File

@@ -1,6 +1,10 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
int run(int argc, char* argv[])
{
bool do_verification = true;
@@ -91,11 +95,30 @@ int run(int argc, char* argv[])
? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
: std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
std::vector<ck::index_t> strides,
bool permute,
auto layout) {
if(permute)
{
return HostTensorDescriptor(lens, strides, Bypass{});
}
else
{
return HostTensorDescriptor(lens, strides, layout);
}
};
Tensor<ADataType> a_gs_ms_ks(
f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
Tensor<B0DataType> b0_gs_ns_ks(
f_host_tensor_descriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
Tensor<B1DataType> b1_gs_os_ns(
f_host_tensor_descriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
Tensor<CDataType> c_gs_ms_os_host_result(
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
Tensor<CDataType> c_gs_ms_os_device_result(
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;

View File

@@ -1,6 +1,10 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
int run(int argc, char* argv[])
{
bool do_verification = true;
@@ -108,11 +112,30 @@ int run(int argc, char* argv[])
head_dim,
1}; // C layout [batch_size, head_num, sequence_length, head_dim]
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
std::vector<ck::index_t> strides,
bool permute,
auto layout) {
if(permute)
{
return HostTensorDescriptor(lens, strides, Bypass{});
}
else
{
return HostTensorDescriptor(lens, strides, layout);
}
};
Tensor<ADataType> a_gs_ms_ks(
f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
Tensor<B0DataType> b0_gs_ns_ks(
f_host_tensor_descriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
Tensor<B1DataType> b1_gs_os_ns(
f_host_tensor_descriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
Tensor<CDataType> c_gs_ms_os_host_result(
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
Tensor<CDataType> c_gs_ms_os_device_result(
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
@@ -186,7 +209,7 @@ int run(int argc, char* argv[])
head_num * 3 * head_dim,
head_dim,
1}; // qkv layout [batch_size, sequence_length, head_num, 3, head_dim]
Tensor<ADataType> qkv_gs_ms_ks(qkv_gs_ms_ks_lengths, qkv_gs_ms_ks_strides);
Tensor<ADataType> qkv_gs_ms_ks(qkv_gs_ms_ks_lengths, qkv_gs_ms_ks_strides, Bypass{});
// merge qkv into a packed pointer send to device
a_gs_ms_ks.ForEach(
[&](auto& self, auto idx) { qkv_gs_ms_ks(idx[0], idx[1], idx[2], 0, idx[3]) = self(idx); });

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -35,13 +35,13 @@ using DeviceGemmV2Instance =
256,
128, 128, 64,
8, 4,
32, 32,
2, 2,
16, 16,
4, 4,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 8, 4, 0,
1, 1, S<1, 32, 1, 8>, 8,
1, 1, S<1, 32, 1, 8>, 4,
ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
// clang-format on

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -35,13 +35,13 @@ using DeviceGemmV2Instance =
256,
128, 128, 64,
8, 4,
32, 32,
2, 2,
16, 16,
4, 4,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 8, 4, 0,
1, 1, S<1, 32, 1, 8>, 8,
1, 1, S<1, 32, 1, 8>, 4,
ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3, ReduceDataType>;
// clang-format on

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -35,13 +35,13 @@ using DeviceGemmV2Instance =
256,
128, 128, 64,
8, 4,
32, 32,
2, 2,
16, 16,
4, 4,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 8, 4, 0,
1, 1, S<1, 32, 1, 8>, 8,
1, 1, S<1, 32, 1, 8>, 4,
ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3, ReduceDataType>;
// clang-format on

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
@@ -35,13 +35,13 @@ using DeviceGemmV2Instance =
256,
128, 128, 64,
8, 4,
32, 32,
2, 2,
16, 16,
4, 4,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 8, 4, 0,
1, 1, S<1, 32, 1, 8>, 8,
1, 1, S<1, 32, 1, 8>, 4,
ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v2, ReduceDataType>;
// clang-format on

View File

@@ -321,11 +321,13 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), Row>::value)
{
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
return HostTensorDescriptor(
{batch_count, row, col}, {batch_stride, stride, 1_uz}, layout);
}
else
{
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
return HostTensorDescriptor(
{batch_count, row, col}, {batch_stride, 1_uz, stride}, layout);
}
};

View File

@@ -206,7 +206,8 @@ int run_grouped_conv_bwd_data_bias_relu_example(int argc, char* argv[])
1, // c
0, // hi
0 // wi
});
},
ctc::GNCHW{});
// input image: GNHWC
const auto in_g_n_c_wis_desc =

View File

@@ -214,7 +214,8 @@ int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_
1, // k
0, // ho
0 // wo
});
},
BiasLayout{});
const auto requant_scale_g_k_desc = bias_g_k_desc;

View File

@@ -201,7 +201,8 @@ int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_el
1, // k
0, // ho
0 // wo
});
},
BiasLayout{});
const auto out_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);

View File

@@ -203,7 +203,8 @@ int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_eleme
1, // k
0, // ho
0 // wo
});
},
RequantScaleLayout{});
const auto out_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);

View File

@@ -22,6 +22,9 @@ using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Add = ck::tensor_operation::element_wise::Add;
@@ -250,19 +253,24 @@ int main(int argc, char* argv[])
Tensor<ADataType> a_gs_ms_ks(
std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()),
Row{});
Tensor<BDataType> b_gs_ns_ks(
std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()),
Row{});
Tensor<DDataType> d_gs_ms_ns(
std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()),
Bypass{});
Tensor<EDataType> e_gs_ms_ns_host_result(
std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
Bypass{});
Tensor<EDataType> e_gs_ms_ns_device_result(
std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
Bypass{});
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -372,7 +380,8 @@ int main(int argc, char* argv[])
{
Tensor<CShuffleDataType> c_ms_ns_host_result(
std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
Bypass{});
using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
NumDimM,

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
@@ -22,6 +22,9 @@ using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Add = ck::tensor_operation::element_wise::Add;
@@ -53,7 +56,7 @@ using DeviceOpInstanceKKNN = ck::tensor_operation::device::
//############################################| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//############################################| | | | | | | | | | | Operation| Operation| Operation| | | | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//############################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceSplitKContractionMultipleD_Xdl_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, ABSpec, ABSpec, DESpec, 1, 256, 256, 128, 32, 4, 4, 32, 32, 4, 2, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 4, 4, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 4, 4, 1, 1, 1, S<1, 32, 1, 4>, 4>;
DeviceSplitKContractionMultipleD_Xdl_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, ABSpec, ABSpec, DESpec, 1, 256, 256, 128, 32, 4, 4, 16, 16, 8, 4, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 4, 4, 1, S<1, 4, 64, 1>, S<0, 2, 1, 3>, S<0, 2, 1, 3>, 3, 4, 4, 1, 1, 1, S<1, 32, 1, 4>, 2>;
// clang-format on
using DeviceOpInstance = DeviceOpInstanceKKNN;
@@ -250,19 +253,24 @@ int main(int argc, char* argv[])
Tensor<ADataType> a_gs_ms_ks(
std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()),
Row{});
Tensor<BDataType> b_gs_ns_ks(
std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()),
Row{});
Tensor<DDataType> d_gs_ms_ns(
std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()),
Bypass{});
Tensor<EDataType> e_gs_ms_ns_host_result(
std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
Bypass{});
Tensor<EDataType> e_gs_ms_ns_device_result(
std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
Bypass{});
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
@@ -372,7 +380,8 @@ int main(int argc, char* argv[])
{
Tensor<CShuffleDataType> c_ms_ns_host_result(
std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
Bypass{});
using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
NumDimM,

View File

@@ -22,6 +22,8 @@ using F32 = float;
using ADataType = F16;
using BDataType = F16;
using NchwLayout = ck::tensor_layout::convolution::NCHW;
using NhwcLayout = ck::tensor_layout::convolution::NHWC;
using UnaryScale = ck::tensor_operation::element_wise::Scale;
using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
using UnaryScaleSquare =
@@ -73,11 +75,11 @@ int main(int argc, char* argv[])
1};
ck::ranges::copy(nchw, ab_lengths.begin());
std::array<Tensor<ADataType>, 2> as = {Tensor<ADataType>(ab_lengths, ab_strides),
Tensor<ADataType>(ab_lengths, ab_strides)};
std::array<Tensor<ADataType>, 2> as = {Tensor<ADataType>(ab_lengths, ab_strides, NchwLayout{}),
Tensor<ADataType>(ab_lengths, ab_strides, NchwLayout{})};
Tensor<ADataType>& a0 = as[0];
Tensor<ADataType>& a1 = as[1];
Tensor<BDataType> b(ab_lengths, ab_strides);
Tensor<BDataType> b(ab_lengths, ab_strides, NchwLayout{});
float alpha = 3.f;
float beta = 2.f;
a0.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
@@ -134,7 +136,7 @@ int main(int argc, char* argv[])
if(do_verification)
{
Tensor<BDataType> host_b(ab_lengths, ab_strides);
Tensor<BDataType> host_b(ab_lengths, ab_strides, NchwLayout{});
using ReferenceElementwiseInstance = ck::tensor_operation::host::
ReferenceElementwise<2, ADataType, BDataType, BinaryAddUnaryScaleSquare>;

View File

@@ -22,6 +22,8 @@ using F32 = float;
using ADataType = F16;
using BDataType = F16;
using NchwLayout = ck::tensor_layout::convolution::NCHW;
using NhwcLayout = ck::tensor_layout::convolution::NHWC;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
ck::Tuple<ADataType>, // InDataTypeTuple
@@ -72,9 +74,9 @@ int main(int argc, char* argv[])
static_cast<int>(nhwc[3])};
ck::ranges::copy(nchw, ab_lengths.begin());
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides, NchwLayout{})};
Tensor<ADataType>& a = as[0];
Tensor<BDataType> b(ab_lengths, b_strides);
Tensor<BDataType> b(ab_lengths, b_strides, NhwcLayout{});
a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
@@ -117,7 +119,7 @@ int main(int argc, char* argv[])
if(do_verification)
{
Tensor<BDataType> host_b(ab_lengths, b_strides);
Tensor<BDataType> host_b(ab_lengths, b_strides, NhwcLayout{});
using ReferenceElementwiseInstance =
ck::tensor_operation::host::ReferenceElementwise<1, ADataType, BDataType, PassThrough>;
auto ref_elementwise = ReferenceElementwiseInstance{};

View File

@@ -23,6 +23,8 @@ using F32 = float;
using ADataType = F16;
using BDataType = F16;
using NchwLayout = ck::tensor_layout::convolution::NCHW;
using NhwcLayout = ck::tensor_layout::convolution::NHWC;
using UnaryScale = ck::tensor_operation::element_wise::Scale;
using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
using UnaryScaleSquare =
@@ -76,9 +78,9 @@ int main(int argc, char* argv[])
static_cast<int>(nhwc[0] * nhwc[1])};
ck::ranges::copy(nchw, ab_lengths.begin());
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides, NchwLayout{})};
Tensor<ADataType>& a = as[0];
Tensor<BDataType> b(ab_lengths, b_strides);
Tensor<BDataType> b(ab_lengths, b_strides, NhwcLayout{});
float scale = 1.f;
auto i = 0;
std::mt19937 gen(11939);
@@ -137,7 +139,7 @@ int main(int argc, char* argv[])
if(do_verification)
{
Tensor<BDataType> host_b(ab_lengths, b_strides);
Tensor<BDataType> host_b(ab_lengths, b_strides, NhwcLayout{});
using ReferenceElementwiseInstance = ck::tensor_operation::host::
ReferenceElementwise<1, ADataType, BDataType, UnaryScaleSquare>;
auto ref_elementwise = ReferenceElementwiseInstance{};

View File

@@ -22,6 +22,9 @@ using F32 = float;
using ADataType = F16;
using BDataType = F16;
using NchwLayout = ck::tensor_layout::convolution::NCHW;
using NhwcLayout = ck::tensor_layout::convolution::NHWC;
using UnaryScale = ck::tensor_operation::element_wise::Scale;
using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
using UnaryScaleSquare =
@@ -76,9 +79,9 @@ int main(int argc, char* argv[])
ck::ranges::copy(nchw, ab_lengths.begin());
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides, NchwLayout{})};
Tensor<ADataType>& a = as[0];
Tensor<BDataType> b(ab_lengths, b_strides);
Tensor<BDataType> b(ab_lengths, b_strides, NhwcLayout{});
float scale = 2.f;
a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
@@ -128,7 +131,7 @@ int main(int argc, char* argv[])
if(do_verification)
{
Tensor<BDataType> host_b(ab_lengths, b_strides);
Tensor<BDataType> host_b(ab_lengths, b_strides, NhwcLayout{});
using ReferenceElementwiseInstance = ck::tensor_operation::host::
ReferenceElementwise<1, ADataType, BDataType, UnaryScaleSquare>;
auto ref_elementwise = ReferenceElementwiseInstance{};

View File

@@ -22,6 +22,8 @@ using F32 = float;
using ADataType = F32;
using BDataType = F32;
using NchwLayout = ck::tensor_layout::convolution::NCHW;
using NhwcLayout = ck::tensor_layout::convolution::NHWC;
using UnaryScale = ck::tensor_operation::element_wise::Scale;
using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
using UnaryScaleSquare =
@@ -76,9 +78,9 @@ int main(int argc, char* argv[])
static_cast<int>(nhwc[0] * nhwc[1])};
ck::ranges::copy(nchw, ab_lengths.begin());
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides, NchwLayout{})};
Tensor<ADataType>& a = as[0];
Tensor<BDataType> b(ab_lengths, b_strides);
Tensor<BDataType> b(ab_lengths, b_strides, NhwcLayout{});
float scale = 1.f;
auto i = 0;
@@ -139,7 +141,7 @@ int main(int argc, char* argv[])
if(do_verification)
{
Tensor<BDataType> host_b(ab_lengths, b_strides);
Tensor<BDataType> host_b(ab_lengths, b_strides, NhwcLayout{});
using ReferenceElementwiseInstance = ck::tensor_operation::host::
ReferenceElementwise<1, ADataType, BDataType, UnaryScaleSquare>;
auto ref_elementwise = ReferenceElementwiseInstance{};

View File

@@ -22,6 +22,9 @@ using F32 = float;
using ADataType = F32;
using BDataType = F32;
using NchwLayout = ck::tensor_layout::convolution::NCHW;
using NhwcLayout = ck::tensor_layout::convolution::NHWC;
using UnaryScale = ck::tensor_operation::element_wise::Scale;
using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
using UnaryScaleSquare =
@@ -76,9 +79,9 @@ int main(int argc, char* argv[])
ck::ranges::copy(nchw, ab_lengths.begin());
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides, NchwLayout{})};
Tensor<ADataType>& a = as[0];
Tensor<BDataType> b(ab_lengths, b_strides);
Tensor<BDataType> b(ab_lengths, b_strides, NhwcLayout{});
float scale = 2.f;
a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
@@ -127,7 +130,7 @@ int main(int argc, char* argv[])
if(do_verification)
{
Tensor<BDataType> host_b(ab_lengths, b_strides);
Tensor<BDataType> host_b(ab_lengths, b_strides, NhwcLayout{});
using ReferenceElementwiseInstance = ck::tensor_operation::host::
ReferenceElementwise<1, ADataType, BDataType, UnaryScaleSquare>;
auto ref_elementwise = ReferenceElementwiseInstance{};

View File

@@ -22,6 +22,9 @@ using F32 = float;
using ADataType = F16;
using BDataType = F16;
using NchwLayout = ck::tensor_layout::convolution::NCHW;
using NhwcLayout = ck::tensor_layout::convolution::NHWC;
using UnaryScale = ck::tensor_operation::element_wise::Scale;
using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
using UnaryScaleSquare =
@@ -78,13 +81,13 @@ int main(int argc, char* argv[])
ck::ranges::copy(nchw, ab_lengths.begin());
std::array<Tensor<ADataType>, 3> as = {Tensor<ADataType>(ab_lengths, ab_strides),
Tensor<ADataType>(ab_lengths, ab_strides),
Tensor<ADataType>(ab_lengths, ab_strides)};
std::array<Tensor<ADataType>, 3> as = {Tensor<ADataType>(ab_lengths, ab_strides, NchwLayout{}),
Tensor<ADataType>(ab_lengths, ab_strides, NchwLayout{}),
Tensor<ADataType>(ab_lengths, ab_strides, NchwLayout{})};
Tensor<ADataType>& a0 = as[0];
Tensor<ADataType>& a1 = as[1];
Tensor<ADataType>& a2 = as[2];
Tensor<BDataType> b(ab_lengths, ab_strides);
Tensor<BDataType> b(ab_lengths, ab_strides, NchwLayout{});
float alpha = 3.f;
float beta = 2.f;
float gamma = 4.f;
@@ -149,7 +152,7 @@ int main(int argc, char* argv[])
if(do_verification)
{
Tensor<BDataType> host_b(ab_lengths, ab_strides);
Tensor<BDataType> host_b(ab_lengths, ab_strides, NchwLayout{});
using ReferenceElementwiseInstance = ck::tensor_operation::host::
ReferenceElementwise<3, ADataType, BDataType, TrinaryAddUnaryScaleSquare>;
auto ref_elementwise = ReferenceElementwiseInstance{};

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
@@ -31,7 +31,7 @@ using DeviceOpInstance = ck::tensor_operation::device::
//##############################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//##############################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//##############################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, DsLayout, Row, F16, F16, F32, F16, DsDataType, F16, PassThrough, PassThrough, CDEElementOp, GemmDefault, 1, 128, 128, 128, 32, 8, 2, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 16, 1, 8>, 8>;
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, DsLayout, Row, F16, F16, F32, F16, DsDataType, F16, PassThrough, PassThrough, CDEElementOp, GemmDefault, 1, 128, 128, 128, 32, 8, 2, 16, 16, 8, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 16, 1, 8>, 4>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,

View File

@@ -1,22 +1,30 @@
#pragma once
#include <type_traits>
bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
ProblemSize ps =
problem_size; // make mutable copy because default stride values of 0 need to be updated
auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = ps;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, int& stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
if(stride <= 0)
stride = desc.GetStrides()[0];
return desc;
}
else
{
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
if(stride <= 0)
stride = desc.GetStrides()[1];
return desc;
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
@@ -123,7 +131,16 @@ bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfi
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
if(std::is_same_v<ck::ranges::range_value_t<decltype(e_m_n_device_result)>, ck::half_t> &&
std::is_same_v<ck::ranges::range_value_t<decltype(e_m_n_host_result)>, ck::half_t>)
{
return ck::utils::check_err(
e_m_n_device_result, e_m_n_host_result, "Error: Incorrect results!", 5e-3, 1e-3);
}
else
{
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
}
}
return true;

View File

@@ -18,6 +18,10 @@
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -220,12 +224,12 @@ int main(int argc, char* argv[])
std::vector<ck::index_t> d0_gs_ms_ns_lengths{G0, G1, M, N};
std::vector<ck::index_t> d0_gs_ms_ns_strides{M * G1 * N, N, G1 * N, 1};
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
Tensor<D0DataType> d0_gs_ms_ns(d0_gs_ms_ns_lengths, d0_gs_ms_ns_strides);
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, Row{});
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, Col{});
Tensor<D0DataType> d0_gs_ms_ns(d0_gs_ms_ns_lengths, d0_gs_ms_ns_strides, Row{});
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Row{});
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Row{});
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;

View File

@@ -48,15 +48,16 @@ HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_,
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
{
return HostTensorDescriptor({N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
return HostTensorDescriptor(
{N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz}, layout);
}
else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
{
return HostTensorDescriptor({N_, C_, D, H, W},
{D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
return HostTensorDescriptor(
{N_, C_, D, H, W}, {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_}, layout);
}
throw std::runtime_error("Pool3d_fwd: problem with layout. ");
return HostTensorDescriptor({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0});
return HostTensorDescriptor({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, layout);
};
template <typename DevicePoolFwdInstance,

View File

@@ -77,7 +77,9 @@ bool maxpool_bwd_test(bool do_verification,
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
using namespace ck::literals;
// reference need Tensor with NCHW order
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
return HostTensorDescriptor({N_, C_, H, W},
{C_ * H * W, 1_uz, W * C_, C_},
ck::tensor_layout::convolution::NCHW{});
};
// in

View File

@@ -42,15 +42,16 @@ HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_,
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
{
return HostTensorDescriptor({N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
return HostTensorDescriptor(
{N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz}, layout);
}
else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
{
return HostTensorDescriptor({N_, C_, D, H, W},
{D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
return HostTensorDescriptor(
{N_, C_, D, H, W}, {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_}, layout);
}
throw std::runtime_error("Avgpool3d_bwd: problem with layout. ");
return HostTensorDescriptor({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0});
return HostTensorDescriptor({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, layout);
};
template <typename DevicePoolBwdInstance,

View File

@@ -81,10 +81,11 @@ int main(int argc, char* argv[])
ck::index_t N = 768;
ck::index_t K = 6144;
ck::index_t StrideA = K;
ck::index_t StrideB = N;
ck::index_t StrideD = 0;
ck::index_t StrideE = N;
ck::index_t StrideA = K;
ck::index_t StrideB = N;
ck::index_t StrideB1 = 0;
ck::index_t StrideD = 0;
ck::index_t StrideE = N;
if(argc == 1)
{
@@ -120,23 +121,31 @@ int main(int argc, char* argv[])
exit(0);
}
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
auto f_host_tensor_descriptor = [](std::size_t row,
std::size_t col,
ck::index_t& stride,
auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
if(stride <= 0)
stride = desc.GetStrides()[0];
return desc;
}
else
{
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
if(stride <= 0)
stride = desc.GetStrides()[1];
return desc;
}
};
Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, 0, B1Layout{}));
Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, StrideB1, B1Layout{}));
Tensor<D0DataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
@@ -196,7 +205,7 @@ int main(int argc, char* argv[])
N,
K,
std::array<ck::index_t, NumATensor>{StrideA},
std::array<ck::index_t, NumBTensor>{StrideB, 0},
std::array<ck::index_t, NumBTensor>{StrideB, StrideB1},
std::array<ck::index_t, NumDTensor>{StrideD},
StrideE,
a_element_op,

View File

@@ -81,10 +81,11 @@ int main(int argc, char* argv[])
ck::index_t N = 768;
ck::index_t K = 6144;
ck::index_t StrideA = K;
ck::index_t StrideB = N;
ck::index_t StrideD = 0;
ck::index_t StrideE = N;
ck::index_t StrideA = K;
ck::index_t StrideB = N;
ck::index_t StrideB1 = 0;
ck::index_t StrideD = 0;
ck::index_t StrideE = N;
if(argc == 1)
{
@@ -120,23 +121,31 @@ int main(int argc, char* argv[])
exit(0);
}
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
auto f_host_tensor_descriptor = [](std::size_t row,
std::size_t col,
ck::index_t& stride,
auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
if(stride <= 0)
stride = desc.GetStrides()[0];
return desc;
}
else
{
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
if(stride <= 0)
stride = desc.GetStrides()[1];
return desc;
}
};
Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, 0, B1Layout{}));
Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, StrideB1, B1Layout{}));
Tensor<D0DataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
@@ -196,7 +205,7 @@ int main(int argc, char* argv[])
N,
K,
std::array<ck::index_t, NumATensor>{StrideA},
std::array<ck::index_t, NumBTensor>{StrideB, 0},
std::array<ck::index_t, NumBTensor>{StrideB, StrideB1},
std::array<ck::index_t, NumDTensor>{},
StrideE,
a_element_op,

View File

@@ -80,10 +80,11 @@ int main(int argc, char* argv[])
ck::index_t N = 768;
ck::index_t K = 6144;
ck::index_t StrideA = K;
ck::index_t StrideB = N;
ck::index_t StrideD = 0;
ck::index_t StrideE = N;
ck::index_t StrideA = K;
ck::index_t StrideB = N;
ck::index_t StrideB1 = 0;
ck::index_t StrideD = 0;
ck::index_t StrideE = N;
if(argc == 1)
{
@@ -119,23 +120,31 @@ int main(int argc, char* argv[])
exit(0);
}
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
auto f_host_tensor_descriptor = [](std::size_t row,
std::size_t col,
ck::index_t& stride,
auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
if(stride <= 0)
stride = desc.GetStrides()[0];
return desc;
}
else
{
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
if(stride <= 0)
stride = desc.GetStrides()[1];
return desc;
}
};
Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, 0, B1Layout{}));
Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, StrideB1, B1Layout{}));
Tensor<D0DataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
@@ -196,7 +205,7 @@ int main(int argc, char* argv[])
K,
std::array<ck::index_t, NumATensor>{StrideA},
std::array<ck::index_t, NumBTensor>{StrideB},
std::array<ck::index_t, NumDTensor>{0, StrideD},
std::array<ck::index_t, NumDTensor>{StrideB1, StrideD},
StrideE,
a_element_op,
b_element_op,
@@ -261,7 +270,7 @@ int main(int argc, char* argv[])
{
for(int n = 0; n < N; ++n)
{
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), b1_k_n(0, n), d_m_n(m, n));
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), b1_k_n(m, n), d_m_n(m, n));
}
}

View File

@@ -19,6 +19,9 @@
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/numeric.hpp"
using Row = ck::tensor_layout::gemm::RowMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -160,12 +163,12 @@ int main(int argc, char* argv[])
exit(0);
}
Tensor<A0DataType> a0_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides);
Tensor<A1DataType> a1_ms_ks(a1_ms_ks_lengths, a1_ms_ks_strides);
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<A0DataType> a0_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides, Row{});
Tensor<A1DataType> a1_ms_ks(a1_ms_ks_lengths, a1_ms_ks_strides, Bypass{});
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides, Row{});
Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides, Row{});
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
std::cout << "a0_ms_ks: " << a0_ms_ks.mDesc << std::endl;
std::cout << "a1_ms_ks: " << a1_ms_ks.mDesc << std::endl;
@@ -264,9 +267,9 @@ int main(int argc, char* argv[])
if(do_verification)
{
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
Tensor<A0DataType> a_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides);
Tensor<A0DataType> a_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides, Row{});
for(size_t m0 = 0; m0 < a_ms_ks.mDesc.GetLengths()[0]; ++m0)
{
@@ -299,7 +302,6 @@ int main(int argc, char* argv[])
auto ref_op = ReferenceOpInstance{};
auto ref_invoker = ref_op.MakeInvoker();
Tensor<float> empty_tensor(std::vector<ck::index_t>{}, std::vector<ck::index_t>{});
auto ref_argument =
ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, PassThrough{}, b_element_op);

View File

@@ -19,6 +19,9 @@
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/numeric.hpp"
using Row = ck::tensor_layout::gemm::RowMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
@@ -140,12 +143,12 @@ int main(int argc, char* argv[])
exit(0);
}
Tensor<A0DataType> a0_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides);
Tensor<A1DataType> a1_ms_ks(a1_ms_ks_lengths, a1_ms_ks_strides);
Tensor<B0DataType> b0_ns_ks(b0_ns_ks_lengths, b0_ns_ks_strides);
Tensor<B1DataType> b1_ns_ks(b1_ns_ks_lengths, b1_ns_ks_strides);
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<A0DataType> a0_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides, Row{});
Tensor<A1DataType> a1_ms_ks(a1_ms_ks_lengths, a1_ms_ks_strides, Bypass{});
Tensor<B0DataType> b0_ns_ks(b0_ns_ks_lengths, b0_ns_ks_strides, Row{});
Tensor<B1DataType> b1_ns_ks(b1_ns_ks_lengths, b1_ns_ks_strides, Row{});
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
std::cout << "a0_ms_ks: " << a0_ms_ks.mDesc << std::endl;
std::cout << "a1_ms_ks: " << a1_ms_ks.mDesc << std::endl;
@@ -246,9 +249,9 @@ int main(int argc, char* argv[])
if(do_verification)
{
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
Tensor<A0DataType> a_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides);
Tensor<A0DataType> a_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides, Row{});
for(size_t m0 = 0; m0 < a_ms_ks.mDesc.GetLengths()[0]; ++m0)
{
@@ -266,7 +269,7 @@ int main(int argc, char* argv[])
}
}
Tensor<B0DataType> b_ns_ks(b0_ns_ks_lengths, b0_ns_ks_strides);
Tensor<B0DataType> b_ns_ks(b0_ns_ks_lengths, b0_ns_ks_strides, Row{});
for(size_t n0 = 0; n0 < b_ns_ks.mDesc.GetLengths()[0]; ++n0)
{

View File

@@ -130,11 +130,12 @@ bool run_grouped_conv(bool do_verification,
// Fill other lenghts than G,K with 1 and strides with 0
bias_g_k_lengths.fill(1);
bias_g_k_strides.fill(0);
bias_g_k_lengths[0] = G;
bias_g_k_lengths[2] = K;
bias_g_k_strides[0] = K; // stride to G
bias_g_k_strides[2] = 1; // stride to K
const auto broadcasted_bias_desc = HostTensorDescriptor(bias_g_k_lengths, bias_g_k_strides);
bias_g_k_lengths[0] = G;
bias_g_k_lengths[2] = K;
bias_g_k_strides[0] = K; // stride to G
bias_g_k_strides[2] = 1; // stride to K
const auto broadcasted_bias_desc =
HostTensorDescriptor(bias_g_k_lengths, bias_g_k_strides, BiasLayout{});
// y = relu ( alpha1 * conv(x) + alpha2 * z + bias )
Tensor<InDataType> in(in_g_n_c_wis_desc);

View File

@@ -28,7 +28,8 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<QuantDataType> quant_b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
// assume scale tensor is [1, n]
Tensor<ScaleDataType> scale_k_n(f_host_tensor_descriptor(K, N, 0, Row{}));
Tensor<ScaleDataType> scale_k_n(
HostTensorDescriptor({K, N}, {0, 1_uz}, ck::tensor_layout::BypassLayoutVerification()));
switch(config.init_method)
{

View File

@@ -241,6 +241,28 @@ int main(int argc, char* argv[])
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
// Update strides based on tensor properties if they are <= 0
auto get_stride = [](auto& tensor, auto layout, ck::index_t current_stride) -> ck::index_t {
if(current_stride <= 0)
{
if constexpr(std::is_same_v<decltype(layout), Row>)
{
return tensor.GetStrides()[0];
}
else
{
return tensor.GetStrides()[1];
}
}
return current_stride;
};
StrideA = get_stride(a0_m_k, A0Layout{}, StrideA);
StrideB = get_stride(b0_k_n, B0Layout{}, StrideB);
ck::index_t StrideD0 = get_stride(d0_m_n, D0Layout{}, StrideD);
ck::index_t StrideD1 = get_stride(d1_m_n, D1Layout{}, StrideD);
StrideE = get_stride(e_m_n_host_result, ELayout{}, StrideE);
std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
@@ -285,8 +307,6 @@ int main(int argc, char* argv[])
constexpr ck::index_t NumDTensor = DsDataType::Size();
constexpr auto I0 = ck::Number<0>{};
// do GEMM
auto device_op = DeviceOpInstance{};
@@ -308,7 +328,7 @@ int main(int argc, char* argv[])
K,
StrideA,
StrideB,
std::array<ck::index_t, NumDTensor>{I0, I0},
std::array<ck::index_t, NumDTensor>{StrideD0, StrideD1},
StrideE,
KBatch,
a_element_op,

View File

@@ -162,6 +162,28 @@ int main(int argc, char* argv[])
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
// Update strides based on tensor properties if they are <= 0
auto get_stride = [](auto& tensor, auto layout, ck::index_t current_stride) -> ck::index_t {
if(current_stride <= 0)
{
if constexpr(std::is_same_v<decltype(layout), Row>)
{
return tensor.GetStrides()[0];
}
else
{
return tensor.GetStrides()[1];
}
}
return current_stride;
};
StrideA = get_stride(a0_m_k, A0Layout{}, StrideA);
StrideB = get_stride(b0_k_n, B0Layout{}, StrideB);
ck::index_t StrideD0 = get_stride(d0_m_n, D0Layout{}, StrideD);
ck::index_t StrideD1 = get_stride(d1_m_n, D1Layout{}, StrideD);
StrideE = get_stride(e_m_n_host_result, ELayout{}, StrideE);
std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
@@ -216,7 +238,7 @@ int main(int argc, char* argv[])
K,
StrideA,
StrideB,
std::array<ck::index_t, NumDTensor>{StrideD, StrideD},
std::array<ck::index_t, NumDTensor>{StrideD0, StrideD1},
StrideE,
KBatch,
a_element_op,

View File

@@ -251,6 +251,28 @@ int main(int argc, char* argv[])
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
// Update strides based on tensor properties if they are <= 0
auto get_stride = [](auto& tensor, auto layout, ck::index_t current_stride) -> ck::index_t {
if(current_stride <= 0)
{
if constexpr(std::is_same_v<decltype(layout), Row>)
{
return tensor.GetStrides()[0];
}
else
{
return tensor.GetStrides()[1];
}
}
return current_stride;
};
StrideA = get_stride(a0_m_k, A0Layout{}, StrideA);
StrideB = get_stride(b0_k_n, B0Layout{}, StrideB);
ck::index_t StrideD0 = get_stride(d0_m_n, D0Layout{}, StrideD);
ck::index_t StrideD1 = get_stride(d1_m_n, D1Layout{}, StrideD);
StrideE = get_stride(e_m_n_host_result, ELayout{}, StrideE);
std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
@@ -295,8 +317,6 @@ int main(int argc, char* argv[])
constexpr ck::index_t NumDTensor = DsDataType::Size();
constexpr auto I0 = ck::Number<0>{};
// do GEMM
auto device_op = DeviceOpInstance{};
@@ -318,7 +338,7 @@ int main(int argc, char* argv[])
K,
StrideA,
StrideB,
std::array<ck::index_t, NumDTensor>{I0, I0},
std::array<ck::index_t, NumDTensor>{StrideD0, StrideD1},
StrideE,
KBatch,
a_element_op,

View File

@@ -287,15 +287,18 @@ int main(int argc, char* argv[])
}
}
Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
Tensor<B0DataType> b0_e_n_k(
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
Tensor<B0DataType> b0_preshuffled(
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
Tensor<D1DataType> d1_e_n(
HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
Tensor<EDataType> e_t_n_host_result(
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
Tensor<EDataType> e_t_n_device_result(
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
std::cout << "d1_e_n: " << d1_e_n.mDesc << std::endl;
@@ -422,7 +425,7 @@ int main(int argc, char* argv[])
e_device_buf.FromDevice(e_t_n_device_result.mData.data());
Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceMoeGemm<A0DataType,
B0DataType,

View File

@@ -301,18 +301,22 @@ int main(int argc, char* argv[])
}
Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
Tensor<A1DataType> a1_t_k(HostTensorDescriptor(
{tokens, (K + Scale_Block_K - 1) / Scale_Block_K}, {Scale_Stride_AM, 1}));
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
{tokens, (K + Scale_Block_K - 1) / Scale_Block_K}, {Scale_Stride_AM, 1}, Row{}));
Tensor<B0DataType> b0_e_n_k(
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
Tensor<B1DataType> b1_e_n_k(
HostTensorDescriptor({experts,
(K + Scale_Block_K - 1) / Scale_Block_K,
(N + Scale_Block_N - 1) / Scale_Block_N * 2},
{(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN}));
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
{(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN},
Col{}));
Tensor<B0DataType> b0_preshuffled(
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
Tensor<EDataType> e_t_n_host_result(
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
Tensor<EDataType> e_t_n_device_result(
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
e_t_n_device_result.SetZero();
std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
std::cout << "a1_t_k: " << a1_t_k.mDesc << std::endl;
@@ -463,7 +467,7 @@ int main(int argc, char* argv[])
Tensor<float> b_e_n_k({experts, K, N * 2});
e_device_buf.FromDevice(e_t_n_device_result.mData.data());
Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
// handle scale before ref.
for(int t = 0; t < tokens; ++t)

View File

@@ -264,15 +264,18 @@ int main(int argc, char* argv[])
}
Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
Tensor<B0DataType> b0_e_n_k(
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
Tensor<B0DataType> b0_preshuffled(
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
Tensor<D1DataType> d1_e_n(
HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
Tensor<EDataType> e_t_n_host_result(
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
Tensor<EDataType> e_t_n_device_result(
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
@@ -488,7 +491,7 @@ int main(int argc, char* argv[])
e_device_buf.FromDevice(e_t_n_device_result.mData.data());
Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceMoeGemm<A0DataType,
B0DataType,

View File

@@ -28,8 +28,9 @@ using F16 = ck::half_t;
using F8 = ck::f8_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = F8;
using B0DataType = F8;
@@ -278,11 +279,11 @@ int main(int argc, char* argv[])
}
}
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}, Row{}));
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
Tensor<D0DataType> d0_t_n(
HostTensorDescriptor({tokens, topk, N}, {StrideDs[0] * topk, StrideDs[0], 0}));
HostTensorDescriptor({tokens, topk, N}, {StrideDs[0] * topk, StrideDs[0], 0}, Bypass{}));
Tensor<D1DataType> d1_e_n(
HostTensorDescriptor({experts, N}, {PerTokenQuant ? StrideDs[1] * N : 1, StrideDs[1]}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));

View File

@@ -292,17 +292,19 @@ int main(int argc, char* argv[])
}
}
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}, Row{}));
Tensor<A1DataType> a1_t_k_k(
HostTensorDescriptor({tokens, topk, (K + Scale_Block_K - 1) / Scale_Block_K},
{(topk * Scale_Stride_AM), Scale_Stride_AM, 1}));
{(topk * Scale_Stride_AM), Scale_Stride_AM, 1},
Row{}));
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
Tensor<B1DataType> b1_e_n_k(HostTensorDescriptor(
{experts, (K + Scale_Block_K - 1) / Scale_Block_K, (N + Scale_Block_N - 1) / Scale_Block_N},
{(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN}));
{(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN},
Col{}));
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));

View File

@@ -29,8 +29,9 @@ using F16 = ck::half_t;
using F8 = ck::f8_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = F8;
using B0DataType = I4;
@@ -239,10 +240,10 @@ int main(int argc, char* argv[])
sorted_token_ids.mData[i] = tokens;
}
}
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}, Row{}));
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}, Bypass{}));
Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
@@ -37,7 +37,7 @@ using DeviceOpInstanceKK_Generic = ck::tensor_operation::device::
//#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Data|
//#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| Type|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>;
DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 1, 256, 256, 128, 16, 4, 4, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 2, ComputeDataType>;
// clang-format on
template <ck::index_t NumDimM,

View File

@@ -95,25 +95,26 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
exit(0);
}
using DefaultLayout = ck::tensor_layout::gemm::RowMajor;
// For Real Part of Complex Tensor
Tensor<ADataType> a_ms_ks_re(a_ms_ks_lengths, a_ms_ks_strides);
Tensor<BDataType> b_ns_ks_re(b_ns_ks_lengths, b_ns_ks_strides);
Tensor<EDataType> d_ms_ns_re(d_ms_ns_lengths, d_ms_ns_strides);
Tensor<ADataType> a_ms_ks_re(a_ms_ks_lengths, a_ms_ks_strides, DefaultLayout{});
Tensor<BDataType> b_ns_ks_re(b_ns_ks_lengths, b_ns_ks_strides, DefaultLayout{});
Tensor<EDataType> d_ms_ns_re(d_ms_ns_lengths, d_ms_ns_strides, DefaultLayout{});
Tensor<EDataType> e_ms_ns_host_result_re(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<EDataType> e_ms_ns_device_result_re(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<EDataType> e_ms_ns_host_result_re(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
Tensor<EDataType> e_ms_ns_device_result_re(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
// For Imaginary Part of Complex Tensor
Tensor<ADataType> a_ms_ks_img(a_ms_ks_lengths, a_ms_ks_strides);
Tensor<BDataType> b_ns_ks_img(b_ns_ks_lengths, b_ns_ks_strides);
Tensor<EDataType> d_ms_ns_img(d_ms_ns_lengths, d_ms_ns_strides);
Tensor<ADataType> a_ms_ks_img(a_ms_ks_lengths, a_ms_ks_strides, DefaultLayout{});
Tensor<BDataType> b_ns_ks_img(b_ns_ks_lengths, b_ns_ks_strides, DefaultLayout{});
Tensor<EDataType> d_ms_ns_img(d_ms_ns_lengths, d_ms_ns_strides, DefaultLayout{});
Tensor<EDataType> e_ms_ns_host_result_img(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<EDataType> e_ms_ns_device_result_img(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<EDataType> e_ms_ns_host_result_img(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
Tensor<EDataType> e_ms_ns_device_result_img(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
// Intermediate E tensor Definition
Tensor<EDataType> e_ms_ns_device_result_re1(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<EDataType> e_ms_ns_device_result_img1(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<EDataType> e_ms_ns_device_result_re1(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
Tensor<EDataType> e_ms_ns_device_result_img1(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
std::cout << "a_ms_ks_re: " << a_ms_ks_re.mDesc << std::endl;
std::cout << "b_ns_ks_re: " << b_ns_ks_re.mDesc << std::endl;
@@ -349,8 +350,10 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
if(do_verification)
{
// Real Part Verification
Tensor<CShuffleDataType> c_ms_ns_host_result_re(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<CShuffleDataType> c_ms_ns_host_result_re1(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<CShuffleDataType> c_ms_ns_host_result_re(
e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
Tensor<CShuffleDataType> c_ms_ns_host_result_re1(
e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
using ReferenceOpInstance =
ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
@@ -422,8 +425,10 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
isRealOk = ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1;
// Img Part Verification
Tensor<CShuffleDataType> c_ms_ns_host_result_img(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<CShuffleDataType> c_ms_ns_host_result_img1(e_ms_ns_lengths, e_ms_ns_strides);
Tensor<CShuffleDataType> c_ms_ns_host_result_img(
e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
Tensor<CShuffleDataType> c_ms_ns_host_result_img1(
e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
auto ref_argument_img = ref_op.MakeArgument(
a_ms_ks_re, b_ns_ks_img, c_ms_ns_host_result_img, a_element_op, b_element_op);

View File

@@ -269,10 +269,12 @@ int main(int argc, char* argv[])
Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
Tensor<XDataType> a1_t_k(HostTensorDescriptor(
{tokens, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
Tensor<B0DataType> b0_e_n_k(
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
Tensor<XDataType> b1_e_n_k(
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
{(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN}));
{(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN},
Col{}));
// A, B Scale preshuffle
Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
@@ -281,12 +283,13 @@ int main(int argc, char* argv[])
{sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
Tensor<XDataType> b_scale_preshuffled(
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
{N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN}));
{N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN},
Col{}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<EDataType> e_t_k_n_host_result(
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
Tensor<EDataType> e_t_k_n_device_result(
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
e_t_k_n_device_result.SetZero();
std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
@@ -480,7 +483,7 @@ int main(int argc, char* argv[])
e_device_buf.ToDevice(e_t_k_n_device_result.mData.data());
invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
using ReferenceGemmInstance =
ck::tensor_operation::host::ReferenceMoeMXGemm1<A0DataType,

Some files were not shown because too many files have changed in this diff Show More