mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-07-01 04:07:56 +00:00
Merge branch 'develop' into kylasa_kdim_pr
This commit is contained in:
2
.github/CODEOWNERS
vendored
2
.github/CODEOWNERS
vendored
@@ -1,4 +1,4 @@
|
||||
* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli @aska-0096 @shumway @vidyasagar-amd
|
||||
* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd
|
||||
# Documentation files
|
||||
docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD
|
||||
*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD
|
||||
|
||||
24
.github/scripts/therock_configure_ci.py
vendored
24
.github/scripts/therock_configure_ci.py
vendored
@@ -42,6 +42,24 @@ def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]:
|
||||
file=sys.stderr,
|
||||
)
|
||||
return None
|
||||
|
||||
GITHUB_WORKFLOWS_CI_PATTERNS = [
|
||||
"therock*",
|
||||
]
|
||||
|
||||
def is_path_workflow_file_related_to_ci(path: str) -> bool:
|
||||
return any(
|
||||
fnmatch.fnmatch(path, ".github/workflows/" + pattern)
|
||||
for pattern in GITHUB_WORKFLOWS_CI_PATTERNS
|
||||
) or any(
|
||||
fnmatch.fnmatch(path, ".github/scripts/" + pattern)
|
||||
for pattern in GITHUB_WORKFLOWS_CI_PATTERNS
|
||||
)
|
||||
|
||||
def check_for_workflow_file_related_to_ci(paths: Optional[Iterable[str]]) -> bool:
|
||||
if paths is None:
|
||||
return False
|
||||
return any(is_path_workflow_file_related_to_ci(p) for p in paths)
|
||||
|
||||
# Paths matching any of these patterns are considered to have no influence over
|
||||
# build or test workflows so any related jobs can be skipped if all paths
|
||||
@@ -82,12 +100,16 @@ def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool:
|
||||
)
|
||||
other_paths = paths_set - github_workflows_paths
|
||||
|
||||
related_to_ci = check_for_workflow_file_related_to_ci(github_workflows_paths)
|
||||
contains_other_non_skippable_files = check_for_non_skippable_path(other_paths)
|
||||
|
||||
print("should_ci_run_given_modified_paths findings:")
|
||||
print(f" contains_other_non_skippable_files: {contains_other_non_skippable_files}")
|
||||
|
||||
if contains_other_non_skippable_files:
|
||||
if related_to_ci:
|
||||
print("Enabling build jobs since a related workflow file was modified")
|
||||
return True
|
||||
elif contains_other_non_skippable_files:
|
||||
print("Enabling TheRock CI jobs since a non-skippable path was modified")
|
||||
return True
|
||||
else:
|
||||
|
||||
51
.github/workflows/therock-ci-linux.yml
vendored
51
.github/workflows/therock-ci-linux.yml
vendored
@@ -27,30 +27,35 @@ jobs:
|
||||
TEATIME_FORCE_INTERACTIVE: 0
|
||||
AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
|
||||
steps:
|
||||
- name: "Checking out repository for rocm-libraries"
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
repository: "ROCm/rocm-libraries"
|
||||
|
||||
- name: Checkout composable_kernel repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
path: "composable_kernel"
|
||||
|
||||
- name: Checkout TheRock repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
repository: "ROCm/TheRock"
|
||||
ref: ec1c2ef4f2636bce7733fd8c95e1dbb6692c8a57
|
||||
ref: 409f43ad9d564454bb1b23f8c8aa15d6b9d25200
|
||||
path: "TheRock"
|
||||
|
||||
- name: Runner Health Settings
|
||||
run: |
|
||||
df -h
|
||||
cmake --version
|
||||
echo "Installed Python versions:"
|
||||
ls -d /opt/python
|
||||
echo "python: $(which python), python3: $(which python3)"
|
||||
echo "Git version: $(git --version)"
|
||||
git config --global --add safe.directory $PWD
|
||||
git config fetch.parallel 10
|
||||
./TheRock/build_tools/health_status.py
|
||||
|
||||
- name: Fetch sources
|
||||
run: |
|
||||
./TheRock/build_tools/fetch_sources.py --jobs 12
|
||||
./TheRock/build_tools/fetch_sources.py --jobs 12 --no-include-rocm-libraries --no-include-ml-frameworks
|
||||
|
||||
- name: Patch rocm-libraries
|
||||
run: |
|
||||
git config --global --add safe.directory '*'
|
||||
git -c user.name="therockbot" -c "user.email=therockbot@amd.com" am --whitespace=nowarn ./TheRock/patches/amd-mainline/rocm-libraries/*.patch
|
||||
|
||||
- name: Install python deps
|
||||
run: |
|
||||
@@ -92,32 +97,14 @@ jobs:
|
||||
aws-region: us-east-2
|
||||
role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
|
||||
|
||||
- name: Create Logs index Files and upload logs
|
||||
- name: Post Build Upload
|
||||
if: always()
|
||||
run: |
|
||||
python3 TheRock/build_tools/github_actions/create_log_index.py \
|
||||
--build-dir=TheRock/build \
|
||||
--amdgpu-family=${{ env.AMDGPU_FAMILIES }}
|
||||
|
||||
python3 TheRock/build_tools/github_actions/upload_build_logs_to_s3.py \
|
||||
--build-dir=TheRock/build \
|
||||
--run-id ${{ github.run_id }} \
|
||||
--amdgpu-family ${{ env.AMDGPU_FAMILIES }}
|
||||
|
||||
- name: Upload artifacts
|
||||
run: |
|
||||
python TheRock/build_tools/github_actions/upload_build_artifacts.py \
|
||||
python3 TheRock/build_tools/github_actions/post_build_upload.py \
|
||||
--run-id ${{ github.run_id }} \
|
||||
--amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
|
||||
--build-dir TheRock/build
|
||||
|
||||
- name: Add Links to Job Summary
|
||||
if: always()
|
||||
run: |
|
||||
python TheRock/build_tools/github_actions/upload_build_summary.py \
|
||||
--run-id ${{ github.run_id }} \
|
||||
--amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
|
||||
--build-dir TheRock/build
|
||||
--build-dir TheRock/build \
|
||||
--upload
|
||||
|
||||
therock-test-linux:
|
||||
name: "Test"
|
||||
|
||||
9
.github/workflows/therock-ci.yml
vendored
9
.github/workflows/therock-ci.yml
vendored
@@ -56,7 +56,14 @@ jobs:
|
||||
uses: ./.github/workflows/therock-ci-linux.yml
|
||||
secrets: inherit
|
||||
with:
|
||||
cmake_options: "-DTHEROCK_ENABLE_COMPOSABLE_KERNEL=ON -DTHEROCK_ENABLE_MIOPEN=ON -DTHEROCK_ENABLE_ALL=OFF -DTHEROCK_USE_EXTERNAL_CK=ON -DTHEROCK_CK_SOURCE_DIR=../"
|
||||
cmake_options: >-
|
||||
-DTHEROCK_ENABLE_COMPOSABLE_KERNEL=ON
|
||||
-DTHEROCK_ENABLE_MIOPEN=ON
|
||||
-DTHEROCK_ENABLE_ALL=OFF
|
||||
-DTHEROCK_USE_EXTERNAL_COMPOSABLE_KERNEL=ON
|
||||
-DTHEROCK_COMPOSABLE_KERNEL_SOURCE_DIR=../composable_kernel
|
||||
-DTHEROCK_USE_EXTERNAL_ROCM_LIBRARIES=ON
|
||||
-DTHEROCK_ROCM_LIBRARIES_SOURCE_DIR=../
|
||||
amdgpu_families: "gfx94X-dcgpu"
|
||||
test_runs_on: "linux-mi325-1gpu-ossci-rocm"
|
||||
|
||||
|
||||
71
.github/workflows/therock-test-component.yml
vendored
Normal file
71
.github/workflows/therock-test-component.yml
vendored
Normal file
@@ -0,0 +1,71 @@
|
||||
name: Test component
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
artifact_run_id:
|
||||
type: string
|
||||
default: ""
|
||||
amdgpu_families:
|
||||
type: string
|
||||
test_runs_on:
|
||||
type: string
|
||||
platform:
|
||||
type: string
|
||||
component:
|
||||
type: string
|
||||
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
test_component:
|
||||
name: 'Test ${{ fromJSON(inputs.component).job_name }} (shard ${{ matrix.shard }} of ${{ fromJSON(inputs.component).total_shards }})'
|
||||
runs-on: ${{ inputs.test_runs_on }}
|
||||
container:
|
||||
image: ${{ inputs.platform == 'linux' && 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98' || null }}
|
||||
options: --ipc host
|
||||
--group-add video
|
||||
--device /dev/kfd
|
||||
--device /dev/dri
|
||||
--group-add 992
|
||||
--env-file /etc/podinfo/gha-gpu-isolation-settings
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# The shard array is based on "total_shards" from "fetch_test_configurations.py"
|
||||
# The test executable will shard based on the array. (ex: [1, 2, 3, 4] = four test shards)
|
||||
shard: ${{ fromJSON(inputs.component).shard_arr }}
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
env:
|
||||
VENV_DIR: ${{ github.workspace }}/.venv
|
||||
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}"
|
||||
OUTPUT_ARTIFACTS_DIR: "./build"
|
||||
THEROCK_BIN_DIR: "./build/bin"
|
||||
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
|
||||
steps:
|
||||
- name: Checkout Repository
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
repository: "ROCm/TheRock"
|
||||
|
||||
- name: Run setup test environment workflow
|
||||
uses: './.github/actions/setup_test_environment'
|
||||
with:
|
||||
ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
|
||||
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
|
||||
OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
|
||||
VENV_DIR: ${{ env.VENV_DIR }}
|
||||
FETCH_ARTIFACT_ARGS: ${{ fromJSON(inputs.component).fetch_artifact_args }}
|
||||
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
|
||||
|
||||
- name: Test
|
||||
timeout-minutes: ${{ fromJSON(inputs.component).timeout_minutes }}
|
||||
env:
|
||||
SHARD_INDEX: ${{ matrix.shard }}
|
||||
TOTAL_SHARDS: ${{ fromJSON(inputs.component).total_shards }}
|
||||
run: |
|
||||
${{ fromJSON(inputs.component).test_script }}
|
||||
40
.github/workflows/therock-test-packages.yml
vendored
40
.github/workflows/therock-test-packages.yml
vendored
@@ -37,41 +37,17 @@ jobs:
|
||||
|
||||
test_components:
|
||||
name: 'Test ${{ matrix.components.job_name }}'
|
||||
runs-on: ${{ inputs.test_runs_on }}
|
||||
needs: configure_test_matrix
|
||||
needs: [configure_test_matrix]
|
||||
# skip tests if no test matrix to run
|
||||
if: ${{ needs.configure_test_matrix.outputs.components != '[]' }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
components: ${{ fromJSON(needs.configure_test_matrix.outputs.components) }}
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
env:
|
||||
VENV_DIR: ${{ github.workspace }}/.venv
|
||||
ARTIFACT_RUN_ID: "${{ github.run_id }}"
|
||||
OUTPUT_ARTIFACTS_DIR: ${{ github.workspace }}/build
|
||||
THEROCK_BIN_DIR: "./build/bin"
|
||||
steps:
|
||||
- name: Checkout Repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
repository: "ROCm/TheRock"
|
||||
|
||||
- name: Run setup test environment workflow
|
||||
uses: './.github/actions/setup_test_environment'
|
||||
with:
|
||||
ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
|
||||
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
|
||||
OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
|
||||
VENV_DIR: ${{ env.VENV_DIR }}
|
||||
FETCH_ARTIFACT_ARGS: ${{ matrix.components.fetch_artifact_args }}
|
||||
PLATFORM: ${{ inputs.platform }}
|
||||
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
|
||||
|
||||
- name: Test
|
||||
timeout-minutes: ${{ matrix.components.timeout_minutes }}
|
||||
run: |
|
||||
if [ "${{ inputs.PLATFORM }}" == "linux" ]; then source ${VENV_DIR}/bin/activate ; else . ${VENV_DIR}/Scripts/activate ; fi
|
||||
${{ matrix.components.test_script }}
|
||||
uses: './.github/workflows/therock-test-component.yml'
|
||||
with:
|
||||
artifact_run_id: ${{ github.run_id }}
|
||||
amdgpu_families: ${{ inputs.amdgpu_families }}
|
||||
test_runs_on: ${{ inputs.test_runs_on }}
|
||||
platform: ${{ inputs.platform }}
|
||||
component: ${{ toJSON(matrix.components) }}
|
||||
|
||||
@@ -6,6 +6,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
|
||||
|
||||
### Added
|
||||
* Added support for B Tensor type pk_int4_t in the CK TILE weight preshuffle GEMM.
|
||||
* Added the new api to load different memory sizes to SGPR.
|
||||
* Added support for B Tensor Preshuffle in CK TILE Grouped GEMM.
|
||||
* Added a basic copy kernel example and supporting documentation for new CK Tile developers.
|
||||
* Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data
|
||||
|
||||
@@ -339,6 +339,7 @@ endif()
|
||||
option(USE_BITINT_EXTENSION_INT4 "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
|
||||
option(USE_OPT_GFX11 "Whether to enable LDS cumode and Wavefront32 mode for GFX11 silicons." OFF)
|
||||
option(ENABLE_ASM_DUMP "Whether to enable assembly dump for kernels." OFF)
|
||||
option(ENABLE_JSON_DUMP "Whether to enable json dump for examples." OFF)
|
||||
|
||||
if(USE_BITINT_EXTENSION_INT4)
|
||||
add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
|
||||
@@ -352,6 +353,11 @@ if(ENABLE_ASM_DUMP)
|
||||
message("CK compiled with ENABLE_ASM_DUMP set to ${ENABLE_ASM_DUMP}")
|
||||
endif()
|
||||
|
||||
if (ENABLE_JSON_DUMP)
|
||||
add_compile_definitions(CK_ENABLE_JSON_DUMP)
|
||||
message("CK compiled with ENABLE_JSON_DUMP set to ${ENABLE_JSON_DUMP}")
|
||||
endif()
|
||||
|
||||
## Threads
|
||||
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
29
Dockerfile
29
Dockerfile
@@ -1,27 +1,23 @@
|
||||
|
||||
FROM ubuntu:24.04
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
ARG ROCMVERSION=6.4.1
|
||||
ARG ROCMVERSION=7.0.1
|
||||
ARG compiler_version=""
|
||||
ARG compiler_commit=""
|
||||
ARG CK_SCCACHE=""
|
||||
ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
|
||||
ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Add rocm repository
|
||||
RUN set -xe && \
|
||||
apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
|
||||
curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
|
||||
apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
|
||||
|
||||
RUN if [ "$ROCMVERSION" != "6.5" ]; then \
|
||||
sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/jammy/amdgpu-install_6.4.60401-1_all.deb --no-check-certificate" && \
|
||||
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.4.60401-1_all.deb && \
|
||||
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
|
||||
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO jammy main > /etc/apt/sources.list.d/rocm.list" && \
|
||||
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu jammy main > /etc/apt/sources.list.d/amdgpu.list'; \
|
||||
fi
|
||||
|
||||
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu jammy main universe | tee -a /etc/apt/sources.list" && \
|
||||
amdgpu-install -y --usecase=rocm --no-dkms
|
||||
RUN wget https://repo.radeon.com/amdgpu-install/7.0.1/ubuntu/noble/amdgpu-install_7.0.1.70001-1_all.deb && \
|
||||
apt install ./amdgpu-install_7.0.1.70001-1_all.deb -y && \
|
||||
apt update && \
|
||||
apt install python3-setuptools python3-wheel -y && \
|
||||
apt install rocm-dev -y
|
||||
|
||||
## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
|
||||
ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
|
||||
@@ -45,7 +41,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
|
||||
libelf-dev \
|
||||
libnuma-dev \
|
||||
libpthread-stubs0-dev \
|
||||
llvm-amdgpu \
|
||||
mpich \
|
||||
net-tools \
|
||||
pkg-config \
|
||||
@@ -61,17 +56,13 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
|
||||
zip \
|
||||
libzstd-dev \
|
||||
openssh-server \
|
||||
clang-format-12 \
|
||||
clang-format-18 \
|
||||
kmod && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
rm -rf amdgpu-install* && \
|
||||
# Remove unnecessary rocm components that take a lot of space
|
||||
apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt
|
||||
|
||||
#Install latest ccache
|
||||
RUN git clone https://github.com/ccache/ccache.git && \
|
||||
git clone https://github.com/ccache/ccache.git && \
|
||||
cd ccache && mkdir build && cd build && cmake .. && make install && \
|
||||
#Install ninja build tracing tools
|
||||
cd / && \
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm6.4.1"
|
||||
ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm7.0.1"
|
||||
FROM $BASE_DOCKER
|
||||
ARG compiler_version=""
|
||||
ARG compiler_commit=""
|
||||
|
||||
37
Jenkinsfile
vendored
37
Jenkinsfile
vendored
@@ -53,7 +53,7 @@ def getBaseDockerImageName(){
|
||||
}
|
||||
else{
|
||||
def ROCM_numeric = parseVersion("${params.ROCMVERSION}")
|
||||
if ( ROCM_numeric.major <= 6 && ROCM_numeric.minor < 5 ){
|
||||
if ( ROCM_numeric.major <= 7 && ROCM_numeric.minor < 1 ){
|
||||
img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}"
|
||||
}
|
||||
else{
|
||||
@@ -476,7 +476,7 @@ def buildHipClangJob(Map conf=[:]){
|
||||
def retimage
|
||||
(retimage, image) = getDockerImage(conf)
|
||||
|
||||
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
|
||||
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
|
||||
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
|
||||
timeout(time: 20, unit: 'HOURS')
|
||||
{
|
||||
@@ -538,7 +538,7 @@ def Build_CK(Map conf=[:]){
|
||||
def image
|
||||
def retimage
|
||||
|
||||
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
|
||||
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
|
||||
try {
|
||||
(retimage, image) = getDockerImage(conf)
|
||||
withDockerContainer(image: image, args: dockerOpts) {
|
||||
@@ -728,7 +728,7 @@ def process_results(Map conf=[:]){
|
||||
def variant = env.STAGE_NAME
|
||||
def retimage
|
||||
|
||||
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
|
||||
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
|
||||
try
|
||||
{
|
||||
echo "Pulling image: ${image}"
|
||||
@@ -836,7 +836,7 @@ def run_aiter_tests(Map conf=[:]){
|
||||
dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
|
||||
echo "Docker flags: ${dockerOpts}"
|
||||
|
||||
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
|
||||
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
|
||||
try
|
||||
{
|
||||
echo "Pulling image: ${image}"
|
||||
@@ -859,6 +859,7 @@ def run_aiter_tests(Map conf=[:]){
|
||||
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py"
|
||||
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py"
|
||||
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha.py"
|
||||
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha_varlen.py"
|
||||
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe.py"
|
||||
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_2stage.py"
|
||||
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_blockscale.py"
|
||||
@@ -894,7 +895,7 @@ def run_pytorch_tests(Map conf=[:]){
|
||||
dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
|
||||
echo "Docker flags: ${dockerOpts}"
|
||||
|
||||
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
|
||||
gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "${variant}", account: 'ROCm', repo: 'composable_kernel') {
|
||||
try
|
||||
{
|
||||
echo "Pulling image: ${image}"
|
||||
@@ -930,7 +931,8 @@ def run_pytorch_tests(Map conf=[:]){
|
||||
}
|
||||
|
||||
//launch develop branch daily jobs
|
||||
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
|
||||
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_PERFORMANCE_TESTS=true
|
||||
0 22 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
|
||||
0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
|
||||
0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
|
||||
0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
|
||||
@@ -957,8 +959,8 @@ pipeline {
|
||||
description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
|
||||
string(
|
||||
name: 'ROCMVERSION',
|
||||
defaultValue: '6.4.1',
|
||||
description: 'Specify which ROCM version to use: 6.4.1 (default).')
|
||||
defaultValue: '7.0.1',
|
||||
description: 'Specify which ROCM version to use: 7.0.1 (default).')
|
||||
string(
|
||||
name: 'COMPILER_VERSION',
|
||||
defaultValue: '',
|
||||
@@ -1037,8 +1039,8 @@ pipeline {
|
||||
description: "Build CK and run tests on gfx942 (default: ON)")
|
||||
booleanParam(
|
||||
name: "BUILD_GFX950",
|
||||
defaultValue: false,
|
||||
description: "Build CK and run tests on gfx950 (default: OFF)")
|
||||
defaultValue: true,
|
||||
description: "Build CK and run tests on gfx950 (default: ON)")
|
||||
booleanParam(
|
||||
name: "BUILD_GFX10",
|
||||
defaultValue: true,
|
||||
@@ -1290,7 +1292,7 @@ pipeline {
|
||||
agent{ label rocmnode("gfx90a")}
|
||||
environment{
|
||||
setup_args = "NO_CK_BUILD"
|
||||
execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen && \
|
||||
execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake -DCMAKE_PREFIX_PATH=/opt/rocm ../codegen && \
|
||||
make -j64 check"""
|
||||
}
|
||||
steps{
|
||||
@@ -1350,7 +1352,6 @@ pipeline {
|
||||
}
|
||||
agent{ label rocmnode("gfx950") }
|
||||
environment{
|
||||
def docker_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0"
|
||||
setup_args = "NO_CK_BUILD"
|
||||
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx950 && \
|
||||
make -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \
|
||||
@@ -1358,7 +1359,7 @@ pipeline {
|
||||
example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx950 """
|
||||
}
|
||||
steps{
|
||||
buildHipClangJobAndReboot(setup_args:setup_args, docker_name: docker_name, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
|
||||
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
@@ -1566,7 +1567,7 @@ pipeline {
|
||||
-DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
|
||||
}
|
||||
steps{
|
||||
Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
|
||||
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
@@ -1631,7 +1632,7 @@ pipeline {
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """
|
||||
|
||||
buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0")
|
||||
buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1")
|
||||
}
|
||||
cleanWs()
|
||||
}
|
||||
@@ -1657,13 +1658,13 @@ pipeline {
|
||||
cleanWs()
|
||||
}
|
||||
}
|
||||
stage("Build CK and run Tests on gfx1101")
|
||||
stage("Build CK and run Tests on gfx11")
|
||||
{
|
||||
when {
|
||||
beforeAgent true
|
||||
expression { params.BUILD_GFX11.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
|
||||
}
|
||||
agent{ label rocmnode("gfx1101") }
|
||||
agent{ label 'miopen && (gfx1101 || gfx1100)' }
|
||||
environment{
|
||||
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx11-generic" -DUSE_OPT_GFX11=ON -DCMAKE_CXX_FLAGS=" -O3 " """
|
||||
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
|
||||
|
||||
@@ -12,6 +12,7 @@ configure_file(${CK_ROOT}/include/ck/config.h.in ${CK_ROOT}/include/ck/config.h)
|
||||
find_package(ROCM)
|
||||
include(ROCMInstallTargets)
|
||||
include(ROCMTest)
|
||||
find_package(hiprtc REQUIRED)
|
||||
|
||||
rocm_setup_version(VERSION 1.0)
|
||||
|
||||
@@ -27,7 +28,7 @@ add_compile_options(-std=c++20)
|
||||
file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
|
||||
# TODO: Use object library
|
||||
add_library(ck_host STATIC ${SOURCES})
|
||||
target_link_libraries(ck_host PRIVATE ck_headers)
|
||||
target_link_libraries(ck_host PRIVATE ck_headers hiprtc::hiprtc)
|
||||
|
||||
set_target_properties(ck_host PROPERTIES
|
||||
LINKER_LANGUAGE CXX
|
||||
|
||||
@@ -36,7 +36,7 @@ using BDataType = ck::half_t;
|
||||
using CDataType = ck::half_t;
|
||||
using AccDataType = float;
|
||||
#else
|
||||
< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, 4, 4, 7, 1>;
|
||||
< F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 16, 128, 4, 4, 16, 16, 1, 2, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, 4, 4, 7, 1>;
|
||||
using ADataType = float;
|
||||
using BDataType = float;
|
||||
using CDataType = float;
|
||||
@@ -185,7 +185,6 @@ int main(int argc, char* argv[])
|
||||
auto a_element_op = AElementOp{};
|
||||
auto b_element_op = BElementOp{};
|
||||
auto c_element_op = CElementOp{};
|
||||
|
||||
// do GEMM
|
||||
auto gemm = DeviceGemmInstance{};
|
||||
auto invoker = gemm.MakeInvoker();
|
||||
@@ -209,8 +208,7 @@ int main(int argc, char* argv[])
|
||||
return 0;
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
#include "ck/library/utility/validation_common.hpp"
|
||||
|
||||
// use macro to minimize code change
|
||||
#ifndef EXAMPLE_WITH_COMPUTE_DATATYPE
|
||||
@@ -29,11 +28,11 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz}, layout);
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride}, layout);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -59,17 +58,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
|
||||
StrideB = f_get_default_stride(K, N, StrideB, BLayout{});
|
||||
StrideC = f_get_default_stride(M, N, StrideC, CLayout{});
|
||||
|
||||
try
|
||||
{
|
||||
ck::utils::validate_gemm_strides_abc<ALayout, BLayout, CLayout>(
|
||||
M, N, K, StrideA, StrideB, StrideC);
|
||||
}
|
||||
catch(const std::runtime_error& e)
|
||||
{
|
||||
std::cerr << "Error: " << e.what() << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
|
||||
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
|
||||
|
||||
|
||||
@@ -174,6 +174,9 @@ int main(int argc, char* argv[])
|
||||
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
|
||||
const auto StrideD = std::is_same<decltype(ELayout{}), ck::tensor_layout::gemm::RowMajor>::value
|
||||
? d_m_n.mDesc.GetStrides()[0]
|
||||
: d_m_n.mDesc.GetStrides()[1];
|
||||
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
|
||||
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
|
||||
std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
|
||||
@@ -221,7 +224,7 @@ int main(int argc, char* argv[])
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
std::array<ck::index_t, 1>{0},
|
||||
std::array<ck::index_t, 1>{static_cast<int>(StrideD)},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
|
||||
@@ -7,7 +7,9 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
|
||||
#endif
|
||||
using namespace ck::literals;
|
||||
|
||||
auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
|
||||
ProblemSize ps =
|
||||
problem_size; // make mutable copy because default stride values of 0 need to be updated
|
||||
auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = ps;
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
@@ -41,6 +43,30 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
|
||||
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
|
||||
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
|
||||
|
||||
// If any user-provided leading stride <= 0, replace it with the one determined by the
|
||||
// created tensor descriptor. For RowMajor the leading stride is index 0, for ColMajor index 1.
|
||||
auto fetch_leading_stride = [](const auto& tensor, auto layout_tag) -> int {
|
||||
if constexpr(std::is_same_v<decltype(layout_tag), ck::tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
return static_cast<int>(tensor.GetStrides()[0]);
|
||||
}
|
||||
else
|
||||
{
|
||||
return static_cast<int>(tensor.GetStrides()[1]);
|
||||
}
|
||||
};
|
||||
|
||||
if(StrideA <= 0)
|
||||
StrideA = fetch_leading_stride(a_m_k, ALayout{});
|
||||
if(StrideB <= 0)
|
||||
StrideB = fetch_leading_stride(b_k_n, BLayout{});
|
||||
if(StrideD0 <= 0)
|
||||
StrideD0 = fetch_leading_stride(d0_m_n, D0Layout{});
|
||||
if(StrideD1 <= 0)
|
||||
StrideD1 = fetch_leading_stride(d1_m_n, D1Layout{});
|
||||
if(StrideE <= 0)
|
||||
StrideE = fetch_leading_stride(e_m_n_host_result, ELayout{});
|
||||
|
||||
switch(config.init_method)
|
||||
{
|
||||
case 0: break;
|
||||
|
||||
@@ -125,7 +125,7 @@ inline bool parse_cmd_args(int argc,
|
||||
|
||||
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
|
||||
problem_size = ck::utils::conv::parse_conv_param(
|
||||
num_dim_spatial, threshold_to_catch_partial_args, argv);
|
||||
num_dim_spatial, threshold_to_catch_partial_args + 1, argv);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -23,7 +23,7 @@ using RsGlobalReduceOp =
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
|
||||
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
// clang-format off
|
||||
template <ck::index_t NDimSpatial>
|
||||
|
||||
@@ -78,12 +78,12 @@ bool pool_test(bool do_verification,
|
||||
|
||||
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value)
|
||||
{
|
||||
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
|
||||
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz}, layout);
|
||||
}
|
||||
else if constexpr(ck::is_same<decltype(layout),
|
||||
ck::tensor_layout::convolution::NHWC>::value)
|
||||
{
|
||||
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
|
||||
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_}, layout);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -115,12 +115,14 @@ int main()
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({stride, 1_uz}));
|
||||
std::vector<std::size_t>({stride, 1_uz}),
|
||||
layout);
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({1_uz, stride}));
|
||||
std::vector<std::size_t>({1_uz, stride}),
|
||||
layout);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -137,11 +137,13 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({batch_count, row, col}, {row * stride, stride, 1_uz});
|
||||
return HostTensorDescriptor(
|
||||
{batch_count, row, col}, {row * stride, stride, 1_uz}, layout);
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({batch_count, row, col}, {col * stride, 1_uz, stride});
|
||||
return HostTensorDescriptor(
|
||||
{batch_count, row, col}, {col * stride, 1_uz, stride}, layout);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -65,7 +65,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDLayern
|
||||
//######| | | | | Type| Type| Type| DataType| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|
|
||||
//######| | | | | | | | | | | | | | Operation| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _M_N| _M_N| _M_N| _M|
|
||||
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EMeanVarDataType, GammaDataType, BetaDataType, HDataType, AElementOp, BElementOp, CDEElementOp, HElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<32, 8>, 8, S<8, 32>, 8>;
|
||||
< ALayout, BLayout, DsLayout, HLayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EMeanVarDataType, GammaDataType, BetaDataType, HDataType, AElementOp, BElementOp, CDEElementOp, HElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 16, 16, 8, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<32, 8>, 4, S<8, 32>, 4>;
|
||||
// clang-format on
|
||||
|
||||
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
|
||||
@@ -154,8 +154,8 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
|
||||
|
||||
int main()
|
||||
{
|
||||
// temp disable on gfx11 & gfx12
|
||||
if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
|
||||
// temp disable on gfx11
|
||||
if(ck::is_gfx11_supported())
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -59,11 +59,13 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz});
|
||||
return HostTensorDescriptor(
|
||||
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride});
|
||||
return HostTensorDescriptor(
|
||||
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -137,11 +137,13 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
|
||||
auto layout) {
|
||||
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz});
|
||||
return HostTensorDescriptor(
|
||||
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride});
|
||||
return HostTensorDescriptor(
|
||||
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -64,11 +64,13 @@ bool run_batched_gemm_rowwise(const ProblemSize& problem_size, const ExecutionCo
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz});
|
||||
return HostTensorDescriptor(
|
||||
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride});
|
||||
return HostTensorDescriptor(
|
||||
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -19,6 +19,9 @@
|
||||
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
@@ -247,11 +250,11 @@ int main(int argc, char* argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
|
||||
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
|
||||
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
|
||||
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
|
||||
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
|
||||
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides, Row{});
|
||||
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides, Bypass{});
|
||||
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
|
||||
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
|
||||
|
||||
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
|
||||
std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
|
||||
@@ -342,7 +345,8 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<CShuffleDataType> c_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
|
||||
Tensor<CShuffleDataType> c_gs_ms_ns_host_result(
|
||||
e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
|
||||
|
||||
using ReferenceOpInstance = ReferenceContraction_G1_M2_N3_K1<NumDimM,
|
||||
NumDimN,
|
||||
|
||||
@@ -17,6 +17,9 @@
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
@@ -247,11 +250,11 @@ int main(int argc, char* argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
|
||||
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
|
||||
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
|
||||
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
|
||||
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
|
||||
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides, Row{});
|
||||
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides, Bypass{});
|
||||
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
|
||||
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
|
||||
|
||||
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
|
||||
std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
|
||||
@@ -342,7 +345,8 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<CShuffleDataType> c_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
|
||||
Tensor<CShuffleDataType> c_gs_ms_ns_host_result(
|
||||
e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
|
||||
|
||||
using ReferenceOpInstance = ReferenceContraction_G1_M3_N2_K1<NumDimG,
|
||||
NumDimM,
|
||||
|
||||
@@ -15,6 +15,8 @@
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
|
||||
int run_contraction_bilinear_example(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
@@ -95,11 +97,11 @@ int run_contraction_bilinear_example(int argc, char* argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
|
||||
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
|
||||
Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
|
||||
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides, Row{});
|
||||
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides, Row{});
|
||||
Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides, Row{});
|
||||
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
|
||||
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
|
||||
|
||||
std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
|
||||
std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
|
||||
@@ -189,7 +191,7 @@ int run_contraction_bilinear_example(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
|
||||
|
||||
using ReferenceOpInstance =
|
||||
ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
|
||||
|
||||
@@ -15,6 +15,8 @@
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
|
||||
int run_contraction_scale_example(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
@@ -85,10 +87,10 @@ int run_contraction_scale_example(int argc, char* argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
|
||||
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
|
||||
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides, Row{});
|
||||
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides, Row{});
|
||||
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
|
||||
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
|
||||
|
||||
std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
|
||||
std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
|
||||
@@ -173,7 +175,7 @@ int run_contraction_scale_example(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
|
||||
|
||||
using ReferenceOpInstance =
|
||||
ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
|
||||
|
||||
@@ -18,6 +18,9 @@
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
@@ -304,10 +307,10 @@ int main(int argc, char* argv[])
|
||||
const auto e_ms_ns_lengths = contraction_descs[i].e_ms_ns_lengths;
|
||||
const auto e_ms_ns_strides = contraction_descs[i].e_ms_ns_strides;
|
||||
|
||||
Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
|
||||
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
|
||||
Tensor<DDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
|
||||
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides, Row{});
|
||||
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides, Row{});
|
||||
Tensor<DDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides, Bypass{});
|
||||
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
|
||||
|
||||
ck::index_t M_ =
|
||||
ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
|
||||
@@ -416,9 +419,9 @@ int main(int argc, char* argv[])
|
||||
const auto e_ms_ns_lengths = contraction_descs[i].e_ms_ns_lengths;
|
||||
const auto e_ms_ns_strides = contraction_descs[i].e_ms_ns_strides;
|
||||
|
||||
Tensor<EDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<EDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
|
||||
|
||||
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
|
||||
|
||||
e_tensors_device[i]->FromDevice(e_device_tensors[i].mData.data());
|
||||
|
||||
|
||||
@@ -17,6 +17,9 @@
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
@@ -300,11 +303,11 @@ int main(int argc, char* argv[])
|
||||
std::vector<ck::index_t> e_gs_ms_ns_strides{
|
||||
G1 * M0 * N0 * M1 * N1, M0 * N0 * M1 * N1, N0 * M1 * N1, N1, M1 * N1, 1};
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
|
||||
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
|
||||
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
|
||||
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
|
||||
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
|
||||
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides, Row{});
|
||||
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides, Bypass{});
|
||||
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
|
||||
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
|
||||
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
|
||||
std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
|
||||
std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
|
||||
@@ -396,7 +399,8 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result(
|
||||
e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
|
||||
|
||||
using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
|
||||
NumDimM,
|
||||
|
||||
@@ -17,6 +17,9 @@
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
@@ -247,11 +250,11 @@ int main(int argc, char* argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
|
||||
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
|
||||
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
|
||||
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
|
||||
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
|
||||
Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides, Row{});
|
||||
Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides, Bypass{});
|
||||
Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
|
||||
Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
|
||||
|
||||
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
|
||||
std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
|
||||
@@ -345,7 +348,8 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result(
|
||||
e_gs_ms_ns_lengths, e_gs_ms_ns_strides, Bypass{});
|
||||
|
||||
using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
|
||||
NumDimM,
|
||||
|
||||
@@ -160,7 +160,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNCW{});
|
||||
|
||||
case 2:
|
||||
return HostTensorDescriptor(
|
||||
@@ -176,7 +177,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
|
||||
1, // c
|
||||
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNCHW{});
|
||||
|
||||
case 3:
|
||||
return HostTensorDescriptor(
|
||||
@@ -195,7 +197,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
|
||||
conv_param.G_ * conv_param.C_, // di
|
||||
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNCDHW{});
|
||||
}
|
||||
|
||||
throw std::runtime_error("unsuppored # dim spatial");
|
||||
@@ -213,7 +216,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
|
||||
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.C_ // x
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GKCX{});
|
||||
case 2:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
@@ -229,7 +233,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
|
||||
1, // c
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GKCYX{});
|
||||
case 3:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
@@ -249,7 +254,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
|
||||
conv_param.C_, // z
|
||||
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GKCZYX{});
|
||||
}
|
||||
|
||||
throw std::runtime_error("unsuppored # dim spatial");
|
||||
@@ -267,7 +273,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
|
||||
0, // k
|
||||
1, // c
|
||||
0 // x
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNKW{});
|
||||
case 2:
|
||||
return HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
@@ -280,7 +287,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNKHW{});
|
||||
case 3:
|
||||
return HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
@@ -295,7 +303,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
|
||||
0, // z
|
||||
0, // y
|
||||
0 // x
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNKDHW{});
|
||||
}
|
||||
|
||||
throw std::runtime_error("unsuppored # dim spatial");
|
||||
@@ -314,7 +323,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNKW{});
|
||||
case 2:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
@@ -329,7 +339,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
|
||||
1, // k
|
||||
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNKHW{});
|
||||
|
||||
case 3:
|
||||
return HostTensorDescriptor(
|
||||
@@ -348,7 +359,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
|
||||
conv_param.G_ * conv_param.K_, // do
|
||||
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNKDHW{});
|
||||
}
|
||||
|
||||
throw std::runtime_error("unsuppored # dim spatial");
|
||||
|
||||
@@ -160,7 +160,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNCW{});
|
||||
|
||||
case 2:
|
||||
return HostTensorDescriptor(
|
||||
@@ -176,7 +177,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
|
||||
1, // c
|
||||
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNCHW{});
|
||||
|
||||
case 3:
|
||||
return HostTensorDescriptor(
|
||||
@@ -195,7 +197,8 @@ inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvPar
|
||||
conv_param.G_ * conv_param.C_, // di
|
||||
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNCDHW{});
|
||||
}
|
||||
|
||||
throw std::runtime_error("unsuppored # dim spatial");
|
||||
@@ -213,7 +216,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
|
||||
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.C_ // x
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GKCX{});
|
||||
case 2:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
@@ -229,7 +233,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
|
||||
1, // c
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GKCYX{});
|
||||
case 3:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
@@ -249,7 +254,8 @@ inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvPa
|
||||
conv_param.C_, // z
|
||||
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GKCZYX{});
|
||||
}
|
||||
|
||||
throw std::runtime_error("unsuppored # dim spatial");
|
||||
@@ -267,7 +273,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
|
||||
0, // k
|
||||
1, // c
|
||||
0 // x
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNKW{});
|
||||
case 2:
|
||||
return HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
@@ -280,7 +287,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNKHW{});
|
||||
case 3:
|
||||
return HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
@@ -295,7 +303,8 @@ inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvPara
|
||||
0, // z
|
||||
0, // y
|
||||
0 // x
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNKDHW{});
|
||||
}
|
||||
|
||||
throw std::runtime_error("unsuppored # dim spatial");
|
||||
@@ -314,7 +323,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNKW{});
|
||||
case 2:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
@@ -329,7 +339,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
|
||||
1, // k
|
||||
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNKHW{});
|
||||
|
||||
case 3:
|
||||
return HostTensorDescriptor(
|
||||
@@ -348,7 +359,8 @@ inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvPa
|
||||
conv_param.G_ * conv_param.K_, // do
|
||||
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
},
|
||||
ck::tensor_layout::convolution::GNKDHW{});
|
||||
}
|
||||
|
||||
throw std::runtime_error("unsuppored # dim spatial");
|
||||
|
||||
@@ -261,6 +261,10 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<
|
||||
B1ElementOp,
|
||||
CElementOp>;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
#include "run_batched_gemm_gemm_wmma_cshuffle_v3.inc"
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
|
||||
@@ -110,11 +110,13 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[])
|
||||
|
||||
if(std::is_same<decltype(layout), Row>::value)
|
||||
{
|
||||
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
|
||||
return HostTensorDescriptor(
|
||||
{batch_count, row, col}, {batch_stride, stride, 1_uz}, layout);
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
|
||||
return HostTensorDescriptor(
|
||||
{batch_count, row, col}, {batch_stride, 1_uz, stride}, layout);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -62,17 +62,19 @@ int run(int argc, char* argv[])
|
||||
std::vector<ck::index_t> b1_g_o_n_lengths{G, O, N};
|
||||
#ifdef CK_MHA_USE_RCCR_LAYOUT
|
||||
std::vector<ck::index_t> b1_g_o_n_strides{N * O, N, 1}; // B1 layout [G, O, N]
|
||||
auto b1_layout = Row{};
|
||||
#else
|
||||
std::vector<ck::index_t> b1_g_o_n_strides{N * O, 1, O}; // B1 layout [G, N, O]
|
||||
auto b1_layout = Col{};
|
||||
#endif
|
||||
std::vector<ck::index_t> c_g_m_o_lengths{G, M, O};
|
||||
std::vector<ck::index_t> c_g_m_o_strides{M * O, O, 1}; // C layout [G, M, O]
|
||||
|
||||
Tensor<ADataType> a_g_m_k(a_g_m_k_lengths, a_g_m_k_strides);
|
||||
Tensor<B0DataType> b0_g_n_k(b0_g_n_k_lengths, b0_g_n_k_strides);
|
||||
Tensor<B1DataType> b1_g_o_n(b1_g_o_n_lengths, b1_g_o_n_strides);
|
||||
Tensor<CDataType> c_g_m_o_host_result(c_g_m_o_lengths, c_g_m_o_strides);
|
||||
Tensor<CDataType> c_g_m_o_device_result(c_g_m_o_lengths, c_g_m_o_strides);
|
||||
Tensor<ADataType> a_g_m_k(a_g_m_k_lengths, a_g_m_k_strides, Row{});
|
||||
Tensor<B0DataType> b0_g_n_k(b0_g_n_k_lengths, b0_g_n_k_strides, Row{});
|
||||
Tensor<B1DataType> b1_g_o_n(b1_g_o_n_lengths, b1_g_o_n_strides, b1_layout);
|
||||
Tensor<CDataType> c_g_m_o_host_result(c_g_m_o_lengths, c_g_m_o_strides, Row{});
|
||||
Tensor<CDataType> c_g_m_o_device_result(c_g_m_o_lengths, c_g_m_o_strides, Row{});
|
||||
|
||||
std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
|
||||
std::cout << "b0_g_n_k: " << b0_g_n_k.mDesc << std::endl;
|
||||
|
||||
@@ -111,12 +111,14 @@ int run(int argc, char* argv[])
|
||||
if(std::is_same<decltype(layout), Row>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
|
||||
std::vector<std::size_t>({batch_stride, stride, 1}));
|
||||
std::vector<std::size_t>({batch_stride, stride, 1}),
|
||||
layout);
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
|
||||
std::vector<std::size_t>({batch_stride, 1, stride}));
|
||||
std::vector<std::size_t>({batch_stride, 1, stride}),
|
||||
layout);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
int run(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
@@ -88,11 +90,11 @@ int run(int argc, char* argv[])
|
||||
? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
|
||||
: std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
|
||||
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
|
||||
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Bypass{});
|
||||
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, Bypass{});
|
||||
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, Bypass{});
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Bypass{});
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Bypass{});
|
||||
|
||||
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
|
||||
std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
int run(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
@@ -88,11 +92,30 @@ int run(int argc, char* argv[])
|
||||
? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
|
||||
: std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
|
||||
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
|
||||
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
|
||||
std::vector<ck::index_t> strides,
|
||||
bool permute,
|
||||
auto layout) {
|
||||
if(permute)
|
||||
{
|
||||
return HostTensorDescriptor(lens, strides, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(lens, strides, layout);
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(
|
||||
f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
|
||||
Tensor<B0DataType> b0_gs_ns_ks(
|
||||
f_host_tensor_descriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
|
||||
Tensor<B1DataType> b1_gs_os_ns(
|
||||
f_host_tensor_descriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(
|
||||
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(
|
||||
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
|
||||
|
||||
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
|
||||
std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
int run(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
@@ -113,11 +117,30 @@ int run(int argc, char* argv[])
|
||||
head_dim,
|
||||
1}; // C layout [batch_size, head_num, q_sequence_length, head_dim]
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
|
||||
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
|
||||
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
|
||||
std::vector<ck::index_t> strides,
|
||||
bool permute,
|
||||
auto layout) {
|
||||
if(permute)
|
||||
{
|
||||
return HostTensorDescriptor(lens, strides, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(lens, strides, layout);
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(
|
||||
f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
|
||||
Tensor<B0DataType> b0_gs_ns_ks(
|
||||
f_host_tensor_descriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
|
||||
Tensor<B1DataType> b1_gs_os_ns(
|
||||
f_host_tensor_descriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(
|
||||
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(
|
||||
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
|
||||
|
||||
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
|
||||
std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
|
||||
@@ -191,7 +214,7 @@ int run(int argc, char* argv[])
|
||||
head_num * 2 * head_dim,
|
||||
head_dim,
|
||||
1}; // kv layout [batch_size, q_sequence_length, head_num, 2, head_dim]
|
||||
Tensor<ADataType> kv_gs_ns_ks(kv_gs_ns_ks_lengths, kv_gs_ns_ks_strides);
|
||||
Tensor<ADataType> kv_gs_ns_ks(kv_gs_ns_ks_lengths, kv_gs_ns_ks_strides, Bypass{});
|
||||
// merge kv into a packed pointer send to device
|
||||
b0_gs_ns_ks.ForEach(
|
||||
[&](auto& self, auto idx) { kv_gs_ns_ks(idx[0], idx[1], idx[2], 0, idx[3]) = self(idx); });
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
int run(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
@@ -63,6 +67,19 @@ int run(int argc, char* argv[])
|
||||
|
||||
std::size_t flop = 0, num_byte = 0;
|
||||
|
||||
auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
|
||||
std::vector<ck::index_t> strides,
|
||||
bool permute,
|
||||
auto layout) {
|
||||
if(permute)
|
||||
{
|
||||
return HostTensorDescriptor(lens, strides, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(lens, strides, layout);
|
||||
}
|
||||
};
|
||||
std::cout << "group count " << group_count << ". printing first 4 groups\n";
|
||||
for(std::size_t i = 0; i < group_count; i++)
|
||||
{
|
||||
@@ -113,10 +130,14 @@ int run(int argc, char* argv[])
|
||||
{}}); // acc1_biases_gs_ms_os_strides
|
||||
|
||||
// C_m_o = A_m_k * B0_k_n * B1_n_o
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
|
||||
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
|
||||
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
Tensor<ADataType> a_gs_ms_ks(
|
||||
f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
|
||||
Tensor<B0DataType> b0_gs_ns_ks(f_host_tensor_descriptor(
|
||||
b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
|
||||
Tensor<B1DataType> b1_gs_os_ns(f_host_tensor_descriptor(
|
||||
b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(f_host_tensor_descriptor(
|
||||
c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
|
||||
|
||||
int Batch = G0 * G1;
|
||||
flop += (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * Batch;
|
||||
@@ -252,7 +273,8 @@ int run(int argc, char* argv[])
|
||||
Tensor<AccDataType> acc0_g_m_n({G0 * G1, M, N}); // scratch object after gemm0
|
||||
Tensor<ADataType> a1_g_m_n({G0 * G1, M, N}); // scratch object after softmax
|
||||
Tensor<CDataType> c_g_m_o_host_result({G0 * G1, M, O}); // scratch object after gemm1
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(f_host_tensor_descriptor(
|
||||
c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
|
||||
|
||||
// permute
|
||||
a_gs_ms_ks.ForEach([&](auto& self, auto idx) {
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
int run(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
@@ -91,11 +95,30 @@ int run(int argc, char* argv[])
|
||||
? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
|
||||
: std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
|
||||
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
|
||||
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
|
||||
std::vector<ck::index_t> strides,
|
||||
bool permute,
|
||||
auto layout) {
|
||||
if(permute)
|
||||
{
|
||||
return HostTensorDescriptor(lens, strides, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(lens, strides, layout);
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(
|
||||
f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
|
||||
Tensor<B0DataType> b0_gs_ns_ks(
|
||||
f_host_tensor_descriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
|
||||
Tensor<B1DataType> b1_gs_os_ns(
|
||||
f_host_tensor_descriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(
|
||||
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(
|
||||
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
|
||||
|
||||
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
|
||||
std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
int run(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
@@ -91,11 +95,30 @@ int run(int argc, char* argv[])
|
||||
? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
|
||||
: std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
|
||||
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
|
||||
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
|
||||
std::vector<ck::index_t> strides,
|
||||
bool permute,
|
||||
auto layout) {
|
||||
if(permute)
|
||||
{
|
||||
return HostTensorDescriptor(lens, strides, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(lens, strides, layout);
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(
|
||||
f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
|
||||
Tensor<B0DataType> b0_gs_ns_ks(
|
||||
f_host_tensor_descriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
|
||||
Tensor<B1DataType> b1_gs_os_ns(
|
||||
f_host_tensor_descriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(
|
||||
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(
|
||||
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
|
||||
|
||||
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
|
||||
std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
int run(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
@@ -108,11 +112,30 @@ int run(int argc, char* argv[])
|
||||
head_dim,
|
||||
1}; // C layout [batch_size, head_num, sequence_length, head_dim]
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
|
||||
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
|
||||
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
auto f_host_tensor_descriptor = [](std::vector<ck::index_t> lens,
|
||||
std::vector<ck::index_t> strides,
|
||||
bool permute,
|
||||
auto layout) {
|
||||
if(permute)
|
||||
{
|
||||
return HostTensorDescriptor(lens, strides, Bypass{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(lens, strides, layout);
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(
|
||||
f_host_tensor_descriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, input_permute, Row{}));
|
||||
Tensor<B0DataType> b0_gs_ns_ks(
|
||||
f_host_tensor_descriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, input_permute, Row{}));
|
||||
Tensor<B1DataType> b1_gs_os_ns(
|
||||
f_host_tensor_descriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, input_permute, Col{}));
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(
|
||||
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(
|
||||
f_host_tensor_descriptor(c_gs_ms_os_lengths, c_gs_ms_os_strides, output_permute, Row{}));
|
||||
|
||||
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
|
||||
std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
|
||||
@@ -186,7 +209,7 @@ int run(int argc, char* argv[])
|
||||
head_num * 3 * head_dim,
|
||||
head_dim,
|
||||
1}; // qkv layout [batch_size, sequence_length, head_num, 3, head_dim]
|
||||
Tensor<ADataType> qkv_gs_ms_ks(qkv_gs_ms_ks_lengths, qkv_gs_ms_ks_strides);
|
||||
Tensor<ADataType> qkv_gs_ms_ks(qkv_gs_ms_ks_lengths, qkv_gs_ms_ks_strides, Bypass{});
|
||||
// merge qkv into a packed pointer send to device
|
||||
a_gs_ms_ks.ForEach(
|
||||
[&](auto& self, auto idx) { qkv_gs_ms_ks(idx[0], idx[1], idx[2], 0, idx[3]) = self(idx); });
|
||||
|
||||
@@ -321,11 +321,13 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(std::is_same<decltype(layout), Row>::value)
|
||||
{
|
||||
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
|
||||
return HostTensorDescriptor(
|
||||
{batch_count, row, col}, {batch_stride, stride, 1_uz}, layout);
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
|
||||
return HostTensorDescriptor(
|
||||
{batch_count, row, col}, {batch_stride, 1_uz, stride}, layout);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -206,7 +206,8 @@ int run_grouped_conv_bwd_data_bias_relu_example(int argc, char* argv[])
|
||||
1, // c
|
||||
0, // hi
|
||||
0 // wi
|
||||
});
|
||||
},
|
||||
ctc::GNCHW{});
|
||||
|
||||
// input image: GNHWC
|
||||
const auto in_g_n_c_wis_desc =
|
||||
|
||||
@@ -214,7 +214,8 @@ int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
},
|
||||
BiasLayout{});
|
||||
|
||||
const auto requant_scale_g_k_desc = bias_g_k_desc;
|
||||
|
||||
|
||||
@@ -201,7 +201,8 @@ int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_el
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
},
|
||||
BiasLayout{});
|
||||
|
||||
const auto out_g_n_k_wos_desc =
|
||||
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
|
||||
|
||||
@@ -203,7 +203,8 @@ int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_eleme
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
},
|
||||
RequantScaleLayout{});
|
||||
|
||||
const auto out_g_n_k_wos_desc =
|
||||
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
|
||||
|
||||
@@ -22,6 +22,9 @@ using S = ck::Sequence<Is...>;
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
|
||||
@@ -250,19 +253,24 @@ int main(int argc, char* argv[])
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(
|
||||
std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
|
||||
std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
|
||||
std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()),
|
||||
Row{});
|
||||
Tensor<BDataType> b_gs_ns_ks(
|
||||
std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
|
||||
std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
|
||||
std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()),
|
||||
Row{});
|
||||
Tensor<DDataType> d_gs_ms_ns(
|
||||
std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
|
||||
std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
|
||||
std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()),
|
||||
Bypass{});
|
||||
Tensor<EDataType> e_gs_ms_ns_host_result(
|
||||
std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
|
||||
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
|
||||
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
|
||||
Bypass{});
|
||||
Tensor<EDataType> e_gs_ms_ns_device_result(
|
||||
std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
|
||||
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
|
||||
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
|
||||
Bypass{});
|
||||
|
||||
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
|
||||
std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
|
||||
@@ -372,7 +380,8 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result(
|
||||
std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
|
||||
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
|
||||
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
|
||||
Bypass{});
|
||||
|
||||
using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
|
||||
NumDimM,
|
||||
|
||||
@@ -22,6 +22,9 @@ using S = ck::Sequence<Is...>;
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
|
||||
@@ -250,19 +253,24 @@ int main(int argc, char* argv[])
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(
|
||||
std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
|
||||
std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
|
||||
std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()),
|
||||
Row{});
|
||||
Tensor<BDataType> b_gs_ns_ks(
|
||||
std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
|
||||
std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
|
||||
std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()),
|
||||
Row{});
|
||||
Tensor<DDataType> d_gs_ms_ns(
|
||||
std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
|
||||
std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
|
||||
std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()),
|
||||
Bypass{});
|
||||
Tensor<EDataType> e_gs_ms_ns_host_result(
|
||||
std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
|
||||
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
|
||||
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
|
||||
Bypass{});
|
||||
Tensor<EDataType> e_gs_ms_ns_device_result(
|
||||
std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
|
||||
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
|
||||
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
|
||||
Bypass{});
|
||||
|
||||
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
|
||||
std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
|
||||
@@ -372,7 +380,8 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result(
|
||||
std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
|
||||
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
|
||||
std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()),
|
||||
Bypass{});
|
||||
|
||||
using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
|
||||
NumDimM,
|
||||
|
||||
@@ -22,6 +22,8 @@ using F32 = float;
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
|
||||
using NchwLayout = ck::tensor_layout::convolution::NCHW;
|
||||
using NhwcLayout = ck::tensor_layout::convolution::NHWC;
|
||||
using UnaryScale = ck::tensor_operation::element_wise::Scale;
|
||||
using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
|
||||
using UnaryScaleSquare =
|
||||
@@ -73,11 +75,11 @@ int main(int argc, char* argv[])
|
||||
1};
|
||||
ck::ranges::copy(nchw, ab_lengths.begin());
|
||||
|
||||
std::array<Tensor<ADataType>, 2> as = {Tensor<ADataType>(ab_lengths, ab_strides),
|
||||
Tensor<ADataType>(ab_lengths, ab_strides)};
|
||||
std::array<Tensor<ADataType>, 2> as = {Tensor<ADataType>(ab_lengths, ab_strides, NchwLayout{}),
|
||||
Tensor<ADataType>(ab_lengths, ab_strides, NchwLayout{})};
|
||||
Tensor<ADataType>& a0 = as[0];
|
||||
Tensor<ADataType>& a1 = as[1];
|
||||
Tensor<BDataType> b(ab_lengths, ab_strides);
|
||||
Tensor<BDataType> b(ab_lengths, ab_strides, NchwLayout{});
|
||||
float alpha = 3.f;
|
||||
float beta = 2.f;
|
||||
a0.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
|
||||
@@ -134,7 +136,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<BDataType> host_b(ab_lengths, ab_strides);
|
||||
Tensor<BDataType> host_b(ab_lengths, ab_strides, NchwLayout{});
|
||||
|
||||
using ReferenceElementwiseInstance = ck::tensor_operation::host::
|
||||
ReferenceElementwise<2, ADataType, BDataType, BinaryAddUnaryScaleSquare>;
|
||||
|
||||
@@ -22,6 +22,8 @@ using F32 = float;
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
|
||||
using NchwLayout = ck::tensor_layout::convolution::NCHW;
|
||||
using NhwcLayout = ck::tensor_layout::convolution::NHWC;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
|
||||
ck::Tuple<ADataType>, // InDataTypeTuple
|
||||
@@ -72,9 +74,9 @@ int main(int argc, char* argv[])
|
||||
static_cast<int>(nhwc[3])};
|
||||
ck::ranges::copy(nchw, ab_lengths.begin());
|
||||
|
||||
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
|
||||
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides, NchwLayout{})};
|
||||
Tensor<ADataType>& a = as[0];
|
||||
Tensor<BDataType> b(ab_lengths, b_strides);
|
||||
Tensor<BDataType> b(ab_lengths, b_strides, NhwcLayout{});
|
||||
|
||||
a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
|
||||
|
||||
@@ -117,7 +119,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<BDataType> host_b(ab_lengths, b_strides);
|
||||
Tensor<BDataType> host_b(ab_lengths, b_strides, NhwcLayout{});
|
||||
using ReferenceElementwiseInstance =
|
||||
ck::tensor_operation::host::ReferenceElementwise<1, ADataType, BDataType, PassThrough>;
|
||||
auto ref_elementwise = ReferenceElementwiseInstance{};
|
||||
|
||||
@@ -23,6 +23,8 @@ using F32 = float;
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
|
||||
using NchwLayout = ck::tensor_layout::convolution::NCHW;
|
||||
using NhwcLayout = ck::tensor_layout::convolution::NHWC;
|
||||
using UnaryScale = ck::tensor_operation::element_wise::Scale;
|
||||
using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
|
||||
using UnaryScaleSquare =
|
||||
@@ -76,9 +78,9 @@ int main(int argc, char* argv[])
|
||||
static_cast<int>(nhwc[0] * nhwc[1])};
|
||||
ck::ranges::copy(nchw, ab_lengths.begin());
|
||||
|
||||
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
|
||||
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides, NchwLayout{})};
|
||||
Tensor<ADataType>& a = as[0];
|
||||
Tensor<BDataType> b(ab_lengths, b_strides);
|
||||
Tensor<BDataType> b(ab_lengths, b_strides, NhwcLayout{});
|
||||
float scale = 1.f;
|
||||
auto i = 0;
|
||||
std::mt19937 gen(11939);
|
||||
@@ -137,7 +139,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<BDataType> host_b(ab_lengths, b_strides);
|
||||
Tensor<BDataType> host_b(ab_lengths, b_strides, NhwcLayout{});
|
||||
using ReferenceElementwiseInstance = ck::tensor_operation::host::
|
||||
ReferenceElementwise<1, ADataType, BDataType, UnaryScaleSquare>;
|
||||
auto ref_elementwise = ReferenceElementwiseInstance{};
|
||||
|
||||
@@ -22,6 +22,9 @@ using F32 = float;
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
|
||||
using NchwLayout = ck::tensor_layout::convolution::NCHW;
|
||||
using NhwcLayout = ck::tensor_layout::convolution::NHWC;
|
||||
|
||||
using UnaryScale = ck::tensor_operation::element_wise::Scale;
|
||||
using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
|
||||
using UnaryScaleSquare =
|
||||
@@ -76,9 +79,9 @@ int main(int argc, char* argv[])
|
||||
|
||||
ck::ranges::copy(nchw, ab_lengths.begin());
|
||||
|
||||
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
|
||||
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides, NchwLayout{})};
|
||||
Tensor<ADataType>& a = as[0];
|
||||
Tensor<BDataType> b(ab_lengths, b_strides);
|
||||
Tensor<BDataType> b(ab_lengths, b_strides, NhwcLayout{});
|
||||
|
||||
float scale = 2.f;
|
||||
a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
|
||||
@@ -128,7 +131,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<BDataType> host_b(ab_lengths, b_strides);
|
||||
Tensor<BDataType> host_b(ab_lengths, b_strides, NhwcLayout{});
|
||||
using ReferenceElementwiseInstance = ck::tensor_operation::host::
|
||||
ReferenceElementwise<1, ADataType, BDataType, UnaryScaleSquare>;
|
||||
auto ref_elementwise = ReferenceElementwiseInstance{};
|
||||
|
||||
@@ -22,6 +22,8 @@ using F32 = float;
|
||||
using ADataType = F32;
|
||||
using BDataType = F32;
|
||||
|
||||
using NchwLayout = ck::tensor_layout::convolution::NCHW;
|
||||
using NhwcLayout = ck::tensor_layout::convolution::NHWC;
|
||||
using UnaryScale = ck::tensor_operation::element_wise::Scale;
|
||||
using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
|
||||
using UnaryScaleSquare =
|
||||
@@ -76,9 +78,9 @@ int main(int argc, char* argv[])
|
||||
static_cast<int>(nhwc[0] * nhwc[1])};
|
||||
ck::ranges::copy(nchw, ab_lengths.begin());
|
||||
|
||||
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
|
||||
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides, NchwLayout{})};
|
||||
Tensor<ADataType>& a = as[0];
|
||||
Tensor<BDataType> b(ab_lengths, b_strides);
|
||||
Tensor<BDataType> b(ab_lengths, b_strides, NhwcLayout{});
|
||||
|
||||
float scale = 1.f;
|
||||
auto i = 0;
|
||||
@@ -139,7 +141,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<BDataType> host_b(ab_lengths, b_strides);
|
||||
Tensor<BDataType> host_b(ab_lengths, b_strides, NhwcLayout{});
|
||||
using ReferenceElementwiseInstance = ck::tensor_operation::host::
|
||||
ReferenceElementwise<1, ADataType, BDataType, UnaryScaleSquare>;
|
||||
auto ref_elementwise = ReferenceElementwiseInstance{};
|
||||
|
||||
@@ -22,6 +22,9 @@ using F32 = float;
|
||||
using ADataType = F32;
|
||||
using BDataType = F32;
|
||||
|
||||
using NchwLayout = ck::tensor_layout::convolution::NCHW;
|
||||
using NhwcLayout = ck::tensor_layout::convolution::NHWC;
|
||||
|
||||
using UnaryScale = ck::tensor_operation::element_wise::Scale;
|
||||
using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
|
||||
using UnaryScaleSquare =
|
||||
@@ -76,9 +79,9 @@ int main(int argc, char* argv[])
|
||||
|
||||
ck::ranges::copy(nchw, ab_lengths.begin());
|
||||
|
||||
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides)};
|
||||
std::array<Tensor<ADataType>, 1> as = {Tensor<ADataType>(ab_lengths, a_strides, NchwLayout{})};
|
||||
Tensor<ADataType>& a = as[0];
|
||||
Tensor<BDataType> b(ab_lengths, b_strides);
|
||||
Tensor<BDataType> b(ab_lengths, b_strides, NhwcLayout{});
|
||||
float scale = 2.f;
|
||||
a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
|
||||
|
||||
@@ -127,7 +130,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<BDataType> host_b(ab_lengths, b_strides);
|
||||
Tensor<BDataType> host_b(ab_lengths, b_strides, NhwcLayout{});
|
||||
using ReferenceElementwiseInstance = ck::tensor_operation::host::
|
||||
ReferenceElementwise<1, ADataType, BDataType, UnaryScaleSquare>;
|
||||
auto ref_elementwise = ReferenceElementwiseInstance{};
|
||||
|
||||
@@ -22,6 +22,9 @@ using F32 = float;
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
|
||||
using NchwLayout = ck::tensor_layout::convolution::NCHW;
|
||||
using NhwcLayout = ck::tensor_layout::convolution::NHWC;
|
||||
|
||||
using UnaryScale = ck::tensor_operation::element_wise::Scale;
|
||||
using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
|
||||
using UnaryScaleSquare =
|
||||
@@ -78,13 +81,13 @@ int main(int argc, char* argv[])
|
||||
|
||||
ck::ranges::copy(nchw, ab_lengths.begin());
|
||||
|
||||
std::array<Tensor<ADataType>, 3> as = {Tensor<ADataType>(ab_lengths, ab_strides),
|
||||
Tensor<ADataType>(ab_lengths, ab_strides),
|
||||
Tensor<ADataType>(ab_lengths, ab_strides)};
|
||||
std::array<Tensor<ADataType>, 3> as = {Tensor<ADataType>(ab_lengths, ab_strides, NchwLayout{}),
|
||||
Tensor<ADataType>(ab_lengths, ab_strides, NchwLayout{}),
|
||||
Tensor<ADataType>(ab_lengths, ab_strides, NchwLayout{})};
|
||||
Tensor<ADataType>& a0 = as[0];
|
||||
Tensor<ADataType>& a1 = as[1];
|
||||
Tensor<ADataType>& a2 = as[2];
|
||||
Tensor<BDataType> b(ab_lengths, ab_strides);
|
||||
Tensor<BDataType> b(ab_lengths, ab_strides, NchwLayout{});
|
||||
float alpha = 3.f;
|
||||
float beta = 2.f;
|
||||
float gamma = 4.f;
|
||||
@@ -149,7 +152,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<BDataType> host_b(ab_lengths, ab_strides);
|
||||
Tensor<BDataType> host_b(ab_lengths, ab_strides, NchwLayout{});
|
||||
using ReferenceElementwiseInstance = ck::tensor_operation::host::
|
||||
ReferenceElementwise<3, ADataType, BDataType, TrinaryAddUnaryScaleSquare>;
|
||||
auto ref_elementwise = ReferenceElementwiseInstance{};
|
||||
|
||||
@@ -1,22 +1,30 @@
|
||||
#pragma once
|
||||
#include <type_traits>
|
||||
|
||||
bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfig& config)
|
||||
{
|
||||
using namespace ck::literals;
|
||||
|
||||
auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
|
||||
ProblemSize ps =
|
||||
problem_size; // make mutable copy because default stride values of 0 need to be updated
|
||||
auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = ps;
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, int& stride, auto layout) {
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[0];
|
||||
return desc;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[1];
|
||||
return desc;
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
|
||||
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
|
||||
|
||||
@@ -18,6 +18,10 @@
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
@@ -220,12 +224,12 @@ int main(int argc, char* argv[])
|
||||
std::vector<ck::index_t> d0_gs_ms_ns_lengths{G0, G1, M, N};
|
||||
std::vector<ck::index_t> d0_gs_ms_ns_strides{M * G1 * N, N, G1 * N, 1};
|
||||
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
|
||||
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
|
||||
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
|
||||
Tensor<D0DataType> d0_gs_ms_ns(d0_gs_ms_ns_lengths, d0_gs_ms_ns_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
|
||||
Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides, Row{});
|
||||
Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides, Row{});
|
||||
Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides, Col{});
|
||||
Tensor<D0DataType> d0_gs_ms_ns(d0_gs_ms_ns_lengths, d0_gs_ms_ns_strides, Row{});
|
||||
Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Row{});
|
||||
Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides, Row{});
|
||||
|
||||
std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
|
||||
std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
|
||||
|
||||
@@ -48,15 +48,16 @@ HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_,
|
||||
|
||||
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
|
||||
{
|
||||
return HostTensorDescriptor({N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
|
||||
return HostTensorDescriptor(
|
||||
{N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz}, layout);
|
||||
}
|
||||
else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
|
||||
{
|
||||
return HostTensorDescriptor({N_, C_, D, H, W},
|
||||
{D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
|
||||
return HostTensorDescriptor(
|
||||
{N_, C_, D, H, W}, {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_}, layout);
|
||||
}
|
||||
throw std::runtime_error("Pool3d_fwd: problem with layout. ");
|
||||
return HostTensorDescriptor({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0});
|
||||
return HostTensorDescriptor({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, layout);
|
||||
};
|
||||
|
||||
template <typename DevicePoolFwdInstance,
|
||||
|
||||
@@ -77,7 +77,9 @@ bool maxpool_bwd_test(bool do_verification,
|
||||
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
|
||||
using namespace ck::literals;
|
||||
// reference need Tensor with NCHW order
|
||||
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
|
||||
return HostTensorDescriptor({N_, C_, H, W},
|
||||
{C_ * H * W, 1_uz, W * C_, C_},
|
||||
ck::tensor_layout::convolution::NCHW{});
|
||||
};
|
||||
|
||||
// in
|
||||
|
||||
@@ -42,15 +42,16 @@ HostTensorDescriptor f_host_tensor_descriptor(std::size_t N_,
|
||||
|
||||
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCDHW>::value)
|
||||
{
|
||||
return HostTensorDescriptor({N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz});
|
||||
return HostTensorDescriptor(
|
||||
{N_, C_, D, H, W}, {C_ * D * H * W, D * H * W, H * W, W, 1_uz}, layout);
|
||||
}
|
||||
else if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NDHWC>::value)
|
||||
{
|
||||
return HostTensorDescriptor({N_, C_, D, H, W},
|
||||
{D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_});
|
||||
return HostTensorDescriptor(
|
||||
{N_, C_, D, H, W}, {D * C_ * H * W, 1_uz, C_ * H * W, W * C_, C_}, layout);
|
||||
}
|
||||
throw std::runtime_error("Avgpool3d_bwd: problem with layout. ");
|
||||
return HostTensorDescriptor({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0});
|
||||
return HostTensorDescriptor({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, layout);
|
||||
};
|
||||
|
||||
template <typename DevicePoolBwdInstance,
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
add_example_executable(example_gemm_multi_ABD_wmma_fp16 gemm_multi_ABD_wmma_fp16.cpp)
|
||||
add_example_executable(example_gemm_multi_ABD_wmma_bias_fastgelu_bf16_i8 gemm_multi_ABD_wmma_bias_fastgelu_bf16_i8.cpp)
|
||||
add_example_executable(example_gemm_multi_ABD_wmma_multiply_bias_fastgelu_bf16_i8 gemm_multi_ABD_wmma_multiply_bias_fastgelu_bf16_i8.cpp)
|
||||
add_example_executable(example_gemm_multi_ABD_wmma_fastgelu_bf16_i8 gemm_multi_ABD_wmma_fastgelu_bf16_i8.cpp)
|
||||
add_example_executable(example_gemm_multi_ABD_xdl_fp16 gemm_multi_ABD_xdl_fp16.cpp)
|
||||
add_example_executable(example_gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8 gemm_multi_ABD_xdl_bias_fastgelu_bf16_i8.cpp)
|
||||
add_example_executable(example_gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8 gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp)
|
||||
|
||||
@@ -0,0 +1,307 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
|
||||
#include "ck/utility/blkgemmpipe_scheduler.hpp"
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using BF16 = ck::bhalf_t;
|
||||
using I8 = int8_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
|
||||
using A0DataType = BF16;
|
||||
using AsDataType = ck::Tuple<A0DataType>;
|
||||
using B0DataType = I8;
|
||||
using B1DataType = BF16;
|
||||
using BsDataType = ck::Tuple<B0DataType, B1DataType>;
|
||||
using AccDataType = F32;
|
||||
using CShuffleDataType = BF16;
|
||||
using D0DataType = BF16;
|
||||
using DsDataType = ck::Tuple<D0DataType>;
|
||||
using EDataType = BF16;
|
||||
|
||||
using A0Layout = Row;
|
||||
using AsLayout = ck::Tuple<A0Layout>;
|
||||
using B0Layout = Row;
|
||||
using B1Layout = B0Layout;
|
||||
using BsLayout = ck::Tuple<B0Layout, B1Layout>;
|
||||
using D0Layout = Row;
|
||||
using DsLayout = ck::Tuple<D0Layout>;
|
||||
using ELayout = Row;
|
||||
|
||||
using Multiply = ck::tensor_operation::element_wise::Multiply;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = Multiply;
|
||||
using CDEElementOp = AddFastGelu;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
|
||||
|
||||
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Wmma_CShuffleV3<
|
||||
AsLayout,
|
||||
BsLayout,
|
||||
DsLayout,
|
||||
ELayout,
|
||||
AsDataType,
|
||||
BsDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp,
|
||||
GemmSpec,
|
||||
256,
|
||||
128,
|
||||
128,
|
||||
64,
|
||||
8,
|
||||
8,
|
||||
16,
|
||||
16,
|
||||
4,
|
||||
2,
|
||||
S<8, 32, 1>,
|
||||
S<1, 0, 2>,
|
||||
S<1, 0, 2>,
|
||||
2,
|
||||
8,
|
||||
8,
|
||||
0,
|
||||
S<8, 32, 1>,
|
||||
S<0, 2, 1>,
|
||||
S<0, 2, 1>,
|
||||
1,
|
||||
1,
|
||||
8,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
S<1, 32, 1, 8>,
|
||||
S<8, 8, 8>,
|
||||
ck::BlockGemmPipelineScheduler::Intrawave,
|
||||
ck::BlockGemmPipelineVersion::v3>;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
int init_method = 2;
|
||||
bool time_kernel = false;
|
||||
|
||||
// GEMM shape
|
||||
ck::index_t M = 4096;
|
||||
ck::index_t N = 768;
|
||||
ck::index_t K = 6144;
|
||||
|
||||
ck::index_t StrideA = K;
|
||||
ck::index_t StrideB = N;
|
||||
ck::index_t StrideD = N;
|
||||
ck::index_t StrideE = N;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 11)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
M = std::stoi(argv[4]);
|
||||
N = std::stoi(argv[5]);
|
||||
K = std::stoi(argv[6]);
|
||||
|
||||
StrideA = std::stoi(argv[7]);
|
||||
StrideB = std::stoi(argv[8]);
|
||||
StrideD = std::stoi(argv[9]);
|
||||
StrideE = std::stoi(argv[10]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: time kernel (0=no, 1=yes)\n");
|
||||
printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
|
||||
Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
|
||||
Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, StrideB, B1Layout{}));
|
||||
Tensor<D0DataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
|
||||
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
|
||||
std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
|
||||
std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
|
||||
std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
|
||||
std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
|
||||
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-5, 5});
|
||||
b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
|
||||
b1_k_n.GenerateTensorValue(GeneratorTensor_2<B1DataType>{0, 5});
|
||||
d_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
|
||||
break;
|
||||
default:
|
||||
a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
|
||||
b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
|
||||
b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 5});
|
||||
d_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
|
||||
}
|
||||
|
||||
DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
|
||||
DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
|
||||
DeviceMem b1_device_buf(sizeof(B1DataType) * b1_k_n.mDesc.GetElementSpaceSize());
|
||||
DeviceMem d_device_buf(sizeof(D0DataType) * d_m_n.mDesc.GetElementSpaceSize());
|
||||
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
|
||||
|
||||
a0_device_buf.ToDevice(a0_m_k.mData.data());
|
||||
b0_device_buf.ToDevice(b0_k_n.mData.data());
|
||||
b1_device_buf.ToDevice(b1_k_n.mData.data());
|
||||
d_device_buf.ToDevice(d_m_n.mData.data());
|
||||
e_device_buf.ToDevice(e_m_n_device_result.mData.data());
|
||||
|
||||
auto a_element_op = AElementOp{};
|
||||
auto b_element_op = BElementOp{};
|
||||
auto cde_element_op = CDEElementOp{};
|
||||
|
||||
constexpr ck::index_t NumATensor = 1;
|
||||
constexpr ck::index_t NumBTensor = 2;
|
||||
constexpr ck::index_t NumDTensor = 1;
|
||||
|
||||
// do GEMM
|
||||
auto device_op = DeviceOpInstance{};
|
||||
auto invoker = device_op.MakeInvoker();
|
||||
auto argument =
|
||||
device_op.MakeArgument(std::array<const void*, NumATensor>{a0_device_buf.GetDeviceBuffer()},
|
||||
std::array<const void*, NumBTensor>{b0_device_buf.GetDeviceBuffer(),
|
||||
b1_device_buf.GetDeviceBuffer()},
|
||||
std::array<const void*, NumDTensor>{d_device_buf.GetDeviceBuffer()},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
std::array<ck::index_t, NumATensor>{StrideA},
|
||||
std::array<ck::index_t, NumBTensor>{StrideB, StrideB},
|
||||
std::array<ck::index_t, NumDTensor>{StrideD},
|
||||
StrideE,
|
||||
1,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
if(!device_op.IsSupportedArgument(argument))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"wrong! device_gemm with the specified compilation parameters does "
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
|
||||
<< std::endl;
|
||||
|
||||
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<CShuffleDataType> c_m_n({M, N});
|
||||
|
||||
Tensor<B1DataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
|
||||
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
b_element_op(b_k_n(k, n), b0_k_n(k, n), b1_k_n(k, n));
|
||||
}
|
||||
}
|
||||
|
||||
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
|
||||
B1DataType,
|
||||
CShuffleDataType,
|
||||
AccDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough>;
|
||||
auto ref_gemm = ReferenceGemmInstance{};
|
||||
auto ref_invoker = ref_gemm.MakeInvoker();
|
||||
|
||||
auto ref_argument = ref_gemm.MakeArgument(
|
||||
a0_m_k, b_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
for(int m = 0; m < M; ++m)
|
||||
{
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
|
||||
}
|
||||
}
|
||||
|
||||
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
|
||||
|
||||
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,299 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
|
||||
#include "ck/utility/blkgemmpipe_scheduler.hpp"
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using BF16 = ck::bhalf_t;
|
||||
using I8 = int8_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
|
||||
using A0DataType = BF16;
|
||||
using AsDataType = ck::Tuple<A0DataType>;
|
||||
using B0DataType = I8;
|
||||
using B1DataType = BF16;
|
||||
using BsDataType = ck::Tuple<B0DataType, B1DataType>;
|
||||
using AccDataType = F32;
|
||||
using CShuffleDataType = F32;
|
||||
using DsDataType = ck::Tuple<>;
|
||||
using EDataType = BF16;
|
||||
|
||||
using A0Layout = Row;
|
||||
using AsLayout = ck::Tuple<A0Layout>;
|
||||
using B0Layout = Row;
|
||||
using B1Layout = B0Layout;
|
||||
using BsLayout = ck::Tuple<B0Layout, B1Layout>;
|
||||
using DsLayout = ck::Tuple<>;
|
||||
using ELayout = Row;
|
||||
|
||||
using Multiply = ck::tensor_operation::element_wise::Multiply;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using FastGelu = ck::tensor_operation::element_wise::FastGelu;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = Multiply;
|
||||
using CDEElementOp = FastGelu;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
|
||||
|
||||
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Wmma_CShuffleV3<
|
||||
AsLayout,
|
||||
BsLayout,
|
||||
DsLayout,
|
||||
ELayout,
|
||||
AsDataType,
|
||||
BsDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp,
|
||||
GemmSpec,
|
||||
256,
|
||||
128,
|
||||
128,
|
||||
64,
|
||||
8,
|
||||
8,
|
||||
16,
|
||||
16,
|
||||
4,
|
||||
2,
|
||||
S<8, 32, 1>,
|
||||
S<1, 0, 2>,
|
||||
S<1, 0, 2>,
|
||||
2,
|
||||
8,
|
||||
8,
|
||||
0,
|
||||
S<8, 32, 1>,
|
||||
S<0, 2, 1>,
|
||||
S<0, 2, 1>,
|
||||
1,
|
||||
1,
|
||||
8,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
S<1, 32, 1, 8>,
|
||||
S<8, 8, 8>,
|
||||
ck::BlockGemmPipelineScheduler::Intrawave,
|
||||
ck::BlockGemmPipelineVersion::v3>;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
int init_method = 2;
|
||||
bool time_kernel = false;
|
||||
|
||||
// GEMM shape
|
||||
ck::index_t M = 4096;
|
||||
ck::index_t N = 768;
|
||||
ck::index_t K = 6144;
|
||||
|
||||
ck::index_t StrideA = K;
|
||||
ck::index_t StrideB = N;
|
||||
ck::index_t StrideE = N;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 11)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
M = std::stoi(argv[4]);
|
||||
N = std::stoi(argv[5]);
|
||||
K = std::stoi(argv[6]);
|
||||
|
||||
StrideA = std::stoi(argv[7]);
|
||||
StrideB = std::stoi(argv[8]);
|
||||
StrideE = std::stoi(argv[9]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: time kernel (0=no, 1=yes)\n");
|
||||
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
|
||||
Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
|
||||
Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, StrideB, B1Layout{}));
|
||||
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
|
||||
std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
|
||||
std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
|
||||
std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
|
||||
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-5, 5});
|
||||
b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
|
||||
b1_k_n.GenerateTensorValue(GeneratorTensor_2<B1DataType>{0, 5});
|
||||
break;
|
||||
default:
|
||||
a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
|
||||
b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
|
||||
b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 5});
|
||||
}
|
||||
|
||||
DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
|
||||
DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
|
||||
DeviceMem b1_device_buf(sizeof(B1DataType) * b1_k_n.mDesc.GetElementSpaceSize());
|
||||
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
|
||||
|
||||
a0_device_buf.ToDevice(a0_m_k.mData.data());
|
||||
b0_device_buf.ToDevice(b0_k_n.mData.data());
|
||||
b1_device_buf.ToDevice(b1_k_n.mData.data());
|
||||
e_device_buf.ToDevice(e_m_n_device_result.mData.data());
|
||||
|
||||
auto a_element_op = AElementOp{};
|
||||
auto b_element_op = BElementOp{};
|
||||
auto cde_element_op = CDEElementOp{};
|
||||
|
||||
constexpr ck::index_t NumATensor = 1;
|
||||
constexpr ck::index_t NumBTensor = 2;
|
||||
constexpr ck::index_t NumDTensor = 0;
|
||||
|
||||
// do GEMM
|
||||
auto device_op = DeviceOpInstance{};
|
||||
auto invoker = device_op.MakeInvoker();
|
||||
auto argument =
|
||||
device_op.MakeArgument(std::array<const void*, NumATensor>{a0_device_buf.GetDeviceBuffer()},
|
||||
std::array<const void*, NumBTensor>{b0_device_buf.GetDeviceBuffer(),
|
||||
b1_device_buf.GetDeviceBuffer()},
|
||||
std::array<const void*, NumDTensor>{},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
std::array<ck::index_t, NumATensor>{StrideA},
|
||||
std::array<ck::index_t, NumBTensor>{StrideB, StrideB},
|
||||
std::array<ck::index_t, NumDTensor>{},
|
||||
StrideE,
|
||||
1,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
if(!device_op.IsSupportedArgument(argument))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"wrong! device_gemm with the specified compilation parameters does "
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
|
||||
<< std::endl;
|
||||
|
||||
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<CShuffleDataType> c_m_n({M, N});
|
||||
|
||||
Tensor<A0DataType> a_m_k({M, K});
|
||||
|
||||
Tensor<B1DataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
|
||||
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
b_element_op(b_k_n(k, n), b0_k_n(k, n), b1_k_n(k, n));
|
||||
}
|
||||
}
|
||||
|
||||
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
|
||||
B1DataType,
|
||||
CShuffleDataType,
|
||||
AccDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough>;
|
||||
auto ref_gemm = ReferenceGemmInstance{};
|
||||
auto ref_invoker = ref_gemm.MakeInvoker();
|
||||
|
||||
auto ref_argument = ref_gemm.MakeArgument(
|
||||
a0_m_k, b_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
for(int m = 0; m < M; ++m)
|
||||
{
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n));
|
||||
}
|
||||
}
|
||||
|
||||
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
|
||||
|
||||
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
362
example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fp16.cpp
Normal file
362
example/60_gemm_multi_ABD/gemm_multi_ABD_wmma_fp16.cpp
Normal file
@@ -0,0 +1,362 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using AccDataType = F32;
|
||||
using CShuffleDataType = F32;
|
||||
using DDataType = F16;
|
||||
using EDataType = F16;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Row;
|
||||
using DLayout = Row;
|
||||
using ELayout = Row;
|
||||
|
||||
struct AddScale
|
||||
{
|
||||
static constexpr auto I0 = ck::Number<0>{};
|
||||
static constexpr auto I1 = ck::Number<1>{};
|
||||
static constexpr auto I2 = ck::Number<2>{};
|
||||
static constexpr auto I3 = ck::Number<3>{};
|
||||
|
||||
__host__ __device__ constexpr void
|
||||
operator()(ck::half4_t& a, const ck::half4_t& a0, const ck::half4_t& a1) const
|
||||
{
|
||||
const auto a0_v_t = ck::vector_type<ck::half_t, 4>{a0};
|
||||
const auto a1_v_t = ck::vector_type<ck::half_t, 4>{a1};
|
||||
|
||||
auto r_v_t = ck::vector_type<ck::half_t, 4>{};
|
||||
|
||||
r_v_t.AsType<ck::half_t>()(I0) =
|
||||
scale * (a0_v_t.AsType<ck::half_t>()[I0] + a1_v_t.AsType<ck::half_t>()[I0]);
|
||||
r_v_t.AsType<ck::half_t>()(I1) =
|
||||
scale * (a0_v_t.AsType<ck::half_t>()[I1] + a1_v_t.AsType<ck::half_t>()[I1]);
|
||||
r_v_t.AsType<ck::half_t>()(I2) =
|
||||
scale * (a0_v_t.AsType<ck::half_t>()[I2] + a1_v_t.AsType<ck::half_t>()[I2]);
|
||||
r_v_t.AsType<ck::half_t>()(I3) =
|
||||
scale * (a0_v_t.AsType<ck::half_t>()[I3] + a1_v_t.AsType<ck::half_t>()[I3]);
|
||||
|
||||
a = r_v_t.AsType<ck::half4_t>()[I0];
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr void
|
||||
operator()(ck::half_t& a, const ck::half_t& a0, const ck::half_t& a1) const
|
||||
{
|
||||
a = scale * (a0 + a1);
|
||||
}
|
||||
|
||||
// this attribute controls the copy_function applying element_wise_op with
|
||||
// pack4_data
|
||||
constexpr const static bool is_pack4_invocable = true;
|
||||
|
||||
float scale = 1.0;
|
||||
};
|
||||
|
||||
struct AlphaBetaAdd
|
||||
{
|
||||
AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
|
||||
|
||||
template <typename E, typename C, typename D>
|
||||
__host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const;
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void operator()<ck::half_t, float, ck::half_t>(
|
||||
ck::half_t& e, const float& c, const ck::half_t& d) const
|
||||
{
|
||||
e = ck::type_convert<ck::half_t>(alpha_ * c + beta_ * ck::type_convert<float>(d));
|
||||
};
|
||||
|
||||
float alpha_;
|
||||
float beta_;
|
||||
};
|
||||
|
||||
using AElementOp = AddScale;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = AlphaBetaAdd;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Wmma_CShuffleV3<
|
||||
ck::Tuple<ALayout, ALayout>,
|
||||
ck::Tuple<BLayout>,
|
||||
ck::Tuple<DLayout>,
|
||||
ELayout,
|
||||
ck::Tuple<ADataType, ADataType>,
|
||||
ck::Tuple<BDataType>,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<DDataType>,
|
||||
EDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp,
|
||||
GemmSpec,
|
||||
256,
|
||||
256,
|
||||
128,
|
||||
32,
|
||||
8,
|
||||
8,
|
||||
16,
|
||||
16,
|
||||
4,
|
||||
4,
|
||||
S<4, 64, 1>,
|
||||
S<1, 0, 2>,
|
||||
S<1, 0, 2>,
|
||||
2,
|
||||
8,
|
||||
8,
|
||||
0,
|
||||
S<4, 64, 1>,
|
||||
S<1, 0, 2>,
|
||||
S<1, 0, 2>,
|
||||
1,
|
||||
1,
|
||||
8,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
S<1, 64, 1, 4>,
|
||||
S<8, 8, 8>>;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// GEMM shape
|
||||
ck::index_t M = 3840;
|
||||
ck::index_t N = 4096;
|
||||
ck::index_t K = 4096;
|
||||
|
||||
ck::index_t StrideA = K;
|
||||
ck::index_t StrideB = N;
|
||||
ck::index_t StrideD = N;
|
||||
ck::index_t StrideE = N;
|
||||
|
||||
float alpha = 1.0f;
|
||||
float beta = 1.0f;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 6)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
alpha = std::stof(argv[4]);
|
||||
beta = std::stof(argv[5]);
|
||||
}
|
||||
else if(argc == 13)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
M = std::stoi(argv[4]);
|
||||
N = std::stoi(argv[5]);
|
||||
K = std::stoi(argv[6]);
|
||||
|
||||
StrideA = std::stoi(argv[7]);
|
||||
StrideB = std::stoi(argv[8]);
|
||||
StrideD = std::stoi(argv[9]);
|
||||
StrideE = std::stoi(argv[10]);
|
||||
|
||||
alpha = std::stof(argv[11]);
|
||||
beta = std::stof(argv[12]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: time kernel (0=no, 1=yes)\n");
|
||||
printf("arg4 to 12: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, "
|
||||
"beta\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<ADataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
|
||||
Tensor<ADataType> a1_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
|
||||
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
|
||||
Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
|
||||
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
|
||||
std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
|
||||
std::cout << "a1_m_k: " << a1_m_k.mDesc << std::endl;
|
||||
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
|
||||
std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
|
||||
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
a0_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
|
||||
a1_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
|
||||
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
|
||||
d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
|
||||
break;
|
||||
default:
|
||||
a0_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
|
||||
a1_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
|
||||
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
|
||||
d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
|
||||
}
|
||||
|
||||
DeviceMem a0_device_buf(sizeof(ADataType) * a0_m_k.mDesc.GetElementSpaceSize());
|
||||
DeviceMem a1_device_buf(sizeof(ADataType) * a1_m_k.mDesc.GetElementSpaceSize());
|
||||
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
|
||||
DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
|
||||
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
|
||||
|
||||
a0_device_buf.ToDevice(a0_m_k.mData.data());
|
||||
a1_device_buf.ToDevice(a1_m_k.mData.data());
|
||||
b_device_buf.ToDevice(b_k_n.mData.data());
|
||||
d_device_buf.ToDevice(d_m_n.mData.data());
|
||||
e_device_buf.ToDevice(e_m_n_device_result.mData.data());
|
||||
|
||||
auto a_element_op = AElementOp{0.2};
|
||||
auto b_element_op = BElementOp{};
|
||||
auto cde_element_op = CDEElementOp{alpha, beta};
|
||||
|
||||
// do GEMM
|
||||
auto device_op = DeviceOpInstance{};
|
||||
auto invoker = device_op.MakeInvoker();
|
||||
auto argument =
|
||||
device_op.MakeArgument(std::array<const void*, 2>{a0_device_buf.GetDeviceBuffer(),
|
||||
a1_device_buf.GetDeviceBuffer()},
|
||||
std::array<const void*, 1>{b_device_buf.GetDeviceBuffer()},
|
||||
std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
std::array<ck::index_t, 2>{StrideA, StrideA},
|
||||
std::array<ck::index_t, 1>{StrideB},
|
||||
std::array<ck::index_t, 1>{StrideD},
|
||||
StrideE,
|
||||
1,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
if(!device_op.IsSupportedArgument(argument))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"wrong! device_gemm with the specified compilation parameters does "
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
|
||||
<< std::endl;
|
||||
|
||||
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<CShuffleDataType> c_m_n({M, N});
|
||||
|
||||
Tensor<ADataType> a_m_k({M, K});
|
||||
|
||||
for(int m = 0; m < M; ++m)
|
||||
{
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
a_element_op(a_m_k(m, k), a0_m_k(m, k), a1_m_k(m, k));
|
||||
}
|
||||
}
|
||||
|
||||
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
|
||||
BDataType,
|
||||
CShuffleDataType,
|
||||
AccDataType,
|
||||
PassThrough,
|
||||
BElementOp,
|
||||
PassThrough>;
|
||||
auto ref_gemm = ReferenceGemmInstance{};
|
||||
auto ref_invoker = ref_gemm.MakeInvoker();
|
||||
|
||||
auto ref_argument =
|
||||
ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, PassThrough{}, b_element_op, PassThrough{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
for(int m = 0; m < M; ++m)
|
||||
{
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
|
||||
}
|
||||
}
|
||||
|
||||
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
|
||||
|
||||
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,296 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_wmma_cshuffle_v3.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
|
||||
#include "ck/utility/blkgemmpipe_scheduler.hpp"
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using BF16 = ck::bhalf_t;
|
||||
using I8 = int8_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
|
||||
using A0DataType = BF16;
|
||||
using AsDataType = ck::Tuple<A0DataType>;
|
||||
using B0DataType = I8;
|
||||
using BsDataType = ck::Tuple<B0DataType>;
|
||||
using AccDataType = F32;
|
||||
using CShuffleDataType = F32;
|
||||
using D0DataType = BF16;
|
||||
using D1DataType = BF16;
|
||||
using DsDataType = ck::Tuple<D0DataType, D1DataType>;
|
||||
using EDataType = BF16;
|
||||
|
||||
using A0Layout = Row;
|
||||
using AsLayout = ck::Tuple<A0Layout>;
|
||||
using B0Layout = Row;
|
||||
using BsLayout = ck::Tuple<B0Layout>;
|
||||
using D0Layout = Row;
|
||||
using D1Layout = D0Layout;
|
||||
using DsLayout = ck::Tuple<D0Layout, D1Layout>;
|
||||
using ELayout = Row;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = MultiplyAddFastGelu;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
|
||||
|
||||
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Wmma_CShuffleV3<
|
||||
AsLayout,
|
||||
BsLayout,
|
||||
DsLayout,
|
||||
ELayout,
|
||||
AsDataType,
|
||||
BsDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp,
|
||||
GemmSpec,
|
||||
256,
|
||||
128,
|
||||
128,
|
||||
64,
|
||||
8,
|
||||
8,
|
||||
16,
|
||||
16,
|
||||
4,
|
||||
2,
|
||||
S<8, 32, 1>,
|
||||
S<1, 0, 2>,
|
||||
S<1, 0, 2>,
|
||||
2,
|
||||
8,
|
||||
8,
|
||||
0,
|
||||
S<8, 32, 1>,
|
||||
S<0, 2, 1>,
|
||||
S<0, 2, 1>,
|
||||
1,
|
||||
1,
|
||||
8,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
S<1, 32, 1, 8>,
|
||||
S<8, 8, 8>,
|
||||
ck::BlockGemmPipelineScheduler::Intrawave,
|
||||
ck::BlockGemmPipelineVersion::v3>;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// GEMM shape
|
||||
ck::index_t M = 4096;
|
||||
ck::index_t N = 768;
|
||||
ck::index_t K = 6144;
|
||||
|
||||
ck::index_t StrideA = K;
|
||||
ck::index_t StrideB = N;
|
||||
ck::index_t StrideD = N;
|
||||
ck::index_t StrideE = N;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 11)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
|
||||
M = std::stoi(argv[4]);
|
||||
N = std::stoi(argv[5]);
|
||||
K = std::stoi(argv[6]);
|
||||
|
||||
StrideA = std::stoi(argv[7]);
|
||||
StrideB = std::stoi(argv[8]);
|
||||
StrideD = std::stoi(argv[9]);
|
||||
StrideE = std::stoi(argv[10]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: time kernel (0=no, 1=yes)\n");
|
||||
printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
|
||||
Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
|
||||
Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
|
||||
Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD, D1Layout{}));
|
||||
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
|
||||
std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
|
||||
std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
|
||||
std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
|
||||
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
|
||||
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-5, 5});
|
||||
b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
|
||||
d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
|
||||
d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-5, 5});
|
||||
break;
|
||||
default:
|
||||
a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
|
||||
b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
|
||||
d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
|
||||
d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{-0.5, 0.5});
|
||||
}
|
||||
|
||||
DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
|
||||
DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
|
||||
DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
|
||||
DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
|
||||
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
|
||||
|
||||
a0_device_buf.ToDevice(a0_m_k.mData.data());
|
||||
b0_device_buf.ToDevice(b0_k_n.mData.data());
|
||||
d0_device_buf.ToDevice(d0_m_n.mData.data());
|
||||
d1_device_buf.ToDevice(d1_m_n.mData.data());
|
||||
e_device_buf.ToDevice(e_m_n_device_result.mData.data());
|
||||
|
||||
auto a_element_op = AElementOp{};
|
||||
auto b_element_op = BElementOp{};
|
||||
auto cde_element_op = CDEElementOp{};
|
||||
|
||||
constexpr ck::index_t NumATensor = 1;
|
||||
constexpr ck::index_t NumBTensor = 1;
|
||||
constexpr ck::index_t NumDTensor = 2;
|
||||
|
||||
// do GEMM
|
||||
auto device_op = DeviceOpInstance{};
|
||||
auto invoker = device_op.MakeInvoker();
|
||||
auto argument =
|
||||
device_op.MakeArgument(std::array<const void*, NumATensor>{a0_device_buf.GetDeviceBuffer()},
|
||||
std::array<const void*, NumBTensor>{b0_device_buf.GetDeviceBuffer()},
|
||||
std::array<const void*, NumDTensor>{d0_device_buf.GetDeviceBuffer(),
|
||||
d1_device_buf.GetDeviceBuffer()},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
std::array<ck::index_t, NumATensor>{StrideA},
|
||||
std::array<ck::index_t, NumBTensor>{StrideB},
|
||||
std::array<ck::index_t, NumDTensor>{StrideD, StrideD},
|
||||
StrideE,
|
||||
1,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
if(!device_op.IsSupportedArgument(argument))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"wrong! device_gemm with the specified compilation parameters does "
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
|
||||
<< std::endl;
|
||||
|
||||
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<CShuffleDataType> c_m_n({M, N});
|
||||
|
||||
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
|
||||
B0DataType,
|
||||
CShuffleDataType,
|
||||
AccDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough>;
|
||||
auto ref_gemm = ReferenceGemmInstance{};
|
||||
auto ref_invoker = ref_gemm.MakeInvoker();
|
||||
|
||||
auto ref_argument = ref_gemm.MakeArgument(
|
||||
a0_m_k, b0_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
for(int m = 0; m < M; ++m)
|
||||
{
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
|
||||
}
|
||||
}
|
||||
|
||||
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
|
||||
|
||||
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -81,10 +81,11 @@ int main(int argc, char* argv[])
|
||||
ck::index_t N = 768;
|
||||
ck::index_t K = 6144;
|
||||
|
||||
ck::index_t StrideA = K;
|
||||
ck::index_t StrideB = N;
|
||||
ck::index_t StrideD = 0;
|
||||
ck::index_t StrideE = N;
|
||||
ck::index_t StrideA = K;
|
||||
ck::index_t StrideB = N;
|
||||
ck::index_t StrideB1 = 0;
|
||||
ck::index_t StrideD = 0;
|
||||
ck::index_t StrideE = N;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
@@ -120,23 +121,31 @@ int main(int argc, char* argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
auto f_host_tensor_descriptor = [](std::size_t row,
|
||||
std::size_t col,
|
||||
ck::index_t& stride,
|
||||
auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[0];
|
||||
return desc;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[1];
|
||||
return desc;
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
|
||||
Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
|
||||
Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, 0, B1Layout{}));
|
||||
Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, StrideB1, B1Layout{}));
|
||||
Tensor<D0DataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
|
||||
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
@@ -196,7 +205,7 @@ int main(int argc, char* argv[])
|
||||
N,
|
||||
K,
|
||||
std::array<ck::index_t, NumATensor>{StrideA},
|
||||
std::array<ck::index_t, NumBTensor>{StrideB, 0},
|
||||
std::array<ck::index_t, NumBTensor>{StrideB, StrideB1},
|
||||
std::array<ck::index_t, NumDTensor>{StrideD},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
|
||||
@@ -81,10 +81,11 @@ int main(int argc, char* argv[])
|
||||
ck::index_t N = 768;
|
||||
ck::index_t K = 6144;
|
||||
|
||||
ck::index_t StrideA = K;
|
||||
ck::index_t StrideB = N;
|
||||
ck::index_t StrideD = 0;
|
||||
ck::index_t StrideE = N;
|
||||
ck::index_t StrideA = K;
|
||||
ck::index_t StrideB = N;
|
||||
ck::index_t StrideB1 = 0;
|
||||
ck::index_t StrideD = 0;
|
||||
ck::index_t StrideE = N;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
@@ -120,23 +121,31 @@ int main(int argc, char* argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
auto f_host_tensor_descriptor = [](std::size_t row,
|
||||
std::size_t col,
|
||||
ck::index_t& stride,
|
||||
auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[0];
|
||||
return desc;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[1];
|
||||
return desc;
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
|
||||
Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
|
||||
Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, 0, B1Layout{}));
|
||||
Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, StrideB1, B1Layout{}));
|
||||
Tensor<D0DataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
|
||||
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
@@ -196,7 +205,7 @@ int main(int argc, char* argv[])
|
||||
N,
|
||||
K,
|
||||
std::array<ck::index_t, NumATensor>{StrideA},
|
||||
std::array<ck::index_t, NumBTensor>{StrideB, 0},
|
||||
std::array<ck::index_t, NumBTensor>{StrideB, StrideB1},
|
||||
std::array<ck::index_t, NumDTensor>{},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
|
||||
@@ -80,10 +80,11 @@ int main(int argc, char* argv[])
|
||||
ck::index_t N = 768;
|
||||
ck::index_t K = 6144;
|
||||
|
||||
ck::index_t StrideA = K;
|
||||
ck::index_t StrideB = N;
|
||||
ck::index_t StrideD = 0;
|
||||
ck::index_t StrideE = N;
|
||||
ck::index_t StrideA = K;
|
||||
ck::index_t StrideB = N;
|
||||
ck::index_t StrideB1 = 0;
|
||||
ck::index_t StrideD = 0;
|
||||
ck::index_t StrideE = N;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
@@ -119,23 +120,31 @@ int main(int argc, char* argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
auto f_host_tensor_descriptor = [](std::size_t row,
|
||||
std::size_t col,
|
||||
ck::index_t& stride,
|
||||
auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[0];
|
||||
return desc;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
|
||||
if(stride <= 0)
|
||||
stride = desc.GetStrides()[1];
|
||||
return desc;
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
|
||||
Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
|
||||
Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, 0, B1Layout{}));
|
||||
Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor(K, N, StrideB1, B1Layout{}));
|
||||
Tensor<D0DataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
|
||||
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
@@ -196,7 +205,7 @@ int main(int argc, char* argv[])
|
||||
K,
|
||||
std::array<ck::index_t, NumATensor>{StrideA},
|
||||
std::array<ck::index_t, NumBTensor>{StrideB},
|
||||
std::array<ck::index_t, NumDTensor>{0, StrideD},
|
||||
std::array<ck::index_t, NumDTensor>{StrideB1, StrideD},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
@@ -261,7 +270,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), b1_k_n(0, n), d_m_n(m, n));
|
||||
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), b1_k_n(m, n), d_m_n(m, n));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -19,6 +19,9 @@
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
@@ -160,12 +163,12 @@ int main(int argc, char* argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
Tensor<A0DataType> a0_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides);
|
||||
Tensor<A1DataType> a1_ms_ks(a1_ms_ks_lengths, a1_ms_ks_strides);
|
||||
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
|
||||
Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
|
||||
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<A0DataType> a0_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides, Row{});
|
||||
Tensor<A1DataType> a1_ms_ks(a1_ms_ks_lengths, a1_ms_ks_strides, Bypass{});
|
||||
Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides, Row{});
|
||||
Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides, Row{});
|
||||
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
|
||||
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
|
||||
|
||||
std::cout << "a0_ms_ks: " << a0_ms_ks.mDesc << std::endl;
|
||||
std::cout << "a1_ms_ks: " << a1_ms_ks.mDesc << std::endl;
|
||||
@@ -264,9 +267,9 @@ int main(int argc, char* argv[])
|
||||
if(do_verification)
|
||||
{
|
||||
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
|
||||
|
||||
Tensor<A0DataType> a_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides);
|
||||
Tensor<A0DataType> a_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides, Row{});
|
||||
|
||||
for(size_t m0 = 0; m0 < a_ms_ks.mDesc.GetLengths()[0]; ++m0)
|
||||
{
|
||||
@@ -299,7 +302,6 @@ int main(int argc, char* argv[])
|
||||
auto ref_op = ReferenceOpInstance{};
|
||||
auto ref_invoker = ref_op.MakeInvoker();
|
||||
|
||||
Tensor<float> empty_tensor(std::vector<ck::index_t>{}, std::vector<ck::index_t>{});
|
||||
auto ref_argument =
|
||||
ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, PassThrough{}, b_element_op);
|
||||
|
||||
|
||||
@@ -19,6 +19,9 @@
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/numeric.hpp"
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
@@ -140,12 +143,12 @@ int main(int argc, char* argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
Tensor<A0DataType> a0_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides);
|
||||
Tensor<A1DataType> a1_ms_ks(a1_ms_ks_lengths, a1_ms_ks_strides);
|
||||
Tensor<B0DataType> b0_ns_ks(b0_ns_ks_lengths, b0_ns_ks_strides);
|
||||
Tensor<B1DataType> b1_ns_ks(b1_ns_ks_lengths, b1_ns_ks_strides);
|
||||
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<A0DataType> a0_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides, Row{});
|
||||
Tensor<A1DataType> a1_ms_ks(a1_ms_ks_lengths, a1_ms_ks_strides, Bypass{});
|
||||
Tensor<B0DataType> b0_ns_ks(b0_ns_ks_lengths, b0_ns_ks_strides, Row{});
|
||||
Tensor<B1DataType> b1_ns_ks(b1_ns_ks_lengths, b1_ns_ks_strides, Row{});
|
||||
Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
|
||||
Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
|
||||
|
||||
std::cout << "a0_ms_ks: " << a0_ms_ks.mDesc << std::endl;
|
||||
std::cout << "a1_ms_ks: " << a1_ms_ks.mDesc << std::endl;
|
||||
@@ -246,9 +249,9 @@ int main(int argc, char* argv[])
|
||||
if(do_verification)
|
||||
{
|
||||
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides, Row{});
|
||||
|
||||
Tensor<A0DataType> a_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides);
|
||||
Tensor<A0DataType> a_ms_ks(a0_ms_ks_lengths, a0_ms_ks_strides, Row{});
|
||||
|
||||
for(size_t m0 = 0; m0 < a_ms_ks.mDesc.GetLengths()[0]; ++m0)
|
||||
{
|
||||
@@ -266,7 +269,7 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
Tensor<B0DataType> b_ns_ks(b0_ns_ks_lengths, b0_ns_ks_strides);
|
||||
Tensor<B0DataType> b_ns_ks(b0_ns_ks_lengths, b0_ns_ks_strides, Row{});
|
||||
|
||||
for(size_t n0 = 0; n0 < b_ns_ks.mDesc.GetLengths()[0]; ++n0)
|
||||
{
|
||||
|
||||
@@ -130,11 +130,12 @@ bool run_grouped_conv(bool do_verification,
|
||||
// Fill other lenghts than G,K with 1 and strides with 0
|
||||
bias_g_k_lengths.fill(1);
|
||||
bias_g_k_strides.fill(0);
|
||||
bias_g_k_lengths[0] = G;
|
||||
bias_g_k_lengths[2] = K;
|
||||
bias_g_k_strides[0] = K; // stride to G
|
||||
bias_g_k_strides[2] = 1; // stride to K
|
||||
const auto broadcasted_bias_desc = HostTensorDescriptor(bias_g_k_lengths, bias_g_k_strides);
|
||||
bias_g_k_lengths[0] = G;
|
||||
bias_g_k_lengths[2] = K;
|
||||
bias_g_k_strides[0] = K; // stride to G
|
||||
bias_g_k_strides[2] = 1; // stride to K
|
||||
const auto broadcasted_bias_desc =
|
||||
HostTensorDescriptor(bias_g_k_lengths, bias_g_k_strides, BiasLayout{});
|
||||
|
||||
// y = relu ( alpha1 * conv(x) + alpha2 * z + bias )
|
||||
Tensor<InDataType> in(in_g_n_c_wis_desc);
|
||||
|
||||
@@ -28,7 +28,8 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
|
||||
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
|
||||
Tensor<QuantDataType> quant_b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
|
||||
// assume scale tensor is [1, n]
|
||||
Tensor<ScaleDataType> scale_k_n(f_host_tensor_descriptor(K, N, 0, Row{}));
|
||||
Tensor<ScaleDataType> scale_k_n(
|
||||
HostTensorDescriptor({K, N}, {0, 1_uz}, ck::tensor_layout::BypassLayoutVerification()));
|
||||
|
||||
switch(config.init_method)
|
||||
{
|
||||
|
||||
@@ -241,6 +241,28 @@ int main(int argc, char* argv[])
|
||||
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
|
||||
// Update strides based on tensor properties if they are <= 0
|
||||
auto get_stride = [](auto& tensor, auto layout, ck::index_t current_stride) -> ck::index_t {
|
||||
if(current_stride <= 0)
|
||||
{
|
||||
if constexpr(std::is_same_v<decltype(layout), Row>)
|
||||
{
|
||||
return tensor.GetStrides()[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
return tensor.GetStrides()[1];
|
||||
}
|
||||
}
|
||||
return current_stride;
|
||||
};
|
||||
|
||||
StrideA = get_stride(a0_m_k, A0Layout{}, StrideA);
|
||||
StrideB = get_stride(b0_k_n, B0Layout{}, StrideB);
|
||||
ck::index_t StrideD0 = get_stride(d0_m_n, D0Layout{}, StrideD);
|
||||
ck::index_t StrideD1 = get_stride(d1_m_n, D1Layout{}, StrideD);
|
||||
StrideE = get_stride(e_m_n_host_result, ELayout{}, StrideE);
|
||||
|
||||
std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
|
||||
std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
|
||||
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
|
||||
@@ -285,8 +307,6 @@ int main(int argc, char* argv[])
|
||||
|
||||
constexpr ck::index_t NumDTensor = DsDataType::Size();
|
||||
|
||||
constexpr auto I0 = ck::Number<0>{};
|
||||
|
||||
// do GEMM
|
||||
auto device_op = DeviceOpInstance{};
|
||||
|
||||
@@ -308,7 +328,7 @@ int main(int argc, char* argv[])
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
std::array<ck::index_t, NumDTensor>{I0, I0},
|
||||
std::array<ck::index_t, NumDTensor>{StrideD0, StrideD1},
|
||||
StrideE,
|
||||
KBatch,
|
||||
a_element_op,
|
||||
|
||||
@@ -162,6 +162,28 @@ int main(int argc, char* argv[])
|
||||
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
|
||||
// Update strides based on tensor properties if they are <= 0
|
||||
auto get_stride = [](auto& tensor, auto layout, ck::index_t current_stride) -> ck::index_t {
|
||||
if(current_stride <= 0)
|
||||
{
|
||||
if constexpr(std::is_same_v<decltype(layout), Row>)
|
||||
{
|
||||
return tensor.GetStrides()[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
return tensor.GetStrides()[1];
|
||||
}
|
||||
}
|
||||
return current_stride;
|
||||
};
|
||||
|
||||
StrideA = get_stride(a0_m_k, A0Layout{}, StrideA);
|
||||
StrideB = get_stride(b0_k_n, B0Layout{}, StrideB);
|
||||
ck::index_t StrideD0 = get_stride(d0_m_n, D0Layout{}, StrideD);
|
||||
ck::index_t StrideD1 = get_stride(d1_m_n, D1Layout{}, StrideD);
|
||||
StrideE = get_stride(e_m_n_host_result, ELayout{}, StrideE);
|
||||
|
||||
std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
|
||||
std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
|
||||
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
|
||||
@@ -216,7 +238,7 @@ int main(int argc, char* argv[])
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
std::array<ck::index_t, NumDTensor>{StrideD, StrideD},
|
||||
std::array<ck::index_t, NumDTensor>{StrideD0, StrideD1},
|
||||
StrideE,
|
||||
KBatch,
|
||||
a_element_op,
|
||||
|
||||
@@ -251,6 +251,28 @@ int main(int argc, char* argv[])
|
||||
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
|
||||
// Update strides based on tensor properties if they are <= 0
|
||||
auto get_stride = [](auto& tensor, auto layout, ck::index_t current_stride) -> ck::index_t {
|
||||
if(current_stride <= 0)
|
||||
{
|
||||
if constexpr(std::is_same_v<decltype(layout), Row>)
|
||||
{
|
||||
return tensor.GetStrides()[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
return tensor.GetStrides()[1];
|
||||
}
|
||||
}
|
||||
return current_stride;
|
||||
};
|
||||
|
||||
StrideA = get_stride(a0_m_k, A0Layout{}, StrideA);
|
||||
StrideB = get_stride(b0_k_n, B0Layout{}, StrideB);
|
||||
ck::index_t StrideD0 = get_stride(d0_m_n, D0Layout{}, StrideD);
|
||||
ck::index_t StrideD1 = get_stride(d1_m_n, D1Layout{}, StrideD);
|
||||
StrideE = get_stride(e_m_n_host_result, ELayout{}, StrideE);
|
||||
|
||||
std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
|
||||
std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
|
||||
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
|
||||
@@ -295,8 +317,6 @@ int main(int argc, char* argv[])
|
||||
|
||||
constexpr ck::index_t NumDTensor = DsDataType::Size();
|
||||
|
||||
constexpr auto I0 = ck::Number<0>{};
|
||||
|
||||
// do GEMM
|
||||
auto device_op = DeviceOpInstance{};
|
||||
|
||||
@@ -318,7 +338,7 @@ int main(int argc, char* argv[])
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
std::array<ck::index_t, NumDTensor>{I0, I0},
|
||||
std::array<ck::index_t, NumDTensor>{StrideD0, StrideD1},
|
||||
StrideE,
|
||||
KBatch,
|
||||
a_element_op,
|
||||
|
||||
@@ -287,15 +287,18 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
|
||||
Tensor<B0DataType> b0_e_n_k(
|
||||
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
|
||||
Tensor<B0DataType> b0_preshuffled(
|
||||
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
|
||||
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
|
||||
Tensor<D1DataType> d1_e_n(
|
||||
HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
|
||||
Tensor<EDataType> e_t_n_host_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
Tensor<EDataType> e_t_n_device_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
|
||||
std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
|
||||
std::cout << "d1_e_n: " << d1_e_n.mDesc << std::endl;
|
||||
@@ -422,7 +425,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
e_device_buf.FromDevice(e_t_n_device_result.mData.data());
|
||||
|
||||
Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
|
||||
Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
|
||||
|
||||
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceMoeGemm<A0DataType,
|
||||
B0DataType,
|
||||
|
||||
@@ -301,18 +301,22 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
|
||||
Tensor<A1DataType> a1_t_k(HostTensorDescriptor(
|
||||
{tokens, (K + Scale_Block_K - 1) / Scale_Block_K}, {Scale_Stride_AM, 1}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
|
||||
{tokens, (K + Scale_Block_K - 1) / Scale_Block_K}, {Scale_Stride_AM, 1}, Row{}));
|
||||
Tensor<B0DataType> b0_e_n_k(
|
||||
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
|
||||
Tensor<B1DataType> b1_e_n_k(
|
||||
HostTensorDescriptor({experts,
|
||||
(K + Scale_Block_K - 1) / Scale_Block_K,
|
||||
(N + Scale_Block_N - 1) / Scale_Block_N * 2},
|
||||
{(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN}));
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
|
||||
{(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
Tensor<B0DataType> b0_preshuffled(
|
||||
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
|
||||
Tensor<EDataType> e_t_n_host_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
Tensor<EDataType> e_t_n_device_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
e_t_n_device_result.SetZero();
|
||||
std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
|
||||
std::cout << "a1_t_k: " << a1_t_k.mDesc << std::endl;
|
||||
@@ -463,7 +467,7 @@ int main(int argc, char* argv[])
|
||||
Tensor<float> b_e_n_k({experts, K, N * 2});
|
||||
e_device_buf.FromDevice(e_t_n_device_result.mData.data());
|
||||
|
||||
Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
|
||||
Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
|
||||
|
||||
// handle scale before ref.
|
||||
for(int t = 0; t < tokens; ++t)
|
||||
|
||||
@@ -264,15 +264,18 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
|
||||
Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
|
||||
Tensor<B0DataType> b0_e_n_k(
|
||||
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
|
||||
Tensor<B0DataType> b0_preshuffled(
|
||||
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
|
||||
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
|
||||
Tensor<D1DataType> d1_e_n(
|
||||
HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
|
||||
Tensor<EDataType> e_t_n_host_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
Tensor<EDataType> e_t_n_device_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
|
||||
std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
|
||||
std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
|
||||
@@ -488,7 +491,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
e_device_buf.FromDevice(e_t_n_device_result.mData.data());
|
||||
|
||||
Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
|
||||
Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
|
||||
|
||||
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceMoeGemm<A0DataType,
|
||||
B0DataType,
|
||||
|
||||
@@ -28,8 +28,9 @@ using F16 = ck::half_t;
|
||||
using F8 = ck::f8_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = F8;
|
||||
using B0DataType = F8;
|
||||
@@ -278,11 +279,11 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
|
||||
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}, Row{}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
|
||||
Tensor<D0DataType> d0_t_n(
|
||||
HostTensorDescriptor({tokens, topk, N}, {StrideDs[0] * topk, StrideDs[0], 0}));
|
||||
HostTensorDescriptor({tokens, topk, N}, {StrideDs[0] * topk, StrideDs[0], 0}, Bypass{}));
|
||||
Tensor<D1DataType> d1_e_n(
|
||||
HostTensorDescriptor({experts, N}, {PerTokenQuant ? StrideDs[1] * N : 1, StrideDs[1]}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
|
||||
@@ -292,17 +292,19 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
|
||||
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}, Row{}));
|
||||
Tensor<A1DataType> a1_t_k_k(
|
||||
HostTensorDescriptor({tokens, topk, (K + Scale_Block_K - 1) / Scale_Block_K},
|
||||
{(topk * Scale_Stride_AM), Scale_Stride_AM, 1}));
|
||||
{(topk * Scale_Stride_AM), Scale_Stride_AM, 1},
|
||||
Row{}));
|
||||
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
|
||||
Tensor<B1DataType> b1_e_n_k(HostTensorDescriptor(
|
||||
{experts, (K + Scale_Block_K - 1) / Scale_Block_K, (N + Scale_Block_N - 1) / Scale_Block_N},
|
||||
{(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN}));
|
||||
{(Scale_Stride_B * Scale_Stride_BN), 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
|
||||
@@ -29,8 +29,9 @@ using F16 = ck::half_t;
|
||||
using F8 = ck::f8_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
using Bypass = ck::tensor_layout::BypassLayoutVerification;
|
||||
|
||||
using A0DataType = F8;
|
||||
using B0DataType = I4;
|
||||
@@ -239,10 +240,10 @@ int main(int argc, char* argv[])
|
||||
sorted_token_ids.mData[i] = tokens;
|
||||
}
|
||||
}
|
||||
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
|
||||
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
|
||||
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}, Row{}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
|
||||
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}, Bypass{}));
|
||||
Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
|
||||
@@ -95,25 +95,26 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
using DefaultLayout = ck::tensor_layout::gemm::RowMajor;
|
||||
// For Real Part of Complex Tensor
|
||||
Tensor<ADataType> a_ms_ks_re(a_ms_ks_lengths, a_ms_ks_strides);
|
||||
Tensor<BDataType> b_ns_ks_re(b_ns_ks_lengths, b_ns_ks_strides);
|
||||
Tensor<EDataType> d_ms_ns_re(d_ms_ns_lengths, d_ms_ns_strides);
|
||||
Tensor<ADataType> a_ms_ks_re(a_ms_ks_lengths, a_ms_ks_strides, DefaultLayout{});
|
||||
Tensor<BDataType> b_ns_ks_re(b_ns_ks_lengths, b_ns_ks_strides, DefaultLayout{});
|
||||
Tensor<EDataType> d_ms_ns_re(d_ms_ns_lengths, d_ms_ns_strides, DefaultLayout{});
|
||||
|
||||
Tensor<EDataType> e_ms_ns_host_result_re(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<EDataType> e_ms_ns_device_result_re(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<EDataType> e_ms_ns_host_result_re(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
|
||||
Tensor<EDataType> e_ms_ns_device_result_re(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
|
||||
|
||||
// For Imaginary Part of Complex Tensor
|
||||
Tensor<ADataType> a_ms_ks_img(a_ms_ks_lengths, a_ms_ks_strides);
|
||||
Tensor<BDataType> b_ns_ks_img(b_ns_ks_lengths, b_ns_ks_strides);
|
||||
Tensor<EDataType> d_ms_ns_img(d_ms_ns_lengths, d_ms_ns_strides);
|
||||
Tensor<ADataType> a_ms_ks_img(a_ms_ks_lengths, a_ms_ks_strides, DefaultLayout{});
|
||||
Tensor<BDataType> b_ns_ks_img(b_ns_ks_lengths, b_ns_ks_strides, DefaultLayout{});
|
||||
Tensor<EDataType> d_ms_ns_img(d_ms_ns_lengths, d_ms_ns_strides, DefaultLayout{});
|
||||
|
||||
Tensor<EDataType> e_ms_ns_host_result_img(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<EDataType> e_ms_ns_device_result_img(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<EDataType> e_ms_ns_host_result_img(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
|
||||
Tensor<EDataType> e_ms_ns_device_result_img(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
|
||||
|
||||
// Intermediate E tensor Definition
|
||||
Tensor<EDataType> e_ms_ns_device_result_re1(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<EDataType> e_ms_ns_device_result_img1(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<EDataType> e_ms_ns_device_result_re1(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
|
||||
Tensor<EDataType> e_ms_ns_device_result_img1(e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
|
||||
|
||||
std::cout << "a_ms_ks_re: " << a_ms_ks_re.mDesc << std::endl;
|
||||
std::cout << "b_ns_ks_re: " << b_ns_ks_re.mDesc << std::endl;
|
||||
@@ -349,8 +350,10 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
|
||||
if(do_verification)
|
||||
{
|
||||
// Real Part Verification
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result_re(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result_re1(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result_re(
|
||||
e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result_re1(
|
||||
e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
|
||||
|
||||
using ReferenceOpInstance =
|
||||
ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
|
||||
@@ -422,8 +425,10 @@ int run_complex_contraction_bilinear_example(int argc, char* argv[])
|
||||
isRealOk = ck::utils::check_err(e_ms_ns_device_result_re, e_ms_ns_host_result_re) ? 0 : 1;
|
||||
|
||||
// Img Part Verification
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result_img(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result_img1(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result_img(
|
||||
e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
|
||||
Tensor<CShuffleDataType> c_ms_ns_host_result_img1(
|
||||
e_ms_ns_lengths, e_ms_ns_strides, DefaultLayout{});
|
||||
|
||||
auto ref_argument_img = ref_op.MakeArgument(
|
||||
a_ms_ks_re, b_ns_ks_img, c_ms_ns_host_result_img, a_element_op, b_element_op);
|
||||
|
||||
@@ -269,10 +269,12 @@ int main(int argc, char* argv[])
|
||||
Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
|
||||
Tensor<XDataType> a1_t_k(HostTensorDescriptor(
|
||||
{tokens, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
|
||||
Tensor<B0DataType> b0_e_n_k(
|
||||
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
|
||||
Tensor<XDataType> b1_e_n_k(
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
|
||||
{(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN}));
|
||||
{(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
|
||||
// A, B Scale preshuffle
|
||||
Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
|
||||
@@ -281,12 +283,13 @@ int main(int argc, char* argv[])
|
||||
{sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
|
||||
Tensor<XDataType> b_scale_preshuffled(
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
|
||||
{N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN}));
|
||||
{N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<EDataType> e_t_k_n_host_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
Tensor<EDataType> e_t_k_n_device_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
|
||||
e_t_k_n_device_result.SetZero();
|
||||
std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
|
||||
@@ -480,7 +483,7 @@ int main(int argc, char* argv[])
|
||||
e_device_buf.ToDevice(e_t_k_n_device_result.mData.data());
|
||||
invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
|
||||
|
||||
Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
|
||||
Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
|
||||
|
||||
using ReferenceGemmInstance =
|
||||
ck::tensor_operation::host::ReferenceMoeMXGemm1<A0DataType,
|
||||
|
||||
@@ -266,10 +266,12 @@ int main(int argc, char* argv[])
|
||||
Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
|
||||
Tensor<XDataType> a1_t_k(HostTensorDescriptor(
|
||||
{tokens, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
|
||||
Tensor<B0DataType> b0_e_n_k(
|
||||
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
|
||||
Tensor<XDataType> b1_e_n_k(
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
|
||||
{(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN}));
|
||||
{(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
|
||||
// A, B Scale preshuffle
|
||||
Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
|
||||
@@ -278,12 +280,13 @@ int main(int argc, char* argv[])
|
||||
{sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
|
||||
Tensor<XDataType> b_scale_preshuffled(
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
|
||||
{N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN}));
|
||||
{N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<EDataType> e_t_k_n_host_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
Tensor<EDataType> e_t_k_n_device_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
|
||||
e_t_k_n_device_result.SetZero();
|
||||
std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
|
||||
@@ -477,7 +480,7 @@ int main(int argc, char* argv[])
|
||||
e_device_buf.ToDevice(e_t_k_n_device_result.mData.data());
|
||||
invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
|
||||
|
||||
Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
|
||||
Tensor<CShuffleDataType> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
|
||||
|
||||
using ReferenceGemmInstance =
|
||||
ck::tensor_operation::host::ReferenceMoeMXGemm1<A0DataType,
|
||||
|
||||
@@ -296,12 +296,15 @@ int main(int argc, char* argv[])
|
||||
Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
|
||||
Tensor<XDataType> a1_t_k(HostTensorDescriptor(
|
||||
{tokens, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
|
||||
Tensor<B0DataType> b0_e_n_k(
|
||||
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
|
||||
Tensor<XDataType> b1_e_n_k(
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
|
||||
{(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN}));
|
||||
{(N * 2 * Scale_Stride_BN), 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
// B preshuffle
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}));
|
||||
Tensor<B0DataType> b0_preshuffled(
|
||||
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
|
||||
|
||||
// A, B Scale preshuffle
|
||||
Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
|
||||
@@ -310,12 +313,13 @@ int main(int argc, char* argv[])
|
||||
{sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
|
||||
Tensor<XDataType> b_scale_preshuffled(
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
|
||||
{N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN}));
|
||||
{N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<EDataType> e_t_k_n_host_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
Tensor<EDataType> e_t_k_n_device_result(
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}));
|
||||
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
|
||||
|
||||
e_t_k_n_device_result.SetZero();
|
||||
std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
|
||||
@@ -506,7 +510,7 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
invoker.Run(argument, StreamConfig{nullptr, false, 0, 0, 1});
|
||||
|
||||
Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1});
|
||||
Tensor<float> c_t_k_n({tokens, topk, N}, {topk * N, N, 1}, Row{});
|
||||
|
||||
using ReferenceGemmInstance =
|
||||
ck::tensor_operation::host::ReferenceMoeMXGemm1<A0DataType,
|
||||
|
||||
@@ -270,14 +270,16 @@ int main(int argc, char* argv[])
|
||||
|
||||
expert_ids.savetxt("expert_ids.txt", "int");
|
||||
sorted_token_ids.savetxt("sorted_token_ids.txt", "int");
|
||||
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
|
||||
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}, Row{}));
|
||||
Tensor<XDataType> a1_t_k_k(
|
||||
HostTensorDescriptor({tokens, topk, (K + ScaleBlockSize - 1) / ScaleBlockSize},
|
||||
{(topk * Scale_Stride_AM), Scale_Stride_AM, 1}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
|
||||
{(topk * Scale_Stride_AM), Scale_Stride_AM, 1},
|
||||
Row{}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
|
||||
Tensor<XDataType> b1_e_n_k(
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
|
||||
{(N * Scale_Stride_BN), 1, Scale_Stride_BN}));
|
||||
{(N * Scale_Stride_BN), 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
|
||||
// A, B Scale preshuffle
|
||||
Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
|
||||
@@ -286,7 +288,8 @@ int main(int argc, char* argv[])
|
||||
{sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
|
||||
Tensor<XDataType> b_scale_preshuffled(
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
|
||||
{N * Scale_Stride_BN, 1, Scale_Stride_BN}));
|
||||
{N * Scale_Stride_BN, 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
|
||||
@@ -268,16 +268,18 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
|
||||
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}, Row{}));
|
||||
Tensor<XDataType> a1_t_k_k(
|
||||
HostTensorDescriptor({tokens, topk, (K + ScaleBlockSize - 1) / ScaleBlockSize},
|
||||
{(topk * Scale_Stride_AM), Scale_Stride_AM, 1}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
|
||||
{(topk * Scale_Stride_AM), Scale_Stride_AM, 1},
|
||||
Row{}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
|
||||
Tensor<XDataType> b1_e_n_k(
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
|
||||
{(N * Scale_Stride_BN), 1, Scale_Stride_BN}));
|
||||
{(N * Scale_Stride_BN), 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
// B preshuffle
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
|
||||
|
||||
// A, B Scale preshuffle
|
||||
Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
|
||||
@@ -286,7 +288,8 @@ int main(int argc, char* argv[])
|
||||
{sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
|
||||
Tensor<XDataType> b_scale_preshuffled(
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
|
||||
{N * Scale_Stride_BN, 1, Scale_Stride_BN}));
|
||||
{N * Scale_Stride_BN, 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
|
||||
@@ -303,16 +303,18 @@ int main(int argc, char* argv[])
|
||||
|
||||
expert_ids.savetxt("expert_ids.txt", "int");
|
||||
sorted_token_ids.savetxt("sorted_token_ids.txt", "int");
|
||||
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
|
||||
Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}, Row{}));
|
||||
Tensor<XDataType> a1_t_k_k(
|
||||
HostTensorDescriptor({tokens, topk, (K + ScaleBlockSize - 1) / ScaleBlockSize},
|
||||
{(topk * Scale_Stride_AM), Scale_Stride_AM, 1}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
|
||||
{(topk * Scale_Stride_AM), Scale_Stride_AM, 1},
|
||||
Row{}));
|
||||
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
|
||||
Tensor<XDataType> b1_e_n_k(
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
|
||||
{(N * Scale_Stride_BN), 1, Scale_Stride_BN}));
|
||||
{(N * Scale_Stride_BN), 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
// B preshuffle
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
|
||||
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
|
||||
|
||||
// A, B Scale preshuffle
|
||||
Tensor<XDataType> a_scale_sorted(HostTensorDescriptor(
|
||||
@@ -321,7 +323,8 @@ int main(int argc, char* argv[])
|
||||
{sorted_size, (K + ScaleBlockSize - 1) / ScaleBlockSize}, {Scale_Stride_AM, 1}));
|
||||
Tensor<XDataType> b_scale_preshuffled(
|
||||
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
|
||||
{N * Scale_Stride_BN, 1, Scale_Stride_BN}));
|
||||
{N * Scale_Stride_BN, 1, Scale_Stride_BN},
|
||||
Col{}));
|
||||
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
|
||||
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
|
||||
|
||||
@@ -47,7 +47,7 @@ set(FMHA_BWD_CODE_GEN_COMMON_ARGS
|
||||
${CMAKE_CURRENT_LIST_DIR}/generate.py
|
||||
--api bwd
|
||||
--receipt 3
|
||||
--optdim 32,64,128,256
|
||||
--optdim 32,64,96,128,256
|
||||
# --filter fmha_bwd_dot...@fmha_bwd_convert...@fmha_bwd...
|
||||
)
|
||||
|
||||
@@ -169,6 +169,10 @@ if(CK_USE_OCP_FP8)
|
||||
list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
|
||||
endif()
|
||||
|
||||
# use RTN_ASM on float to bfloat16 conversion by default, align with FA upstream
|
||||
list(APPEND FMHA_BWD_PRIVATE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3)
|
||||
list(APPEND FMHA_BWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3)
|
||||
|
||||
target_compile_options(${FMHA_FWD_INSTANCES}
|
||||
PRIVATE ${FMHA_FWD_PRIVATE_COMPILE_OPTIONS}
|
||||
INTERFACE ${FMHA_FWD_INTERFACE_COMPILE_OPTIONS})
|
||||
|
||||
@@ -36,6 +36,13 @@ args:
|
||||
total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary
|
||||
also with "-s=s0,s1,s2..." comma seperated int to set per batch seqlen(group-mode)
|
||||
-s_k seqlen_k (including new key/value), -1 means equal to s (default:-1)
|
||||
also with "-s_k=s0,s1,s2..." comma-separated ints to set seqlen per batch (group mode)
|
||||
-s_qpad seqlen_q stride between 2 batches (group-mode optional) (default:-1)
|
||||
Provide positive strides per-batch to simulate physical padding on Q
|
||||
-s_kpad seqlen_k stride between 2 batches, currently used in group-mode only (default:-1)
|
||||
for kv-cache case, each batch [1,s,h,d]/[1,h,s,d] can have a stride
|
||||
along seqlen, instead of packed, same as xformer kv_padding,
|
||||
must be greater than or equal to s_k
|
||||
-d head dim for q, k (default:128)
|
||||
-d_v head dim for v, -1 means equal to d (default:-1)
|
||||
-scale_s scale factor of S. 0 means equal to 1/sqrt(hdim). (default:0)
|
||||
@@ -76,11 +83,20 @@ args:
|
||||
-repeat number of iterations to benchmark the kernel (default:20)
|
||||
-json 0: No Json, 1: Dump Results in Json format (default:0)
|
||||
-jsonfile json file name to dump results (default:fmha_fwd.json)
|
||||
-q_eff_lens Batch-mode only: per-batch effective seqlen for Q (exclude PAD) (default:"")
|
||||
Comma-separated list of length 'b'. If empty, no override
|
||||
-kv_eff_lens Batch-mode only: per-batch effective seqlen for KV (exclude PAD) (default:"")
|
||||
Comma-separated list of length 'b'. If empty, no override
|
||||
```
|
||||
Example 1: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case.
|
||||
Example 2: `./bin/tile_example_fmha_fwd -b=1 -h=8 -s=16384 -d=64 -drop_prefs=1 -drop_seed=10 -drop_offset=1234` will run a fmha case with
|
||||
batch=1, nhead=8, sequence length=16384, hdim=64, drop_seed=0 (in GPU memory), drop_offset=1234 (in GPU memory) fp16 case
|
||||
|
||||
## Padding Examples
|
||||
Example 3 (Group mode with padding): `./bin/tile_example_fmha_fwd -mode=1 -b=2 -h=8 -s=1024,2048 -s_k=1024,2048 -s_qpad=1536,3072 -s_kpad=1536,3072 -d=128` will run group mode with 2 batches having different sequence lengths (1024, 2048) but physically padded to (1536, 3072) respectively.
|
||||
|
||||
Example 4 (Batch mode with effective lengths): `./bin/tile_example_fmha_fwd -mode=0 -b=2 -h=8 -s=2048 -s_k=2048 -d=128 -q_eff_lens=1024,1536 -kv_eff_lens=1024,1536` will run batch mode where all batches use 2048 as physical sequence length but have effective lengths of (1024, 1536) for Q and KV respectively.
|
||||
|
||||
## support features
|
||||
Currently we are still in rapid development stage, so more features/optimizations will be coming soon.
|
||||
|
||||
@@ -128,6 +144,15 @@ Note FA use bottom-right by default to express swa case, here we require you exp
|
||||
### dropout
|
||||
TBD
|
||||
|
||||
### sequence padding and variable length support
|
||||
We support sequence padding and variable-length processing in both batch and group modes fmha forward to handle real-world scenarios where sequences have different lengths.
|
||||
|
||||
**Group Mode Padding**: Use `-s_qpad` and `-s_kpad` to specify physical stride between batches, enabling padded layouts. Each batch can have different logical sequence lengths (`-s`, `-s_k`) but use larger physical strides for memory alignment.
|
||||
|
||||
**Batch Mode Variable Length**: Use `-q_eff_lens` and `-kv_eff_lens` to specify effective sequence lengths per batch. All batches share the same physical sequence length, but the kernel processes only the effective portions. This enables efficient variable-length attention without memory waste.
|
||||
|
||||
Both approaches optimize memory access patterns while supporting flexible sequence length requirements commonly found in transformer inference scenarios.
|
||||
|
||||
## FP8 experimental support
|
||||
As described in [this blog](https://blog.hippoml.com/8bit-hippoattention-up-to-3x-faster-compared-to-flashattentionv2-8f9def90b482), we have an experimental support for fp8 fmha kernels, you can evaluate the performance by setting the arg `-prec=fp8` to the `tile_example_fmha_fwd`, on a gfx942 machine and ROCm 6.0+.
|
||||
|
||||
|
||||
@@ -50,16 +50,10 @@ using fmha_bwd_shape_{F_idx} = ck_tile::TileFmhaBwdShape<fmha_block_tile_{F_idx}
|
||||
fmha_warp_tile2_{F_idx},
|
||||
{F_maxq}>;
|
||||
|
||||
using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits<false, /* kPadSeqLenQ */
|
||||
false, /* kPadSeqLenK */
|
||||
{F_dpad},
|
||||
using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaBwdTraits<{F_dpad},
|
||||
{F_dvpad},
|
||||
false,
|
||||
{F_bias},
|
||||
{F_dbias},
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
{F_occupancy}>;
|
||||
using fmha_mask_{F_idx} = {F_mask};
|
||||
using fmha_dropout_{F_idx} = {F_dropout};
|
||||
@@ -94,19 +88,19 @@ using fmha_bwd_dk_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
|
||||
ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
|
||||
typename FmhaBwdTypeConfig<{F_dtype}>::KGradDataType,
|
||||
false,
|
||||
{F_dpad}>>;
|
||||
({F_dpad} > 0)>>;
|
||||
|
||||
using fmha_bwd_dv_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
|
||||
ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
|
||||
typename FmhaBwdTypeConfig<{F_dtype}>::VGradDataType,
|
||||
false,
|
||||
{F_dvpad}>>;
|
||||
({F_dvpad} > 0)>>;
|
||||
|
||||
using fmha_bwd_dq_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
|
||||
ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
|
||||
typename FmhaBwdTypeConfig<{F_dtype}>::QGradDataType,
|
||||
false,
|
||||
{F_dpad}>>;
|
||||
({F_dpad} > 0)>>;
|
||||
|
||||
using fmha_bwd_dq_dk_dv_kernel_{F_idx} =
|
||||
ck_tile::FmhaBwdDQDKDVKernel<fmha_bwd_pipeline_{F_idx},
|
||||
@@ -200,7 +194,7 @@ float fmha_bwd_(const ck_tile::stream_config& s, fmha_bwd_args a)
|
||||
|
||||
template <>
|
||||
float fmha_bwd<2>(fmha_bwd_traits t, fmha_bwd_args a, const ck_tile::stream_config& s){{
|
||||
const bool has_load_tr = ck_tile::is_load_tr_supported();
|
||||
[[maybe_unused]] const bool has_load_tr = ck_tile::is_load_tr_supported();
|
||||
float r = -1;
|
||||
{F_dispatch}
|
||||
return r;
|
||||
@@ -220,9 +214,9 @@ def FMHA_BWD_API_COND_STATEMENT(F_cond: str, F_body: str, *, indent=0, if_ = 0)
|
||||
FMHA_BWD_API_INNER_DISPATCH="""
|
||||
{F_if}((t.is_group_mode == {F_mode}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_dbias == {F_dbias}) && ({F_dropout_check}) &&
|
||||
({F_scheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic}){F_cond_extra}) {{
|
||||
using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dvpad}>;
|
||||
using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, ({F_dvpad} > 0)>;
|
||||
using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_dpad}, {F_dvpad}, {F_deterministic}, {F_trload}, {F_maxq}, {F_bn0}>;
|
||||
using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dpad}, {F_deterministic}, {F_convert_dq_bn0}>;
|
||||
using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, ({F_dpad} > 0), {F_deterministic}, {F_convert_dq_bn0}>;
|
||||
r = fmha_bwd_<dot_do_o_trait_, dq_dk_dv_trait_, std::conditional_t<{F_convert_dq_enabled}, convert_dq_trait_, void>>(s, a);
|
||||
return r;
|
||||
}}
|
||||
@@ -278,8 +272,8 @@ class FmhaBwdDQDKDVKernel:
|
||||
F_hdim : int # hdim
|
||||
F_dtype : str # data type
|
||||
F_tile : FmhaBwdDQDKDVTileSize
|
||||
F_dpad : str #
|
||||
F_dvpad : str #
|
||||
F_dpad : Literal[0, 8 ,1]
|
||||
F_dvpad : Literal[0, 8 ,1]
|
||||
F_bias : str #
|
||||
F_dbias : str #
|
||||
F_dropout : str #
|
||||
@@ -320,8 +314,8 @@ class FmhaBwdDQDKDVKernel:
|
||||
F_wm1 = self.F_tile.F_wm1,
|
||||
F_wn1 = self.F_tile.F_wn1,
|
||||
F_wk1 = self.F_tile.F_wk1,
|
||||
F_dpad = BOOL_MAP[self.F_dpad],
|
||||
F_dvpad = BOOL_MAP[self.F_dvpad],
|
||||
F_dpad = self.F_dpad,
|
||||
F_dvpad = self.F_dvpad,
|
||||
F_bias = BIAS_MAP[self.F_bias],
|
||||
F_dbias = BOOL_MAP[self.F_dbias],
|
||||
F_dropout = DROPOUT_MAP[self.F_dropout],
|
||||
@@ -337,8 +331,8 @@ class FmhaBwdDQDKDVKernel:
|
||||
def name(self) -> str:
|
||||
def pad_name() -> str:
|
||||
n = ''
|
||||
if self.F_dpad == 't' : n += 'd'
|
||||
if self.F_dvpad == 't' : n += 'dv'
|
||||
if self.F_dpad : n += f'd{self.F_dpad}'
|
||||
if self.F_dvpad : n += f'dv{self.F_dvpad}'
|
||||
if n != '' : n = 'p' + n
|
||||
return n
|
||||
pn = pad_name()
|
||||
@@ -380,6 +374,7 @@ def get_dq_dk_dv_tiles(dtype : str, tr_load: str) -> List[FmhaBwdDQDKDVTileSize]
|
||||
return [
|
||||
FmhaBwdDQDKDVTileSize( 32, 128, 32, 32, 32, 32, 64, 32, 32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
|
||||
FmhaBwdDQDKDVTileSize( 32, 128, 64, 32, 64, 32, 32, 64, 64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
|
||||
FmhaBwdDQDKDVTileSize( 32, 128, 96, 32, 96, 32, 32, 96, 96, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
|
||||
FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
|
||||
# FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
|
||||
FmhaBwdDQDKDVTileSize( 16, 64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
|
||||
@@ -621,8 +616,8 @@ class FmhaBwdApiTrait:
|
||||
dbias : str
|
||||
dropout : str
|
||||
spad1d : str # spad for 1d kernels (dot/convert)
|
||||
dpad : str
|
||||
dvpad : str
|
||||
dpad : Literal[0, 1, 8]
|
||||
dvpad : Literal[0, 1, 8]
|
||||
deterministic : str
|
||||
mask_impl : str
|
||||
tr_load : str
|
||||
@@ -651,13 +646,13 @@ class FmhaBwdApiTrait:
|
||||
|
||||
@property
|
||||
def dcheck(self) -> str:
|
||||
if self.dpad == 't': return f'a.hdim_q % {self.bhdq} != 0'
|
||||
else : return f'a.hdim_q % {self.bhdq} == 0'
|
||||
if self.dpad == 0: return f'a.hdim_q % {self.bhdq} == 0'
|
||||
else: return f'a.hdim_q % {self.dpad} == 0'
|
||||
|
||||
@property
|
||||
def dvcheck(self) -> str:
|
||||
if self.dvpad == 't': return f'a.hdim_v % {self.bhdv} != 0'
|
||||
else : return f'a.hdim_v % {self.bhdv} == 0'
|
||||
if self.dvpad == 0: return f'a.hdim_v % {self.bhdv} == 0'
|
||||
else: return f'a.hdim_v % {self.dvpad} == 0'
|
||||
|
||||
@property
|
||||
def extra_cond(self) -> str:
|
||||
@@ -677,8 +672,9 @@ class FmhaBwdApiTrait:
|
||||
def get_occupancy(dtype, hdim):
|
||||
return 2
|
||||
|
||||
F_dvpad = 't' if self.dvpad else 'f'
|
||||
return FmhaBwdOGradDotOKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_spad=self.spad1d,
|
||||
F_dvpad=self.dvpad, F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim))
|
||||
F_dvpad=F_dvpad, F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim))
|
||||
|
||||
@property
|
||||
def dq_dk_dv_kernel(self) -> FmhaBwdDQDKDVKernel:
|
||||
@@ -693,8 +689,9 @@ class FmhaBwdApiTrait:
|
||||
def get_occupancy(dtype, hdim):
|
||||
return 2
|
||||
|
||||
F_dpad = 't' if self.dpad else 'f'
|
||||
return FmhaBwdConvertQGradKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype,
|
||||
F_bm0=M0_1D, F_bn0=self.convert_dq_bn0, F_spad=self.spad1d, F_dpad=self.dpad,
|
||||
F_bm0=M0_1D, F_bn0=self.convert_dq_bn0, F_spad=self.spad1d, F_dpad=F_dpad,
|
||||
F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim),
|
||||
F_deterministic=self.deterministic, disabled=self.tile.max_seq_q != 0)
|
||||
|
||||
@@ -720,7 +717,7 @@ class FmhaBwdApiPool:
|
||||
F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias],
|
||||
F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
|
||||
F_scheck=trait.scheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=trait.hdim, F_dtype=BWD_DTYPE_MAP[trait.dtype],
|
||||
F_spad1d=BOOL_MAP[trait.spad1d], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
|
||||
F_spad1d=BOOL_MAP[trait.spad1d], F_dpad=trait.dpad, F_dvpad=trait.dvpad,
|
||||
F_deterministic=BOOL_MAP[trait.deterministic], F_trload=BOOL_MAP[trait.tr_load], F_maxq=trait.tile.max_seq_q,
|
||||
F_convert_dq_enabled=BOOL_MAP[not trait.convert_dq_kernel.disabled], F_bn0=trait.tile.F_bn0, F_cond_extra=trait.extra_cond,
|
||||
F_convert_dq_bn0=trait.convert_dq_bn0)
|
||||
@@ -793,7 +790,10 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm
|
||||
|
||||
for dtype, tr_load in itertools.product(BWD_DTYPE_MAP.keys(), ["t", "f"]):
|
||||
tiles: Any = get_dq_dk_dv_tiles(dtype, tr_load)
|
||||
for tile, mode, mask, bias, dbias, dropout, spad1d, dpad, dvpad, deterministic in itertools.product(tiles, MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 4)):
|
||||
dpad_options = itertools.product(*([[0, 8, 1]] * 2))
|
||||
tf = ["t", "f"]
|
||||
for tile, mode, mask, bias, dbias, dropout, spad1d, (dpad, dvpad), deterministic in itertools.product(
|
||||
tiles, MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), tf, DROPOUT_MAP.keys(), tf, dpad_options, tf):
|
||||
assert isinstance(tile, FmhaBwdDQDKDVTileSize), "tile must be FmhaBwdDQDKDVTileSize"
|
||||
hdim = tile.F_bhdq
|
||||
if (mode == "group") and (spad1d == "f"):
|
||||
@@ -804,8 +804,12 @@ def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[Fm
|
||||
continue
|
||||
if ("wg32" in dropout):
|
||||
continue
|
||||
if tr_load == "t" and (dpad == "t" or dvpad == "t"):
|
||||
if tr_load == "t":
|
||||
continue # tr_load cannot work with dpad or dvpad
|
||||
else: # tr_load == "f"
|
||||
# do not generate instance with only 1 of dpad/dvpad being 8
|
||||
if dpad != dvpad and dpad == 8:
|
||||
continue
|
||||
if optdim_list != [-1]:
|
||||
if hdim not in optdim_list:
|
||||
continue
|
||||
|
||||
@@ -259,11 +259,11 @@ class FmhaFwdApiTrait:
|
||||
def skcheck(self) -> str:
|
||||
if self.mode == 'group': return 'true/*group mode skpad always true*/' # group mode only generate spad/skpad == true
|
||||
if self.pipeline_tag == 'qr_async':
|
||||
if self.skpad == 't' : return f'a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0'
|
||||
else : return f'a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0'
|
||||
if self.skpad == 't' : return f'(a.cu_seqlen_kv_ptr != nullptr) || (a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0)'
|
||||
else : return f'(a.cu_seqlen_kv_ptr == nullptr) && (a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0)'
|
||||
elif self.pipeline_tag in ['qr', 'qs']:
|
||||
if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
|
||||
else : return f'a.seqlen_k % {self.bn0} == 0'
|
||||
else : return f'(a.cu_seqlen_kv_ptr == nullptr) && (a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0)'
|
||||
elif self.pipeline_tag == 'qr_async_trload':
|
||||
if self.skpad == 't' : return 'true'
|
||||
else: return 'true'
|
||||
|
||||
@@ -33,6 +33,10 @@ auto create_args(int argc, char* argv[])
|
||||
"0",
|
||||
"seqlen_k for new key/value, 0 means not to use this at all; "
|
||||
"-1 to choose s_knew in [1, s] randomly.")
|
||||
.insert("s_qpad",
|
||||
"-1",
|
||||
"seqlen_q stride between 2 batches (group-mode optional).\n"
|
||||
"Provide positive strides per-batch to simulate physical padding on Q.")
|
||||
.insert("s_kpad",
|
||||
"-1",
|
||||
"seqlen_k stride between 2 batches, currently used in group-mode only\n"
|
||||
@@ -107,7 +111,15 @@ auto create_args(int argc, char* argv[])
|
||||
.insert("warmup", "5", "number of iterations before benchmark the kernel")
|
||||
.insert("repeat", "20", "number of iterations to benchmark the kernel")
|
||||
.insert("json", "0", "0: No Json, 1: Dump Results in Json format")
|
||||
.insert("jsonfile", "fmha_fwd.json", "json file name to dump results");
|
||||
.insert("jsonfile", "fmha_fwd.json", "json file name to dump results")
|
||||
.insert("q_eff_lens",
|
||||
"",
|
||||
"Batch-mode only: per-batch effective seqlen for Q (exclude PAD).\n"
|
||||
"Comma-separated list of length 'b'. If empty, no override.")
|
||||
.insert("kv_eff_lens",
|
||||
"",
|
||||
"Batch-mode only: per-batch effective seqlen for KV (exclude PAD).\n"
|
||||
"Comma-separated list of length 'b'. If empty, no override.");
|
||||
|
||||
bool result = arg_parser.parse(argc, argv);
|
||||
return std::make_tuple(result, arg_parser);
|
||||
@@ -127,6 +139,9 @@ auto run(const ck_tile::ArgParser& arg_parser)
|
||||
ck_tile::index_t hdim_v = arg_parser.get_int("d_v");
|
||||
ck_tile::index_t seqlen_knew = arg_parser.get_int("s_knew");
|
||||
auto seqlen_kpads = arg_parser.get_int_vec("s_kpad");
|
||||
auto seqlen_qpads = arg_parser.get_int_vec("s_qpad");
|
||||
auto q_eff_lens_per_batch = arg_parser.get_int_vec("q_eff_lens");
|
||||
auto kv_eff_lens_per_batch = arg_parser.get_int_vec("kv_eff_lens");
|
||||
ck_tile::index_t rotary_dim = arg_parser.get_int("rotary_dim");
|
||||
bool i_perm = arg_parser.get_bool("iperm");
|
||||
bool o_perm = arg_parser.get_bool("operm");
|
||||
@@ -174,7 +189,10 @@ auto run(const ck_tile::ArgParser& arg_parser)
|
||||
hdim_q,
|
||||
hdim_v,
|
||||
seqlen_knew,
|
||||
seqlen_qpads,
|
||||
seqlen_kpads,
|
||||
q_eff_lens_per_batch,
|
||||
kv_eff_lens_per_batch,
|
||||
rotary_dim,
|
||||
i_perm,
|
||||
o_perm,
|
||||
|
||||
@@ -52,7 +52,16 @@ auto parse_cmd_args(int argc, char* argv[]) -> std::pair<bool, ck_tile::ArgParse
|
||||
"random seed used for initializing input tensors. 0 for "
|
||||
"non-deterministic seed")
|
||||
.insert("warmup", "5", "number of iterations before benchmark the kernel")
|
||||
.insert("repeat", "30", "number of iterations to benchmark the kernel");
|
||||
.insert("repeat", "30", "number of iterations to benchmark the kernel")
|
||||
// Optional effective seqlen override (exclude PAD) for batch mode
|
||||
.insert("q_eff_lens",
|
||||
"",
|
||||
"Batch-mode only: per-batch effective seqlen for Q (exclude PAD).\n"
|
||||
"Comma-separated list of length 'b'. If empty, no override.")
|
||||
.insert("kv_eff_lens",
|
||||
"",
|
||||
"Batch-mode only: per-batch effective seqlen for KV (exclude PAD).\n"
|
||||
"Comma-separated list of length 'b'. If empty, no override.");
|
||||
|
||||
bool result = arg_parser.parse(argc, argv);
|
||||
return std::make_pair(result, arg_parser);
|
||||
@@ -111,6 +120,8 @@ struct Problem
|
||||
|
||||
input_layout = args.get_int("iperm") == 1 ? TensorLayout::bhsd : TensorLayout::bshd;
|
||||
output_layout = args.get_int("operm") == 1 ? TensorLayout::bhsd : TensorLayout::bshd;
|
||||
q_eff_lens = args.get_int_vec("q_eff_lens");
|
||||
kv_eff_lens = args.get_int_vec("kv_eff_lens");
|
||||
}
|
||||
|
||||
std::vector<ck_tile::index_t> get_query_shape() const
|
||||
@@ -172,6 +183,8 @@ struct Problem
|
||||
mask_info mask;
|
||||
TensorLayout input_layout;
|
||||
TensorLayout output_layout;
|
||||
std::vector<int> q_eff_lens;
|
||||
std::vector<int> kv_eff_lens;
|
||||
};
|
||||
|
||||
struct RunConfig
|
||||
@@ -326,8 +339,10 @@ bool run_impl(const Problem& problem, const RunConfig& run_config)
|
||||
q_buf.ToDevice(q.data());
|
||||
k_buf.ToDevice(k.data());
|
||||
v_buf.ToDevice(v.data());
|
||||
// Ensure output buffer is zero-initialized so padded regions compare cleanly
|
||||
o_buf.SetZero();
|
||||
|
||||
ck_tile::fmha_fwd_v3_args args;
|
||||
ck_tile::fmha_fwd_v3_args args{};
|
||||
|
||||
args.data_type = problem.data_type;
|
||||
args.batch = problem.batch;
|
||||
@@ -380,6 +395,60 @@ bool run_impl(const Problem& problem, const RunConfig& run_config)
|
||||
: problem.seqlen_q * problem.hdim;
|
||||
args.batch_stride_o = problem.seqlen_q * problem.nhead_q * problem.hdim;
|
||||
|
||||
// Optional cumulative seqlen overrides (exclude PAD)
|
||||
const bool has_varlen_q = !problem.q_eff_lens.empty() && problem.q_eff_lens[0] != -1;
|
||||
const bool has_varlen_k = !problem.kv_eff_lens.empty() && problem.kv_eff_lens[0] != -1;
|
||||
|
||||
auto make_effective_vec = [&](const std::vector<int>& opt_vec, ck_tile::index_t fallback) {
|
||||
std::vector<ck_tile::index_t> eff;
|
||||
if(!opt_vec.empty() && opt_vec[0] != -1)
|
||||
{
|
||||
eff.assign(opt_vec.begin(), opt_vec.end());
|
||||
if(eff.size() < static_cast<size_t>(problem.batch))
|
||||
{
|
||||
eff.resize(problem.batch, eff.back());
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
eff.assign(problem.batch, fallback);
|
||||
}
|
||||
return eff;
|
||||
};
|
||||
|
||||
const auto eff_q_vec = make_effective_vec(problem.q_eff_lens, problem.seqlen_q);
|
||||
const auto eff_kv_vec = make_effective_vec(problem.kv_eff_lens, problem.seqlen_k);
|
||||
|
||||
// Calculate cumulative sums for kernel arguments if varlen is used
|
||||
std::vector<ck_tile::index_t> cuq_cum, cukv_cum;
|
||||
auto calculate_cumulative = [&](const std::vector<ck_tile::index_t>& per_batch_vec,
|
||||
std::vector<ck_tile::index_t>& cum_vec) {
|
||||
cum_vec.resize(per_batch_vec.size() + 1);
|
||||
cum_vec[0] = 0;
|
||||
for(std::size_t i = 0; i < per_batch_vec.size(); ++i)
|
||||
cum_vec[i + 1] = cum_vec[i] + per_batch_vec[i];
|
||||
};
|
||||
|
||||
if(has_varlen_q)
|
||||
{
|
||||
calculate_cumulative(eff_q_vec, cuq_cum);
|
||||
}
|
||||
if(has_varlen_k)
|
||||
{
|
||||
calculate_cumulative(eff_kv_vec, cukv_cum);
|
||||
}
|
||||
|
||||
ck_tile::DeviceMem cuq_buf(!cuq_cum.empty() ? cuq_cum.size() * sizeof(ck_tile::index_t) : 0);
|
||||
ck_tile::DeviceMem cukv_buf(!cukv_cum.empty() ? cukv_cum.size() * sizeof(ck_tile::index_t) : 0);
|
||||
cuq_buf.ToDevice(!cuq_cum.empty() ? cuq_cum.data() : nullptr);
|
||||
cukv_buf.ToDevice(!cukv_cum.empty() ? cukv_cum.data() : nullptr);
|
||||
args.cu_seqlen_q_ptr =
|
||||
!cuq_cum.empty() ? reinterpret_cast<const ck_tile::index_t*>(cuq_buf.GetDeviceBuffer())
|
||||
: nullptr;
|
||||
args.cu_seqlen_kv_ptr =
|
||||
!cukv_cum.empty() ? reinterpret_cast<const ck_tile::index_t*>(cukv_buf.GetDeviceBuffer())
|
||||
: nullptr;
|
||||
|
||||
ck_tile::stream_config stream_config{nullptr,
|
||||
true,
|
||||
/*log_level=*/0,
|
||||
@@ -442,15 +511,72 @@ bool run_impl(const Problem& problem, const RunConfig& run_config)
|
||||
o_ref = o_ref.transpose({0, 2, 1, 3});
|
||||
}
|
||||
|
||||
host::fmha_fwd<float, DataType>(q,
|
||||
k,
|
||||
v,
|
||||
problem.mask,
|
||||
o_ref,
|
||||
ck_tile::identity{},
|
||||
ck_tile::identity{},
|
||||
ck_tile::identity{},
|
||||
ck_tile::scales{problem.softmax_scale});
|
||||
// If variable lengths are provided, compute per-batch references
|
||||
// with the effective lengths; else compute a single full reference.
|
||||
if(has_varlen_q || has_varlen_k)
|
||||
{
|
||||
// Variable-length aware verification: zero-fill padded region and only compute valid part.
|
||||
o_ref.SetZero();
|
||||
|
||||
for(int b = 0; b < problem.batch; ++b)
|
||||
{
|
||||
const ck_tile::index_t seqlen_q_eff = eff_q_vec[b];
|
||||
const ck_tile::index_t seqlen_kv_eff = eff_kv_vec[b];
|
||||
|
||||
if(seqlen_q_eff <= 0 || seqlen_kv_eff <= 0)
|
||||
continue;
|
||||
|
||||
// Slice current batch from inputs (bshd) and build single-batch tensors
|
||||
ck_tile::HostTensor<DataType> q_b({1, seqlen_q_eff, problem.nhead_q, problem.hdim});
|
||||
ck_tile::HostTensor<DataType> k_b({1, seqlen_kv_eff, problem.nhead_kv, problem.hdim});
|
||||
ck_tile::HostTensor<DataType> v_b({1, seqlen_kv_eff, problem.nhead_kv, problem.hdim});
|
||||
ck_tile::HostTensor<DataType> o_b({1, seqlen_q_eff, problem.nhead_q, problem.hdim});
|
||||
|
||||
// Copy effective region
|
||||
q_b.ForEach([&](auto& self, auto idx) {
|
||||
// idx: [0, s, h, d]
|
||||
self(idx) = q(b, idx[1], idx[2], idx[3]);
|
||||
});
|
||||
k_b.ForEach([&](auto& self, auto idx) { self(idx) = k(b, idx[1], idx[2], idx[3]); });
|
||||
v_b.ForEach([&](auto& self, auto idx) { self(idx) = v(b, idx[1], idx[2], idx[3]); });
|
||||
|
||||
// Compute reference for this batch segment (host::fmha_fwd expects bshd tensors)
|
||||
host::fmha_fwd<float, DataType>(q_b,
|
||||
k_b,
|
||||
v_b,
|
||||
problem.mask,
|
||||
o_b,
|
||||
ck_tile::identity{},
|
||||
ck_tile::identity{},
|
||||
ck_tile::identity{},
|
||||
ck_tile::scales{problem.softmax_scale});
|
||||
|
||||
// Scatter into o_ref's bshd descriptor memory
|
||||
for(int s = 0; s < seqlen_q_eff; ++s)
|
||||
{
|
||||
for(int h = 0; h < problem.nhead_q; ++h)
|
||||
{
|
||||
for(int d = 0; d < problem.hdim; ++d)
|
||||
{
|
||||
o_ref(b, s, h, d) = o_b(0, s, h, d);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// No varlen override: compute the full reference once
|
||||
host::fmha_fwd<float, DataType>(q,
|
||||
k,
|
||||
v,
|
||||
problem.mask,
|
||||
o_ref,
|
||||
ck_tile::identity{},
|
||||
ck_tile::identity{},
|
||||
ck_tile::identity{},
|
||||
ck_tile::scales{problem.softmax_scale});
|
||||
}
|
||||
|
||||
ck_tile::HostTensor<DataType> o(problem.get_output_shape());
|
||||
o_buf.FromDevice(o.data());
|
||||
|
||||
@@ -15,6 +15,10 @@
|
||||
#include <utility>
|
||||
#include <variant>
|
||||
|
||||
struct FmhaBwdFp32
|
||||
{
|
||||
};
|
||||
|
||||
struct FmhaBwdFp16
|
||||
{
|
||||
};
|
||||
@@ -26,6 +30,26 @@ struct FmhaBwdBf16
|
||||
template <typename DataType>
|
||||
struct FmhaBwdTypeConfig;
|
||||
|
||||
template <>
|
||||
struct FmhaBwdTypeConfig<FmhaBwdFp32>
|
||||
{
|
||||
using QDataType = float;
|
||||
using KDataType = float;
|
||||
using VDataType = float;
|
||||
using GemmDataType = float;
|
||||
using BiasDataType = float;
|
||||
using LSEDataType = float;
|
||||
using AccDataType = float; // data type for gemm accumulation
|
||||
using DDataType = float;
|
||||
using RandValOutputDataType = uint8_t;
|
||||
using ODataType = float;
|
||||
using OGradDataType = float;
|
||||
using QGradDataType = float;
|
||||
using KGradDataType = float;
|
||||
using VGradDataType = float;
|
||||
using BiasGradDataType = float;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct FmhaBwdTypeConfig<FmhaBwdFp16>
|
||||
{
|
||||
@@ -368,8 +392,8 @@ template <ck_tile::index_t HDim_,
|
||||
typename FmhaDropout_,
|
||||
ck_tile::BlockAttentionBiasEnum BiasEnum_,
|
||||
bool kHasBiasGrad_,
|
||||
bool kPadD_,
|
||||
bool kPadDv_,
|
||||
ck_tile::index_t kPadD_,
|
||||
ck_tile::index_t kPadDv_,
|
||||
bool kIsDeterministic_,
|
||||
bool kUseTrLoad_,
|
||||
ck_tile::index_t MaxSeqLenQ_,
|
||||
|
||||
@@ -763,15 +763,21 @@ bwd_result fmha_bwd_run(mode_enum mode,
|
||||
ck_tile::FillConstant<QGradDataType>{ck_tile::numeric<QGradDataType>::infinity()}(dq_host);
|
||||
ck_tile::FillConstant<KGradDataType>{ck_tile::numeric<KGradDataType>::infinity()}(dk_host);
|
||||
ck_tile::FillConstant<VGradDataType>{ck_tile::numeric<VGradDataType>::infinity()}(dv_host);
|
||||
ck_tile::FillConstant<AccDataType>{ck_tile::numeric<AccDataType>::infinity()}(dq_acc_host);
|
||||
dq_buf.ToDevice(dq_host.data());
|
||||
dk_buf.ToDevice(dk_host.data());
|
||||
dv_buf.ToDevice(dv_host.data());
|
||||
dq_acc_buf.ToDevice(dq_acc_host.data());
|
||||
|
||||
o_buf.ToDevice(o_host.data());
|
||||
lse_buf.ToDevice(lse_host.data());
|
||||
dq_buf.SetZero();
|
||||
dbias_buf.SetZero();
|
||||
dq_acc_buf.SetZero();
|
||||
|
||||
// non-deterministic kernels use atomic add to write dq
|
||||
// Some block may be skipped with causal mask and dq are not set to zeros
|
||||
// In these cases thus we need to zero out it first
|
||||
if(!deterministic || mask.type == mask_enum::no_mask)
|
||||
dq_acc_buf.SetZero();
|
||||
|
||||
ck_tile::stream_config stream_config_v{nullptr, true, 0, 0, 1};
|
||||
fmha_bwd(fmha_traits, fmha_args, stream_config_v);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user