mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-07-02 04:37:02 +00:00
Merge branch 'develop' into amd-develop
This commit is contained in:
1
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
1
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
@@ -0,0 +1 @@
|
||||
blank_issues_enabled: true
|
||||
221
.github/ISSUE_TEMPLATE/issue_report.yml
vendored
Normal file
221
.github/ISSUE_TEMPLATE/issue_report.yml
vendored
Normal file
@@ -0,0 +1,221 @@
|
||||
name: Issue Report
|
||||
description: File a report for ROCm related issues on Linux and Windows. For issues pertaining to documentation or non-bug related, please open a blank issue located below.
|
||||
title: "[Issue]: "
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thank you for taking the time to fill out this report!
|
||||
|
||||
You can acquire your OS, CPU, GPU (for filling out this report) with the following commands:
|
||||
|
||||
Linux:
|
||||
echo "OS:" && cat /etc/os-release | grep -E "^(NAME=|VERSION=)";
|
||||
echo "CPU: " && cat /proc/cpuinfo | grep "model name" | sort --unique;
|
||||
echo "GPU:" && /opt/rocm/bin/rocminfo | grep -E "^\s*(Name|Marketing Name)";
|
||||
|
||||
Windows:
|
||||
(Get-WmiObject Win32_OperatingSystem).Version
|
||||
(Get-WmiObject win32_Processor).Name
|
||||
(Get-WmiObject win32_VideoController).Name
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Problem Description
|
||||
description: Describe the issue you encountered.
|
||||
validations:
|
||||
required: true
|
||||
- type: input
|
||||
attributes:
|
||||
label: Operating System
|
||||
description: What is the name and version number of the OS?
|
||||
placeholder: "e.g. Ubuntu 22.04.3 LTS (Jammy Jellyfish)"
|
||||
validations:
|
||||
required: true
|
||||
- type: input
|
||||
attributes:
|
||||
label: CPU
|
||||
description: What CPU did you encounter the issue on?
|
||||
placeholder: "e.g. AMD Ryzen 9 5900HX with Radeon Graphics"
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
attributes:
|
||||
label: GPU
|
||||
description: What GPU(s) did you encounter the issue on (you can select multiple GPUs from the list)
|
||||
multiple: true
|
||||
options:
|
||||
- AMD Instinct MI300X
|
||||
- AMD Instinct MI300A
|
||||
- AMD Instinct MI300
|
||||
- AMD Instinct MI250X
|
||||
- AMD Instinct MI250
|
||||
- AMD Instinct MI210
|
||||
- AMD Instinct MI100
|
||||
- AMD Instinct MI50
|
||||
- AMD Instinct MI25
|
||||
- AMD Radeon Pro V620
|
||||
- AMD Radeon Pro VII
|
||||
- AMD Radeon RX 7900 XTX
|
||||
- AMD Radeon VII
|
||||
- AMD Radeon Pro W7900
|
||||
- AMD Radeon Pro W7800
|
||||
- AMD Radeon Pro W6800
|
||||
- AMD Radeon Pro W6600
|
||||
- AMD Radeon Pro W5500
|
||||
- AMD Radeon RX 7900 XT
|
||||
- AMD Radeon RX 7600
|
||||
- AMD Radeon RX 6950 XT
|
||||
- AMD Radeon RX 6900 XT
|
||||
- AMD Radeon RX 6800 XT
|
||||
- AMD Radeon RX 6800
|
||||
- AMD Radeon RX 6750
|
||||
- AMD Radeon RX 6700 XT
|
||||
- AMD Radeon RX 6700
|
||||
- AMD Radeon RX 6650 XT
|
||||
- AMD Radeon RX 6600 XT
|
||||
- AMD Radeon RX 6600
|
||||
- Other
|
||||
validations:
|
||||
required: true
|
||||
- type: input
|
||||
attributes:
|
||||
label: Other
|
||||
description: If you selected Other, please specify
|
||||
- type: dropdown
|
||||
attributes:
|
||||
label: ROCm Version
|
||||
description: What version(s) of ROCm did you encounter the issue on?
|
||||
multiple: true
|
||||
options:
|
||||
- ROCm 6.0.0
|
||||
- ROCm 5.7.1
|
||||
- ROCm 5.7.0
|
||||
- ROCm 5.6.1
|
||||
- ROCm 5.6.0
|
||||
- ROCm 5.5.1
|
||||
- ROCm 5.5.0
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
attributes:
|
||||
label: ROCm Component
|
||||
description: (Optional) If this issue relates to a specific ROCm component, it can be mentioned here.
|
||||
multiple: true
|
||||
options:
|
||||
- Other
|
||||
- AMD Common Language Runtime
|
||||
- AMD MIGraphX
|
||||
- AMD System Management Interface
|
||||
- amdgpu KCL/autoconf
|
||||
- amdgpu Kernel-mode GPU Driver
|
||||
- amdgpu-install
|
||||
- AOMP
|
||||
- AOMP Extras
|
||||
- AqlProfile
|
||||
- build-infra
|
||||
- chelsio
|
||||
- clang-ocl
|
||||
- Composable Kernel
|
||||
- dkms
|
||||
- docker / ROCm-docker
|
||||
- flang
|
||||
- gpuburn
|
||||
- half
|
||||
- HIP
|
||||
- HIP Examples
|
||||
- hipBLAS
|
||||
- hipBLASLt
|
||||
- HIPCC
|
||||
- hipCUB
|
||||
- hip-examples-private
|
||||
- hipFFT
|
||||
- hipfort
|
||||
- HIPIFY
|
||||
- hipRAND
|
||||
- hipSOLVER
|
||||
- hipSPARSE
|
||||
- hipSPARSELt
|
||||
- hipTensor
|
||||
- hip-tests
|
||||
- HSA Runtime
|
||||
- infrastructure
|
||||
- jenkins-utils
|
||||
- libdrm
|
||||
- Linux BPI packaging framework
|
||||
- llvm-project
|
||||
- Mesa
|
||||
- meta
|
||||
- MIOpen
|
||||
- MIVisionX
|
||||
- ml-framework-ci
|
||||
- MLSEQA_TestRepo
|
||||
- OpenCL API C++ Bindings
|
||||
- OpenCL API Headers
|
||||
- OpenCL Conformance Test Suite
|
||||
- OpenCL ICD Loader
|
||||
- perftest-p2p
|
||||
- prototype
|
||||
- RCCL
|
||||
- rccl-rdma-sharp-plugins
|
||||
- rocALUTION
|
||||
- rocBLAS
|
||||
- ROCdbgapi
|
||||
- ROCdebug-agent
|
||||
- rocFFT
|
||||
- ROCgdb
|
||||
- ROCK
|
||||
- ROCm Documentation/Website
|
||||
- ROCm Data Center Tool
|
||||
- ROCm Examples
|
||||
- ROCm for Windows
|
||||
- ROCm Performance Primitives
|
||||
- ROCm System Management Interface Library
|
||||
- ROCm Thrust
|
||||
- ROCm Validation Suite
|
||||
- rocm_bandwidth_test
|
||||
- rocm-cmake
|
||||
- rocm-core
|
||||
- rocm-docs-core
|
||||
- rocminfo
|
||||
- rocMLIR
|
||||
- rocmtools
|
||||
- rocPRIM
|
||||
- rocprofiler
|
||||
- rocRAND
|
||||
- ROCR-Runtime
|
||||
- rocSOLVER
|
||||
- rocSPARSE
|
||||
- roctracer
|
||||
- ROCT-Thunk-Interface
|
||||
- rocWMMA
|
||||
- Tensile
|
||||
- umr
|
||||
- ibv_rc_pingpong-amd
|
||||
- mellanox
|
||||
- mpitest
|
||||
- Pytorch
|
||||
- Tensorflow
|
||||
- APEX
|
||||
- torchvision
|
||||
- Magma
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Steps to Reproduce
|
||||
description: (Optional) Detailed steps to reproduce the issue.
|
||||
validations:
|
||||
required: false
|
||||
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: (Optional for Linux users) Output of /opt/rocm/bin/rocminfo --support
|
||||
description: The output of rocminfo --support could help to better address the problem.
|
||||
validations:
|
||||
required: false
|
||||
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Additional Information
|
||||
description: (Optional) Any additional information that is relevant, e.g. relevant environment variables, dockerfiles, log files, dmesg output (on Linux), etc.
|
||||
validations:
|
||||
required: false
|
||||
6
.github/dependabot.yml
vendored
6
.github/dependabot.yml
vendored
@@ -10,3 +10,9 @@ updates:
|
||||
open-pull-requests-limit: 10
|
||||
schedule:
|
||||
interval: "daily"
|
||||
labels:
|
||||
- "documentation"
|
||||
- "dependencies"
|
||||
- "ci:docs-only"
|
||||
reviewers:
|
||||
- "samjwu"
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -54,5 +54,4 @@ _images/
|
||||
_static/
|
||||
_templates/
|
||||
_toc.yml
|
||||
docBin/
|
||||
_doxygen/
|
||||
|
||||
@@ -3,11 +3,6 @@
|
||||
|
||||
version: 2
|
||||
|
||||
build:
|
||||
os: ubuntu-22.04
|
||||
tools:
|
||||
python: "3.8"
|
||||
|
||||
sphinx:
|
||||
configuration: docs/conf.py
|
||||
|
||||
@@ -16,3 +11,8 @@ formats: [htmlzip, pdf, epub]
|
||||
python:
|
||||
install:
|
||||
- requirements: docs/sphinx/requirements.txt
|
||||
|
||||
build:
|
||||
os: ubuntu-22.04
|
||||
tools:
|
||||
python: "3.8"
|
||||
|
||||
@@ -19,6 +19,7 @@ None
|
||||
- Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
|
||||
- Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799)
|
||||
- Support for Batched Gemm DL (#732)
|
||||
- Introduce wrapper sublibrary (limited functionality). (#1071, #1098)
|
||||
|
||||
### Changes
|
||||
- Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
|
||||
|
||||
@@ -59,9 +59,9 @@ authors:
|
||||
family-names: Zhou
|
||||
- given-names: Jianfeng
|
||||
family-names: Yan
|
||||
repository-code: 'https://github.com/ROCmSoftwarePlatform/composable_kernel'
|
||||
repository-code: 'https://github.com/ROCm/composable_kernel'
|
||||
abstract: Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for Machine Learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel progarmming languages, like HIP C++.
|
||||
keywords:
|
||||
- 'CK, Composable Kernel, Tensor Coordinate Transformation'
|
||||
license: MIT
|
||||
license-url: https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE
|
||||
license-url: https://github.com/ROCm/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE
|
||||
|
||||
@@ -146,6 +146,33 @@ if(${hip_VERSION_FLAT} GREATER 500723302)
|
||||
add_compile_options(-fno-offload-uniform-block)
|
||||
endif()
|
||||
|
||||
#
|
||||
# Seperate linking jobs from compiling
|
||||
# Too many concurrent linking jobs can break the build
|
||||
# Copied from LLVM
|
||||
set(CK_PARALLEL_LINK_JOBS "" CACHE STRING
|
||||
"Define the maximum number of concurrent link jobs (Ninja only).")
|
||||
if(CMAKE_GENERATOR MATCHES "Ninja")
|
||||
if(CK_PARALLEL_LINK_JOBS)
|
||||
set_property(GLOBAL APPEND PROPERTY JOB_POOLS link_job_pool=${CK_PARALLEL_LINK_JOBS})
|
||||
set(CMAKE_JOB_POOL_LINK link_job_pool)
|
||||
endif()
|
||||
elseif(CK_PARALLEL_LINK_JOBS)
|
||||
message(WARNING "Job pooling is only available with Ninja generators.")
|
||||
endif()
|
||||
# Similar for compiling
|
||||
set(CK_PARALLEL_COMPILE_JOBS "" CACHE STRING
|
||||
"Define the maximum number of concurrent compile jobs (Ninja only).")
|
||||
if(CMAKE_GENERATOR MATCHES "Ninja")
|
||||
if(CK_PARALLEL_COMPILE_JOBS)
|
||||
set_property(GLOBAL APPEND PROPERTY JOB_POOLS compile_job_pool=${CK_PARALLEL_COMPILE_JOBS})
|
||||
set(CMAKE_JOB_POOL_COMPILE compile_job_pool)
|
||||
endif()
|
||||
elseif(CK_PARALLEL_COMPILE_JOBS)
|
||||
message(WARNING "Job pooling is only available with Ninja generators.")
|
||||
endif()
|
||||
|
||||
|
||||
option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
|
||||
option(USE_OPT_NAVI3X, "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF)
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
FROM ubuntu:20.04
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
ARG ROCMVERSION=5.7
|
||||
ARG ROCMVERSION=6.0
|
||||
ARG compiler_version=""
|
||||
ARG compiler_commit=""
|
||||
|
||||
@@ -16,8 +16,8 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
|
||||
ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
|
||||
RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
|
||||
|
||||
RUN wget https://repo.radeon.com/amdgpu-install/5.7/ubuntu/focal/amdgpu-install_5.7.50700-1_all.deb --no-check-certificate
|
||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_5.7.50700-1_all.deb
|
||||
RUN wget https://repo.radeon.com/amdgpu-install/6.0/ubuntu/focal/amdgpu-install_6.0.60000-1_all.deb --no-check-certificate
|
||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.0.60000-1_all.deb
|
||||
|
||||
RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
|
||||
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
|
||||
|
||||
59
Jenkinsfile
vendored
59
Jenkinsfile
vendored
@@ -33,7 +33,7 @@ def runShell(String command){
|
||||
|
||||
def getDockerImageName(){
|
||||
def img
|
||||
if (params.ROCMVERSION != "6.0"){
|
||||
if (params.ROCMVERSION != "6.1"){
|
||||
if (params.COMPILER_VERSION == "") {
|
||||
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
|
||||
}
|
||||
@@ -302,7 +302,7 @@ def buildHipClangJob(Map conf=[:]){
|
||||
def retimage
|
||||
(retimage, image) = getDockerImage(conf)
|
||||
|
||||
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
|
||||
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
|
||||
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
|
||||
timeout(time: 5, unit: 'HOURS')
|
||||
{
|
||||
@@ -355,7 +355,7 @@ def runCKProfiler(Map conf=[:]){
|
||||
def variant = env.STAGE_NAME
|
||||
def retimage
|
||||
|
||||
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
|
||||
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
|
||||
try {
|
||||
(retimage, image) = getDockerImage(conf)
|
||||
withDockerContainer(image: image, args: dockerOpts) {
|
||||
@@ -487,7 +487,7 @@ def Build_CK(Map conf=[:]){
|
||||
def retimage
|
||||
def navi_node = 0
|
||||
|
||||
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
|
||||
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
|
||||
try {
|
||||
(retimage, image) = getDockerImage(conf)
|
||||
withDockerContainer(image: image, args: dockerOpts) {
|
||||
@@ -553,7 +553,7 @@ def Build_CK(Map conf=[:]){
|
||||
sh """#!/bin/bash
|
||||
rm -rf "${params.hipTensor_branch}".zip
|
||||
rm -rf hipTensor-"${params.hipTensor_branch}"
|
||||
wget https://github.com/ROCmSoftwarePlatform/hipTensor/archive/refs/heads/"${params.hipTensor_branch}".zip
|
||||
wget https://github.com/ROCm/hipTensor/archive/refs/heads/"${params.hipTensor_branch}".zip
|
||||
unzip -o "${params.hipTensor_branch}".zip
|
||||
"""
|
||||
dir("hipTensor-${params.hipTensor_branch}"){
|
||||
@@ -605,7 +605,7 @@ def process_results(Map conf=[:]){
|
||||
def variant = env.STAGE_NAME
|
||||
def retimage
|
||||
|
||||
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
|
||||
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
|
||||
try {
|
||||
(retimage, image) = getDockerImage(conf)
|
||||
}
|
||||
@@ -655,8 +655,8 @@ def process_results(Map conf=[:]){
|
||||
}
|
||||
|
||||
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
|
||||
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION=
|
||||
0 21 * * * % ROCMVERSION=5.7;COMPILER_VERSION=;COMPILER_COMMIT=
|
||||
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.0;COMPILER_VERSION=
|
||||
0 21 * * * % ROCMVERSION=6.0;COMPILER_VERSION=;COMPILER_COMMIT=
|
||||
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=;USE_SCCACHE=false
|
||||
0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;COMPILER_COMMIT=;USE_SCCACHE=false''' : ""
|
||||
|
||||
@@ -675,8 +675,8 @@ pipeline {
|
||||
description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
|
||||
string(
|
||||
name: 'ROCMVERSION',
|
||||
defaultValue: '5.7',
|
||||
description: 'Specify which ROCM version to use: 5.7 (default).')
|
||||
defaultValue: '6.0',
|
||||
description: 'Specify which ROCM version to use: 6.0 (default).')
|
||||
string(
|
||||
name: 'COMPILER_VERSION',
|
||||
defaultValue: '',
|
||||
@@ -703,8 +703,8 @@ pipeline {
|
||||
description: "Use the CK build to verify hipTensor build and tests (default: ON)")
|
||||
string(
|
||||
name: 'hipTensor_branch',
|
||||
defaultValue: 'develop',
|
||||
description: 'Specify which branch of hipTensor to use (default: develop)')
|
||||
defaultValue: 'mainline',
|
||||
description: 'Specify which branch of hipTensor to use (default: mainline)')
|
||||
booleanParam(
|
||||
name: "USE_SCCACHE",
|
||||
defaultValue: true,
|
||||
@@ -768,8 +768,15 @@ pipeline {
|
||||
}
|
||||
agent{ label rocmnode("gfx908 || gfx90a") }
|
||||
environment{
|
||||
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -DCMAKE_EXE_LINKER_FLAGS=" -L ${env.WORKSPACE}/script -T hip_fatbin_insert " """
|
||||
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
|
||||
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \
|
||||
-DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \
|
||||
-DCMAKE_EXE_LINKER_FLAGS=" -L ${env.WORKSPACE}/script -T hip_fatbin_insert " \
|
||||
-DCMAKE_CXX_FLAGS=" -O3 " """
|
||||
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
|
||||
cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
|
||||
-DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \
|
||||
-DCMAKE_CXX_COMPILER="${build_compiler()}" \
|
||||
-DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
|
||||
}
|
||||
steps{
|
||||
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
|
||||
@@ -784,8 +791,12 @@ pipeline {
|
||||
}
|
||||
agent{ label rocmnode("gfx908 || gfx90a") }
|
||||
environment{
|
||||
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" """
|
||||
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
|
||||
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " """
|
||||
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
|
||||
cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
|
||||
-DGPU_TARGETS="gfx908;gfx90a" \
|
||||
-DCMAKE_CXX_COMPILER="${build_compiler()}" \
|
||||
-DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
|
||||
}
|
||||
steps{
|
||||
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
|
||||
@@ -800,8 +811,12 @@ pipeline {
|
||||
}
|
||||
agent{ label rocmnode("navi21") }
|
||||
environment{
|
||||
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON """
|
||||
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
|
||||
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """
|
||||
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
|
||||
cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
|
||||
-DGPU_TARGETS="gfx1030" \
|
||||
-DCMAKE_CXX_COMPILER="${build_compiler()}" \
|
||||
-DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
|
||||
}
|
||||
steps{
|
||||
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
|
||||
@@ -816,8 +831,12 @@ pipeline {
|
||||
}
|
||||
agent{ label rocmnode("navi32") }
|
||||
environment{
|
||||
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON """
|
||||
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
|
||||
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """
|
||||
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
|
||||
cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
|
||||
-DGPU_TARGETS="gfx1101" \
|
||||
-DCMAKE_CXX_COMPILER="${build_compiler()}" \
|
||||
-DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
|
||||
}
|
||||
steps{
|
||||
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
|
||||
|
||||
@@ -32,7 +32,6 @@ python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
|
||||
```
|
||||
|
||||
You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page.
|
||||
page.
|
||||
|
||||
```note
|
||||
If you use CK, cite us as follows:
|
||||
@@ -71,7 +70,7 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
|
||||
3. Clone CK source code from the GitHub repository and start the build:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git && \
|
||||
git clone https://github.com/ROCm/composable_kernel.git && \
|
||||
cd composable_kernel && \
|
||||
mkdir build && \
|
||||
cd build
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
using BiasLayout = ck::tensor_layout::convolution::G_K;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ScaleAddScaleAddRelu = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
|
||||
|
||||
@@ -64,6 +65,9 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
|
||||
std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
|
||||
std::array<ck::index_t, 6> out_strides{
|
||||
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
|
||||
// Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
|
||||
std::array<ck::index_t, 6> bias_lengths{G, 1, K, 1, 1, 1};
|
||||
std::array<ck::index_t, 6> bias_strides{K, 0, 1, 0, 0, 0};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
|
||||
@@ -74,13 +78,13 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
|
||||
SimpleDeviceMem d0(sizeof(std::tuple_element_t<0, DDataTypes>) * N * Do * Ho * Wo * G * K);
|
||||
SimpleDeviceMem d1(sizeof(std::tuple_element_t<1, DDataTypes>) * N * Do * Ho * Wo * G * K);
|
||||
SimpleDeviceMem d1(sizeof(std::tuple_element_t<1, DDataTypes>) * G * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
|
||||
NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<OutLayout, OutLayout>,
|
||||
ck::Tuple<OutLayout, BiasLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
@@ -117,8 +121,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{out_lengths, out_lengths},
|
||||
{out_strides, out_strides},
|
||||
{out_lengths, bias_lengths},
|
||||
{out_strides, bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
@@ -187,8 +191,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{out_lengths, out_lengths},
|
||||
{out_strides, out_strides},
|
||||
{out_lengths, bias_lengths},
|
||||
{out_strides, bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
|
||||
4
client_example/25_tensor_transforms/CMakeLists.txt
Normal file
4
client_example/25_tensor_transforms/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
add_executable(client_tensor_transform tensor_transform.cpp)
|
||||
target_link_libraries(client_tensor_transform PRIVATE composable_kernel::device_other_operations)
|
||||
add_executable(client_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp)
|
||||
target_link_libraries(client_tensor_transform_using_wrapper PRIVATE composable_kernel::device_other_operations)
|
||||
@@ -9,7 +9,7 @@
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/sequence.hpp"
|
||||
|
||||
#include "tensor_transform_wrapper.hpp"
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
|
||||
using DataType = int;
|
||||
|
||||
@@ -17,7 +17,7 @@ template <typename Layout>
|
||||
void Print1d(const Layout& layout)
|
||||
{
|
||||
std::cout << "Print1d" << std::endl;
|
||||
for(ck::index_t w = 0; w < ck::tensor_transform_wrapper::size(layout); w++)
|
||||
for(ck::index_t w = 0; w < ck::wrapper::size(layout); w++)
|
||||
{
|
||||
std::cout << layout(ck::make_tuple(w)) << " ";
|
||||
}
|
||||
@@ -28,9 +28,9 @@ template <typename Layout>
|
||||
void Print2d(const Layout& layout)
|
||||
{
|
||||
std::cout << "Print2d" << std::endl;
|
||||
for(ck::index_t h = 0; h < ck::tensor_transform_wrapper::size<0>(layout); h++)
|
||||
for(ck::index_t h = 0; h < ck::wrapper::size<0>(layout); h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < ck::tensor_transform_wrapper::size<1>(layout); w++)
|
||||
for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
|
||||
{
|
||||
std::cout << layout(ck::make_tuple(h, w)) << " ";
|
||||
}
|
||||
@@ -43,15 +43,11 @@ template <typename Layout>
|
||||
void Print3dCustom(const Layout& layout)
|
||||
{
|
||||
std::cout << "Print3dCustom" << std::endl;
|
||||
for(ck::index_t d = 0;
|
||||
d < ck::tensor_transform_wrapper::size<0>(ck::tensor_transform_wrapper::get<0>(layout));
|
||||
d++)
|
||||
for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++)
|
||||
{
|
||||
for(ck::index_t h = 0;
|
||||
h < ck::tensor_transform_wrapper::size<1>(ck::tensor_transform_wrapper::get<0>(layout));
|
||||
h++)
|
||||
for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < ck::tensor_transform_wrapper::size<1>(layout); w++)
|
||||
for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
|
||||
{
|
||||
std::cout << layout(ck::make_tuple(ck::make_tuple(d, h), w)) << " ";
|
||||
}
|
||||
@@ -68,7 +64,7 @@ int main()
|
||||
// Basic descriptor 0, 1, 2, ... 30, 31 (compile-time descriptor)
|
||||
// (dims:4,8 strides:1,4)
|
||||
const auto shape_4x8 = ck::make_tuple(ck::Number<4>{}, ck::Number<8>{});
|
||||
const auto layout_4x8_s1x4 = ck::tensor_transform_wrapper::make_layout(shape_4x8);
|
||||
const auto layout_4x8_s1x4 = ck::wrapper::make_layout(shape_4x8);
|
||||
std::cout << "dims:4,8 strides:1,4" << std::endl;
|
||||
Print2d(layout_4x8_s1x4);
|
||||
using Cord1x1Type = ck::Tuple<ck::Number<1>, ck::Number<1>>;
|
||||
@@ -77,10 +73,9 @@ int main()
|
||||
|
||||
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (runtime descriptor)
|
||||
// dims:4,(2,4) strides:2,(1,8)
|
||||
const auto shape_4x2x4 = ck::make_tuple(4, ck::make_tuple(2, 4));
|
||||
const auto strides_s2x1x8 = ck::make_tuple(2, ck::make_tuple(1, 8));
|
||||
const auto layout_4x2x4_s2x1x8 =
|
||||
ck::tensor_transform_wrapper::make_layout(shape_4x2x4, strides_s2x1x8);
|
||||
const auto shape_4x2x4 = ck::make_tuple(4, ck::make_tuple(2, 4));
|
||||
const auto strides_s2x1x8 = ck::make_tuple(2, ck::make_tuple(1, 8));
|
||||
const auto layout_4x2x4_s2x1x8 = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8);
|
||||
|
||||
std::cout << "dims:4,(2,4) strides:2,(1,8)" << std::endl;
|
||||
Print2d(layout_4x2x4_s2x1x8);
|
||||
@@ -92,7 +87,7 @@ int main()
|
||||
const auto strides_s1x4x2x8 = ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}),
|
||||
ck::make_tuple(ck::Number<2>{}, ck::Number<8>{}));
|
||||
static const auto layout_2x2x2x4_s1x4x2x8 =
|
||||
ck::tensor_transform_wrapper::make_layout(shape_2x2x2x4, strides_s1x4x2x8);
|
||||
ck::wrapper::make_layout(shape_2x2x2x4, strides_s1x4x2x8);
|
||||
|
||||
std::cout << "dims:(2,2),(2,4) strides:(1,4),(2,8)" << std::endl;
|
||||
Print2d(layout_2x2x2x4_s1x4x2x8);
|
||||
@@ -108,7 +103,7 @@ int main()
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}), ck::Number<2>{}),
|
||||
ck::Number<8>{});
|
||||
static const auto layout_2x2x2x4_s1x4x2x8_nested =
|
||||
ck::tensor_transform_wrapper::make_layout(shape_2x2x2x4_nested, strides_s1x4x2x8_nested);
|
||||
ck::wrapper::make_layout(shape_2x2x2x4_nested, strides_s1x4x2x8_nested);
|
||||
|
||||
std::cout << "dims:((2,2),2),4 strides:((1,4),2),8" << std::endl;
|
||||
Print1d(layout_2x2x2x4_s1x4x2x8_nested);
|
||||
@@ -1,3 +1,3 @@
|
||||
ROCmSoftwarePlatform/rocm-recipes
|
||||
ROCm/rocm-recipes
|
||||
RadeonOpenCompute/rocm-cmake@04f694df2a8dc9d7e35fa4dee4ba5fa407ec04f8 --build
|
||||
danmar/cppcheck@2.9
|
||||
danmar/cppcheck@2.9
|
||||
|
||||
27
docs/conf.py
27
docs/conf.py
@@ -4,23 +4,34 @@
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
from rocm_docs import ROCmDocs
|
||||
|
||||
html_theme_options = {"flavor": "list"}
|
||||
|
||||
name = "Composable Kernel"
|
||||
get_version = r'sed -n -e "s/^rocm_setup_version(.* \([0-9\.]\{1,\}\).*/\1/p" ../CMakeLists.txt'
|
||||
version = subprocess.getoutput(get_version)
|
||||
if len(version) > 0:
|
||||
name = f"{name} {version}"
|
||||
with open('../CMakeLists.txt', encoding='utf-8') as f:
|
||||
match = re.search(r'.*set\(version ([0-9.]+)[^0-9.]+', f.read())
|
||||
if not match:
|
||||
raise ValueError("VERSION not found!")
|
||||
version_number = match[1]
|
||||
left_nav_title = f"Composable Kernel {version_number} Documentation"
|
||||
|
||||
# for PDF output on Read the Docs
|
||||
project = "Composable Kernel Documentation"
|
||||
author = "Advanced Micro Devices, Inc."
|
||||
copyright = "Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved."
|
||||
version = version_number
|
||||
release = version_number
|
||||
|
||||
external_toc_path = "./sphinx/_toc.yml"
|
||||
|
||||
docs_core = ROCmDocs(f"{name} Documentation")
|
||||
docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/docBin/xml")
|
||||
docs_core = ROCmDocs(left_nav_title)
|
||||
docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
|
||||
docs_core.setup()
|
||||
|
||||
external_projects_current_project = "composable_kernel"
|
||||
|
||||
mathjax3_config = {
|
||||
'tex': {
|
||||
'macros': {
|
||||
|
||||
@@ -58,7 +58,7 @@ PROJECT_LOGO =
|
||||
# entered, it will be relative to the location where doxygen was started. If
|
||||
# left blank the current directory will be used.
|
||||
|
||||
OUTPUT_DIRECTORY = docBin
|
||||
OUTPUT_DIRECTORY = .
|
||||
|
||||
# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
|
||||
# directories (in 2 levels) under the output directory of each output format and
|
||||
@@ -778,7 +778,9 @@ WARN_LOGFILE =
|
||||
INPUT = ../../include/ck/tensor_operation/gpu/grid \
|
||||
../../include/ck/tensor_operation/gpu/block \
|
||||
../../include/ck/tensor_operation/gpu/thread \
|
||||
../../library/include/ck/library/utility
|
||||
../../library/include/ck/library/utility \
|
||||
../../include/ck/wrapper
|
||||
|
||||
|
||||
# This tag can be used to specify the character encoding of the source files
|
||||
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
|
||||
|
||||
@@ -34,6 +34,7 @@ Current CK library are structured into 4 layers:
|
||||
* "Templated Tile Operators" layer
|
||||
* "Templated Kernel and Invoker" layer
|
||||
* "Instantiated Kernel and Invoker" layer
|
||||
* "Wrapper for tensor transform operations"
|
||||
* "Client API" layer
|
||||
|
||||
.. image:: data/ck_layer.png
|
||||
@@ -50,6 +51,7 @@ The following is a list of CK documents in the suggested reading order:
|
||||
|
||||
tutorial_hello_world
|
||||
dockerhub
|
||||
wrapper
|
||||
Supported_Primitives_Guide
|
||||
API_Reference_Guide
|
||||
Contributors_Guide
|
||||
|
||||
@@ -5,6 +5,6 @@ defaults:
|
||||
maxdepth: 6
|
||||
root: index
|
||||
subtrees:
|
||||
- caption: About
|
||||
entries:
|
||||
- file: license
|
||||
- caption: About
|
||||
entries:
|
||||
- file: license
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
rocm-docs-core>=0.20.0
|
||||
rocm-docs-core==0.30.1
|
||||
sphinxcontrib-bibtex==2.6.1
|
||||
|
||||
@@ -16,7 +16,7 @@ beautifulsoup4==4.11.2
|
||||
# via pydata-sphinx-theme
|
||||
breathe==4.34.0
|
||||
# via rocm-docs-core
|
||||
certifi==2022.12.7
|
||||
certifi==2023.7.22
|
||||
# via requests
|
||||
cffi==1.15.1
|
||||
# via
|
||||
@@ -26,7 +26,7 @@ charset-normalizer==3.1.0
|
||||
# via requests
|
||||
click==8.1.3
|
||||
# via sphinx-external-toc
|
||||
cryptography==40.0.2
|
||||
cryptography==41.0.6
|
||||
# via pyjwt
|
||||
deprecated==1.2.13
|
||||
# via pygithub
|
||||
@@ -42,7 +42,7 @@ fastjsonschema==2.18.0
|
||||
# via rocm-docs-core
|
||||
gitdb==4.0.10
|
||||
# via gitpython
|
||||
gitpython==3.1.35
|
||||
gitpython==3.1.37
|
||||
# via rocm-docs-core
|
||||
idna==3.4
|
||||
# via requests
|
||||
@@ -88,9 +88,9 @@ pydata-sphinx-theme==0.13.3
|
||||
# via
|
||||
# rocm-docs-core
|
||||
# sphinx-book-theme
|
||||
pygithub==1.58.2
|
||||
pygithub==1.58.1
|
||||
# via rocm-docs-core
|
||||
pygments==2.14.0
|
||||
pygments==2.15.0
|
||||
# via
|
||||
# accessible-pygments
|
||||
# pydata-sphinx-theme
|
||||
@@ -109,11 +109,11 @@ pyyaml==6.0
|
||||
# pybtex
|
||||
# rocm-docs-core
|
||||
# sphinx-external-toc
|
||||
requests==2.28.2
|
||||
requests==2.31.0
|
||||
# via
|
||||
# pygithub
|
||||
# sphinx
|
||||
rocm-docs-core==0.27.0
|
||||
rocm-docs-core==0.30.1
|
||||
# via -r requirements.in
|
||||
six==1.16.0
|
||||
# via
|
||||
@@ -141,7 +141,7 @@ sphinx-book-theme==1.0.1
|
||||
# via rocm-docs-core
|
||||
sphinx-copybutton==0.5.1
|
||||
# via rocm-docs-core
|
||||
sphinx-design==0.3.0
|
||||
sphinx-design==0.4.1
|
||||
# via rocm-docs-core
|
||||
sphinx-external-toc==0.3.1
|
||||
# via rocm-docs-core
|
||||
@@ -163,7 +163,7 @@ sphinxcontrib-serializinghtml==1.1.5
|
||||
# via sphinx
|
||||
typing-extensions==4.5.0
|
||||
# via pydata-sphinx-theme
|
||||
urllib3==1.26.15
|
||||
urllib3==1.26.18
|
||||
# via requests
|
||||
wrapt==1.15.0
|
||||
# via deprecated
|
||||
|
||||
73
docs/wrapper.rst
Normal file
73
docs/wrapper.rst
Normal file
@@ -0,0 +1,73 @@
|
||||
===============
|
||||
Wrapper
|
||||
===============
|
||||
|
||||
-------------------------------------
|
||||
Description
|
||||
-------------------------------------
|
||||
|
||||
.. note::
|
||||
|
||||
The wrapper is under development and its functionality is limited.
|
||||
|
||||
|
||||
CK provides a lightweight wrapper for more complex operations implemented in
|
||||
the library. It allows indexing of nested layouts using a simple interface
|
||||
(avoiding complex descriptor transformations) and memory access (using Tensor).
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
const auto shape_4x2x4 = ck::make_tuple(4, ck::make_tuple(2, 4));
|
||||
const auto strides_s2x1x8 = ck::make_tuple(2, ck::make_tuple(1, 8));
|
||||
const auto layout = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8);
|
||||
|
||||
std::array<ck::index_t, 32> data;
|
||||
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
|
||||
|
||||
for(ck::index_t w = 0; w < size(tensor); w++) {
|
||||
tensor(w) = w;
|
||||
}
|
||||
|
||||
// slice() == slice(0, -1) (whole dimension)
|
||||
auto tensor_slice = tensor(ck::wrapper::slice(1, 3), ck::make_tuple(ck::wrapper::slice(), ck::wrapper::slice()));
|
||||
std::cout << "dims:2,(2,4) strides:2,(1,8)" << std::endl;
|
||||
for(ck::index_t h = 0; h < ck::wrapper::size<0>(tensor_slice); h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < ck::wrapper::size<1>(tensor_slice); w++)
|
||||
{
|
||||
std::cout << tensor_slice(h, w) << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
Output::
|
||||
|
||||
dims:2,(2,4) strides:2,(1,8)
|
||||
1 5 9 13 17 21 25 29
|
||||
2 6 10 14 18 22 26 30
|
||||
|
||||
-------------------------------------
|
||||
Layout
|
||||
-------------------------------------
|
||||
|
||||
.. doxygenstruct:: ck::wrapper::Layout
|
||||
|
||||
-------------------------------------
|
||||
Layout helpers
|
||||
-------------------------------------
|
||||
|
||||
.. doxygenfile:: layout_utils.hpp
|
||||
|
||||
-------------------------------------
|
||||
Tensor
|
||||
-------------------------------------
|
||||
|
||||
.. doxygenstruct:: ck::wrapper::Tensor
|
||||
|
||||
-------------------------------------
|
||||
Tensor helpers
|
||||
-------------------------------------
|
||||
|
||||
.. doxygenfile:: tensor_utils.hpp
|
||||
@@ -42,6 +42,8 @@ foreach(gpu IN LISTS GPU_TARGETS)
|
||||
# ScaleAdd ScaleAdd Relu
|
||||
add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp)
|
||||
add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16)
|
||||
add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp)
|
||||
add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16)
|
||||
set(target 1)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
@@ -0,0 +1,294 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <type_traits>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
|
||||
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
|
||||
constexpr ck::index_t NDimSpatial = 3;
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using AccDataType = float;
|
||||
using CShuffleDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
|
||||
using BiasLayout = ck::tensor_layout::convolution::G_K;
|
||||
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using OutElementOp = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <typename OutElementOp>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<OutLayout, BiasLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<OutDataType, OutDataType>,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
32, // KPerBlock
|
||||
8, // AK1
|
||||
8, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
8, // ABlockTransferSrcScalarPerVector
|
||||
8, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
8, // BBlockTransferSrcScalarPerVector
|
||||
8, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 32, 1, 8>,
|
||||
8>;
|
||||
|
||||
using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
|
||||
|
||||
namespace {
|
||||
// Use custom implementation to pass two more tensors for post op
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InElementOp,
|
||||
typename WeiElementOp,
|
||||
typename OutElementOp,
|
||||
typename DeviceConvNDFwdInstance>
|
||||
bool run_grouped_conv_fwd(bool do_verification,
|
||||
int init_method,
|
||||
bool time_kernel,
|
||||
const ck::utils::conv::ConvParam& conv_param,
|
||||
const HostTensorDescriptor& in_g_n_c_wis_desc,
|
||||
const HostTensorDescriptor& wei_g_k_c_xs_desc,
|
||||
const HostTensorDescriptor& out_g_n_k_wos_desc,
|
||||
const InElementOp& in_element_op,
|
||||
const WeiElementOp& wei_element_op,
|
||||
const OutElementOp& out_element_op)
|
||||
{
|
||||
constexpr ck::index_t NumDs = 2;
|
||||
const ck::index_t G = out_g_n_k_wos_desc.GetLengths()[0];
|
||||
const ck::index_t K = out_g_n_k_wos_desc.GetLengths()[2];
|
||||
|
||||
// Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
|
||||
std::array<ck::index_t, NDimSpatial + 3> bias_g_k_lengths;
|
||||
std::array<ck::index_t, NDimSpatial + 3> bias_g_k_strides;
|
||||
// Fill other lenghts than G,K with 1 and strides with 0
|
||||
bias_g_k_lengths.fill(1);
|
||||
bias_g_k_strides.fill(0);
|
||||
bias_g_k_lengths[0] = G;
|
||||
bias_g_k_lengths[2] = K;
|
||||
bias_g_k_strides[0] = K; // stride to G
|
||||
bias_g_k_strides[2] = 1; // stride to K
|
||||
const auto broadcasted_bias_desc = HostTensorDescriptor(bias_g_k_lengths, bias_g_k_strides);
|
||||
|
||||
// y = relu ( alpha1 * conv(x) + alpha2 * z + bias )
|
||||
Tensor<InDataType> in(in_g_n_c_wis_desc);
|
||||
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
|
||||
Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
|
||||
Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
|
||||
std::array<Tensor<OutDataType>, NumDs> d_tensors = {Tensor<OutDataType>(out_g_n_k_wos_desc),
|
||||
Tensor<OutDataType>(broadcasted_bias_desc)};
|
||||
|
||||
std::cout << "in: " << in.mDesc << std::endl;
|
||||
std::cout << "wei: " << wei.mDesc << std::endl;
|
||||
std::cout << "out: " << out_host.mDesc << std::endl;
|
||||
std::cout << "z_tensor: " << d_tensors[0].mDesc << std::endl;
|
||||
std::cout << "bias_tensor: " << d_tensors[1].mDesc << std::endl;
|
||||
|
||||
// Make sure that we allocated only G * K values for bias
|
||||
assert(static_cast<ck::index_t>(d_tensors[1].mData.size()) == G * K);
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
|
||||
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
|
||||
d_tensors[0].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
|
||||
d_tensors[1].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
|
||||
break;
|
||||
default:
|
||||
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
|
||||
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
|
||||
d_tensors[0].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
|
||||
d_tensors[1].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
|
||||
}
|
||||
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
|
||||
DeviceMem z_buf(sizeof(OutDataType) * d_tensors[0].mDesc.GetElementSpaceSize());
|
||||
DeviceMem bias_buf(sizeof(OutDataType) * d_tensors[1].mDesc.GetElementSpaceSize());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
|
||||
|
||||
in_device_buf.ToDevice(in.mData.data());
|
||||
wei_device_buf.ToDevice(wei.mData.data());
|
||||
z_buf.ToDevice(d_tensors[0].mData.data());
|
||||
bias_buf.ToDevice(d_tensors[1].mData.data());
|
||||
|
||||
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
|
||||
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
|
||||
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
|
||||
std::array<ck::index_t, NDimSpatial> input_left_pads{};
|
||||
std::array<ck::index_t, NDimSpatial> input_right_pads{};
|
||||
|
||||
auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
|
||||
|
||||
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
|
||||
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
|
||||
copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
|
||||
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
|
||||
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
|
||||
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
|
||||
copy(conv_param.conv_filter_strides_, conv_filter_strides);
|
||||
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
|
||||
copy(conv_param.input_left_pads_, input_left_pads);
|
||||
copy(conv_param.input_right_pads_, input_right_pads);
|
||||
|
||||
const std::array<const void*, NumDs> ds = {z_buf.GetDeviceBuffer(), bias_buf.GetDeviceBuffer()};
|
||||
|
||||
auto conv = DeviceConvNDFwdInstance{};
|
||||
auto invoker = conv.MakeInvoker();
|
||||
auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
|
||||
wei_device_buf.GetDeviceBuffer(),
|
||||
ds,
|
||||
out_device_buf.GetDeviceBuffer(),
|
||||
a_g_n_c_wis_lengths,
|
||||
a_g_n_c_wis_strides,
|
||||
b_g_k_c_xs_lengths,
|
||||
b_g_k_c_xs_strides,
|
||||
std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
|
||||
e_g_n_k_wos_lengths, bias_g_k_lengths},
|
||||
std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
|
||||
e_g_n_k_wos_strides, bias_g_k_strides},
|
||||
e_g_n_k_wos_lengths,
|
||||
e_g_n_k_wos_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
|
||||
if(!conv.IsSupportedArgument(argument))
|
||||
{
|
||||
throw std::runtime_error("The device op with the specified compilation parameters does "
|
||||
"not support this convolution problem.");
|
||||
}
|
||||
|
||||
float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = conv_param.GetFlops() + G * K +
|
||||
conv_param.GetOutputByte<OutDataType>() / sizeof(OutDataType);
|
||||
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
|
||||
G * K * sizeof(OutDataType) + conv_param.GetOutputByte<OutDataType>();
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
|
||||
<< conv.GetTypeString() << std::endl;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
auto ref_conv =
|
||||
ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
0, /*Num A Elementwise Tensors*/
|
||||
0, /*Num B Elementwise Tensors*/
|
||||
NumDs>();
|
||||
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(in,
|
||||
wei,
|
||||
out_host,
|
||||
conv_param.conv_filter_strides_,
|
||||
conv_param.conv_filter_dilations_,
|
||||
conv_param.input_left_pads_,
|
||||
conv_param.input_right_pads_,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op,
|
||||
{},
|
||||
{},
|
||||
d_tensors);
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
out_device_buf.FromDevice(out_device.mData.data());
|
||||
|
||||
return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#include "run_convnd_fwd_activ_example.inc"
|
||||
|
||||
int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); }
|
||||
@@ -24,7 +24,7 @@ bool run_convnd_fwd_example(int argc, char* argv[])
|
||||
// Following shapes are selected to avoid overflow. Expect inf in case of
|
||||
// size increase for some elementwise ops.
|
||||
ck::utils::conv::ConvParam conv_param{
|
||||
3, 1, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
|
||||
3, 2, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
add_example_executable(example_tensor_transform tensor_transform.cpp)
|
||||
add_example_executable(example_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp)
|
||||
@@ -26,7 +26,7 @@ inline std::string get_device_name()
|
||||
}
|
||||
const std::string raw_name(props.gcnArchName);
|
||||
|
||||
// https://github.com/ROCmSoftwarePlatform/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
|
||||
// https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
|
||||
static std::map<std::string, std::string> device_name_map = {
|
||||
{"Ellesmere", "gfx803"},
|
||||
{"Baffin", "gfx803"},
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
@@ -500,22 +501,29 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
|
||||
// for sanity check of vector memory access
|
||||
for(index_t i = 0; i < NumATensor; ++i)
|
||||
{
|
||||
a_mz_stride_[i] = a_ms_ks_strides[i][NumDimM - 1];
|
||||
a_kz_stride_[i] = a_ms_ks_strides[i][NumDimM + NumDimK - 1];
|
||||
as_mz_consecutive_[i] = a_ms_ks_strides[i][NumDimM - 1] == 1;
|
||||
as_kz_consecutive_[i] = a_ms_ks_strides[i][NumDimM + NumDimK - 1] == 1;
|
||||
as_max_read_elems_[i] =
|
||||
CalculateMaxRead<NumDimM, NumDimK>(a_ms_ks_lengths[i], a_ms_ks_strides[i]);
|
||||
}
|
||||
|
||||
for(index_t i = 0; i < NumBTensor; ++i)
|
||||
{
|
||||
b_nz_stride_[i] = b_ns_ks_strides[i][NumDimN - 1];
|
||||
b_kz_stride_[i] = b_ns_ks_strides[i][NumDimN + NumDimK - 1];
|
||||
bs_nz_consecutive_[i] = b_ns_ks_strides[i][NumDimN - 1] == 1;
|
||||
bs_kz_consecutive_[i] = b_ns_ks_strides[i][NumDimN + NumDimK - 1] == 1;
|
||||
bs_max_read_elems_[i] =
|
||||
CalculateMaxRead<NumDimN, NumDimK>(b_ns_ks_lengths[i], b_ns_ks_strides[i]);
|
||||
}
|
||||
|
||||
for(index_t i = 0; i < NumDTensor; ++i)
|
||||
{
|
||||
ds_nz_stride_[i] = d_ms_ns_strides[i][NumDimM + NumDimN - 1];
|
||||
ds_nz_consecutive_[i] = d_ms_ns_strides[i][NumDimM + NumDimN - 1] == 1;
|
||||
ds_max_read_elems_[i] =
|
||||
CalculateMaxRead<NumDimM, NumDimN>(d_ms_ns_lengths[i], d_ms_ns_strides[i]);
|
||||
}
|
||||
|
||||
e_nz_stride_ = e_ms_ns_stride[NumDimM + NumDimN - 1];
|
||||
e_nz_consecutive_ = e_ms_ns_stride[NumDimM + NumDimN - 1] == 1;
|
||||
e_max_write_elems_ = CalculateMaxRead<NumDimM, NumDimN>(e_ms_ns_length, e_ms_ns_stride);
|
||||
}
|
||||
|
||||
// pointers
|
||||
@@ -545,16 +553,19 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
|
||||
BElementwiseOperation b_element_op_;
|
||||
CDEElementwiseOperation cde_element_op_;
|
||||
|
||||
// Strides for the last M/N/K dimensions of A/B/Ds/E
|
||||
// for sanity check of vector load/store
|
||||
std::array<index_t, NumATensor> a_mz_stride_;
|
||||
std::array<index_t, NumATensor> a_kz_stride_;
|
||||
// Describe whether the last part of a given dimension of A/B/D/E is consecutive
|
||||
// in the memory or not.
|
||||
std::array<bool, NumATensor> as_mz_consecutive_;
|
||||
std::array<bool, NumATensor> as_kz_consecutive_;
|
||||
std::array<bool, NumBTensor> bs_nz_consecutive_;
|
||||
std::array<bool, NumBTensor> bs_kz_consecutive_;
|
||||
std::array<bool, NumDTensor> ds_nz_consecutive_;
|
||||
bool e_nz_consecutive_;
|
||||
|
||||
std::array<index_t, NumBTensor> b_nz_stride_;
|
||||
std::array<index_t, NumBTensor> b_kz_stride_;
|
||||
|
||||
std::array<index_t, NumDTensor> ds_nz_stride_;
|
||||
index_t e_nz_stride_;
|
||||
std::array<index_t, NumATensor> as_max_read_elems_;
|
||||
std::array<index_t, NumBTensor> bs_max_read_elems_;
|
||||
std::array<index_t, NumDTensor> ds_max_read_elems_;
|
||||
index_t e_max_write_elems_;
|
||||
};
|
||||
|
||||
// Invoker
|
||||
@@ -643,73 +654,65 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
|
||||
|
||||
// check vector load/store
|
||||
{
|
||||
bool all_valid = true;
|
||||
|
||||
bool valid_as_access = true;
|
||||
static_for<0, NumATensor, 1>{}([&](auto i) {
|
||||
// vector memory access of A: could be on M or AK1 dimension
|
||||
if constexpr(ABlockTransferSrcVectorDim == 1)
|
||||
const bool valid_a_vector_size =
|
||||
arg.as_max_read_elems_[i] % ABlockTransferSrcScalarPerVector == 0;
|
||||
const bool valid_a_access_dim_m =
|
||||
ABlockTransferSrcVectorDim == 1 && arg.as_mz_consecutive_[i];
|
||||
const bool valid_a_access_dim_k =
|
||||
ABlockTransferSrcVectorDim == 2 && arg.as_kz_consecutive_[i];
|
||||
const bool valid_a_access_dim = valid_a_access_dim_m || valid_a_access_dim_k;
|
||||
if(!(valid_a_vector_size && valid_a_access_dim))
|
||||
{
|
||||
if(!(arg.a_mz_stride_[i] == 1 && arg.as_grid_desc_ak0_m_ak1_[i].GetLength(I1) %
|
||||
ABlockTransferSrcScalarPerVector ==
|
||||
0))
|
||||
{
|
||||
all_valid = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(!(arg.a_kz_stride_[i] == 1 && arg.as_grid_desc_ak0_m_ak1_[i].GetLength(I2) %
|
||||
ABlockTransferSrcScalarPerVector ==
|
||||
0))
|
||||
{
|
||||
all_valid = false;
|
||||
}
|
||||
valid_as_access = false;
|
||||
}
|
||||
});
|
||||
|
||||
// vector memory access of B: could be on N or BK1 dimension
|
||||
static_for<0, NumBTensor, 1>{}([&](auto i) {
|
||||
if constexpr(BBlockTransferSrcVectorDim == 1)
|
||||
{
|
||||
if(!(arg.b_nz_stride_[i] == 1 && arg.bs_grid_desc_bk0_n_bk1_[i].GetLength(I1) %
|
||||
BBlockTransferSrcScalarPerVector ==
|
||||
0))
|
||||
{
|
||||
all_valid = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(!(arg.b_kz_stride_[i] == 1 && arg.bs_grid_desc_bk0_n_bk1_[i].GetLength(I2) %
|
||||
BBlockTransferSrcScalarPerVector ==
|
||||
0))
|
||||
{
|
||||
all_valid = false;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// check vector load of Ds
|
||||
static_for<0, NumDTensor, 1>{}([&](auto i) {
|
||||
if(!(arg.ds_nz_stride_[i] == 1 &&
|
||||
arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) %
|
||||
CDEBlockTransferScalarPerVector_NPerBlock ==
|
||||
0))
|
||||
{
|
||||
all_valid = false;
|
||||
}
|
||||
});
|
||||
|
||||
// vector memory access of E: always on NPerBlock dimension
|
||||
if(!(arg.e_nz_stride_ == 1 &&
|
||||
arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) %
|
||||
CDEBlockTransferScalarPerVector_NPerBlock ==
|
||||
0))
|
||||
if(!valid_as_access)
|
||||
{
|
||||
all_valid = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!all_valid)
|
||||
bool valid_bs_access = true;
|
||||
static_for<0, NumBTensor, 1>{}([&](auto i) {
|
||||
const bool valid_b_vector_size =
|
||||
arg.bs_max_read_elems_[i] % BBlockTransferSrcScalarPerVector == 0;
|
||||
const bool valid_b_access_dim_n =
|
||||
BBlockTransferSrcVectorDim == 1 && arg.bs_nz_consecutive_[i];
|
||||
const bool valid_b_access_dim_k =
|
||||
BBlockTransferSrcVectorDim == 2 && arg.bs_kz_consecutive_[i];
|
||||
const bool valid_b_access_dim = valid_b_access_dim_n || valid_b_access_dim_k;
|
||||
if(!(valid_b_vector_size && valid_b_access_dim))
|
||||
{
|
||||
valid_bs_access = false;
|
||||
}
|
||||
});
|
||||
if(!valid_bs_access)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool valid_ds_access = true;
|
||||
static_for<0, NumDTensor, 1>{}([&](auto i) {
|
||||
const bool valid_d_vector_size =
|
||||
arg.ds_max_read_elems_[i] % CDEBlockTransferScalarPerVector_NPerBlock == 0;
|
||||
// Vector read of Ds is always on N dimension.
|
||||
const bool valid_d_access_dim = arg.ds_nz_consecutive_[i];
|
||||
if(!(valid_d_vector_size && valid_d_access_dim))
|
||||
{
|
||||
valid_ds_access = false;
|
||||
}
|
||||
});
|
||||
if(!valid_ds_access)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
const bool valid_e_vector_size =
|
||||
arg.e_max_write_elems_ % CDEBlockTransferScalarPerVector_NPerBlock == 0;
|
||||
// Vector write of E is always on N dimension.
|
||||
const bool valid_e_access_dim = arg.e_nz_consecutive_;
|
||||
if(!(valid_e_vector_size && valid_e_access_dim))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
@@ -183,7 +184,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
|
||||
return generate_tuple([&](auto i) { return vec[i]; }, num);
|
||||
};
|
||||
|
||||
const auto a_ms_ns_lengths = to_tuple(a_ms_ks_lengths_vec, Number<NumDimM + NumDimK>{});
|
||||
const auto a_ms_ks_lengths = to_tuple(a_ms_ks_lengths_vec, Number<NumDimM + NumDimK>{});
|
||||
const auto a_ms_ks_strides = to_tuple(a_ms_ks_strides_vec, Number<NumDimM + NumDimK>{});
|
||||
|
||||
// dimension Ids for M0, M1, ...
|
||||
@@ -194,14 +195,14 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
|
||||
typename arithmetic_sequence_gen<NumDimM, NumDimM + NumDimK, 1>::type{};
|
||||
|
||||
// lengths for M0, M1, ...
|
||||
const auto mLengths = get_container_subset(a_ms_ns_lengths, mDimIds);
|
||||
const auto mLengths = get_container_subset(a_ms_ks_lengths, mDimIds);
|
||||
|
||||
// lengths for K0, K1, ...
|
||||
const auto kLengths = get_container_subset(a_ms_ns_lengths, kDimIds);
|
||||
const auto kLengths = get_container_subset(a_ms_ks_lengths, kDimIds);
|
||||
|
||||
// naive tensor A[M0, M1, M2, ..., K0, K1, K2...]
|
||||
const auto a_grid_desc_ms_ks =
|
||||
make_naive_tensor_descriptor(a_ms_ns_lengths, a_ms_ks_strides);
|
||||
make_naive_tensor_descriptor(a_ms_ks_lengths, a_ms_ks_strides);
|
||||
|
||||
// transformed tensor A[MRaw = M0 * M1 * M2 * ... , KRaw = K0 * K1 * K2 * ...]
|
||||
const auto a_grid_desc_mraw_kraw = transform_tensor_descriptor(
|
||||
@@ -383,7 +384,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
|
||||
const void* p_b_grid,
|
||||
std::array<const void*, NumDTensor> p_ds_grid,
|
||||
void* p_e_grid,
|
||||
const std::vector<index_t>& a_ms_ns_lengths,
|
||||
const std::vector<index_t>& a_ms_ks_lengths,
|
||||
const std::vector<index_t>& a_ms_ks_strides,
|
||||
const std::vector<index_t>& b_ns_ks_lengths,
|
||||
const std::vector<index_t>& b_ns_ks_strides,
|
||||
@@ -398,7 +399,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
|
||||
p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
|
||||
p_ds_grid_{},
|
||||
p_e_grid_{static_cast<EDataType*>(p_e_grid)},
|
||||
a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(a_ms_ns_lengths, a_ms_ks_strides)},
|
||||
a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(a_ms_ks_lengths, a_ms_ks_strides)},
|
||||
b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(b_ns_ks_lengths, b_ns_ks_strides)},
|
||||
ds_grid_desc_m_n_{},
|
||||
e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(e_ms_ns_lengths, e_ms_ns_strides)},
|
||||
@@ -411,13 +412,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
|
||||
block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
|
||||
a_element_op_{a_element_op},
|
||||
b_element_op_{b_element_op},
|
||||
cde_element_op_{cde_element_op},
|
||||
a_mz_stride_{},
|
||||
a_kz_stride_{},
|
||||
b_nz_stride_{},
|
||||
b_kz_stride_{},
|
||||
ds_nz_stride_{},
|
||||
e_nz_stride_{}
|
||||
cde_element_op_{cde_element_op}
|
||||
{
|
||||
// populate pointer, batch stride, desc for Ds
|
||||
static_for<0, NumDTensor, 1>{}([&](auto i) {
|
||||
@@ -448,18 +443,26 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
|
||||
}
|
||||
|
||||
// for sanity check of vector memory access
|
||||
a_mz_stride_ = a_ms_ks_strides[NumDimM - 1];
|
||||
a_kz_stride_ = a_ms_ks_strides[NumDimM + NumDimK - 1];
|
||||
a_mz_consecutive_ = a_ms_ks_strides[NumDimM - 1] == 1;
|
||||
a_kz_consecutive_ = a_ms_ks_strides[NumDimM + NumDimK - 1] == 1;
|
||||
a_max_read_elems_ =
|
||||
CalculateMaxRead<NumDimM, NumDimK>(a_ms_ks_lengths, a_ms_ks_strides);
|
||||
|
||||
b_nz_stride_ = b_ns_ks_strides[NumDimN - 1];
|
||||
b_kz_stride_ = b_ns_ks_strides[NumDimN + NumDimK - 1];
|
||||
b_nz_consecutive_ = b_ns_ks_strides[NumDimN - 1] == 1;
|
||||
b_kz_consecutive_ = b_ns_ks_strides[NumDimN + NumDimK - 1] == 1;
|
||||
b_max_read_elems_ =
|
||||
CalculateMaxRead<NumDimN, NumDimK>(b_ns_ks_lengths, b_ns_ks_strides);
|
||||
|
||||
for(index_t i = 0; i < NumDTensor; ++i)
|
||||
{
|
||||
ds_nz_stride_[i] = ds_ms_ns_strides[i][NumDimM + NumDimN - 1];
|
||||
ds_nz_consecutive_[i] = ds_ms_ns_strides[i][NumDimM + NumDimN - 1] == 1;
|
||||
ds_max_read_elems_[i] =
|
||||
CalculateMaxRead<NumDimM, NumDimN>(ds_ms_ns_lengths[i], ds_ms_ns_strides[i]);
|
||||
}
|
||||
|
||||
e_nz_stride_ = e_ms_ns_strides[NumDimM + NumDimN - 1];
|
||||
e_nz_consecutive_ = e_ms_ns_strides[NumDimM + NumDimN - 1] == 1;
|
||||
e_max_write_elems_ =
|
||||
CalculateMaxRead<NumDimM, NumDimN>(e_ms_ns_lengths, e_ms_ns_strides);
|
||||
}
|
||||
|
||||
void Print() const
|
||||
@@ -499,15 +502,19 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
|
||||
BElementwiseOperation b_element_op_;
|
||||
CDEElementwiseOperation cde_element_op_;
|
||||
|
||||
// Strides for the last M/N/K dimensions of A/B/Ds/E
|
||||
// for sanity check of vector load/store
|
||||
index_t a_mz_stride_;
|
||||
index_t a_kz_stride_;
|
||||
index_t b_nz_stride_;
|
||||
index_t b_kz_stride_;
|
||||
std::array<index_t, NumDTensor> ds_nz_stride_;
|
||||
index_t e_mz_stride_;
|
||||
index_t e_nz_stride_;
|
||||
// Describe whether the last part of a given dimension of A/B/D/E is consecutive
|
||||
// in the memory or not.
|
||||
bool a_mz_consecutive_;
|
||||
bool a_kz_consecutive_;
|
||||
bool b_nz_consecutive_;
|
||||
bool b_kz_consecutive_;
|
||||
std::array<bool, NumDTensor> ds_nz_consecutive_;
|
||||
bool e_nz_consecutive_;
|
||||
|
||||
index_t a_max_read_elems_;
|
||||
index_t b_max_read_elems_;
|
||||
std::array<index_t, NumDTensor> ds_max_read_elems_;
|
||||
index_t e_max_write_elems_;
|
||||
};
|
||||
|
||||
// Invoker
|
||||
@@ -616,65 +623,47 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
|
||||
(BBlockTransferSrcVectorDim == 1 || BBlockTransferSrcVectorDim == 2),
|
||||
"wrong!");
|
||||
|
||||
// vector memory access of A: could be on M or AK1 dimension
|
||||
if constexpr(ABlockTransferSrcVectorDim == 1)
|
||||
{
|
||||
if(!(arg.a_mz_stride_ == 1 &&
|
||||
arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) % ABlockTransferSrcScalarPerVector == 0))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(!(arg.a_kz_stride_ == 1 &&
|
||||
arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) % ABlockTransferSrcScalarPerVector == 0))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// vector memory access of B: could be on N or BK1 dimension
|
||||
if constexpr(BBlockTransferSrcVectorDim == 1)
|
||||
{
|
||||
if(!(arg.b_nz_stride_ == 1 &&
|
||||
arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) % BBlockTransferSrcScalarPerVector == 0))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(!(arg.b_kz_stride_ == 1 &&
|
||||
arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) % BBlockTransferSrcScalarPerVector == 0))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// vector memory access of Ds: always on NPerBlock dimension
|
||||
bool valid_d_access = true;
|
||||
|
||||
static_for<0, NumDTensor, 1>{}([&](auto i) {
|
||||
if(!(arg.ds_nz_stride_[i] == 1 &&
|
||||
arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) %
|
||||
CDEBlockTransferScalarPerVector_NPerBlock ==
|
||||
0))
|
||||
{
|
||||
valid_d_access = false;
|
||||
}
|
||||
});
|
||||
|
||||
if(valid_d_access == false)
|
||||
const bool valid_a_vector_size =
|
||||
arg.a_max_read_elems_ % ABlockTransferSrcScalarPerVector == 0;
|
||||
const bool valid_a_access_dim_m = ABlockTransferSrcVectorDim == 1 && arg.a_mz_consecutive_;
|
||||
const bool valid_a_access_dim_k = ABlockTransferSrcVectorDim == 2 && arg.a_kz_consecutive_;
|
||||
const bool valid_a_access_dim = valid_a_access_dim_m || valid_a_access_dim_k;
|
||||
if(!(valid_a_vector_size && valid_a_access_dim))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// vector memory access of E: always on NPerBlock dimension
|
||||
if(!(arg.e_nz_stride_ == 1 &&
|
||||
arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) %
|
||||
CDEBlockTransferScalarPerVector_NPerBlock ==
|
||||
0))
|
||||
const bool valid_b_vector_size =
|
||||
arg.b_max_read_elems_ % BBlockTransferSrcScalarPerVector == 0;
|
||||
const bool valid_b_access_dim_n = BBlockTransferSrcVectorDim == 1 && arg.b_nz_consecutive_;
|
||||
const bool valid_b_access_dim_k = BBlockTransferSrcVectorDim == 2 && arg.b_kz_consecutive_;
|
||||
const bool valid_b_access_dim = valid_b_access_dim_n || valid_b_access_dim_k;
|
||||
if(!(valid_b_vector_size && valid_b_access_dim))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool valid_ds_access = true;
|
||||
static_for<0, NumDTensor, 1>{}([&](auto i) {
|
||||
const bool valid_d_vector_size =
|
||||
arg.ds_max_read_elems_[i] % CDEBlockTransferScalarPerVector_NPerBlock == 0;
|
||||
// Vector read of Ds is always on N dimension.
|
||||
const bool valid_d_access_dim = arg.ds_nz_consecutive_[i];
|
||||
if(!(valid_d_vector_size && valid_d_access_dim))
|
||||
{
|
||||
valid_ds_access = false;
|
||||
}
|
||||
});
|
||||
if(!valid_ds_access)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
const bool valid_e_vector_size =
|
||||
arg.e_max_write_elems_ % CDEBlockTransferScalarPerVector_NPerBlock == 0;
|
||||
// Vector write of E is always on N dimension.
|
||||
const bool valid_e_access_dim = arg.e_nz_consecutive_;
|
||||
if(!(valid_e_vector_size && valid_e_access_dim))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@@ -692,7 +681,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
|
||||
const void* p_b,
|
||||
std::array<const void*, NumDTensor> p_ds,
|
||||
void* p_e,
|
||||
const std::vector<index_t>& a_ms_ns_lengths,
|
||||
const std::vector<index_t>& a_ms_ks_lengths,
|
||||
const std::vector<index_t>& a_ms_ks_strides,
|
||||
const std::vector<index_t>& b_ns_ks_lengths,
|
||||
const std::vector<index_t>& b_ns_ks_strides,
|
||||
@@ -708,7 +697,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
|
||||
p_b,
|
||||
p_ds,
|
||||
p_e,
|
||||
a_ms_ns_lengths,
|
||||
a_ms_ks_lengths,
|
||||
a_ms_ks_strides,
|
||||
b_ns_ks_lengths,
|
||||
b_ns_ks_strides,
|
||||
@@ -729,7 +718,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
|
||||
const void* p_b,
|
||||
std::array<const void*, NumDTensor> p_ds,
|
||||
void* p_e,
|
||||
const std::vector<index_t>& a_ms_ns_lengths,
|
||||
const std::vector<index_t>& a_ms_ks_lengths,
|
||||
const std::vector<index_t>& a_ms_ks_strides,
|
||||
const std::vector<index_t>& b_ns_ks_lengths,
|
||||
const std::vector<index_t>& b_ns_ks_strides,
|
||||
@@ -745,7 +734,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
|
||||
p_b,
|
||||
p_ds,
|
||||
p_e,
|
||||
a_ms_ns_lengths,
|
||||
a_ms_ks_lengths,
|
||||
a_ms_ks_strides,
|
||||
b_ns_ks_lengths,
|
||||
b_ns_ks_strides,
|
||||
|
||||
@@ -0,0 +1,87 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
/**
|
||||
* Calculates the maximum number of subsequent elements of the fast changing dimension
|
||||
* that are consecutive in memory.
|
||||
*
|
||||
* Example:
|
||||
* NumDimM = 2, NumDimK = 3
|
||||
* A shape = [ 2, 3, 4, 5, 6]
|
||||
* A strides = [360, 120, 30, 6, 1]
|
||||
* | M | | K |
|
||||
* It follows from strides that K is FCD and all the subsequent elements of K are consecutive
|
||||
* in memory.
|
||||
* But if strides were [360, 120, 6, 24, 1], then only 6 subsequent elements of K would be
|
||||
* consecutive in memory.
|
||||
*
|
||||
* Assumes that the dimensions are split into two groups of `NumDim1` and `NumDim2` dimensions.
|
||||
*/
|
||||
template <index_t NumDim1, index_t NumDim2>
|
||||
auto CalculateMaxRead(const std::vector<index_t>& lengths, const std::vector<index_t>& strides)
|
||||
{
|
||||
if(lengths.size() != NumDim1 + NumDim2)
|
||||
{
|
||||
std::ostringstream err;
|
||||
err << "Incorrect number of lengths in " << __FILE__ << ":" << __LINE__
|
||||
<< ", in function: " << __func__;
|
||||
throw std::runtime_error(err.str());
|
||||
}
|
||||
if(strides.size() != NumDim1 + NumDim2)
|
||||
{
|
||||
std::ostringstream err;
|
||||
err << "Incorrect number of strides in " << __FILE__ << ":" << __LINE__
|
||||
<< ", in function: " << __func__;
|
||||
throw std::runtime_error(err.str());
|
||||
}
|
||||
|
||||
// Determine the beginning and end idx of the group representing the FCD.
|
||||
index_t begin_idx, end_idx;
|
||||
if(strides[NumDim1 - 1] == 1)
|
||||
{
|
||||
begin_idx = 0;
|
||||
end_idx = NumDim1 - 1;
|
||||
}
|
||||
else if(strides[NumDim1 + NumDim2 - 1] == 1)
|
||||
{
|
||||
begin_idx = NumDim1;
|
||||
end_idx = NumDim1 + NumDim2 - 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// The dimension consecutive in memory is not the last dimension of any group, so only
|
||||
// one element can be read/written at once.
|
||||
return 1;
|
||||
}
|
||||
|
||||
index_t consecutive_stride = 1;
|
||||
for(index_t dim_idx = end_idx; dim_idx >= begin_idx; --dim_idx)
|
||||
{
|
||||
if(strides[dim_idx] == consecutive_stride)
|
||||
{
|
||||
consecutive_stride *= lengths[dim_idx];
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
const index_t max_subsequent_elems = consecutive_stride;
|
||||
return max_subsequent_elems;
|
||||
}
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -357,15 +357,17 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
|
||||
return out_gemmm_gemmn_desc;
|
||||
}
|
||||
|
||||
// Shape of Ds and E must be aligned. Strides can be different.
|
||||
// Pass e_g_n_k_wos_lengths for logical broadcast.
|
||||
static auto MakeDsGridDescriptor_M_N(
|
||||
const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
|
||||
const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides)
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
|
||||
|
||||
return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(ds_g_n_k_wos_lengths[i],
|
||||
return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(e_g_n_k_wos_lengths,
|
||||
ds_g_n_k_wos_strides[i]);
|
||||
},
|
||||
Number<NumDTensor>{});
|
||||
@@ -569,7 +571,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
|
||||
|
||||
// D desc
|
||||
ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N<DLayout>(
|
||||
ds_g_n_k_wos_lengths[i], ds_g_n_k_wos_strides[i]);
|
||||
e_g_n_k_wos_lengths, ds_g_n_k_wos_strides[i]);
|
||||
});
|
||||
compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_k_wos_strides[0];
|
||||
|
||||
@@ -916,8 +918,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
|
||||
is_same_v<DLayout, ctc::G_NDHW_K> || is_same_v<DLayout, ctc::GNWK> ||
|
||||
is_same_v<DLayout, ctc::GNHWK> || is_same_v<DLayout, ctc::GNDHWK> ||
|
||||
is_same_v<DLayout, ctc::NWGK> || is_same_v<DLayout, ctc::NHWGK> ||
|
||||
is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::GK> ||
|
||||
is_same_v<DLayout, ctc::G_K>)
|
||||
is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::G_K>)
|
||||
{
|
||||
const index_t K = arg.ds_g_n_k_wos_lengths_[i][2];
|
||||
|
||||
@@ -925,6 +926,27 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
|
||||
{
|
||||
valid = false;
|
||||
}
|
||||
|
||||
if constexpr(is_same_v<DLayout, ctc::G_K>)
|
||||
{
|
||||
// G and K must be the same
|
||||
if(arg.ds_g_n_k_wos_lengths_[i][0] != arg.e_g_n_k_wos_lengths_[0] ||
|
||||
arg.ds_g_n_k_wos_lengths_[i][2] != arg.e_g_n_k_wos_lengths_[2])
|
||||
{
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// E and D must have the same shape
|
||||
for(index_t d = 0; d < NDimSpatial + 3; d++)
|
||||
{
|
||||
if(arg.ds_g_n_k_wos_lengths_[i][d] != arg.e_g_n_k_wos_lengths_[d])
|
||||
{
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -631,8 +631,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
|
||||
is_same_v<DLayout, ctc::G_NDHW_K> || is_same_v<DLayout, ctc::GNWK> ||
|
||||
is_same_v<DLayout, ctc::GNHWK> || is_same_v<DLayout, ctc::GNDHWK> ||
|
||||
is_same_v<DLayout, ctc::NWGK> || is_same_v<DLayout, ctc::NHWGK> ||
|
||||
is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::GK> ||
|
||||
is_same_v<DLayout, ctc::G_K>)
|
||||
is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::G_K>)
|
||||
{
|
||||
const index_t K = arg.ds_g_n_k_wos_lengths_[i][2];
|
||||
|
||||
|
||||
@@ -308,12 +308,6 @@ struct GNDHWK : public BaseTensorLayout
|
||||
static constexpr const char* name = "GNDHWK";
|
||||
};
|
||||
|
||||
// for output bias
|
||||
struct GK : public BaseTensorLayout
|
||||
{
|
||||
static constexpr const char* name = "GK";
|
||||
};
|
||||
|
||||
// output tensor
|
||||
// packed NWGK/NHWGK/NDHWGK
|
||||
struct NWGK : public BaseTensorLayout
|
||||
|
||||
@@ -50,7 +50,9 @@ __global__ void
|
||||
ignore = p_in_global;
|
||||
ignore = out_grid_desc;
|
||||
ignore = p_out_global;
|
||||
ignore = batch_count;
|
||||
ignore = block_2_tile_map;
|
||||
ignore = compute_ptr_offset_of_batch;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -522,22 +522,21 @@ struct TransformConvFwdToGemm
|
||||
|
||||
// for output bias
|
||||
template <typename CLayout,
|
||||
typename std::enable_if<is_same_v<CLayout, tensor_layout::convolution::GK> ||
|
||||
is_same_v<CLayout, tensor_layout::convolution::G_K>,
|
||||
typename std::enable_if<is_same_v<CLayout, tensor_layout::convolution::G_K>,
|
||||
bool>::type = false>
|
||||
static auto
|
||||
MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */)
|
||||
static auto MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides)
|
||||
{
|
||||
const index_t N = c_g_n_k_wos_lengths[1];
|
||||
const index_t K = c_g_n_k_wos_lengths[2];
|
||||
const index_t N = c_g_n_k_wos_lengths[1];
|
||||
const index_t K = c_g_n_k_wos_lengths[2];
|
||||
const index_t KStride = c_g_n_k_wos_strides[2];
|
||||
|
||||
const index_t NHoWo =
|
||||
N * ck::accumulate_n<index_t>(
|
||||
c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
|
||||
|
||||
const auto out_gemmm_gemmn_desc =
|
||||
make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(I0, I1));
|
||||
make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(I0, KStride));
|
||||
|
||||
return out_gemmm_gemmn_desc;
|
||||
}
|
||||
|
||||
@@ -166,4 +166,16 @@ __host__ __device__ constexpr auto IsNestedTuple(const Tuple<Ts...>&)
|
||||
return (is_detected<is_tuple, Ts>::value || ...);
|
||||
}
|
||||
|
||||
template <index_t depth = 0, typename T>
|
||||
__host__ __device__ constexpr auto TupleDepth(const T&)
|
||||
{
|
||||
return depth;
|
||||
}
|
||||
|
||||
template <index_t depth = 0, typename... Ts>
|
||||
__host__ __device__ constexpr auto TupleDepth(const Tuple<Ts...>&)
|
||||
{
|
||||
return math::max(TupleDepth<depth + 1>(Ts{})...);
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
@@ -3,27 +3,13 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
|
||||
#include "ck/utility/number.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/tuple_helper.hpp"
|
||||
#include "ck/utility/sequence.hpp"
|
||||
#include "ck/utility/sequence_helper.hpp"
|
||||
#include "ck/utility/is_detected.hpp"
|
||||
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
#include "ck/tensor_description/multi_index_transform_helper.hpp"
|
||||
#include "ck/wrapper/utils/layout_utils.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_transform_wrapper {
|
||||
namespace wrapper {
|
||||
|
||||
/**
|
||||
* \brief Layout wrapper
|
||||
*
|
||||
* \details
|
||||
* Layout wrapper that performs the tensor descriptor logic.
|
||||
* \brief Layout wrapper that performs the tensor descriptor logic.
|
||||
*
|
||||
* \tparam Shape Tuple of Number<> (for compile-time layout) or index_t
|
||||
* (dynamic layout). It is possible to pass nested shapes
|
||||
@@ -32,21 +18,39 @@ namespace tensor_transform_wrapper {
|
||||
* (dynamic layout). Stride tuple should be nested if shape tuple is
|
||||
* nested.
|
||||
*/
|
||||
template <typename Shape, typename Strides = Tuple<>>
|
||||
template <typename Shape, typename Strides>
|
||||
struct Layout
|
||||
{
|
||||
private:
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
|
||||
template <typename T>
|
||||
using is_tuple = decltype(std::declval<T&>().IsTuple());
|
||||
// Generate default idxs tuple (idx with all merged nested shapes)
|
||||
template <typename... Ts>
|
||||
__host__ __device__ constexpr static auto GenerateDefaultIdxsTuple(const Tuple<Ts...>&)
|
||||
{
|
||||
return generate_tuple(
|
||||
[&](auto) {
|
||||
if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime())
|
||||
{
|
||||
// runtime layout
|
||||
return index_t(0);
|
||||
}
|
||||
else
|
||||
{
|
||||
// compiletime layout
|
||||
return I0;
|
||||
}
|
||||
},
|
||||
Number<Tuple<Ts...>::Size()>{});
|
||||
}
|
||||
|
||||
// Generate packed (column-major) strides if not passed
|
||||
template <typename... Ts>
|
||||
__host__ __device__ constexpr static auto
|
||||
GenerateColumnMajorPackedStrides(const Tuple<Ts...>& tuple)
|
||||
GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape)
|
||||
{
|
||||
const auto unrolled_shape = UnrollNestedTuple(shape);
|
||||
return generate_tuple(
|
||||
[&](auto i) {
|
||||
if constexpr(i.value == 0)
|
||||
@@ -56,10 +60,10 @@ struct Layout
|
||||
else
|
||||
{
|
||||
return TupleReduce<I0.value, i.value>([](auto x, auto y) { return x * y; },
|
||||
tuple);
|
||||
unrolled_shape);
|
||||
}
|
||||
},
|
||||
Number<Tuple<Ts...>::Size()>{});
|
||||
Number<decltype(unrolled_shape)::Size()>{});
|
||||
}
|
||||
|
||||
// Generate LowerDims in Compile-time for MergeTrasform using passed Type
|
||||
@@ -112,8 +116,8 @@ struct Layout
|
||||
// Example shape: (2, (2, 2)), 2, (2, 2)
|
||||
// Unrolled shape: 2, (2, 2), 2, (2, 2)
|
||||
template <typename... ShapeDims, typename... IdxDims>
|
||||
__host__ __device__ constexpr static auto UnrollShapeViaIdx(const Tuple<ShapeDims...>& shape,
|
||||
const Tuple<IdxDims...>& idx)
|
||||
__host__ __device__ constexpr static auto AlignShapeToIdx(const Tuple<ShapeDims...>& shape,
|
||||
const Tuple<IdxDims...>& idx)
|
||||
{
|
||||
if constexpr(!IsNestedTuple(Tuple<IdxDims...>{}))
|
||||
{
|
||||
@@ -125,7 +129,7 @@ struct Layout
|
||||
// Iterate over shape tuple elements:
|
||||
// 1. If corresponding idx element is tuple then return (will be unrolled)
|
||||
// 2. If no, pack in tuple. It will be restored during unroll.
|
||||
auto unrolled_shape_via_idx = generate_tuple(
|
||||
auto aligned_shape = generate_tuple(
|
||||
[&](auto i) {
|
||||
if constexpr(is_detected<is_tuple,
|
||||
tuple_element_t<i, Tuple<IdxDims...>>>::value)
|
||||
@@ -140,37 +144,34 @@ struct Layout
|
||||
Number<Tuple<IdxDims...>::Size()>{});
|
||||
|
||||
// Unroll and process next step
|
||||
return UnrollShapeViaIdx(UnrollNestedTuple<0, 1>(unrolled_shape_via_idx),
|
||||
UnrollNestedTuple<0, 1>(idx));
|
||||
return AlignShapeToIdx(UnrollNestedTuple<0, 1>(aligned_shape),
|
||||
UnrollNestedTuple<0, 1>(idx));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... ShapeDims, typename DescriptorToMerge>
|
||||
__host__ __device__ constexpr static auto MakeMerge1d(const Tuple<ShapeDims...>& shape,
|
||||
DescriptorToMerge& desc)
|
||||
const DescriptorToMerge& desc)
|
||||
{
|
||||
// Reverse each element in tuple
|
||||
using ReversedUnrolledShape = decltype(TupleReverse(UnrollNestedTuple(shape)));
|
||||
const auto merge_elems = ReversedUnrolledShape{};
|
||||
|
||||
const auto merge_elems = TupleReverse(UnrollNestedTuple(shape));
|
||||
// Generate reverted indexes (column major traverse)
|
||||
using MergeElemsSequence =
|
||||
typename arithmetic_sequence_gen<0, ReversedUnrolledShape::Size(), 1>::type;
|
||||
const auto lower_dims = make_tuple(MergeElemsSequence::Reverse());
|
||||
const auto upper_dims = make_tuple(Sequence<0>{});
|
||||
using MergeElemsSequence = typename arithmetic_sequence_gen<0, merge_elems.Size(), 1>::type;
|
||||
const auto lower_dims = make_tuple(MergeElemsSequence::Reverse());
|
||||
const auto upper_dims = make_tuple(Sequence<0>{});
|
||||
// Merge to 1d
|
||||
return transform_tensor_descriptor(
|
||||
desc, make_tuple(make_merge_transform(merge_elems)), lower_dims, upper_dims);
|
||||
}
|
||||
|
||||
// Merge nested shape dims
|
||||
// Merge nested shape dims when corresponding index is also nested.
|
||||
// Input desc shape: 2, 2, 2, 2, 2, 2
|
||||
// Example idx: 1, 1, 1, 1
|
||||
// Example shape: 2, (2, 2), 2, (2, 2)
|
||||
// Merged shape: 2, 4, 2, 4
|
||||
template <typename... ShapeDims, typename... IdxDims, typename DescriptorToMerge>
|
||||
__host__ __device__ constexpr static auto
|
||||
MakeMerges(const Tuple<ShapeDims...>& shape, const Tuple<IdxDims...>&, DescriptorToMerge& desc)
|
||||
__host__ __device__ constexpr static auto CreateMergedDescriptor(
|
||||
const Tuple<ShapeDims...>& shape, const Tuple<IdxDims...>&, DescriptorToMerge& desc)
|
||||
{
|
||||
const auto transforms = generate_tuple(
|
||||
[&](auto i) {
|
||||
@@ -206,14 +207,38 @@ struct Layout
|
||||
return transform_tensor_descriptor(desc, transforms, lower_dims, upper_dims);
|
||||
}
|
||||
|
||||
template <typename LayoutShape, typename LayoutStrides>
|
||||
__host__ __device__ static auto MakeFlattenDescriptor(const LayoutShape& shape,
|
||||
const LayoutStrides& strides)
|
||||
{
|
||||
const auto unrolled_shape = UnrollNestedTuple(shape);
|
||||
const auto unrolled_strides = UnrollNestedTuple(strides);
|
||||
static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
|
||||
"Size of strides and shape are not consistent.");
|
||||
return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
|
||||
}
|
||||
|
||||
// If the stride is not passed, you can infer it from `GenerateColumnMajorPackedStrides`.
|
||||
using DeducedStrides =
|
||||
std::conditional_t<is_same_v<Strides, Tuple<>>,
|
||||
remove_cvref_t<decltype(GenerateColumnMajorPackedStrides(Shape{}))>,
|
||||
Strides>;
|
||||
using FlattenDescriptorType =
|
||||
remove_cvref_t<decltype(MakeFlattenDescriptor(Shape{}, DeducedStrides{}))>;
|
||||
using Descriptor1dType =
|
||||
remove_cvref_t<decltype(MakeMerge1d(Shape{}, FlattenDescriptorType{}))>;
|
||||
using DefaultIdxsTupleType = remove_cvref_t<decltype(GenerateDefaultIdxsTuple(Shape{}))>;
|
||||
|
||||
template <typename... ShapeDims, typename... IdxDims>
|
||||
__host__ __device__ constexpr auto TransformDesc(const Tuple<ShapeDims...>& shape,
|
||||
const Tuple<IdxDims...>& idx) const
|
||||
__host__ __device__ constexpr static auto
|
||||
TransformDesc(const Tuple<ShapeDims...>& shape,
|
||||
const Tuple<IdxDims...>& idx,
|
||||
const FlattenDescriptorType& naive_descriptor)
|
||||
{
|
||||
if constexpr(Tuple<IdxDims...>::Size() == I1)
|
||||
{
|
||||
// 1d idx path
|
||||
return MakeMerge1d(shape, descriptor_);
|
||||
return MakeMerge1d(shape, naive_descriptor);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -224,62 +249,55 @@ struct Layout
|
||||
static_assert(Tuple<ShapeDims...>::Size() == Tuple<IdxDims...>::Size(),
|
||||
"Idx rank and Shape rank must be the same (except 1d).");
|
||||
// Unroll while IdxDims is nested
|
||||
const auto unrolled_shape_via_idx = UnrollShapeViaIdx(shape, idx);
|
||||
const auto aligned_shape = AlignShapeToIdx(shape, idx);
|
||||
// Transform correct form of shape
|
||||
return MakeMerges(unrolled_shape_via_idx, UnrollNestedTuple(idx), descriptor_);
|
||||
return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idx), naive_descriptor);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename LayoutShape, typename LayoutStrides>
|
||||
__host__ __device__ static auto MakeNaiveDescriptor(const LayoutShape& shape,
|
||||
const LayoutStrides& strides)
|
||||
{
|
||||
const auto unrolled_shape = UnrollNestedTuple(shape);
|
||||
|
||||
if constexpr(ck::is_same_v<LayoutStrides, Tuple<>>)
|
||||
{
|
||||
// If shape is packed
|
||||
const auto column_major_packed_strides =
|
||||
GenerateColumnMajorPackedStrides(unrolled_shape);
|
||||
return make_naive_tensor_descriptor(unrolled_shape, column_major_packed_strides);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto unrolled_strides = UnrollNestedTuple(strides);
|
||||
static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
|
||||
"Size of strides and shape are not consistent.");
|
||||
return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
|
||||
}
|
||||
}
|
||||
using MergedNestsDescriptorType = remove_cvref_t<decltype(TransformDesc(
|
||||
Shape{}, DefaultIdxsTupleType{}, FlattenDescriptorType{}))>;
|
||||
|
||||
public:
|
||||
using NaiveDescriptorType = remove_cvref_t<decltype(MakeNaiveDescriptor(Shape{}, Strides{}))>;
|
||||
__host__ __device__ constexpr auto GetElementSpaceSize() const
|
||||
{
|
||||
return flatten_descriptor_.GetElementSpaceSize();
|
||||
}
|
||||
|
||||
__host__ __device__ Layout() = delete;
|
||||
/**
|
||||
* \brief Layout constructor.
|
||||
*
|
||||
* \param shape Shape for layout.
|
||||
* \param strides Strides for layout (optional if tensor is packed).
|
||||
* \return Layout object.
|
||||
*/
|
||||
__host__ __device__ Layout() = delete;
|
||||
__host__ __device__ Layout(const Shape& shape, const Strides& strides) : descriptor_{}
|
||||
__host__ __device__ constexpr Layout(const Shape& shape, const Strides& strides)
|
||||
: flatten_descriptor_{}, shape_(shape), strides_(strides)
|
||||
{
|
||||
// Construct if runtime mode
|
||||
if constexpr(!NaiveDescriptorType::IsKnownAtCompileTime())
|
||||
if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime())
|
||||
{
|
||||
// Keep only shape, strides are not need for transforms
|
||||
shape_ = shape;
|
||||
descriptor_ = MakeNaiveDescriptor(shape, strides);
|
||||
flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_);
|
||||
descriptor_1d_ = MakeMerge1d(shape_, flatten_descriptor_);
|
||||
merged_nests_descriptor_ =
|
||||
TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_);
|
||||
}
|
||||
}
|
||||
|
||||
__host__ __device__ Layout(const Shape& shape) : descriptor_{}
|
||||
/**
|
||||
* \brief Layout constructor (with default packed column-major strides).
|
||||
*
|
||||
* \param shape Shape for layout.
|
||||
*/
|
||||
__host__ __device__ constexpr Layout(const Shape& shape)
|
||||
: flatten_descriptor_{}, shape_(shape), strides_(GenerateColumnMajorPackedStrides(shape_))
|
||||
{
|
||||
if constexpr(!NaiveDescriptorType::IsKnownAtCompileTime())
|
||||
if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime())
|
||||
{
|
||||
shape_ = shape;
|
||||
descriptor_ = MakeNaiveDescriptor(shape, Strides{});
|
||||
flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_);
|
||||
descriptor_1d_ = MakeMerge1d(shape_, flatten_descriptor_);
|
||||
merged_nests_descriptor_ =
|
||||
TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -292,7 +310,9 @@ struct Layout
|
||||
template <typename Idxs>
|
||||
__host__ __device__ constexpr index_t operator()() const
|
||||
{
|
||||
using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}));
|
||||
static_assert(FlattenDescriptorType::IsKnownAtCompileTime(),
|
||||
"Compiletime operator used on runtime layout.");
|
||||
using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, FlattenDescriptorType{}));
|
||||
using UnrolledIdx = decltype(UnrollNestedTuple(Idxs{}));
|
||||
return TransformedDesc{}.CalculateOffset(UnrolledIdx{});
|
||||
}
|
||||
@@ -306,9 +326,22 @@ struct Layout
|
||||
template <typename... Ts>
|
||||
__host__ __device__ index_t operator()(const Tuple<Ts...>& Idx) const
|
||||
{
|
||||
// Static to construct transformed_desc only once
|
||||
static const auto transformed_desc = TransformDesc(shape_, Idx);
|
||||
return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx));
|
||||
if constexpr(!IsNestedTuple(Tuple<Ts...>{}) && Tuple<Ts...>::Size() == 1)
|
||||
{
|
||||
// if 1d access
|
||||
return descriptor_1d_.CalculateOffset(Idx);
|
||||
}
|
||||
else if constexpr(!IsNestedTuple(Tuple<Ts...>{}) && Tuple<Ts...>::Size() == Shape::Size())
|
||||
{
|
||||
// if Shape::Size() access (merged nested shapes)
|
||||
return merged_nests_descriptor_.CalculateOffset(UnrollNestedTuple(Idx));
|
||||
}
|
||||
else
|
||||
{
|
||||
// Custom index, need to transform descriptor
|
||||
const auto transformed_desc = TransformDesc(shape_, Idx, flatten_descriptor_);
|
||||
return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -338,7 +371,7 @@ struct Layout
|
||||
*
|
||||
* \return Calculated size.
|
||||
*/
|
||||
__host__ __device__ constexpr index_t GetLength() const
|
||||
__host__ __device__ constexpr index_t GetLengths() const
|
||||
{
|
||||
const auto unrolled_shape = UnrollNestedTuple(shape_);
|
||||
return TupleReduce<I0.value, unrolled_shape.Size()>([](auto x, auto y) { return x * y; },
|
||||
@@ -346,80 +379,56 @@ struct Layout
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Dimension getter.
|
||||
* \brief Shape getter.
|
||||
*
|
||||
* \tparam IDim Dimension idx.
|
||||
* \return Calculated size.
|
||||
* \return Shape.
|
||||
*/
|
||||
template <index_t IDim>
|
||||
__host__ __device__ constexpr auto Get() const
|
||||
__host__ __device__ constexpr const Shape& GetShape() const { return shape_; }
|
||||
|
||||
/**
|
||||
* \brief Strides getter.
|
||||
*
|
||||
* \return Strides.
|
||||
*/
|
||||
__host__ __device__ constexpr const DeducedStrides& GetStrides() const { return strides_; }
|
||||
|
||||
/**
|
||||
* \brief Get default lengths (tuple filled with Shape length elements).
|
||||
*
|
||||
* \return Default lengths.
|
||||
*/
|
||||
__host__ __device__ constexpr auto GetDefaultLengthsTuple() const
|
||||
{
|
||||
const auto elem = shape_.At(Number<IDim>{});
|
||||
return elem;
|
||||
return generate_tuple([&](auto i) { return GetLength<i>(); }, Number<Shape::Size()>{});
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get default start idx (tuple filled with 0s of the same size as Shape).
|
||||
*
|
||||
* \return Default start idx.
|
||||
*/
|
||||
__host__ __device__ constexpr auto GetDefaultStartIdxs() const
|
||||
{
|
||||
return GenerateDefaultIdxsTuple(shape_);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get default descriptor (with the same size as Shape)
|
||||
*
|
||||
* \return Default descriptor.
|
||||
*/
|
||||
__host__ __device__ constexpr MergedNestsDescriptorType GetDefaultDescriptor()
|
||||
{
|
||||
return merged_nests_descriptor_;
|
||||
}
|
||||
|
||||
private:
|
||||
NaiveDescriptorType descriptor_;
|
||||
Shape shape_;
|
||||
FlattenDescriptorType flatten_descriptor_;
|
||||
Descriptor1dType descriptor_1d_;
|
||||
MergedNestsDescriptorType merged_nests_descriptor_;
|
||||
const Shape shape_;
|
||||
const DeducedStrides strides_;
|
||||
};
|
||||
|
||||
// Layout helpers
|
||||
// Length getter (product if tuple)
|
||||
template <index_t idx, typename Shape, typename Strides>
|
||||
__host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
|
||||
{
|
||||
return layout.template GetLength<idx>();
|
||||
}
|
||||
|
||||
// Get shape size (product of dims if tuple)
|
||||
template <typename... ShapeDims>
|
||||
__host__ __device__ constexpr index_t size(const Tuple<ShapeDims...>& shape)
|
||||
{
|
||||
using UnrolledShape = decltype(UnrollNestedTuple(shape));
|
||||
return TupleReduce<0, UnrolledShape::Size()>([](auto x, auto y) { return x * y; },
|
||||
UnrolledShape{});
|
||||
}
|
||||
|
||||
// Get dim size (could be returned from get function)
|
||||
template <typename T>
|
||||
__host__ __device__ T constexpr size(const T& dim)
|
||||
{
|
||||
return dim;
|
||||
}
|
||||
|
||||
// Get layout size (product of shapes)
|
||||
template <typename Shape, typename Strides>
|
||||
__host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
|
||||
{
|
||||
return layout.GetLength();
|
||||
}
|
||||
|
||||
// Get shape element size
|
||||
template <index_t idx, typename... ShapeDims>
|
||||
__host__ __device__ constexpr index_t size(const Tuple<ShapeDims...>& shape)
|
||||
{
|
||||
return size(shape.At(Number<idx>{}));
|
||||
}
|
||||
|
||||
// Dim getter (tuple if tuple)
|
||||
template <index_t idx, typename Shape, typename Strides>
|
||||
__host__ __device__ constexpr auto get(const Layout<Shape, Strides>& layout)
|
||||
{
|
||||
return layout.template Get<idx>();
|
||||
}
|
||||
|
||||
template <typename Shape, typename Strides>
|
||||
__host__ __device__ constexpr Layout<Shape, Strides> make_layout(const Shape& shape,
|
||||
const Strides& strides)
|
||||
{
|
||||
return Layout<Shape, Strides>(shape, strides);
|
||||
}
|
||||
|
||||
template <typename Shape>
|
||||
__host__ __device__ constexpr Layout<Shape> make_layout(const Shape& shape)
|
||||
{
|
||||
return Layout<Shape>(shape);
|
||||
}
|
||||
|
||||
} // namespace tensor_transform_wrapper
|
||||
} // namespace wrapper
|
||||
} // namespace ck
|
||||
314
include/ck/wrapper/tensor.hpp
Normal file
314
include/ck/wrapper/tensor.hpp
Normal file
@@ -0,0 +1,314 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "utils/tensor_utils.hpp"
|
||||
#include "utils/layout_utils.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace wrapper {
|
||||
|
||||
/**
|
||||
* \brief Tensor wrapper that performs static and dynamic buffer logic.
|
||||
*
|
||||
* \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR).
|
||||
* \tparam ElementType Element data type.
|
||||
* \tparam Shape Tensor shape (layout component).
|
||||
* \tparam Strides Tensor strides (layout component).
|
||||
* \tparam NumVectors Number of vectors (only for VGPR, SGPR).
|
||||
* \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR).
|
||||
*/
|
||||
template <MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename Strides,
|
||||
index_t NumVectors, // param for Register memory
|
||||
index_t ScalarPerVector // param for Register memory
|
||||
>
|
||||
struct Tensor
|
||||
{
|
||||
private:
|
||||
// Check if Tuple contains Slice object
|
||||
template <typename T>
|
||||
constexpr static bool IsSlicing(T&&)
|
||||
{
|
||||
return is_detected<is_slice, T>::value;
|
||||
}
|
||||
template <typename... Ts>
|
||||
constexpr static bool IsSlicing(Tuple<Ts...>&&)
|
||||
{
|
||||
return (IsSlicing(Ts{}) || ...);
|
||||
}
|
||||
|
||||
// Calculate first index of new tensor after slice
|
||||
// It is needed to calculate offset for new tensor
|
||||
template <typename... Ts>
|
||||
constexpr auto GetStartIdxForSlicedTensor(const Tuple<Ts...>& idx) const
|
||||
{
|
||||
const auto start_idx_for_sliced_tensor = generate_tuple(
|
||||
[&](auto i) {
|
||||
constexpr auto num_i = Number<i>{};
|
||||
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
// if tuple then recurrence
|
||||
return GetStartIdxForSlicedTensor(idx.At(num_i));
|
||||
}
|
||||
else if constexpr(is_detected<is_slice,
|
||||
tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
// if slice, return the beginning of the interval
|
||||
return idx.At(num_i).from_;
|
||||
}
|
||||
else
|
||||
{
|
||||
// if one dim selected
|
||||
return idx.At(num_i);
|
||||
}
|
||||
},
|
||||
Number<Tuple<Ts...>::Size()>{});
|
||||
|
||||
return start_idx_for_sliced_tensor;
|
||||
}
|
||||
|
||||
// Calculate new tensor shape after slice
|
||||
template <typename... Ts, typename ShapeTmpType>
|
||||
constexpr auto GetShapeFromSlicedTensor(const Tuple<Ts...>& idx,
|
||||
const ShapeTmpType& shape) const
|
||||
{
|
||||
// Pack each value in tuple to remove empty tuples after generation
|
||||
auto new_shape = generate_tuple(
|
||||
[&](auto i) {
|
||||
constexpr auto num_i = Number<i>{};
|
||||
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
if constexpr(!IsSlicing(tuple_element_t<i.value, Tuple<Ts...>>{}))
|
||||
{
|
||||
// if tuple does not have any slice then we can remove dimension
|
||||
return Tuple<>{};
|
||||
}
|
||||
else
|
||||
{
|
||||
// if tuple then recurrence
|
||||
return make_tuple(GetShapeFromSlicedTensor(idx.At(num_i), shape.At(num_i)));
|
||||
}
|
||||
}
|
||||
else if constexpr(is_detected<is_slice,
|
||||
tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
// calculate new dimension
|
||||
const auto& dim = size(shape.At(num_i));
|
||||
const auto val = idx.At(num_i).range(dim);
|
||||
return make_tuple(val);
|
||||
}
|
||||
else
|
||||
{
|
||||
// remove dimension for just value
|
||||
return Tuple<>{};
|
||||
}
|
||||
},
|
||||
Number<Tuple<Ts...>::Size()>{});
|
||||
// Remove empty tuples (deleted elements) and return
|
||||
return UnrollNestedTuple<0, 1>(new_shape);
|
||||
}
|
||||
|
||||
template <typename... Ts, typename StridesTmpType>
|
||||
constexpr auto GetStridesFromSlicedTensor(const Tuple<Ts...>& idx,
|
||||
const StridesTmpType& strides) const
|
||||
{
|
||||
// Pack each value in tuple to remove empty tuples after generation
|
||||
auto new_strides = generate_tuple(
|
||||
[&](auto i) {
|
||||
constexpr auto num_i = Number<i>{};
|
||||
if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
if constexpr(!IsSlicing(tuple_element_t<i.value, Tuple<Ts...>>{}))
|
||||
{
|
||||
// if tuple does not have any slice then we can remove dimension
|
||||
return Tuple<>{};
|
||||
}
|
||||
else
|
||||
{
|
||||
// if tuple then recurrence
|
||||
return make_tuple(
|
||||
GetStridesFromSlicedTensor(idx.At(num_i), strides.At(num_i)));
|
||||
}
|
||||
}
|
||||
else if constexpr(is_detected<is_slice,
|
||||
tuple_element_t<i.value, Tuple<Ts...>>>::value)
|
||||
{
|
||||
// Stride will be the same
|
||||
return make_tuple(strides.At(num_i));
|
||||
}
|
||||
else
|
||||
{
|
||||
// remove dimension for just value
|
||||
return Tuple<>{};
|
||||
}
|
||||
},
|
||||
Number<Tuple<Ts...>::Size()>{});
|
||||
// Remove empty tuples (deleted elements) and return
|
||||
return UnrollNestedTuple<0, 1>(new_strides);
|
||||
}
|
||||
|
||||
public:
|
||||
using ElementSpaceSize = decltype(Layout<Shape, Strides>{
|
||||
Shape{}, Strides{}}.GetElementSpaceSize()); // SpaceSize type for buffer
|
||||
using TensorElementType = ElementType; // DataType
|
||||
|
||||
static constexpr MemoryTypeEnum TensorBufferAddressSpace = BufferAddressSpace;
|
||||
static constexpr bool IsDynamicBuffer = !(BufferAddressSpace == MemoryTypeEnum ::Sgpr ||
|
||||
BufferAddressSpace == MemoryTypeEnum ::Vgpr);
|
||||
|
||||
__host__ __device__ Tensor() = delete;
|
||||
__host__ __device__ Tensor(ElementType* pointer, const Layout<Shape, Strides>& layout)
|
||||
: layout_(layout),
|
||||
buffer_(make_dynamic_buffer<BufferAddressSpace>(pointer, layout.GetElementSpaceSize()))
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ Tensor(const Layout<Shape, Strides>& layout) : layout_(layout)
|
||||
{
|
||||
static_assert(!IsDynamicBuffer, "Wrong BufferAddressSpace for register.");
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr const Layout<Shape, Strides>& GetLayout() const
|
||||
{
|
||||
return layout_;
|
||||
}
|
||||
|
||||
// Getter for new sliced tensor
|
||||
template <typename... Ts, enable_if_t<IsSlicing(Tuple<Ts...>{}), bool> = false>
|
||||
__host__ __device__ auto operator[](const Tuple<Ts...>& idx) const
|
||||
{
|
||||
static_assert(IsDynamicBuffer, "Register slice is not supported");
|
||||
// Calculate offset based on first idx for new tensor
|
||||
const index_t offset = layout_(GetStartIdxForSlicedTensor(idx));
|
||||
|
||||
auto new_shape = GetShapeFromSlicedTensor(idx, layout_.GetShape());
|
||||
if constexpr(is_same_v<Strides, Tuple<>>)
|
||||
{
|
||||
auto new_layout = make_layout(new_shape);
|
||||
return make_tensor<BufferAddressSpace>(buffer_.p_data_ + offset, new_layout);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto new_strides = GetStridesFromSlicedTensor(idx, layout_.GetStrides());
|
||||
auto new_layout = make_layout(new_shape, new_strides);
|
||||
return make_tensor<BufferAddressSpace>(buffer_.p_data_ + offset, new_layout);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Ts, enable_if_t<IsSlicing(Tuple<Ts...>{}), bool> = false>
|
||||
__host__ __device__ auto operator()(const Tuple<Ts...>& idx) const
|
||||
{
|
||||
return this->operator[](idx);
|
||||
}
|
||||
|
||||
template <typename... Idxs, enable_if_t<IsSlicing(Tuple<Idxs...>{}), bool> = false>
|
||||
__host__ __device__ auto operator()(Idxs... idxs) const
|
||||
{
|
||||
return this->operator[](make_tuple(idxs...));
|
||||
}
|
||||
|
||||
// Getter for the const value
|
||||
template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
|
||||
__host__ __device__ const ElementType& operator[](const Tuple<Ts...>& idx) const
|
||||
{
|
||||
if constexpr(IsDynamicBuffer)
|
||||
{
|
||||
const index_t offset = layout_(idx);
|
||||
return buffer_[offset];
|
||||
}
|
||||
else
|
||||
{
|
||||
if constexpr(is_same_v<Strides, Tuple<>>)
|
||||
{
|
||||
constexpr index_t offset =
|
||||
Layout<Shape, Strides>{Shape{}}.template operator()<Tuple<Ts...>>();
|
||||
return buffer_[Number<offset>{}];
|
||||
}
|
||||
else
|
||||
{
|
||||
constexpr index_t offset =
|
||||
Layout<Shape, Strides>{Shape{}, Strides{}}.template operator()<Tuple<Ts...>>();
|
||||
return buffer_[Number<offset>{}];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
|
||||
__host__ __device__ const ElementType& operator()(const Tuple<Ts...>& idx) const
|
||||
{
|
||||
return this->operator[](idx);
|
||||
}
|
||||
|
||||
template <typename... Idxs, enable_if_t<!IsSlicing(Tuple<Idxs...>{}), bool> = false>
|
||||
__host__ __device__ const ElementType& operator()(Idxs... idxs) const
|
||||
{
|
||||
return this->operator[](make_tuple(idxs...));
|
||||
}
|
||||
|
||||
// Getter for the value reference
|
||||
template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
|
||||
__host__ __device__ ElementType& operator[](const Tuple<Ts...>& idx)
|
||||
{
|
||||
if constexpr(IsDynamicBuffer)
|
||||
{
|
||||
const index_t offset = layout_(idx);
|
||||
return buffer_(offset);
|
||||
}
|
||||
else
|
||||
{
|
||||
if constexpr(is_same_v<Strides, Tuple<>>)
|
||||
{
|
||||
constexpr index_t offset =
|
||||
Layout<Shape, Strides>{Shape{}}.template operator()<Tuple<Ts...>>();
|
||||
return buffer_(Number<offset>{});
|
||||
}
|
||||
else
|
||||
{
|
||||
constexpr index_t offset =
|
||||
Layout<Shape, Strides>{Shape{}, Strides{}}.template operator()<Tuple<Ts...>>();
|
||||
return buffer_(Number<offset>{});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Ts, enable_if_t<!IsSlicing(Tuple<Ts...>{}), bool> = false>
|
||||
__host__ __device__ ElementType& operator()(const Tuple<Ts...>& idx)
|
||||
{
|
||||
return this->operator[](idx);
|
||||
}
|
||||
|
||||
template <typename... Idxs, enable_if_t<!IsSlicing(Tuple<Idxs...>{}), bool> = false>
|
||||
__host__ __device__ ElementType& operator()(Idxs... idxs)
|
||||
{
|
||||
return this->operator[](make_tuple(idxs...));
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr auto GetDefaultDescriptor()
|
||||
{
|
||||
return layout_.GetDefaultDescriptor();
|
||||
}
|
||||
|
||||
private:
|
||||
using DynamicBufferType = DynamicBuffer<BufferAddressSpace,
|
||||
ElementType,
|
||||
ElementSpaceSize,
|
||||
true /*InvalidElementUseNumericalZeroValue*/>;
|
||||
using StaticBufferType =
|
||||
StaticBufferTupleOfVector<BufferAddressSpace,
|
||||
ElementType,
|
||||
NumVectors,
|
||||
ScalarPerVector,
|
||||
true /*InvalidElementUseNumericalZeroValue*/>;
|
||||
// If register use static buffer, else use dynamic buffer
|
||||
using Buffer = std::conditional_t<IsDynamicBuffer, DynamicBufferType, StaticBufferType>;
|
||||
|
||||
const Layout<Shape, Strides> layout_;
|
||||
Buffer buffer_;
|
||||
};
|
||||
|
||||
} // namespace wrapper
|
||||
} // namespace ck
|
||||
335
include/ck/wrapper/utils/layout_utils.hpp
Normal file
335
include/ck/wrapper/utils/layout_utils.hpp
Normal file
@@ -0,0 +1,335 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
|
||||
#include "ck/utility/number.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/tuple_helper.hpp"
|
||||
#include "ck/utility/sequence.hpp"
|
||||
#include "ck/utility/sequence_helper.hpp"
|
||||
#include "ck/utility/is_detected.hpp"
|
||||
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
#include "ck/tensor_description/multi_index_transform_helper.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace wrapper {
|
||||
|
||||
// Disable from doxygen docs generation
|
||||
/// @cond
|
||||
// forward declaration
|
||||
template <typename Shape, typename Strides>
|
||||
struct Layout;
|
||||
|
||||
template <typename T>
|
||||
using is_tuple = decltype(std::declval<T&>().IsTuple());
|
||||
/// @endcond
|
||||
|
||||
// make_*
|
||||
/**
|
||||
* \brief Make layout function.
|
||||
*
|
||||
* \tparam Shape Shape for layout.
|
||||
* \tparam Strides Strides for layout.
|
||||
* \return Constructed layout.
|
||||
*/
|
||||
template <typename Shape, typename Strides>
|
||||
__host__ __device__ constexpr Layout<Shape, Strides> make_layout(const Shape& shape,
|
||||
const Strides& strides)
|
||||
{
|
||||
return Layout<Shape, Strides>(shape, strides);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Make layout function with packed strides
|
||||
* (column-major).
|
||||
*
|
||||
* \tparam Shape Shape for layout.
|
||||
* \return Constructed layout.
|
||||
*/
|
||||
template <typename Shape>
|
||||
__host__ __device__ constexpr Layout<Shape, Tuple<>> make_layout(const Shape& shape)
|
||||
{
|
||||
return Layout<Shape, Tuple<>>(shape);
|
||||
}
|
||||
|
||||
// Layout helpers
|
||||
// get
|
||||
// Get dim (could be returned from get with empty Idxs)
|
||||
/**
|
||||
* \private
|
||||
*/
|
||||
template <typename T>
|
||||
__host__ __device__ T constexpr get(const T& dim)
|
||||
{
|
||||
return dim;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get element from tuple (Shape/Strides/Idxs).
|
||||
*
|
||||
* \tparam idx Index to lookup.
|
||||
* \param tuple Tuple to lookup.
|
||||
* \return Requsted element.
|
||||
*/
|
||||
template <index_t idx, typename... Dims>
|
||||
__host__ __device__ constexpr auto get(const Tuple<Dims...>& tuple)
|
||||
{
|
||||
return tuple.At(Number<idx>{});
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get sub layout.
|
||||
*
|
||||
* \tparam idx Index to lookup.
|
||||
* \param layout Layout to create sub layout.
|
||||
* \return Requsted sub layout.
|
||||
*/
|
||||
template <index_t idx, typename Shape, typename Strides>
|
||||
__host__ __device__ constexpr auto get(const Layout<Shape, Strides>& layout)
|
||||
{
|
||||
const auto& shape = layout.GetShape();
|
||||
const auto& new_shape = get<idx>(shape);
|
||||
static_assert(is_detected<is_tuple, decltype(new_shape)>::value,
|
||||
"Shape of sub layout must be tuple");
|
||||
if constexpr(is_same_v<Strides, Tuple<>>)
|
||||
{
|
||||
// If stride not passed, create without strides
|
||||
return make_layout(new_shape);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto& strides = layout.GetStrides();
|
||||
const auto& new_strides = get<idx>(strides);
|
||||
static_assert(is_detected<is_tuple, decltype(new_strides)>::value,
|
||||
"Strides of sub layout must be tuple");
|
||||
return make_layout(new_shape, new_strides);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Hierarchical get.
|
||||
*
|
||||
* \tparam Idxs Indexes to lookup.
|
||||
* \param elem Element to lookup.
|
||||
* \return Requsted element.
|
||||
*/
|
||||
template <index_t Idx, index_t... Idxs, typename T>
|
||||
__host__ __device__ constexpr auto get(const T& elem)
|
||||
{
|
||||
return get<Idxs...>(get<Idx>(elem));
|
||||
}
|
||||
|
||||
// size
|
||||
// Get dim size (could be returned from get function)
|
||||
/**
|
||||
* \private
|
||||
*/
|
||||
template <typename T>
|
||||
__host__ __device__ T constexpr size(const T& dim)
|
||||
{
|
||||
return dim;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Length get (product if tuple).
|
||||
*
|
||||
* \tparam idx Index to lookup.
|
||||
* \param layout Layout to get Shape of.
|
||||
* \return Requsted length.
|
||||
*/
|
||||
template <index_t idx, typename Shape, typename Strides>
|
||||
__host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
|
||||
{
|
||||
return layout.template GetLength<idx>();
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Shape size (product of dims).
|
||||
*
|
||||
* \param shape Shape to lookup.
|
||||
* \return Requsted size.
|
||||
*/
|
||||
template <typename... ShapeDims>
|
||||
__host__ __device__ constexpr index_t size(const Tuple<ShapeDims...>& shape)
|
||||
{
|
||||
const auto unrolled_shape = UnrollNestedTuple(shape);
|
||||
return TupleReduce<0, unrolled_shape.Size()>([](auto x, auto y) { return x * y; },
|
||||
unrolled_shape);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Layout size (product of dims).
|
||||
*
|
||||
* \param layout Layout to calculate shape size.
|
||||
* \return Requsted size.
|
||||
*/
|
||||
template <typename Shape, typename Strides>
|
||||
__host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
|
||||
{
|
||||
return layout.GetLengths();
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Length get from tuple (product if tuple).
|
||||
*
|
||||
* \tparam idx Index to lookup.
|
||||
* \param tuple Tuple to lookup.
|
||||
* \return Requsted length.
|
||||
*/
|
||||
template <index_t idx, typename... Ts>
|
||||
__host__ __device__ constexpr index_t size(const Tuple<Ts...>& tuple)
|
||||
{
|
||||
return size(tuple.At(Number<idx>{}));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Hierarchical size.
|
||||
*
|
||||
* \tparam Idx First index to lookup (to avoid empty Idxs).
|
||||
* \tparam Idxs Next indexes to lookup.
|
||||
* \param elem Element to lookup.
|
||||
* \return Requsted element.
|
||||
*/
|
||||
template <index_t Idx, index_t... Idxs, typename T>
|
||||
__host__ __device__ constexpr auto size(const T& elem)
|
||||
{
|
||||
return size(get<Idx, Idxs...>(elem));
|
||||
}
|
||||
|
||||
// rank
|
||||
/**
|
||||
* \brief Get layout rank (num elements in shape).
|
||||
*
|
||||
* \param layout Layout to calculate rank.
|
||||
* \return Requsted rank.
|
||||
*/
|
||||
template <typename Shape, typename Strides>
|
||||
__host__ __device__ constexpr auto rank([[maybe_unused]] const Layout<Shape, Strides>& layout)
|
||||
{
|
||||
return Shape::Size();
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get tuple rank (num elements in tuple).
|
||||
* Return 1 if scalar passed.
|
||||
*
|
||||
* \param tuple Tuple to calculate rank.
|
||||
* \return Requsted rank.
|
||||
*/
|
||||
template <typename... Dims>
|
||||
__host__ __device__ constexpr auto rank([[maybe_unused]] const Tuple<Dims...>& tuple)
|
||||
{
|
||||
return Tuple<Dims...>::Size();
|
||||
}
|
||||
|
||||
/**
|
||||
* \private
|
||||
*/
|
||||
template <index_t IDim>
|
||||
__host__ __device__ constexpr index_t rank(const Number<IDim>&)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* \private
|
||||
*/
|
||||
__host__ __device__ constexpr index_t rank(const index_t&) { return 1; }
|
||||
|
||||
/**
|
||||
* \brief Hierarchical rank.
|
||||
*
|
||||
* \tparam Idxs Indexes to lookup.
|
||||
* \param elem Element to lookup.
|
||||
* \return Requsted rank.
|
||||
*/
|
||||
template <index_t... Idxs, typename T>
|
||||
__host__ __device__ constexpr auto rank(const T& elem)
|
||||
{
|
||||
return rank(get<Idxs...>(elem));
|
||||
}
|
||||
|
||||
// depth
|
||||
/**
|
||||
* \brief Get depth of the layout shape (return 0 if scalar).
|
||||
*
|
||||
* \param layout Layout to calculate depth.
|
||||
* \return Requsted depth.
|
||||
*/
|
||||
template <typename Shape, typename Strides>
|
||||
__host__ __device__ constexpr auto depth(const Layout<Shape, Strides>& layout)
|
||||
{
|
||||
const auto& shape = layout.GetShape();
|
||||
return TupleDepth(shape);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get depth of the tuple. (return 0 if scalar)
|
||||
*
|
||||
* \param tuple Tuple to calculate depth.
|
||||
* \return Requsted depth.
|
||||
*/
|
||||
template <typename... Dims>
|
||||
__host__ __device__ constexpr auto depth(const Tuple<Dims...>& tuple)
|
||||
{
|
||||
return TupleDepth(tuple);
|
||||
}
|
||||
|
||||
/**
|
||||
* \private
|
||||
*/
|
||||
template <index_t IDim>
|
||||
__host__ __device__ constexpr index_t depth(const Number<IDim>&)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* \private
|
||||
*/
|
||||
__host__ __device__ constexpr index_t depth(const index_t&) { return 0; }
|
||||
|
||||
/**
|
||||
* \brief Hierarchical depth.
|
||||
*
|
||||
* \tparam Idxs Indexes to lookup.
|
||||
* \param elem Element to lookup.
|
||||
* \return Requsted depth.
|
||||
*/
|
||||
template <index_t... Idxs, typename T>
|
||||
__host__ __device__ constexpr auto depth(const T& elem)
|
||||
{
|
||||
return depth(get<Idxs...>(elem));
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get Layout strides.
|
||||
*
|
||||
* \param layout Layout to get strides from.
|
||||
* \return Requsted strides.
|
||||
*/
|
||||
template <typename Shape, typename Strides>
|
||||
__host__ __device__ constexpr const auto& stride(const Layout<Shape, Strides>& layout)
|
||||
{
|
||||
return layout.GetStrides();
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get Layout shape.
|
||||
*
|
||||
* \param layout Layout to get shape from.
|
||||
* \return Requsted shape.
|
||||
*/
|
||||
template <typename Shape, typename Strides>
|
||||
__host__ __device__ constexpr const auto& shape(const Layout<Shape, Strides>& layout)
|
||||
{
|
||||
return layout.GetShape();
|
||||
}
|
||||
|
||||
} // namespace wrapper
|
||||
} // namespace ck
|
||||
290
include/ck/wrapper/utils/tensor_utils.hpp
Normal file
290
include/ck/wrapper/utils/tensor_utils.hpp
Normal file
@@ -0,0 +1,290 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
|
||||
#include "ck/utility/number.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/tuple_helper.hpp"
|
||||
#include "ck/utility/dynamic_buffer.hpp"
|
||||
#include "ck/utility/amd_address_space.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace wrapper {
|
||||
|
||||
/**
|
||||
* \brief Memory type, allowed members:
|
||||
* - Generic,
|
||||
* - Global,
|
||||
* - LDS,
|
||||
* - SGPR,
|
||||
* - VGPR,
|
||||
*/
|
||||
using MemoryTypeEnum = AddressSpaceEnum;
|
||||
|
||||
// Disable from doxygen docs generation
|
||||
/// @cond
|
||||
// forward declarations
|
||||
template <typename Shape, typename Strides>
|
||||
struct Layout;
|
||||
template <MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename Strides,
|
||||
index_t NumVectors, // params for Register memory
|
||||
index_t ScalarPerVector // param for Register memory
|
||||
>
|
||||
|
||||
struct Tensor;
|
||||
|
||||
template <typename FromType, typename ToType>
|
||||
struct Slice
|
||||
{
|
||||
__host__ __device__ constexpr Slice() : from_(), to_() {}
|
||||
__host__ __device__ constexpr Slice(FromType from, ToType to) : from_(from), to_(to) {}
|
||||
|
||||
template <typename T>
|
||||
__host__ __device__ constexpr auto range(const T& dim) const
|
||||
{
|
||||
if constexpr(is_same_v<FromType, index_t> || is_same_v<ToType, index_t> ||
|
||||
is_same_v<T, index_t>)
|
||||
{
|
||||
assert(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_) && "Invalid range");
|
||||
if(to_ < 0)
|
||||
{
|
||||
return dim - from_ + to_ + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// workaround if one end of the interval is index_t and the second one is Number
|
||||
return static_cast<index_t>(to_) - static_cast<index_t>(from_);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
static_assert(dim >= to_ && from_ >= Number<0>{} && (to_ < 0 || to_ > from_),
|
||||
"Invalid range");
|
||||
if constexpr(to_ < 0)
|
||||
{
|
||||
return dim - from_ + to_ + Number<1>{};
|
||||
}
|
||||
else
|
||||
{
|
||||
return to_ - from_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__host__ __device__ static constexpr bool IsSlice() { return true; }
|
||||
|
||||
const FromType from_;
|
||||
const ToType to_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using is_slice = decltype(std::declval<T&>().IsSlice());
|
||||
|
||||
template <typename T>
|
||||
using is_tuple = decltype(std::declval<T&>().IsTuple());
|
||||
/// @endcond
|
||||
|
||||
/**
|
||||
* \brief Make tensor function.
|
||||
*
|
||||
* \tparam MemoryType Type of memory.
|
||||
* \param pointer Pointer to the memory.
|
||||
* \param layout Tensor layout.
|
||||
* \return Constructed tensor.
|
||||
*/
|
||||
template <MemoryTypeEnum MemoryType, typename ElementType, typename Shape, typename Strides>
|
||||
constexpr auto make_tensor(ElementType* pointer, const Layout<Shape, Strides>& layout)
|
||||
{
|
||||
return Tensor<MemoryType, ElementType, Shape, Strides, 0 /*NumVectors*/, 0 /*ScalarPerVector*/>(
|
||||
pointer, layout);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Make SGPR or VGPR tensor function.
|
||||
*
|
||||
* \tparam MemoryType Type of memory.
|
||||
* \tparam NumVectors Number of vectors.
|
||||
* \tparam ScalarPerVector Scalars per vector.
|
||||
* \tparam ElementType Memory data type.
|
||||
* \param layout Tensor layout.
|
||||
* \return Constructed tensor.
|
||||
*/
|
||||
template <MemoryTypeEnum MemoryType,
|
||||
index_t NumVectors,
|
||||
index_t ScalarPerVector,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename Strides>
|
||||
constexpr auto make_register_tensor(const Layout<Shape, Strides>& layout)
|
||||
{
|
||||
static_assert(!IsNestedTuple(Shape{}), "Register tensor with nested layout is not supported");
|
||||
return Tensor<MemoryType, ElementType, Shape, Strides, NumVectors, ScalarPerVector>(layout);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get Tensor Layout.
|
||||
*
|
||||
* \param tensor Tensor to get layout of.
|
||||
* \return Requsted layout.
|
||||
*/
|
||||
template <MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename Strides,
|
||||
index_t NumVectors,
|
||||
index_t ScalarPerVector>
|
||||
__host__ __device__ constexpr const auto&
|
||||
layout(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
|
||||
tensor)
|
||||
{
|
||||
return tensor.GetLayout();
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Product of tensor shape dims.
|
||||
*
|
||||
* \tparam Idxs Indexes to access specific shape dim (optional).
|
||||
* \param tensor Tensor to get Shape of.
|
||||
* \return Requsted size.
|
||||
*/
|
||||
template <index_t... Idxs,
|
||||
MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename Strides,
|
||||
index_t NumVectors,
|
||||
index_t ScalarPerVector>
|
||||
__host__ __device__ constexpr index_t
|
||||
size(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
|
||||
tensor)
|
||||
{
|
||||
return size<Idxs...>(tensor.GetLayout());
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Rank of Shape tuple.
|
||||
*
|
||||
* \tparam Idxs Indexes to access specific shape dim (optional).
|
||||
* \param tensor Tensor to get rank of.
|
||||
* \return Requsted rank.
|
||||
*/
|
||||
template <index_t... Idxs,
|
||||
MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename Strides,
|
||||
index_t NumVectors,
|
||||
index_t ScalarPerVector>
|
||||
__host__ __device__ constexpr index_t
|
||||
rank(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
|
||||
tensor)
|
||||
{
|
||||
return rank<Idxs...>(tensor.GetLayout());
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Depth of Shape tuple.
|
||||
*
|
||||
* \tparam Idxs Indexes to access specific shape dim (optional).
|
||||
* \param tensor Tensor to get depth of.
|
||||
* \return Requsted depth.
|
||||
*/
|
||||
template <index_t... Idxs,
|
||||
MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename Strides,
|
||||
index_t NumVectors,
|
||||
index_t ScalarPerVector>
|
||||
__host__ __device__ constexpr index_t
|
||||
depth(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
|
||||
tensor)
|
||||
{
|
||||
return depth<Idxs...>(tensor.GetLayout());
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get Tensor strides.
|
||||
*
|
||||
* \param tensor Tensor to get strides from.
|
||||
* \return Requsted strides.
|
||||
*/
|
||||
template <MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename Strides,
|
||||
index_t NumVectors,
|
||||
index_t ScalarPerVector>
|
||||
__host__ __device__ constexpr const auto&
|
||||
stride(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
|
||||
tensor)
|
||||
{
|
||||
return stride(tensor.GetLayout());
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get Tensor shape.
|
||||
*
|
||||
* \param tensor Tensor to get shape from.
|
||||
* \return Requsted shape.
|
||||
*/
|
||||
template <MemoryTypeEnum BufferAddressSpace,
|
||||
typename ElementType,
|
||||
typename Shape,
|
||||
typename Strides,
|
||||
index_t NumVectors,
|
||||
index_t ScalarPerVector>
|
||||
__host__ __device__ constexpr const auto&
|
||||
shape(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
|
||||
tensor)
|
||||
{
|
||||
return shape(tensor.GetLayout());
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get dim slice.
|
||||
*
|
||||
* \param from Beginning of the interval.
|
||||
* \param to End of the interval. (could be also negative to index from the end)
|
||||
* \return Requested slice. Could be used to create sliced tensor from other tensor.
|
||||
*/
|
||||
template <typename FromType, typename ToType>
|
||||
constexpr auto slice(const FromType from, const ToType to)
|
||||
{
|
||||
return Slice<FromType, ToType>(from, to);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get dim slice. (Assumed that from is equal to 1)
|
||||
*
|
||||
* \param to End of the interval. (could be also negative to index from the end)
|
||||
* \return Requested slice. Could be used to create sliced tensor from other tensor.
|
||||
*/
|
||||
template <typename ToType>
|
||||
constexpr auto slice(const ToType to)
|
||||
{
|
||||
if constexpr(is_same_v<ToType, index_t>)
|
||||
{
|
||||
return Slice<index_t, ToType>(0, to);
|
||||
}
|
||||
else
|
||||
{
|
||||
return Slice<Number<0>, ToType>(Number<0>{}, to);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Get whole dim slice (from = 0, to = -1).
|
||||
*
|
||||
* \return Requested slice. Could be used to create sliced tensor from other tensor.
|
||||
*/
|
||||
constexpr auto slice() { return Slice<Number<0>, Number<-1>>(Number<0>{}, Number<-1>{}); }
|
||||
|
||||
} // namespace wrapper
|
||||
} // namespace ck
|
||||
@@ -86,9 +86,9 @@ using NHWGK = ck::tensor_layout::convolution::NHWGK;
|
||||
using NDHWGK = ck::tensor_layout::convolution::NDHWGK;
|
||||
|
||||
//
|
||||
using GK = ck::tensor_layout::convolution::G_K;
|
||||
using GK_Tuple = ck::Tuple<GK>;
|
||||
using GK_GK_Tuple = ck::Tuple<GK, GK>;
|
||||
using G_K = ck::tensor_layout::convolution::G_K;
|
||||
using GK_Tuple = ck::Tuple<G_K>;
|
||||
using GK_GK_Tuple = ck::Tuple<G_K, G_K>;
|
||||
|
||||
// pointwise functor
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
@@ -61,7 +61,11 @@ using device_contraction_kk_instance = std::tuple<
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, ComputeDataType>
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, ComputeDataType>,
|
||||
// Small scalar per vector
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 2, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 8, 1, 8>, 1, ComputeDataType>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
@@ -96,7 +100,11 @@ using device_contraction_kn_instance = std::tuple<
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 1, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 1, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>,
|
||||
// Small scalar per vector
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 8>, 2, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 8, 1, 8>, 1, ComputeDataType>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
@@ -131,7 +139,11 @@ using device_contraction_mk_instance = std::tuple<
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 4, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 4, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>,
|
||||
// Small scalar per vector
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 2, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 8, 1, 8>, 1, ComputeDataType>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
@@ -166,7 +178,11 @@ using device_contraction_mn_instance = std::tuple<
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>,
|
||||
// Small scalar per vector
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 8>, 2, ComputeDataType>,
|
||||
DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 8, 1, 8>, 1, ComputeDataType>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
|
||||
@@ -25,10 +25,6 @@ using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
|
||||
|
||||
static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
|
||||
template <ck::tensor_operation::device::GemmSpecialization GemmSpec>
|
||||
using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple<
|
||||
@@ -37,7 +33,7 @@ using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple<
|
||||
//#####################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| | |
|
||||
//#####################| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | |
|
||||
//#####################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
// pipeline v1, 1 wave
|
||||
// pipeline v1, 1 wave
|
||||
DeviceGemm_Xdl_CShuffle< Row, Row, Row, F8, F8, F8, F32, F8, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 256, 128, 64, 16, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, LoopScheduler::Default, PipelineVersion::v1>,
|
||||
DeviceGemm_Xdl_CShuffle< Row, Row, Row, F8, F8, F8, F32, F8, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, LoopScheduler::Default, PipelineVersion::v1>,
|
||||
DeviceGemm_Xdl_CShuffle< Row, Row, Row, F8, F8, F8, F32, F8, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 256, 64, 16, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, LoopScheduler::Default, PipelineVersion::v1>,
|
||||
@@ -75,7 +71,8 @@ using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple<
|
||||
DeviceGemm_Xdl_CShuffle< Row, Row, Row, F8, F8, F8, F32, F8, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, LoopScheduler::Interwave, PipelineVersion::v1>
|
||||
|
||||
#endif
|
||||
#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
|
||||
#if 0
|
||||
//CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
|
||||
// pipeline v2, 1 wave
|
||||
,
|
||||
DeviceGemm_Xdl_CShuffle< Row, Row, Row, F8, F8, F8, F32, F8, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 256, 128, 64, 16, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, LoopScheduler::Default, PipelineVersion::v2>,
|
||||
@@ -98,17 +95,6 @@ using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple<
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceGemm<Row, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances<GemmDefault>{});
|
||||
|
||||
add_device_operation_instances(
|
||||
instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances<MNKPadding>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
@@ -345,7 +345,11 @@ void add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceGemm<Col, Col, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances);
|
||||
|
||||
void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances(
|
||||
void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_default_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceGemm<Row, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances);
|
||||
|
||||
void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_padded_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceGemm<Row, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances);
|
||||
|
||||
@@ -575,7 +579,8 @@ struct DeviceOperationInstanceFactory<
|
||||
if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
|
||||
is_same_v<CLayout, Row>)
|
||||
{
|
||||
add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances(op_ptrs);
|
||||
add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_padded_instances(op_ptrs);
|
||||
add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_default_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
|
||||
is_same_v<CLayout, Row>)
|
||||
|
||||
@@ -27,7 +27,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
@@ -43,7 +43,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
F16,
|
||||
F16,
|
||||
@@ -59,7 +59,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
F32,
|
||||
F32,
|
||||
@@ -75,7 +75,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
@@ -130,7 +130,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
|
||||
{
|
||||
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
|
||||
if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
|
||||
is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK>)
|
||||
is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK> &&
|
||||
DLayouts::Size() == 2 && is_same_v<tuple_element_t<0, DLayouts>, NDHWGK> &&
|
||||
is_same_v<tuple_element_t<1, DLayouts>, G_K>)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP32
|
||||
if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
|
||||
|
||||
@@ -101,7 +101,8 @@ list(APPEND GEMM_INSTANCES
|
||||
device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp)
|
||||
|
||||
list(APPEND GEMM_INSTANCES
|
||||
device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp
|
||||
device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp
|
||||
device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp
|
||||
device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp
|
||||
device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp
|
||||
device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp)
|
||||
|
||||
@@ -16,6 +16,7 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using F8 = ck::f8_t;
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using F8 = ck::f8_t;
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp"
|
||||
|
||||
#ifdef CK_ENABLE_FP8
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
|
||||
|
||||
void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_default_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceGemm<Row, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances<GemmDefault>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -0,0 +1,26 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp"
|
||||
|
||||
#ifdef CK_ENABLE_FP8
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_padded_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceGemm<Row, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances<MNKPadding>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -13,7 +13,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
@@ -28,7 +28,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_bf16_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
ConvFwdDefault>{});
|
||||
add_device_operation_instances(
|
||||
@@ -36,7 +36,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_bf16_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
ConvFwd1x1P0>{});
|
||||
add_device_operation_instances(
|
||||
@@ -44,7 +44,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_bf16_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
ConvFwd1x1S1P0>{});
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
F16,
|
||||
F16,
|
||||
@@ -28,7 +28,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f16_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
ConvFwdDefault>{});
|
||||
add_device_operation_instances(
|
||||
@@ -36,7 +36,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f16_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
ConvFwd1x1P0>{});
|
||||
add_device_operation_instances(
|
||||
@@ -44,7 +44,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f16_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
ConvFwd1x1S1P0>{});
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
F32,
|
||||
F32,
|
||||
@@ -28,7 +28,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f32_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
ConvFwdDefault>{});
|
||||
add_device_operation_instances(
|
||||
@@ -36,7 +36,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f32_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
ConvFwd1x1P0>{});
|
||||
add_device_operation_instances(
|
||||
@@ -44,7 +44,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f32_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
ConvFwd1x1S1P0>{});
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
@@ -27,7 +27,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_int8_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
ConvFwdDefault>{});
|
||||
add_device_operation_instances(
|
||||
@@ -35,7 +35,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_int8_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
ConvFwd1x1P0>{});
|
||||
add_device_operation_instances(
|
||||
@@ -43,7 +43,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw
|
||||
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_int8_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<NDHWGK, NDHWGK>,
|
||||
ck::Tuple<NDHWGK, G_K>,
|
||||
NDHWGK,
|
||||
ConvFwd1x1S1P0>{});
|
||||
}
|
||||
|
||||
@@ -22,13 +22,13 @@ using S = ck::Sequence<Is...>;
|
||||
using NHWGC = ck::tensor_layout::convolution::NHWGC;
|
||||
using GKYXC = ck::tensor_layout::convolution::GKYXC;
|
||||
using NHWGK = ck::tensor_layout::convolution::NHWGK;
|
||||
using GK = ck::tensor_layout::convolution::G_K;
|
||||
using G_K = ck::tensor_layout::convolution::G_K;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Relu = ck::tensor_operation::element_wise::Relu;
|
||||
using TanH = ck::tensor_operation::element_wise::TanH;
|
||||
|
||||
using GK_Tuple = ck::Tuple<GK>;
|
||||
using GK_GK_Tuple = ck::Tuple<GK, GK>;
|
||||
using GK_Tuple = ck::Tuple<G_K>;
|
||||
using GK_GK_Tuple = ck::Tuple<G_K, G_K>;
|
||||
using I32_Tuple = ck::Tuple<int32_t>;
|
||||
using F32_Tuple = ck::Tuple<float>;
|
||||
using I32_F32_Tuple = ck::Tuple<int32_t, float>;
|
||||
|
||||
@@ -1,85 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "profiler/profile_transpose_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
enum struct MatrixLayout
|
||||
{
|
||||
NCDHW, // 0
|
||||
NCHWD, // 1
|
||||
};
|
||||
|
||||
enum struct DataType
|
||||
{
|
||||
F32_F32_F32_F32_F32, // 0
|
||||
F16_F16_F16_F16_F16, // 1
|
||||
};
|
||||
|
||||
#define OP_NAME "transpose"
|
||||
#define OP_DESC "Transpose"
|
||||
|
||||
int profile_transpose(int argc, char* argv[])
|
||||
{
|
||||
if(argc != 15)
|
||||
{
|
||||
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
|
||||
printf("arg2: data type (0: fp32; 1: fp16)\n");
|
||||
// printf("arg3: matrix layout (NCDHW -> NDCHW);\n");
|
||||
printf("arg4: verification (0: no; 1: yes)\n");
|
||||
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
|
||||
printf("arg6: print tensor value (0: no; 1: yes)\n");
|
||||
printf("arg7: time kernel (0=no, 1=yes)\n");
|
||||
printf("arg8 to 13: N, C, D, H, W\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const auto data_type = static_cast<DataType>(std::stoi(argv[2]));
|
||||
// const auto layout = static_cast<MatrixLayout>(std::stoi(argv[3]));
|
||||
const bool do_verification = std::stoi(argv[3]);
|
||||
const int init_method = std::stoi(argv[4]);
|
||||
const bool do_log = std::stoi(argv[5]);
|
||||
const bool time_kernel = std::stoi(argv[6]);
|
||||
std::vector<index_t> lengths = std::stoi(argv[7]);
|
||||
|
||||
/**const int N = std::stoi(argv[7]);
|
||||
const int C = std::stoi(argv[8]);
|
||||
const int D = std::stoi(argv[9]);
|
||||
const int H = std::stoi(argv[10]);
|
||||
const int W = std::stoi(argv[11]);**/
|
||||
|
||||
using F32 = float;
|
||||
using F16 = ck::half_t;
|
||||
|
||||
auto profile = [&](auto a_type, auto b_type) {
|
||||
using ADataType = decltype(a_type);
|
||||
using BDataType = decltype(b_type);
|
||||
|
||||
bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType>(
|
||||
do_verification, init_method, do_log, time_kernel, lengths);
|
||||
|
||||
return pass ? 0 : 1;
|
||||
};
|
||||
|
||||
if(data_type == GemmDataType::F32_F32_F32_F32_F32)
|
||||
{
|
||||
return profile(F32{}, F32{});
|
||||
}
|
||||
else if(data_type == GemmDataType::F16_F16_F16_F16_F16)
|
||||
{
|
||||
return profile(F16{}, F16{});
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "this data_type & layout is not implemented" << std::endl;
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_transpose);
|
||||
@@ -149,6 +149,7 @@ add_subdirectory(batched_gemm_multi_d)
|
||||
add_subdirectory(grouped_convnd_bwd_data)
|
||||
add_subdirectory(conv_tensor_rearrange)
|
||||
add_subdirectory(transpose)
|
||||
add_subdirectory(wrapper)
|
||||
if(GPU_TARGETS MATCHES "gfx11")
|
||||
add_subdirectory(wmma_op)
|
||||
endif()
|
||||
|
||||
4
test/wrapper/CMakeLists.txt
Normal file
4
test/wrapper/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
add_gtest_executable(test_layout test_layout.cpp)
|
||||
target_link_libraries(test_layout PRIVATE utility)
|
||||
add_gtest_executable(test_tensor test_tensor.cpp)
|
||||
target_link_libraries(test_tensor PRIVATE utility)
|
||||
481
test/wrapper/test_layout.cpp
Normal file
481
test/wrapper/test_layout.cpp
Normal file
@@ -0,0 +1,481 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
|
||||
#include "ck/tensor_description/tensor_descriptor.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
#include "ck/tensor_description/multi_index_transform_helper.hpp"
|
||||
|
||||
class TestWrapperLayout : public ::testing::Test
|
||||
{
|
||||
protected:
|
||||
static constexpr auto I0 = ck::Number<0>{};
|
||||
static constexpr auto I1 = ck::Number<1>{};
|
||||
|
||||
template <typename Desc,
|
||||
typename Desc1d,
|
||||
typename LayoutRuntime,
|
||||
typename LayoutCompiletime,
|
||||
typename Idxs>
|
||||
void Run(Desc& desc,
|
||||
Desc1d& desc_1d,
|
||||
LayoutRuntime& layout_runtime,
|
||||
LayoutCompiletime& layout_compiletime,
|
||||
const std::vector<Idxs>& idxs)
|
||||
{
|
||||
// 1d check
|
||||
EXPECT_EQ(desc_1d.GetLength(I0), ck::wrapper::size(layout_runtime));
|
||||
// Check layout compiletime and runtime result consistency
|
||||
EXPECT_EQ(ck::wrapper::size(layout_runtime), ck::wrapper::size(layout_compiletime));
|
||||
|
||||
for(ck::index_t i = 0; i < desc_1d.GetLength(I0); i++)
|
||||
{
|
||||
const ck::index_t layout_runtime_offset_1d = layout_runtime(ck::make_tuple(i));
|
||||
const ck::index_t layout_compiletime_offset_1d = layout_compiletime(ck::make_tuple(i));
|
||||
const ck::index_t desc_offset_1d = desc_1d.CalculateOffset(ck::make_tuple(i));
|
||||
EXPECT_EQ(layout_runtime_offset_1d, desc_offset_1d);
|
||||
EXPECT_EQ(layout_compiletime_offset_1d, layout_runtime_offset_1d);
|
||||
}
|
||||
// size(layout)-d check, don't check if access is hierarchical
|
||||
if constexpr(!IsNestedTuple(Idxs{}))
|
||||
{
|
||||
ck::static_for<0, Idxs::Size(), 1>{}([&](auto d) {
|
||||
EXPECT_EQ(desc.GetLength(ck::Number<d>{}), ck::wrapper::size<d>(layout_runtime));
|
||||
EXPECT_EQ(ck::wrapper::size<d>(layout_runtime),
|
||||
ck::wrapper::size<d>(layout_compiletime));
|
||||
});
|
||||
}
|
||||
for(const auto idx : idxs)
|
||||
{
|
||||
const ck::index_t layout_runtime_offset = layout_runtime(idx);
|
||||
const ck::index_t layout_compiletime_offset = layout_compiletime(idx);
|
||||
const ck::index_t desc_offset =
|
||||
desc.CalculateOffset(UnrollNestedTuple(idx)); // Unroll if nested
|
||||
EXPECT_EQ(layout_runtime_offset, desc_offset);
|
||||
EXPECT_EQ(layout_runtime_offset, layout_compiletime_offset);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(TestWrapperLayout, 2d)
|
||||
{
|
||||
// dims:(4, 3) strides:(1, 4)
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
constexpr ck::index_t s1 = 1;
|
||||
constexpr ck::index_t s0 = 4;
|
||||
const auto desc =
|
||||
ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
|
||||
// Reverse due to column major
|
||||
const auto desc_1d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1))),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}));
|
||||
const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0));
|
||||
const auto layout_compiletime =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}));
|
||||
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs;
|
||||
|
||||
for(ck::index_t h = 0; h < d1; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0; w++)
|
||||
{
|
||||
idxs.emplace_back(h, w);
|
||||
}
|
||||
}
|
||||
|
||||
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs);
|
||||
}
|
||||
|
||||
TEST_F(TestWrapperLayout, 3d_nested)
|
||||
{
|
||||
// dims:((2, 3), 4, 3) strides:((2, 4), 12, 48)
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
constexpr ck::index_t s3 = 2;
|
||||
constexpr ck::index_t s2 = 4;
|
||||
constexpr ck::index_t s1 = 12;
|
||||
constexpr ck::index_t s0 = 48;
|
||||
const auto desc = ck::make_naive_tensor_descriptor(
|
||||
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
|
||||
// Reverse due to column major
|
||||
const auto desc_1d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
|
||||
ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}));
|
||||
const auto desc_3d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
|
||||
ck::make_pass_through_transform(d1),
|
||||
ck::make_pass_through_transform(d2)),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<3>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
|
||||
const auto layout_runtime =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), d1, d0),
|
||||
ck::make_tuple(ck::make_tuple(s3, s2), s1, s0));
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(
|
||||
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}), ck::Number<d1>{}, ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
|
||||
ck::Number<s1>{},
|
||||
ck::Number<s0>{}));
|
||||
std::vector<ck::Tuple<ck::index_t, ck::index_t, ck::index_t>> idxs_3d;
|
||||
|
||||
for(ck::index_t d = 0; d < d2 * d3; d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < d1; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0; w++)
|
||||
{
|
||||
idxs_3d.emplace_back(d, h, w);
|
||||
}
|
||||
}
|
||||
}
|
||||
this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
|
||||
|
||||
// Check also 4d iteration
|
||||
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t, ck::index_t>> idxs_4d;
|
||||
|
||||
for(ck::index_t e = 0; e < d3; e++)
|
||||
{
|
||||
for(ck::index_t d = 0; d < d2; d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < d1; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0; w++)
|
||||
{
|
||||
idxs_4d.emplace_back(ck::make_tuple(e, d), h, w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
|
||||
}
|
||||
|
||||
TEST_F(TestWrapperLayout, 2d_nested)
|
||||
{
|
||||
// dims:((2, 3), (4, 3)) strides:((2, 4), (48, 12))
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
constexpr ck::index_t s3 = 2;
|
||||
constexpr ck::index_t s2 = 4;
|
||||
constexpr ck::index_t s1 = 48;
|
||||
constexpr ck::index_t s0 = 12;
|
||||
const auto desc = ck::make_naive_tensor_descriptor(
|
||||
ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
|
||||
// Reverse due to column major
|
||||
const auto desc_1d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
|
||||
ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}));
|
||||
const auto desc_2d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
|
||||
ck::make_merge_transform(ck::make_tuple(d0, d1))),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<3, 2>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
|
||||
const auto layout_runtime =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), ck::make_tuple(d1, d0)),
|
||||
ck::make_tuple(ck::make_tuple(s3, s2), ck::make_tuple(s1, s0)));
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
|
||||
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
|
||||
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
|
||||
|
||||
for(ck::index_t h = 0; h < d2 * d3; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0 * d1; w++)
|
||||
{
|
||||
idxs_2d.emplace_back(h, w);
|
||||
}
|
||||
}
|
||||
this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
|
||||
// Check also 4d iteration
|
||||
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::Tuple<ck::index_t, ck::index_t>>>
|
||||
idxs_4d;
|
||||
|
||||
for(ck::index_t e = 0; e < d3; e++)
|
||||
{
|
||||
for(ck::index_t d = 0; d < d2; d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < d1; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0; w++)
|
||||
{
|
||||
idxs_4d.emplace_back(ck::make_tuple(e, d), ck::make_tuple(h, w));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
|
||||
}
|
||||
|
||||
TEST_F(TestWrapperLayout, 3d_double_nested)
|
||||
{
|
||||
// dims:(((2, 2), 3), (4, 3)) strides:(((2, 4), 8), (96, 24))
|
||||
constexpr ck::index_t d4 = 2;
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
constexpr ck::index_t s4 = 2;
|
||||
constexpr ck::index_t s3 = 4;
|
||||
constexpr ck::index_t s2 = 8;
|
||||
constexpr ck::index_t s1 = 96;
|
||||
constexpr ck::index_t s0 = 24;
|
||||
const auto desc = ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d4>{},
|
||||
ck::Number<d3>{},
|
||||
ck::Number<d2>{},
|
||||
ck::Number<d1>{},
|
||||
ck::Number<d0>{}),
|
||||
ck::make_tuple(ck::Number<s4>{},
|
||||
ck::Number<s3>{},
|
||||
ck::Number<s2>{},
|
||||
ck::Number<s1>{},
|
||||
ck::Number<s0>{}));
|
||||
// Reverse due to column major
|
||||
const auto desc_1d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3, d4))),
|
||||
ck::make_tuple(ck::Sequence<4, 3, 2, 1, 0>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}));
|
||||
const auto desc_3d = transform_tensor_descriptor(
|
||||
desc,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d3, d4)),
|
||||
ck::make_pass_through_transform(d2),
|
||||
ck::make_merge_transform(ck::make_tuple(d0, d1))),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<4, 3>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
|
||||
const auto desc_2d = transform_tensor_descriptor(
|
||||
desc_3d,
|
||||
ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3 * d4)),
|
||||
ck::make_pass_through_transform(d1 * d0)),
|
||||
ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}),
|
||||
ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
|
||||
const auto layout_runtime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)),
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, s3), s2), ck::make_tuple(s1, s0)));
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
|
||||
ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
|
||||
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
|
||||
std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
|
||||
|
||||
for(ck::index_t h = 0; h < d2 * d3 * d4; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0 * d1; w++)
|
||||
{
|
||||
idxs_2d.emplace_back(h, w);
|
||||
}
|
||||
}
|
||||
this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
|
||||
// Check also 3d iteration
|
||||
std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>> idxs_3d;
|
||||
|
||||
for(ck::index_t d = 0; d < d3 * d4; d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < d2; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d1 * d0; w++)
|
||||
{
|
||||
idxs_3d.emplace_back(ck::make_tuple(d, h), w);
|
||||
}
|
||||
}
|
||||
}
|
||||
this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
|
||||
// Check also 5d iteration
|
||||
std::vector<ck::Tuple<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>,
|
||||
ck::Tuple<ck::index_t, ck::index_t>>>
|
||||
idxs_5d;
|
||||
|
||||
for(ck::index_t f = 0; f < d4; f++)
|
||||
{
|
||||
for(ck::index_t e = 0; e < d3; e++)
|
||||
{
|
||||
for(ck::index_t d = 0; d < d2; d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < d1; h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < d0; w++)
|
||||
{
|
||||
idxs_5d.emplace_back(ck::make_tuple(ck::make_tuple(f, e), d),
|
||||
ck::make_tuple(h, w));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_5d);
|
||||
}
|
||||
|
||||
TEST(TestLayoutHelpers, SizeAndGet)
|
||||
{
|
||||
// dims:(((2, 2), 3), (4, 3))
|
||||
constexpr ck::index_t d4 = 2;
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
const auto layout_runtime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
|
||||
|
||||
// Size of layout
|
||||
EXPECT_EQ(ck::wrapper::size(layout_runtime), d4 * d3 * d2 * d1 * d0);
|
||||
EXPECT_EQ(ck::wrapper::size(layout_compiletime), d4 * d3 * d2 * d1 * d0);
|
||||
|
||||
// Size of dims
|
||||
EXPECT_EQ(ck::wrapper::size<0>(layout_runtime), d4 * d3 * d2);
|
||||
EXPECT_EQ(ck::wrapper::size<0>(layout_compiletime), d4 * d3 * d2);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(layout_runtime), d1 * d0);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(layout_compiletime), d1 * d0);
|
||||
|
||||
// Access through new layout (using get with layout object)
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_runtime)), d4 * d3);
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_compiletime)), d4 * d3);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
|
||||
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d4);
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
|
||||
d4);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d3);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
|
||||
d3);
|
||||
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
|
||||
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_runtime)), d1);
|
||||
EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_compiletime)), d1);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_runtime)), d0);
|
||||
EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_compiletime)), d0);
|
||||
}
|
||||
|
||||
TEST(TestLayoutHelpers, DepthAndRank)
|
||||
{
|
||||
// dims:(((2, 2), 3), (4, 3))
|
||||
constexpr ck::index_t d4 = 2;
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
const auto layout_runtime = ck::wrapper::make_layout(
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
|
||||
|
||||
EXPECT_EQ(ck::wrapper::depth(layout_runtime), 3);
|
||||
EXPECT_EQ(ck::wrapper::depth(layout_compiletime), 3);
|
||||
EXPECT_EQ(ck::wrapper::depth(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
|
||||
// Check for integer
|
||||
EXPECT_EQ(ck::wrapper::depth(d0), 0);
|
||||
|
||||
EXPECT_EQ(ck::wrapper::rank(layout_runtime), 2);
|
||||
EXPECT_EQ(ck::wrapper::rank(layout_compiletime), 2);
|
||||
EXPECT_EQ(ck::wrapper::rank(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
|
||||
// Check for integer
|
||||
EXPECT_EQ(ck::wrapper::rank(d0), 1);
|
||||
}
|
||||
|
||||
TEST(TestLayoutHelpers, ShapeAndStrides)
|
||||
{
|
||||
// dims:(((2, 2), 3), (4, 3))
|
||||
constexpr ck::index_t d4 = 2;
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
constexpr ck::index_t s4 = 2;
|
||||
constexpr ck::index_t s3 = 4;
|
||||
constexpr ck::index_t s2 = 8;
|
||||
constexpr ck::index_t s1 = 96;
|
||||
constexpr ck::index_t s0 = 24;
|
||||
const auto shape_compiletime = ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}));
|
||||
const auto strides_compiletime = ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<s4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
|
||||
ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
|
||||
const auto shape_runtime =
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
|
||||
const auto strides_runtime =
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(s4, s3), s2), ck::make_tuple(s1, s0));
|
||||
const auto layout_runtime = ck::wrapper::make_layout(shape_runtime, strides_runtime);
|
||||
const auto layout_compiletime =
|
||||
ck::wrapper::make_layout(shape_compiletime, strides_compiletime);
|
||||
|
||||
constexpr bool check_compiletime_shape =
|
||||
std::is_same_v<decltype(shape_compiletime),
|
||||
std::remove_reference_t<decltype(shape(layout_compiletime))>>;
|
||||
constexpr bool check_compiletime_strides =
|
||||
std::is_same_v<decltype(strides_compiletime),
|
||||
std::remove_reference_t<decltype(stride(layout_compiletime))>>;
|
||||
constexpr bool check_runtime_shape =
|
||||
std::is_same_v<decltype(shape_runtime),
|
||||
std::remove_reference_t<decltype(shape(layout_runtime))>>;
|
||||
constexpr bool check_runtime_strides =
|
||||
std::is_same_v<decltype(strides_runtime),
|
||||
std::remove_reference_t<decltype(stride(layout_runtime))>>;
|
||||
EXPECT_TRUE(check_compiletime_shape);
|
||||
EXPECT_TRUE(check_compiletime_strides);
|
||||
EXPECT_TRUE(check_runtime_shape);
|
||||
EXPECT_TRUE(check_runtime_strides);
|
||||
}
|
||||
|
||||
TEST(TestLayoutHelpers, Hierarchical)
|
||||
{
|
||||
// dims:(((2, 2), 3), (4, 3))
|
||||
constexpr ck::index_t d4 = 2;
|
||||
constexpr ck::index_t d3 = 2;
|
||||
constexpr ck::index_t d2 = 3;
|
||||
constexpr ck::index_t d1 = 4;
|
||||
constexpr ck::index_t d0 = 3;
|
||||
const auto runtime_shape =
|
||||
ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
|
||||
const auto layout_runtime = ck::wrapper::make_layout(runtime_shape);
|
||||
const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
|
||||
ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
|
||||
ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
|
||||
|
||||
EXPECT_EQ((ck::wrapper::rank<0, 0>(runtime_shape)), 2);
|
||||
EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_runtime)), 2);
|
||||
EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_compiletime)), 2);
|
||||
|
||||
EXPECT_EQ((ck::wrapper::depth<0, 0>(runtime_shape)), 1);
|
||||
EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_runtime)), 1);
|
||||
EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_compiletime)), 1);
|
||||
|
||||
EXPECT_EQ((ck::wrapper::size<0, 0>(runtime_shape)), d4 * d3);
|
||||
EXPECT_EQ((ck::wrapper::size<0, 0>(layout_runtime)), d4 * d3);
|
||||
EXPECT_EQ((ck::wrapper::size<0, 0>(layout_compiletime)), d4 * d3);
|
||||
|
||||
EXPECT_EQ((ck::wrapper::get<0, 0, 0>(runtime_shape)), d4);
|
||||
}
|
||||
205
test/wrapper/test_tensor.cpp
Normal file
205
test/wrapper/test_tensor.cpp
Normal file
@@ -0,0 +1,205 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
|
||||
#include "ck/utility/common_header.hpp"
|
||||
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
#include "ck/wrapper/tensor.hpp"
|
||||
|
||||
// Compare data in tensor with offset from layout.
|
||||
// Data and offset should match if physical memory has been initialized with
|
||||
// sequentially increasing values from 0.
|
||||
template <typename TensorType>
|
||||
__host__ __device__ bool TestTensorCheck3d(TensorType& tensor)
|
||||
{
|
||||
const auto& layout = ck::wrapper::layout(tensor);
|
||||
for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++)
|
||||
{
|
||||
for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++)
|
||||
{
|
||||
for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
|
||||
{
|
||||
const auto idx = ck::make_tuple(ck::make_tuple(d, h), w);
|
||||
if(tensor(idx) != layout(idx))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename TensorType>
|
||||
__host__ __device__ bool TestTensorCheck1d(TensorType& tensor, ck::index_t start_offset = 0)
|
||||
{
|
||||
const auto& layout = ck::wrapper::layout(tensor);
|
||||
for(ck::index_t w = 0; w < ck::wrapper::size<0>(layout); w++)
|
||||
{
|
||||
if(tensor(w) - start_offset != layout(ck::make_tuple(w)))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <ck::index_t nelems, typename TensorType>
|
||||
__host__ __device__ bool StaticTestTensorCheck1d(TensorType& tensor)
|
||||
{
|
||||
const auto& layout = ck::wrapper::layout(tensor);
|
||||
bool success = true;
|
||||
ck::static_for<0, nelems, 1>{}([&](auto w) {
|
||||
if(tensor(ck::Number<w.value>{}) != layout(ck::make_tuple(w.value)))
|
||||
{
|
||||
success = false;
|
||||
}
|
||||
});
|
||||
return success;
|
||||
}
|
||||
|
||||
template <typename TensorType>
|
||||
__host__ __device__ void InitTensor(TensorType& tensor)
|
||||
{
|
||||
for(ck::index_t i = 0; i < ck::wrapper::size(ck::wrapper::layout(tensor)); i++)
|
||||
{
|
||||
tensor(i) = i;
|
||||
}
|
||||
}
|
||||
|
||||
template <ck::index_t nelems, typename TensorType>
|
||||
__host__ __device__ void StaticInitTensor(TensorType& tensor)
|
||||
{
|
||||
|
||||
ck::static_for<0, nelems, 1>{}([&](auto i) { tensor(ck::Number<i.value>{}) = i.value; });
|
||||
}
|
||||
|
||||
// Tests
|
||||
TEST(TestTensor, ReadWriteHostMemory)
|
||||
{
|
||||
constexpr ck::index_t nelems = 8;
|
||||
|
||||
std::array<ck::index_t, nelems> data;
|
||||
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
|
||||
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
|
||||
InitTensor(tensor);
|
||||
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor));
|
||||
EXPECT_TRUE(TestTensorCheck3d(tensor));
|
||||
}
|
||||
|
||||
__global__ void TestTensorReadWriteDevice(void* data, void* success)
|
||||
{
|
||||
constexpr ck::index_t nelems = 8;
|
||||
constexpr ck::index_t scalar_per_vector = 1;
|
||||
__shared__ ck::index_t p_shared[nelems];
|
||||
|
||||
ck::index_t* casted_data_ptr = static_cast<ck::index_t*>(data);
|
||||
bool* casted_success_ptr = static_cast<bool*>(success);
|
||||
|
||||
const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
|
||||
constexpr auto register_layout = ck::wrapper::make_layout(ck::make_tuple(ck::Number<8>{}));
|
||||
|
||||
auto tensor_global =
|
||||
ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout);
|
||||
auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(p_shared, layout);
|
||||
auto tensor_vgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr,
|
||||
nelems,
|
||||
scalar_per_vector,
|
||||
ck::index_t>(register_layout);
|
||||
auto tensor_sgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Sgpr,
|
||||
nelems,
|
||||
scalar_per_vector,
|
||||
ck::index_t>(register_layout);
|
||||
|
||||
InitTensor(tensor_global);
|
||||
InitTensor(tensor_lds);
|
||||
StaticInitTensor<nelems>(tensor_vgpr);
|
||||
StaticInitTensor<nelems>(tensor_sgpr);
|
||||
|
||||
*casted_success_ptr &= TestTensorCheck1d(tensor_global);
|
||||
*casted_success_ptr &= TestTensorCheck3d(tensor_global);
|
||||
|
||||
*casted_success_ptr &= TestTensorCheck1d(tensor_lds);
|
||||
*casted_success_ptr &= TestTensorCheck3d(tensor_lds);
|
||||
|
||||
*casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_vgpr);
|
||||
|
||||
*casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_sgpr);
|
||||
}
|
||||
|
||||
TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory)
|
||||
{
|
||||
constexpr ck::index_t nelems = 8;
|
||||
std::array<ck::index_t, nelems> host_data;
|
||||
|
||||
DeviceMem data_buf(nelems * sizeof(ck::index_t));
|
||||
data_buf.ToDevice(&host_data[0]);
|
||||
DeviceMem success_buf(sizeof(bool));
|
||||
|
||||
launch_and_time_kernel(StreamConfig{},
|
||||
TestTensorReadWriteDevice,
|
||||
dim3(1),
|
||||
dim3(1),
|
||||
nelems * sizeof(ck::index_t),
|
||||
data_buf.GetDeviceBuffer(),
|
||||
success_buf.GetDeviceBuffer());
|
||||
|
||||
bool success;
|
||||
success_buf.FromDevice(&success);
|
||||
EXPECT_TRUE(success);
|
||||
}
|
||||
|
||||
TEST(TestTensor, Slicing)
|
||||
{
|
||||
constexpr ck::index_t nelems = 8;
|
||||
|
||||
std::array<ck::index_t, nelems> data;
|
||||
const auto shape = ck::make_tuple(ck::make_tuple(2, 2), 2);
|
||||
const auto strides = ck::make_tuple(ck::make_tuple(1, 2), 4);
|
||||
const auto layout = ck::wrapper::make_layout(shape, strides);
|
||||
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
|
||||
InitTensor(tensor);
|
||||
|
||||
auto tensor2x2x2 =
|
||||
tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2));
|
||||
EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2);
|
||||
EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2);
|
||||
EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2));
|
||||
|
||||
auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2));
|
||||
EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2);
|
||||
EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2);
|
||||
EXPECT_EQ(ck::wrapper::size(tensor2x2), 4);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor2x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0))));
|
||||
|
||||
auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2));
|
||||
EXPECT_EQ(rank(tensor1x1), 2);
|
||||
EXPECT_EQ(depth(tensor1x1), 2);
|
||||
EXPECT_EQ(size(tensor1x1), 1);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor1x1, layout(ck::make_tuple(ck::make_tuple(1, 1), 1))));
|
||||
|
||||
auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2));
|
||||
EXPECT_EQ(ck::wrapper::rank(tensor2), 1);
|
||||
EXPECT_EQ(ck::wrapper::depth(tensor2), 1);
|
||||
EXPECT_EQ(ck::wrapper::size(tensor2), 2);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor2, layout(ck::make_tuple(ck::make_tuple(1, 1), 0))));
|
||||
|
||||
// negative indexing
|
||||
auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice());
|
||||
EXPECT_EQ(rank(tensor1x2), 2);
|
||||
EXPECT_EQ(depth(tensor1x2), 2);
|
||||
EXPECT_EQ(size(tensor1x2), 2);
|
||||
EXPECT_TRUE(TestTensorCheck1d(tensor1x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0))));
|
||||
}
|
||||
Reference in New Issue
Block a user