diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b798e38f3..e5903f3747 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,8 +72,9 @@ message(STATUS "Build with HIP ${HIP_VERSION}")
rocm_create_package(
- NAME CK-${CK_BACKEND}
+ NAME composablekernel
DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
+ MAINTAINER "MIOpen Kernels Dev Team
"
LDCONFIG
)
@@ -226,15 +227,12 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
-configure_file("${PROJECT_SOURCE_DIR}/include/ck/hip_version.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/hip_version.hpp")
-
include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include
${PROJECT_BINARY_DIR}/include
${PROJECT_SOURCE_DIR}/library/include
)
-include(googletest)
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV)
@@ -243,7 +241,31 @@ if(BUILD_DEV)
endif()
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
+
add_subdirectory(library)
add_subdirectory(example)
add_subdirectory(test)
add_subdirectory(profiler)
+
+#Create an interface target for the include only files and call it "composablekernels"
+include(CMakePackageConfigHelpers)
+
+set(version 1.0.0)
+write_basic_package_version_file(
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+ VERSION "${version}"
+ COMPATIBILITY AnyNewerVersion
+)
+
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+ INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+ NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
+
+install(FILES
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
diff --git a/Config.cmake.in b/Config.cmake.in
new file mode 100644
index 0000000000..12b5c331ae
--- /dev/null
+++ b/Config.cmake.in
@@ -0,0 +1,11 @@
+@PACKAGE_INIT@
+
+set(_composable_kernel_supported_components device_operations host_tensor)
+
+foreach(_comp ${composable_kernel_FIND_COMPONENTS})
+ if(NOT _comp IN_LIST _composable_kernel_supported_components)
+ set(composable_kernel_FOUND False)
+ set(composable_kernel_NOT_FOUND_MESSAGE "Unsupported component: ${_comp}")
+ endif()
+ include("${CMAKE_CURRENT_LIST_DIR}/composable_kernel${_comp}Targets.cmake")
+endforeach()
diff --git a/Dockerfile b/Dockerfile
index c4cf0fac57..79c961144a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,13 +11,7 @@ ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
RUN apt-get update
RUN apt-get install -y wget gnupg
RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-RUN if ! [ -z $OSDB_BKC_VERSION ]; then \
- echo "Using BKC VERISION: $OSDB_BKC_VERSION";\
- sh -c "echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-osdb-deb/ compute-rocm-dkms-no-npi-hipclang ${OSDB_BKC_VERSION} > /etc/apt/sources.list.d/rocm.list" ;\
- cat /etc/apt/sources.list.d/rocm.list;\
- else \
- sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" ;\
- fi
+RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
@@ -25,18 +19,15 @@ RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/ap
# Install dependencies
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
apt-utils \
- sshpass \
build-essential \
cmake-data=3.15.1-0kitware1 \
cmake=3.15.1-0kitware1 \
curl \
- doxygen \
g++ \
gdb \
git \
hip-rocclr \
jq \
- lcov \
libelf-dev \
libncurses5-dev \
libnuma-dev \
@@ -44,7 +35,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
llvm-amdgpu \
pkg-config \
python \
- python3 \
+ python3.8 \
python-dev \
python3-dev \
python-pip \
@@ -62,8 +53,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
apt-get clean && \
rm -rf /var/lib/apt/lists/*
-# RUN pip3 install --default-timeout=100000 -r requirements.txt
-
# Setup ubsan environment to printstacktrace
RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
ENV UBSAN_OPTIONS=print_stacktrace=1
@@ -83,6 +72,13 @@ ARG PREFIX=/opt/rocm
RUN cget install pfultz2/rocm-recipes
# Install rbuild
RUN pip3 install https://github.com/RadeonOpenCompute/rbuild/archive/6d78a0553babdaea8d2da5de15cbda7e869594b8.tar.gz
+# Install packages for processing the performance results
+RUN pip3 install --upgrade pip
+RUN pip3 install sqlalchemy
+RUN pip3 install pymysql
+RUN pip3 install pandas
+RUN pip3 install setuptools-rust
+RUN pip3 install sshtunnel
# Setup ubsan environment to printstacktrace
ENV UBSAN_OPTIONS=print_stacktrace=1
@@ -92,5 +88,3 @@ ADD rbuild.ini /rbuild.ini
ADD dev-requirements.txt dev-requirements.txt
RUN rbuild prepare -s develop -d $PREFIX
RUN groupadd -f render
-# RUN cget install -f min-requirements.txt
-# RUN CXXFLAGS='-isystem $PREFIX/include' cget install -f ./mlir-requirements.txt
diff --git a/Jenkinsfile b/Jenkinsfile
index 824437c970..65876ea1c0 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -7,7 +7,6 @@ def show_node_info() {
echo "NODE_NAME = \$NODE_NAME"
lsb_release -sd
uname -r
- cat /sys/module/amdgpu/version
ls /opt/ -la
"""
}
@@ -100,35 +99,45 @@ def buildHipClangJob(Map conf=[:]){
def variant = env.STAGE_NAME
-
def retimage
- gitStatusWrapper(credentialsId: '7126e5fe-eb51-4576-b52b-9aaf1de8f0fd', gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
- try {
- retimage = docker.build("${image}", dockerArgs + '.')
- withDockerContainer(image: image, args: dockerOpts) {
- timeout(time: 5, unit: 'MINUTES')
- {
- sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+
+ gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+ if (params.USE_DOCKERFILE){
+ try {
+ retimage = docker.build("${image}", dockerArgs + '.')
+ withDockerContainer(image: image, args: dockerOpts) {
+ timeout(time: 5, unit: 'MINUTES')
+ {
+ sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+ }
+ }
+ }
+ catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+ echo "The job was cancelled or aborted"
+ throw e
+ }
+ catch(Exception ex) {
+ retimage = docker.build("${image}", dockerArgs + "--no-cache .")
+ withDockerContainer(image: image, args: dockerOpts) {
+ timeout(time: 5, unit: 'MINUTES')
+ {
+ sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+ }
}
}
}
- catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
- echo "The job was cancelled or aborted"
- throw e
- }
- catch(Exception ex) {
- retimage = docker.build("${image}", dockerArgs + "--no-cache .")
- withDockerContainer(image: image, args: dockerOpts) {
- timeout(time: 5, unit: 'MINUTES')
- {
- sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
- }
+ else{
+ timeout(time: 3, unit: 'HOURS'){
+ retimage = docker.image('compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54').pull()
+ image="b56f8ac0d6ea"
+ sh "docker images"
}
}
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 5, unit: 'HOURS')
{
+ sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
cmake_build(conf)
}
}
@@ -140,6 +149,10 @@ def reboot(){
build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),]
}
+
+
+
+
def buildHipClangJobAndReboot(Map conf=[:]){
try{
buildHipClangJob(conf)
@@ -156,14 +169,157 @@ def buildHipClangJobAndReboot(Map conf=[:]){
}
}
+
+def runCKProfiler(Map conf=[:]){
+ show_node_info()
+
+ env.HSA_ENABLE_SDMA=0
+ checkout scm
+
+ def image = "composable_kernels"
+ def prefixpath = conf.get("prefixpath", "/opt/rocm")
+ def gpu_arch = conf.get("gpu_arch", "gfx908")
+
+ // Jenkins is complaining about the render group
+ // def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+ def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+ if (conf.get("enforce_xnack_on", false)) {
+ dockerOpts = dockerOpts + " --env HSA_XNACK=1"
+ }
+ def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' "
+
+ def variant = env.STAGE_NAME
+
+ def retimage
+
+ gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+ if (params.USE_DOCKERFILE){
+ try {
+ retimage = docker.build("${image}", dockerArgs + '.')
+ withDockerContainer(image: image, args: dockerOpts) {
+ timeout(time: 5, unit: 'MINUTES')
+ {
+ sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+ }
+ }
+ }
+ catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+ echo "The job was cancelled or aborted"
+ throw e
+ }
+ catch(Exception ex) {
+ retimage = docker.build("${image}", dockerArgs + "--no-cache .")
+ withDockerContainer(image: image, args: dockerOpts) {
+ timeout(time: 5, unit: 'MINUTES')
+ {
+ sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+ }
+ }
+ }
+ }
+ else{
+ timeout(time: 3, unit: 'HOURS'){
+ retimage = docker.image('compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54').pull()
+ image="b56f8ac0d6ea"
+ sh "docker images"
+ }
+ }
+
+ withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+ timeout(time: 5, unit: 'HOURS')
+ {
+ cmake_build(conf)
+ dir("script"){
+ //run gemm performance tests
+ def gemm_log = "perf_gemm_${gpu_arch}.log"
+ sh "rm -f ${gemm_log}"
+ sh "echo Branch name: ${env.BRANCH_NAME} > ${gemm_log}"
+ sh "echo Node name: ${NODE_NAME} >> ${gemm_log}"
+ sh "echo GPU_arch name: ${gpu_arch} >> ${gemm_log}"
+ sh "rocminfo | grep 'Compute Unit:' >> ${gemm_log} "
+ sh "hipcc --version | grep -e 'HIP version' >> ${gemm_log}"
+ sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}"
+ sh "./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}"
+ sh "./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a ${gemm_log}"
+ sh "./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a ${gemm_log}"
+ sh "./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a ${gemm_log}"
+ sh "./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a ${gemm_log}"
+ sh "./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a ${gemm_log}"
+ sh "./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a ${gemm_log}"
+ sh "./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a ${gemm_log}"
+ sh "./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a ${gemm_log}"
+ sh "./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a ${gemm_log}"
+ sh "./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a ${gemm_log}"
+ sh "./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a ${gemm_log}"
+ sh "./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a ${gemm_log}"
+ sh "./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a ${gemm_log}"
+ sh "./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a ${gemm_log}"
+ sh "./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a ${gemm_log}"
+ //results will be parsed, stored, and analyzed within the python script
+ //the script will return 0 if the performance criteria are met
+ //or return 1 if the criteria are not met
+ archiveArtifacts "${gemm_log}"
+ sh "python3 parse_perf_data.py ${gemm_log} "
+ //run resnet50 test
+ def resnet_log = "perf_resnet50_${gpu_arch}.log"
+ sh "rm -f ${resnet_log}"
+ sh "echo Branch name: ${env.BRANCH_NAME} > ${resnet_log}"
+ sh "echo Node name: ${NODE_NAME} >> ${resnet_log}"
+ sh "echo GPU_arch name: ${gpu_arch} >> ${resnet_log}"
+ sh "rocminfo | grep 'Compute Unit:' >> ${resnet_log} "
+ sh "hipcc --version | grep -e 'HIP version' >> ${resnet_log}"
+ sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet_log}"
+ //first run tests with N=256
+ sh "./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet_log}"
+ //then run with N=4
+ sh "./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet_log}"
+ archiveArtifacts "${resnet_log}"
+ //the script will put the results from N=256 and N=4 runs into separate tables
+ sh "python3 parse_perf_data.py ${resnet_log} "
+ }
+ }
+ }
+ }
+ return retimage
+}
+
+
+def runPerfTest(Map conf=[:]){
+ try{
+ runCKProfiler(conf)
+ }
+ catch(e){
+ echo "throwing error exception in performance tests"
+ echo 'Exception occurred: ' + e.toString()
+ throw e
+ }
+ finally{
+ if (!conf.get("no_reboot", false)) {
+ reboot()
+ }
+ }
+}
+
pipeline {
agent none
options {
parallelsAlwaysFailFast()
}
- // environment{
- // variable = value
- // }
+ parameters {
+ booleanParam(
+ name: "USE_DOCKERFILE",
+ defaultValue: true,
+ description: "")
+ }
+ environment{
+ dbuser = "${dbuser}"
+ dbpassword = "${dbpassword}"
+ dbsship = "${dbsship}"
+ dbsshport = "${dbsshport}"
+ dbsshuser = "${dbsshuser}"
+ dbsshpassword = "${dbsshpassword}"
+ status_wrapper_creds = "${status_wrapper_creds}"
+ }
stages{
stage("Static checks") {
parallel{
@@ -178,29 +334,6 @@ pipeline {
// buildHipClangJobAndReboot(build_cmd: build_cmd, no_reboot:true, prefixpath: '/opt/rocm', build_type: 'debug')
// }
// }
- stage('Build Profiler: Release, gfx908')
- {
- agent { label rocmnode("nogpu")}
- environment{
- setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
- }
- steps{
- buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
- }
- }
- stage('Build Profiler: Debug, gfx908')
- {
- agent { label rocmnode("nogpu")}
- environment{
- setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
- }
- steps{
- // until we stabilize debug build due to compiler crashes
- catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
- buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Debug')
- }
- }
- }
stage('Clang Format') {
agent{ label rocmnode("nogpu") }
environment{
@@ -220,7 +353,7 @@ pipeline {
}
}
}
- stage("Tests")
+ stage("Tests")
{
parallel
{
@@ -228,12 +361,11 @@ pipeline {
{
agent{ label rocmnode("gfx908")}
environment{
- setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
+ setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
}
steps{
- buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release')
+ buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release', gpu_arch: "gfx908")
}
-
}
stage("Run Tests: gfx90a")
{
@@ -242,26 +374,68 @@ pipeline {
setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
}
steps{
- buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release')
+ buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release', gpu_arch: "gfx90a")
}
-
}
-
}
}
- // enable after the cmake file supports packaging
- // stage("Packages") {
- // when {
- // expression { params.BUILD_PACKAGES && params.TARGET_NOGPU && params.DATATYPE_NA }
- // }
- // parallel {
- // stage("Package /opt/rocm") {
- // agent{ label rocmnode("nogpu") }
- // steps{
- // buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a")
- // }
- // }
- // }
- // }
+ stage("Client App")
+ {
+ parallel
+ {
+ stage("Run Client App")
+ {
+ agent{ label rocmnode("gfx908")}
+ environment{
+ setup_args = """ -D -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " """
+ execute_args = """ cd ../test/client_app && rm -rf build && mkdir build && cd build && cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" .. && make """
+ }
+ steps{
+ buildHipClangJobAndReboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+ }
+ }
+ }
+ }
+ stage("Performance Tests")
+ {
+ parallel
+ {
+ stage("Run ckProfiler: gfx908")
+ {
+ agent{ label rocmnode("gfx908")}
+ environment{
+ setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
+ }
+ steps{
+ runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release', gpu_arch: "gfx908")
+ }
+ }
+ stage("Run ckProfiler: gfx90a")
+ {
+ agent{ label rocmnode("gfx90a")}
+ environment{
+ setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
+ }
+ steps{
+ runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release', gpu_arch: "gfx90a")
+ }
+ }
+ }
+ }
+ /* enable after the cmake file supports packaging
+ stage("Packages") {
+ when {
+ expression { params.BUILD_PACKAGES && params.TARGET_NOGPU && params.DATATYPE_NA }
+ }
+ parallel {
+ stage("Package /opt/rocm") {
+ agent{ label rocmnode("nogpu") }
+ steps{
+ buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a")
+ }
+ }
+ }
+ }
+ */
}
}
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000..2fe9a8455e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2018- , Advanced Micro Devices, Inc. (Chao Liu, Jing Zhang)
+Copyright (c) 2019- , Advanced Micro Devices, Inc. (Letao Qin, Qianfeng Zhang, Liang Huang, Shaojie Wang)
+Copyright (c) 2022- , Advanced Micro Devices, Inc. (Anthony Chang, Chunyu Lai, Illia Silin, Adam Osewski, Poyen Chen, Jehandad Khan)
+Copyright (c) 2019-2021, Advanced Micro Devices, Inc. (Hanwen Chang)
+Copyright (c) 2019-2020, Advanced Micro Devices, Inc. (Tejash Shah)
+Copyright (c) 2020 , Advanced Micro Devices, Inc. (Xiaoyan Zhou)
+Copyright (c) 2021-2022, Advanced Micro Devices, Inc. (Jianfeng Yan)
+
+SPDX-License-Identifier: MIT
+Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index 4011d34415..f6c933bf5b 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ docker run \
--group-add sudo \
-w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev \
+rocm/tensorflow:rocm5.1-tf2.6-dev \
/bin/bash
```
@@ -20,7 +20,7 @@ mkdir build && cd build
cmake \
-D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \
--D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3 \
+-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \
..
@@ -43,3 +43,13 @@ Instructions for running each individual examples are under ```example/```
make -j ckProfiler
```
Instructions for running ckProfiler are under ```profiler/```
+
+
+## Caveat
+### Kernel Timing and Verification
+CK's own kernel timer will warn up kernel once, and then run it multiple times
+to get average kernel time. For some kernels that use atomic add, this will cause
+output buffer to be accumulated multiple times, causing verfication failure.
+To work around it, do not use CK's own timer and do verification at the same time.
+CK's own timer and verification in each example and ckProfiler can be enabled or
+disabled from command line.
diff --git a/cmake/EnableCompilerWarnings.cmake b/cmake/EnableCompilerWarnings.cmake
index 9f193b2090..78133af031 100644
--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -66,7 +66,7 @@ else()
-Wunreachable-code
-Wunused
- -Wno-sign-compare
+ -Wsign-compare
-Wno-extra-semi-stmt
)
if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "Clang")
diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake
index c7e70cc8a9..959bc4f4b0 100644
--- a/cmake/googletest.cmake
+++ b/cmake/googletest.cmake
@@ -18,6 +18,8 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
-Wno-switch-enum
-Wno-zero-as-null-pointer-constant
-Wno-unused-member-function
+ -Wno-comma
+ -Wno-old-style-cast
)
message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")
@@ -33,4 +35,5 @@ FetchContent_MakeAvailable(googletest)
target_compile_options(gtest PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
target_compile_options(gtest_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
-
+target_compile_options(gmock PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
+target_compile_options(gmock_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
index 696d3bac42..c03c454c68 100644
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -1,3 +1,8 @@
+add_example_executable(example_gemm_dl_fp32 gemm_dl_fp32.cpp)
+add_example_executable(example_gemm_dl_fp16 gemm_dl_fp16.cpp)
+add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp)
+# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
+add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
diff --git a/example/01_gemm/gemm_dl_fp16.cpp b/example/01_gemm/gemm_dl_fp16.cpp
new file mode 100644
index 0000000000..9a22628777
--- /dev/null
+++ b/example/01_gemm/gemm_dl_fp16.cpp
@@ -0,0 +1,209 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "check_err.hpp"
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "device_gemm_dl.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+
+template
+using S = ck::Sequence;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType = ck::half_t;
+using BDataType = ck::half_t;
+using CDataType = ck::half_t;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::
+ // ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
+ // ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+ // ########| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
+ // ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ DeviceGemmDl< F16, F16, F16, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 2, 4, 4, 1, S<8, 2>, S<8, 2>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+ ReferenceGemm;
+
+int main(int argc, char* argv[])
+{
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
+
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideC = 4096;
+
+ if(argc == 1)
+ {
+ // do nothing
+ }
+ else if(argc == 4)
+ {
+ do_verification = std::stoi(argv[1]);
+ init_method = std::stoi(argv[2]);
+ time_kernel = std::stoi(argv[3]);
+ }
+ else if(argc == 10)
+ {
+ do_verification = std::stoi(argv[1]);
+ init_method = std::stoi(argv[2]);
+ time_kernel = std::stoi(argv[3]);
+
+ M = std::stoi(argv[4]);
+ N = std::stoi(argv[5]);
+ K = std::stoi(argv[6]);
+
+ StrideA = std::stoi(argv[7]);
+ StrideB = std::stoi(argv[8]);
+ StrideC = std::stoi(argv[9]);
+ }
+ else
+ {
+ printf("arg1: verification (0=no, 1=yes)\n");
+ printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
+ printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+ exit(1);
+ }
+
+ auto f_host_tensor_descriptor =
+ [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+ if(std::is_same::value)
+ {
+ return HostTensorDescriptor(std::vector({row, col}),
+ std::vector({stride, 1}));
+ }
+ else
+ {
+ return HostTensorDescriptor(std::vector({row, col}),
+ std::vector({1, stride}));
+ }
+ };
+
+ Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+ Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+ Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+ Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+ std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+ std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+ std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+ switch(init_method)
+ {
+ case 0: break;
+ case 1:
+ a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ break;
+ case 2:
+ a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
+ b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
+ break;
+ default:
+ a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+ b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+ }
+
+ DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+ DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+ DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+
+ a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+ b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+ auto a_element_op = AElementOp{};
+ auto b_element_op = BElementOp{};
+ auto c_element_op = CElementOp{};
+
+ // do GEMM
+ auto gemm = DeviceGemmInstance{};
+ auto invoker = gemm.MakeInvoker();
+ auto argument = gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()),
+ static_cast(b_k_n_device_buf.GetDeviceBuffer()),
+ static_cast(c_m_n_device_buf.GetDeviceBuffer()),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ StrideC,
+ a_element_op,
+ b_element_op,
+ c_element_op);
+
+ if(!gemm.IsSupportedArgument(argument))
+ {
+ std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+ return 0;
+ }
+
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+ << gemm.GetTypeString() << std::endl;
+
+ c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+ bool pass = true;
+
+ if(do_verification)
+ {
+ auto ref_gemm = ReferenceGemmInstance{};
+ auto ref_invoker = ref_gemm.MakeInvoker();
+
+ auto ref_argument = ref_gemm.MakeArgument(
+ a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+ ref_invoker.Run(ref_argument);
+
+ pass = ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+ }
+
+ return pass ? 0 : 1;
+}
diff --git a/example/01_gemm/gemm_dl_fp32.cpp b/example/01_gemm/gemm_dl_fp32.cpp
new file mode 100644
index 0000000000..32b183a3a1
--- /dev/null
+++ b/example/01_gemm/gemm_dl_fp32.cpp
@@ -0,0 +1,208 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "check_err.hpp"
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "device_gemm_dl.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+
+template
+using S = ck::Sequence;
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType = float;
+using BDataType = float;
+using CDataType = float;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::
+ // ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
+ // ########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+ // ########| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
+ // ########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ DeviceGemmDl< F32, F32, F32, F32, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 1, 4, 4, 1, S<8, 2>, S<8, 2>, S<2, 1, 4, 1>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<2, 1, 4, 1>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 1, 2, 3, 4, 5>, 5, 4>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+ ReferenceGemm;
+
+int main(int argc, char* argv[])
+{
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
+
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideC = 4096;
+
+ if(argc == 1)
+ {
+ // do nothing
+ }
+ else if(argc == 4)
+ {
+ do_verification = std::stoi(argv[1]);
+ init_method = std::stoi(argv[2]);
+ time_kernel = std::stoi(argv[3]);
+ }
+ else if(argc == 10)
+ {
+ do_verification = std::stoi(argv[1]);
+ init_method = std::stoi(argv[2]);
+ time_kernel = std::stoi(argv[3]);
+
+ M = std::stoi(argv[4]);
+ N = std::stoi(argv[5]);
+ K = std::stoi(argv[6]);
+
+ StrideA = std::stoi(argv[7]);
+ StrideB = std::stoi(argv[8]);
+ StrideC = std::stoi(argv[9]);
+ }
+ else
+ {
+ printf("arg1: verification (0=no, 1=yes)\n");
+ printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
+ printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+ exit(1);
+ }
+
+ auto f_host_tensor_descriptor =
+ [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+ if(std::is_same::value)
+ {
+ return HostTensorDescriptor(std::vector({row, col}),
+ std::vector({stride, 1}));
+ }
+ else
+ {
+ return HostTensorDescriptor(std::vector({row, col}),
+ std::vector({1, stride}));
+ }
+ };
+
+ Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+ Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+ Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+ Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+ std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+ std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+ std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+ switch(init_method)
+ {
+ case 0: break;
+ case 1:
+ a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ break;
+ case 2:
+ a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
+ b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
+ break;
+ default:
+ a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+ b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+ }
+
+ DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+ DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+ DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+
+ a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+ b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+ auto a_element_op = AElementOp{};
+ auto b_element_op = BElementOp{};
+ auto c_element_op = CElementOp{};
+
+ // do GEMM
+ auto gemm = DeviceGemmInstance{};
+ auto invoker = gemm.MakeInvoker();
+ auto argument = gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()),
+ static_cast(b_k_n_device_buf.GetDeviceBuffer()),
+ static_cast(c_m_n_device_buf.GetDeviceBuffer()),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ StrideC,
+ a_element_op,
+ b_element_op,
+ c_element_op);
+
+ if(!gemm.IsSupportedArgument(argument))
+ {
+ std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+ return 0;
+ }
+
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+ << gemm.GetTypeString() << std::endl;
+
+ c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+ bool pass = true;
+
+ if(do_verification)
+ {
+ auto ref_gemm = ReferenceGemmInstance{};
+ auto ref_invoker = ref_gemm.MakeInvoker();
+
+ auto ref_argument = ref_gemm.MakeArgument(
+ a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+ ref_invoker.Run(ref_argument);
+
+ pass = ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+ }
+
+ return pass ? 0 : 1;
+}
diff --git a/example/01_gemm/gemm_dl_int8.cpp b/example/01_gemm/gemm_dl_int8.cpp
new file mode 100644
index 0000000000..16c9213104
--- /dev/null
+++ b/example/01_gemm/gemm_dl_int8.cpp
@@ -0,0 +1,206 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "check_err.hpp"
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "device_gemm_dl.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+
+template
+using S = ck::Sequence;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType = int8_t;
+using BDataType = int8_t;
+using CDataType = int8_t;
+using AccDataType = int32_t;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::
+ // #########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
+ // #########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+ // #########| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
+ // #########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ DeviceGemmDl< int8_t, int8_t, int8_t, int32_t, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 16, 4, 4, 4, 1, S<8, 2>, S<8, 2>, S<2, 1, 4, 4>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 4>, S<2, 1, 4, 4>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>, 5, 4>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+ ReferenceGemm;
+
+int main(int argc, char* argv[])
+{
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
+
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideC = 4096;
+
+ if(argc == 1)
+ {
+ // do nothing
+ }
+ else if(argc == 4)
+ {
+ do_verification = std::stoi(argv[1]);
+ init_method = std::stoi(argv[2]);
+ time_kernel = std::stoi(argv[3]);
+ }
+ else if(argc == 10)
+ {
+ do_verification = std::stoi(argv[1]);
+ init_method = std::stoi(argv[2]);
+ time_kernel = std::stoi(argv[3]);
+
+ M = std::stoi(argv[4]);
+ N = std::stoi(argv[5]);
+ K = std::stoi(argv[6]);
+
+ StrideA = std::stoi(argv[7]);
+ StrideB = std::stoi(argv[8]);
+ StrideC = std::stoi(argv[9]);
+ }
+ else
+ {
+ printf("arg1: verification (0=no, 1=yes)\n");
+ printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
+ printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+ exit(1);
+ }
+
+ auto f_host_tensor_descriptor =
+ [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+ if(std::is_same::value)
+ {
+ return HostTensorDescriptor(std::vector({row, col}),
+ std::vector({stride, 1}));
+ }
+ else
+ {
+ return HostTensorDescriptor(std::vector({row, col}),
+ std::vector({1, stride}));
+ }
+ };
+
+ Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+ Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+ Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+ Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+ std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+ std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+ std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+ switch(init_method)
+ {
+ case 0: break;
+ case 1:
+ a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ break;
+ case 2:
+ a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
+ b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
+ break;
+ default:
+ a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+ b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+ }
+
+ DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+ DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+ DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+
+ a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+ b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+ auto a_element_op = AElementOp{};
+ auto b_element_op = BElementOp{};
+ auto c_element_op = CElementOp{};
+
+ // do GEMM
+ auto gemm = DeviceGemmInstance{};
+ auto invoker = gemm.MakeInvoker();
+ auto argument = gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()),
+ static_cast(b_k_n_device_buf.GetDeviceBuffer()),
+ static_cast(c_m_n_device_buf.GetDeviceBuffer()),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ StrideC,
+ a_element_op,
+ b_element_op,
+ c_element_op);
+
+ if(!gemm.IsSupportedArgument(argument))
+ {
+ std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+ return 0;
+ }
+
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+ << gemm.GetTypeString() << std::endl;
+
+ c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+ bool pass = true;
+
+ if(do_verification)
+ {
+ auto ref_gemm = ReferenceGemmInstance{};
+ auto ref_invoker = ref_gemm.MakeInvoker();
+
+ auto ref_argument = ref_gemm.MakeArgument(
+ a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+ ref_invoker.Run(ref_argument);
+
+ pass = ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+ }
+
+ return pass ? 0 : 1;
+}
diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp
index a4567dcd6e..b126736be6 100644
--- a/example/01_gemm/gemm_xdl_bf16.cpp
+++ b/example/01_gemm/gemm_xdl_bf16.cpp
@@ -84,13 +84,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
- ReferenceGemm;
+ ReferenceGemm;
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -105,13 +105,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -125,7 +125,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
@@ -193,12 +193,12 @@ int main(int argc, char* argv[])
if(!gemm.IsSupportedArgument(argument))
{
- throw std::runtime_error(
- "wrong! device_gemm with the specified compilation parameters does "
- "not support this GEMM problem");
+ std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+ return 0;
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
@@ -232,7 +232,7 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument);
- ck::utils::check_err(c_m_n_device_f32_result.mData, c_m_n_host_result.mData);
+ return ck::utils::check_err(c_m_n_device_f32_result.mData, c_m_n_host_result.mData) ? 0 : 1;
}
return 0;
diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp
index 9cd40c3976..7e1af4bab2 100644
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -4,7 +4,6 @@
#include
#include
#include
-
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
@@ -29,29 +28,30 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using ADataType = ck::half_t;
-using BDataType = ck::half_t;
-using CDataType = ck::half_t;
-using AccDataType = float;
+using ADataType = F16;
+using BDataType = F16;
+using AccDataType = F32;
+using CShuffleDataType = F32;
+using CDataType = F16;
-using ALayout = ck::tensor_layout::gemm::RowMajor;
-using BLayout = ck::tensor_layout::gemm::ColumnMajor;
-using CLayout = ck::tensor_layout::gemm::RowMajor;
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
-using AElementOp = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp = ck::tensor_operation::element_wise::PassThrough;
-using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
#if 1
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
-//######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
-//######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
-//######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
-//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
- < Row, Col, Row, F16, F16, F16, F32, F32, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
+//######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
+//######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
+//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
#elif 0
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_ProducerConsumer_CShuffle
//######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| ABBlockTransfer| BlockGemm| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
@@ -70,13 +70,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
- ReferenceGemm;
+ ReferenceGemm;
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -87,17 +87,21 @@ int main(int argc, char* argv[])
ck::index_t StrideB = 4096;
ck::index_t StrideC = 4096;
- if(argc == 4)
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -111,7 +115,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
@@ -184,12 +188,12 @@ int main(int argc, char* argv[])
if(!gemm.IsSupportedArgument(argument))
{
- throw std::runtime_error(
- "wrong! device_gemm with the specified compilation parameters does "
- "not support this GEMM problem");
+ std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+ return 0;
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
@@ -214,7 +218,7 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument);
- ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+ return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
}
return 0;
diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
new file mode 100644
index 0000000000..7cea68c8b0
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -0,0 +1,238 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "check_err.hpp"
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "device_gemm_xdl.hpp"
+#include "device_gemm_xdl_cshuffle.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+
+template
+using S = ck::Sequence;
+
+using F64 = double;
+
+using ADataType = double;
+using BDataType = double;
+using CDataType = double;
+using AccDataType = double;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
+//##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+//##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
+//##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
+//##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+#if 0
+ < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 32, 32, 4, 1, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, 7, 1>;
+#else
+ < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>;
+#endif
+ // clang-format on
+
+ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm;
+
+template
+std::ostream& show_2d_matrix(std::ostream& os, Tensor& matrix)
+{
+ os << "[" << std::endl;
+ for(int x = 0; x < matrix.mDesc.GetLengths()[0]; x++)
+ {
+ os << "[";
+ for(int y = 0; y < matrix.mDesc.GetLengths()[1]; y++)
+ {
+ os << std::setw(4) << static_cast(matrix(x, y));
+ }
+ os << "]" << std::endl;
+ }
+ os << "]";
+ return os;
+}
+
+int main(int argc, char* argv[])
+{
+ bool do_verification = 0;
+ int init_method = 0;
+ bool time_kernel = false;
+
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideC = 4096;
+
+ if(argc == 4)
+ {
+ do_verification = std::stoi(argv[1]);
+ init_method = std::stoi(argv[2]);
+ time_kernel = std::stoi(argv[3]);
+ }
+ else if(argc == 10)
+ {
+ do_verification = std::stoi(argv[1]);
+ init_method = std::stoi(argv[2]);
+ time_kernel = std::stoi(argv[3]);
+
+ M = std::stoi(argv[4]);
+ N = std::stoi(argv[5]);
+ K = std::stoi(argv[6]);
+
+ StrideA = std::stoi(argv[7]);
+ StrideB = std::stoi(argv[8]);
+ StrideC = std::stoi(argv[9]);
+ }
+ else
+ {
+ printf("arg1: verification (0=no, 1=yes)\n");
+ printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+ printf("arg3: run kernel # of times (>1)\n");
+ printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+ exit(0);
+ }
+
+ auto f_host_tensor_descriptor =
+ [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+ if(std::is_same::value)
+ {
+ return HostTensorDescriptor(std::vector({row, col}),
+ std::vector({stride, 1}));
+ }
+ else
+ {
+ return HostTensorDescriptor(std::vector({row, col}),
+ std::vector({1, stride}));
+ }
+ };
+
+ Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+ Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+ Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+ Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+ std::cout << "data type: " << typeid(ADataType{}).name() << std::endl;
+ std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+ std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+ std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+ switch(init_method)
+ {
+ case 0: break;
+ case 1:
+ a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ break;
+ case 2:
+ a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
+ b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
+ break;
+ default:
+ a_m_k.GenerateTensorValue(GeneratorTensor_1{1});
+ b_k_n.GenerateTensorValue(GeneratorTensor_1{1});
+ }
+
+ DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+ DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+ DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+
+ a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+ b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+ auto a_element_op = AElementOp{};
+ auto b_element_op = BElementOp{};
+ auto c_element_op = CElementOp{};
+
+ // do GEMM
+ auto gemm = DeviceGemmInstance{};
+ auto invoker = gemm.MakeInvoker();
+ auto argument = gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()),
+ static_cast(b_k_n_device_buf.GetDeviceBuffer()),
+ static_cast(c_m_n_device_buf.GetDeviceBuffer()),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ StrideC,
+ a_element_op,
+ b_element_op,
+ c_element_op);
+
+ if(!gemm.IsSupportedArgument(argument))
+ {
+ std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+ return 0;
+ }
+
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+ << gemm.GetTypeString() << std::endl;
+
+ c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+ if(do_verification)
+ {
+ auto ref_gemm = ReferenceGemmInstance{};
+ auto ref_invoker = ref_gemm.MakeInvoker();
+
+ auto ref_argument = ref_gemm.MakeArgument(
+ a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+ ref_invoker.Run(ref_argument);
+
+#if 0
+ {
+ show_2d_matrix(std::cout << "a : ", a_m_k) << std::endl;
+ show_2d_matrix(std::cout << "b: ", b_k_n) << std::endl;
+ show_2d_matrix(std::cout << "c_device: ", c_m_n_device_result) << std::endl;
+ show_2d_matrix(std::cout << "c_host :", c_m_n_host_result) << std::endl;
+ }
+#endif
+ return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
+ }
+
+ return 0;
+}
diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp
index ab5869db61..27fcd62a2c 100644
--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
@@ -78,14 +78,19 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
16>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
// clang-format on
-using ReferenceGemmInstance = ck::tensor_operation::host::
- ReferenceGemm;
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm;
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -100,13 +105,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -120,7 +125,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
@@ -189,12 +194,12 @@ int main(int argc, char* argv[])
if(!gemm.IsSupportedArgument(argument))
{
- throw std::runtime_error(
- "wrong! device_gemm with the specified compilation parameters does "
- "not support this GEMM problem");
+ std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+ return 0;
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
@@ -219,7 +224,7 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument);
- ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+ return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
}
return 0;
diff --git a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
index 2abebbbac4..1a6e1de4dc 100644
--- a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
+++ b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
@@ -86,9 +86,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBias2D1)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, alpha, beta\n");
exit(0);
}
@@ -216,7 +216,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
@@ -246,6 +246,8 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument);
- ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+ return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
}
+
+ return 0;
}
diff --git a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
index f3ed2bad37..f91f6ccfc7 100644
--- a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
+++ b/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
@@ -3,89 +3,109 @@
#include
#include
#include
-#include
#include "check_err.hpp"
#include "config.hpp"
-#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
-#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "element_wise_operation.hpp"
-#include "device_gemm_xdl_c_shuffle_bias_activation.hpp"
-#include "reference_gemm_bias_activation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+#include "device_gemm_multiple_d_xdl_cshuffle.hpp"
template
using S = ck::Sequence;
-using ADataType = ck::half_t;
-using BDataType = ck::half_t;
-using CDataType = ck::half_t;
-using AccDataType = float;
+using F16 = ck::half_t;
+using F32 = float;
-using ALayout = ck::tensor_layout::gemm::RowMajor;
-using BLayout = ck::tensor_layout::gemm::ColumnMajor;
-using CLayout = ck::tensor_layout::gemm::RowMajor;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
-using AElementOp = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp = ck::tensor_operation::element_wise::PassThrough;
-using CElementOp = ck::tensor_operation::element_wise::AddRelu;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-// clang-format off
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle_Bias_Activation<
- ADataType, // ADataType
- BDataType, // BDataType
- CDataType, // CDataType
- AccDataType, // AccDataType
- ALayout, // ALayout
- BLayout, // BLayout
- CLayout, // CLayout
- AElementOp, // AElementwiseOperation
- BElementOp, // BElementwiseOperation
- CElementOp, // CElementwiseOperation
- 256, // BlockSize
- 256, // MPerBlock
- 128, // NPerBlock
- 4, // K0PerBlock
- 8, // K1
- 32, // MPerXDL
- 32, // NPerXDL
- 4, // MXdlPerWave
- 2, // NXdlPerWave
- S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
- S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
- S<1, 0, 2>, // ABlockTransferSrcAccessOrder
- 2, // ABlockTransferSrcVectorDim
- 8, // ABlockTransferSrcScalarPerVector
- 8, // ABlockTransferDstScalarPerVector_K1
- true, // ABlockLdsAddExtraM
- S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
- S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
- S<1, 0, 2>, // BBlockTransferSrcAccessOrder
- 2, // BBlockTransferSrcVectorDim
- 8, // BBlockTransferSrcScalarPerVector
- 8, // BBlockTransferDstScalarPerVector_K1
- true, // BBlockLdsAddExtraN
- 1, // CShuffleMXdlPerWavePerShuffle
- 1, // CShuffleNXdlPerWavePerShuffle
- S<1, 1, 32, 1, 1, 8>, // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
- 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl
-// clang-format on
+// C = A * B
+// E = Relu(C + D);
+struct AddRelu
+{
+ __host__ __device__ void
+ operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& d) const
+ {
+ const ck::half_t x = c + d;
-using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBiasActivation;
+ e = x > 0 ? x : 0;
+ }
+};
+
+using ADataType = F16;
+using BDataType = F16;
+using AccDataType = F32;
+using CShuffleDataType = F16;
+using DDataType = F16;
+using DsDataType = ck::Tuple;
+using EDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = AddRelu;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceOpInstance =
+ ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle,
+ S<1, 0, 2>,
+ S<1, 0, 2>,
+ 2,
+ 8,
+ 8,
+ 1,
+ S<4, 64, 1>,
+ S<1, 0, 2>,
+ S<1, 0, 2>,
+ 2,
+ 8,
+ 8,
+ 1,
+ 1,
+ 1,
+ S<1, 32, 1, 8>,
+ 8>;
int main(int argc, char* argv[])
{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
@@ -94,19 +114,23 @@ int main(int argc, char* argv[])
ck::index_t StrideA = 4096;
ck::index_t StrideB = 4096;
- ck::index_t StrideC = 4096;
+ ck::index_t StrideE = 4096;
- if(argc == 4)
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
@@ -114,14 +138,14 @@ int main(int argc, char* argv[])
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
- StrideC = std::stoi(argv[9]);
+ StrideE = std::stoi(argv[9]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
- printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+ printf("arg3: time kernel (0=no, 1=yes)\n");
+ printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n");
exit(0);
}
@@ -141,17 +165,14 @@ int main(int argc, char* argv[])
Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
- Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
- Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-
- // c0_n[n]
- Tensor c0_n(HostTensorDescriptor(
- std::vector({static_cast(N)}), std::vector({1})));
+ Tensor d_m_n(f_host_tensor_descriptor(M, N, 0, ELayout{}));
+ Tensor e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+ Tensor e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
- std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
- std::cout << "c0_n: " << c0_n.mDesc << std::endl;
+ std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+ std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
switch(init_method)
{
@@ -159,59 +180,59 @@ int main(int argc, char* argv[])
case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5});
b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
- c0_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ d_m_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
- c0_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
+ d_m_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
}
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
- DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
- DeviceMem c0_n_device_buf(sizeof(CDataType) * c0_n.mDesc.GetElementSpace());
+ DeviceMem d_m_n_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpace());
+ DeviceMem e_m_n_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpace());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
- c_m_n_device_buf.ToDevice(c_m_n_device_result.mData.data());
- c0_n_device_buf.ToDevice(c0_n.mData.data());
+ d_m_n_device_buf.ToDevice(d_m_n.mData.data());
- auto a_element_op = AElementOp{};
- auto b_element_op = BElementOp{};
- auto c_element_op = CElementOp{};
+ auto a_element_op = AElementOp{};
+ auto b_element_op = BElementOp{};
+ auto cde_element_op = CDEElementOp{};
// do GEMM
- auto gemm = DeviceGemmInstance{};
+ auto device_op = DeviceOpInstance{};
- auto invoker = gemm.MakeInvoker();
- auto argument = gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()),
- static_cast(b_k_n_device_buf.GetDeviceBuffer()),
- static_cast(c_m_n_device_buf.GetDeviceBuffer()),
- static_cast(c0_n_device_buf.GetDeviceBuffer()),
- M,
- N,
- K,
- StrideA,
- StrideB,
- StrideC,
- a_element_op,
- b_element_op,
- c_element_op);
+ auto invoker = device_op.MakeInvoker();
- if(!gemm.IsSupportedArgument(argument))
+ auto argument =
+ device_op.MakeArgument(a_m_k_device_buf.GetDeviceBuffer(),
+ b_k_n_device_buf.GetDeviceBuffer(),
+ std::array{d_m_n_device_buf.GetDeviceBuffer()},
+ e_m_n_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ std::array{0},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ if(!device_op.IsSupportedArgument(argument))
{
- throw std::runtime_error(
- "wrong! device_gemm with the specified compilation parameters does "
- "not support this GEMM problem");
+ throw std::runtime_error("wrong! this device_op instance does not support this problem");
}
- float ave_time = invoker.Run(argument, nrepeat);
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
- std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
- sizeof(CDataType) * M * N + sizeof(CDataType) * N;
+ std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+ sizeof(EDataType) * M * N + sizeof(EDataType) * N;
float tflops = static_cast(flop) / 1.E9 / ave_time;
@@ -220,18 +241,38 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
- c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-
if(do_verification)
{
+ e_m_n_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+ Tensor c_m_n(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm;
+
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
- auto ref_argument = ref_gemm.MakeArgument(
- a_m_k, b_k_n, c_m_n_host_result, c0_n, a_element_op, b_element_op, c_element_op);
+ auto ref_argument =
+ ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
ref_invoker.Run(ref_argument);
- ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+ for(int m = 0; m < M; ++m)
+ {
+ for(int n = 0; n < N; ++n)
+ {
+ cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
+ }
+ }
+
+ return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1;
}
+
+ return 0;
}
diff --git a/example/04_gemm_add_add_fastgelu/CMakeLists.txt b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
new file mode 100644
index 0000000000..754de47c2b
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
@@ -0,0 +1 @@
+add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
diff --git a/example/04_gemm_add_add_fastgelu/README.md b/example/04_gemm_add_add_fastgelu/README.md
new file mode 100644
index 0000000000..08a55fb9a3
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/README.md
@@ -0,0 +1,23 @@
+# Instructions for ```example_gemm_add_add_fastgelu_xdl_fp16```
+
+## Run ```example_gemm_add_add_fastgelu_xdl_fp16```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+#arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE"
+./bin/example_gemm_add_add_fastgelu_xdl_fp16 1 1 1
+```
+
+Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
+```
+a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
+b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
+d0_m_n: dim 2, lengths {3840, 4096}, strides {0, 1}
+d1_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
+e_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
+launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
+Warm up 1 time
+Start running 10 times...
+Perf: 1.26914 ms, 101.525 TFlops, 100.804 GB/s, DeviceGemmMultipleD_Xdl_CShuffle<256, 256, 128, 32, 8, 8>
+```
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
new file mode 100644
index 0000000000..7db5be0c91
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
@@ -0,0 +1,245 @@
+#include
+#include
+#include
+#include
+#include
+
+#include "check_err.hpp"
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+#include "device_gemm_multiple_d_xdl_cshuffle.hpp"
+
+template
+using S = ck::Sequence;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+
+using ADataType = F16;
+using BDataType = F16;
+using AccDataType = F32;
+using CShuffleDataType = F32;
+using D0DataType = F16;
+using D1DataType = F16;
+using DsDataType = ck::Tuple;
+using EDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using ELayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+//######| ALayout| BLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
+//######| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
+//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ < ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
+
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideD0 = 0;
+ ck::index_t StrideD1 = 4096;
+ ck::index_t StrideE = 4096;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 4)
+ {
+ do_verification = std::stoi(argv[1]);
+ init_method = std::stoi(argv[2]);
+ time_kernel = std::stoi(argv[3]);
+ }
+ else if(argc == 12)
+ {
+ do_verification = std::stoi(argv[1]);
+ init_method = std::stoi(argv[2]);
+ time_kernel = std::stoi(argv[3]);
+
+ M = std::stoi(argv[4]);
+ N = std::stoi(argv[5]);
+ K = std::stoi(argv[6]);
+
+ StrideA = std::stoi(argv[7]);
+ StrideB = std::stoi(argv[8]);
+ StrideD0 = std::stoi(argv[9]);
+ StrideD1 = std::stoi(argv[10]);
+ StrideE = std::stoi(argv[11]);
+ }
+ else
+ {
+ printf("arg1: verification (0=no, 1=yes)\n");
+ printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+ printf("arg3: time kernel (0=no, 1=yes)\n");
+ printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, "
+ "StrideE\n");
+ exit(0);
+ }
+
+ auto f_host_tensor_descriptor =
+ [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+ if(std::is_same::value)
+ {
+ return HostTensorDescriptor(std::vector({row, col}),
+ std::vector({stride, 1}));
+ }
+ else
+ {
+ return HostTensorDescriptor(std::vector({row, col}),
+ std::vector({1, stride}));
+ }
+ };
+
+ Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+ Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+ Tensor d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
+ Tensor d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
+ Tensor e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+ Tensor e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+ std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+ std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+ std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+ std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
+ std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+ switch(init_method)
+ {
+ case 0: break;
+ case 1:
+ a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ d0_m_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ d1_m_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ break;
+ default:
+ a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
+ b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
+ d0_m_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
+ d1_m_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
+ }
+
+ DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+ DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+ DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpace());
+ DeviceMem d1_m_n_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpace());
+ DeviceMem e_m_n_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpace());
+
+ a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+ b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+ d0_m_n_device_buf.ToDevice(d0_m_n.mData.data());
+ d1_m_n_device_buf.ToDevice(d1_m_n.mData.data());
+
+ auto a_element_op = AElementOp{};
+ auto b_element_op = BElementOp{};
+ auto cde_element_op = CDEElementOp{};
+
+ // do GEMM
+ auto device_op = DeviceOpInstance{};
+ auto invoker = device_op.MakeInvoker();
+ auto argument =
+ device_op.MakeArgument(a_m_k_device_buf.GetDeviceBuffer(),
+ b_k_n_device_buf.GetDeviceBuffer(),
+ std::array{d0_m_n_device_buf.GetDeviceBuffer(),
+ d1_m_n_device_buf.GetDeviceBuffer()},
+ e_m_n_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ std::array{StrideD0, StrideD1},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ if(!device_op.IsSupportedArgument(argument))
+ {
+ throw std::runtime_error("wrong! this device_op instance does not support this problem");
+ }
+
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+ sizeof(D0DataType) * N + sizeof(D1DataType) * M * N +
+ sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+ << device_op.GetTypeString() << std::endl;
+
+ if(do_verification)
+ {
+ Tensor c_m_n(HostTensorDescriptor(
+ std::vector{static_cast(M), static_cast(N)}));
+
+ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm;
+
+ auto ref_gemm = ReferenceGemmInstance{};
+ auto ref_invoker = ref_gemm.MakeInvoker();
+
+ auto ref_argument =
+ ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+ ref_invoker.Run(ref_argument);
+
+ for(int m = 0; m < M; ++m)
+ {
+ for(int n = 0; n < N; ++n)
+ {
+ cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
+ }
+ }
+
+ e_m_n_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+ return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1;
+ }
+
+ return 0;
+}
diff --git a/example/04_gemm_bias_relu_add/CMakeLists.txt b/example/04_gemm_bias_relu_add/CMakeLists.txt
deleted file mode 100644
index 4f48db94a8..0000000000
--- a/example/04_gemm_bias_relu_add/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_example_executable(example_gemm_xdl_bias_relu_add gemm_xdl_bias_relu_add.cpp)
diff --git a/example/04_gemm_bias_relu_add/README.md b/example/04_gemm_bias_relu_add/README.md
deleted file mode 100644
index f8d9bd6152..0000000000
--- a/example/04_gemm_bias_relu_add/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# Instructions for ```example_gemm_xdl_bias_relu_add```
-
-## Run ```example_gemm_xdl_bias_relu_add```
-```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: run kernel # of times (>1)
-#arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC
-./bin/example_gemm_xdl_bias_relu_add 0 1 5 3840 4096 4096 4096 4096 4096
-```
-
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-c0_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-c1_m_n: dim 2, lengths {3840, 4096}, strides {1, 0}
-arg.a_grid_desc_k0_m_k1_{512, 3840, 8}
-arg.b_grid_desc_k0_n_k1_{512, 4096, 8}
-arg.c_grid_desc_m_n_{ 3840, 4096}
-arg.c0_grid_desc_m_n_{ 3840, 4096}
-arg.c1_grid_desc_m_n_{ 3840, 4096}
-launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 5 times...
-Perf: 1.27583 ms, 100.992 TFlops, 73.9688 GB/s
-```
diff --git a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp b/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
deleted file mode 100644
index 9405c36881..0000000000
--- a/example/04_gemm_bias_relu_add/gemm_xdl_bias_relu_add.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-#include
-#include
-#include
-#include
-#include
-#include
-
-#include "check_err.hpp"
-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "host_gemm.hpp"
-#include "device_tensor.hpp"
-#include "element_wise_operation.hpp"
-#include "device_gemm_xdl_c_shuffle_bias_activation_add.hpp"
-#include "reference_gemm_bias_activation_add.hpp"
-
-template
-using S = ck::Sequence;
-
-using ADataType = ck::half_t;
-using BDataType = ck::half_t;
-using CDataType = ck::half_t;
-using AccDataType = float;
-
-using ALayout = ck::tensor_layout::gemm::RowMajor;
-using BLayout = ck::tensor_layout::gemm::ColumnMajor;
-using CLayout = ck::tensor_layout::gemm::RowMajor;
-
-using AElementOp = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp = ck::tensor_operation::element_wise::PassThrough;
-using CElementOp = ck::tensor_operation::element_wise::AddReluAdd;
-
-// clang-format off
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle_Bias_Activation_Add<
- ADataType, // ADataType
- BDataType, // BDataType
- CDataType, // CDataType
- AccDataType, // AccDataType
- ALayout, // ALayout
- BLayout, // BLayout
- CLayout, // CLayout
- AElementOp, // AElementwiseOperation
- BElementOp, // BElementwiseOperation
- CElementOp, // CElementwiseOperation
- 256, // BlockSize
- 256, // MPerBlock
- 128, // NPerBlock
- 4, // K0PerBlock
- 8, // K1
- 32, // MPerXDL
- 32, // NPerXDL
- 4, // MXdlPerWave
- 2, // NXdlPerWave
- S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
- S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
- S<1, 0, 2>, // ABlockTransferSrcAccessOrder
- 2, // ABlockTransferSrcVectorDim
- 8, // ABlockTransferSrcScalarPerVector
- 8, // ABlockTransferDstScalarPerVector_K1
- true, // ABlockLdsAddExtraM
- S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
- S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
- S<1, 0, 2>, // BBlockTransferSrcAccessOrder
- 2, // BBlockTransferSrcVectorDim
- 8, // BBlockTransferSrcScalarPerVector
- 8, // BBlockTransferDstScalarPerVector_K1
- true, // BBlockLdsAddExtraN
- 1, // CShuffleMXdlPerWavePerShuffle
- 1, // CShuffleNXdlPerWavePerShuffle
- S<1, 1, 32, 1, 1, 8>, // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
- 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl
-// clang-format on
-
-using ReferenceGemmInstance =
- ck::tensor_operation::host::ReferenceGemmBiasActivationAdd;
-int main(int argc, char* argv[])
-{
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
-
- // GEMM shape
- ck::index_t M = 3840;
- ck::index_t N = 4096;
- ck::index_t K = 4096;
-
- ck::index_t StrideA = 4096;
- ck::index_t StrideB = 4096;
- ck::index_t StrideC = 4096;
- ck::index_t StrideC1 = 4096;
-
- if(argc == 4)
- {
- do_verification = std::stoi(argv[1]);
- init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
- }
- else if(argc == 11)
- {
- do_verification = std::stoi(argv[1]);
- init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
-
- M = std::stoi(argv[4]);
- N = std::stoi(argv[5]);
- K = std::stoi(argv[6]);
-
- StrideA = std::stoi(argv[7]);
- StrideB = std::stoi(argv[8]);
- StrideC = std::stoi(argv[9]);
- StrideC1 = std::stoi(argv[10]);
- }
- else
- {
- printf("arg1: verification (0=no, 1=yes)\n");
- printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: run kernel # of times (>1)\n");
- printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, StrideC1\n");
- exit(0);
- }
-
- auto f_host_tensor_descriptor =
- [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
- if(std::is_same::value)
- {
- return HostTensorDescriptor(std::vector({row, col}),
- std::vector({stride, 1}));
- }
- else
- {
- return HostTensorDescriptor(std::vector({row, col}),
- std::vector({1, stride}));
- }
- };
-
- Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
- Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
- Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
- Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-
- // c0_n[n]
- Tensor c0_n(HostTensorDescriptor(
- std::vector({static_cast(N)}), std::vector({1})));
-
- // c1_m_n[m ,n]
- Tensor c1_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-
- std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
- std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
- std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
- std::cout << "c0_n: " << c0_n.mDesc << std::endl;
- std::cout << "c1_m_n: " << c1_m_n.mDesc << std::endl;
-
- switch(init_method)
- {
- case 0: break;
- case 1:
- a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5});
- b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
- c0_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
- c1_m_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
- break;
- default:
- a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
- b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
- c0_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
- c1_m_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
- }
-
- DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
- DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
- DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
- DeviceMem c0_n_device_buf(sizeof(CDataType) * c0_n.mDesc.GetElementSpace());
- DeviceMem c1_m_n_device_buf(sizeof(CDataType) * c1_m_n.mDesc.GetElementSpace());
-
- a_m_k_device_buf.ToDevice(a_m_k.mData.data());
- b_k_n_device_buf.ToDevice(b_k_n.mData.data());
- c_m_n_device_buf.ToDevice(c_m_n_device_result.mData.data());
- c0_n_device_buf.ToDevice(c0_n.mData.data());
- c1_m_n_device_buf.ToDevice(c1_m_n.mData.data());
-
- auto a_element_op = AElementOp{};
- auto b_element_op = BElementOp{};
- auto c_element_op = CElementOp{};
-
- // do GEMM
- auto gemm = DeviceGemmInstance{};
-
- auto invoker = gemm.MakeInvoker();
- auto argument = gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()),
- static_cast(b_k_n_device_buf.GetDeviceBuffer()),
- static_cast(c_m_n_device_buf.GetDeviceBuffer()),
- static_cast(c0_n_device_buf.GetDeviceBuffer()),
- static_cast(c1_m_n_device_buf.GetDeviceBuffer()),
- M,
- N,
- K,
- StrideA,
- StrideB,
- StrideC,
- StrideC1,
- a_element_op,
- b_element_op,
- c_element_op);
-
- if(!gemm.IsSupportedArgument(argument))
- {
- throw std::runtime_error(
- "wrong! device_gemm with the specified compilation parameters does "
- "not support this GEMM problem");
- }
-
- float ave_time = invoker.Run(argument, nrepeat);
-
- std::size_t flop = std::size_t(2) * M * N * K;
- std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
- sizeof(CDataType) * M * N + sizeof(CDataType) * N +
- sizeof(CDataType) * M * N;
-
- float tflops = static_cast(flop) / 1.E9 / ave_time;
-
- float gb_per_sec = num_btype / 1.E6 / ave_time;
-
- std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
- << std::endl;
-
- c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-
- if(do_verification)
- {
- auto ref_gemm = ReferenceGemmInstance{};
- auto ref_invoker = ref_gemm.MakeInvoker();
-
- auto ref_argument = ref_gemm.MakeArgument(a_m_k,
- b_k_n,
- c_m_n_host_result,
- c0_n,
- c1_m_n,
- a_element_op,
- b_element_op,
- c_element_op);
-
- ref_invoker.Run(ref_argument);
-
- ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
- }
-}
diff --git a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt b/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
index df8f70606c..4e1dd1f3e6 100644
--- a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
+++ b/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
@@ -1,2 +1,2 @@
add_example_executable(example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp)
-target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_fwd_util)
+target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_util)
diff --git a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
index 751ce16b90..d50afb6854 100644
--- a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
+++ b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
@@ -7,7 +7,7 @@
#include "check_err.hpp"
#include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
#include "device.hpp"
#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
#include "device_tensor.hpp"
@@ -93,7 +93,7 @@ void PrintUseMsg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
- << "arg3: run kernel # of times (>1)\n"
+ << "arg3: time kernel (0=n0, 1=yes)\n"
<< "Following arguments:\n"
<< " N, K, C, \n"
<< " , (ie Y, X for 2D)\n"
@@ -120,40 +120,40 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
ck::utils::conv::ConvParams params;
int arg_idx = 4;
- params.num_dim_spatial = num_dim_spatial;
- params.N = std::stoi(argv[arg_idx++]);
- params.K = std::stoi(argv[arg_idx++]);
- params.C = std::stoi(argv[arg_idx++]);
+ params.num_dim_spatial_ = num_dim_spatial;
+ params.N_ = std::stoi(argv[arg_idx++]);
+ params.K_ = std::stoi(argv[arg_idx++]);
+ params.C_ = std::stoi(argv[arg_idx++]);
- params.filter_spatial_lengths.resize(num_dim_spatial);
+ params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+ params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_spatial_lengths.resize(num_dim_spatial);
+ params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+ params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
- params.conv_filter_strides.resize(num_dim_spatial);
+ params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+ params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
}
- params.conv_filter_dilations.resize(num_dim_spatial);
+ params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+ params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_left_pads.resize(num_dim_spatial);
+ params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+ params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
}
- params.input_right_pads.resize(num_dim_spatial);
+ params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
- params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+ params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
}
return params;
@@ -165,9 +165,9 @@ int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
- bool do_verification = 0;
- int init_method = 0;
- int nrepeat = 5;
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
const int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
@@ -176,7 +176,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
- nrepeat = std::stoi(argv[3]);
+ time_kernel = std::stoi(argv[3]);
}
if(argc >= 5)
@@ -184,21 +184,21 @@ int main(int argc, char* argv[])
params = ParseConvParams(argc, argv);
}
- std::vector input_dims{static_cast(params.N),
- static_cast(params.C)};
+ std::vector input_dims{static_cast(params.N_),
+ static_cast(params.C_)};
input_dims.insert(std::end(input_dims),
- std::begin(params.input_spatial_lengths),
- std::end(params.input_spatial_lengths));
+ std::begin(params.input_spatial_lengths_),
+ std::end(params.input_spatial_lengths_));
- std::vector filter_dims{static_cast(params.K),
- static_cast(params.C)};
+ std::vector filter_dims{static_cast(params.K_),
+ static_cast(params.C_)};
filter_dims.insert(std::end(filter_dims),
- std::begin(params.filter_spatial_lengths),
- std::end(params.filter_spatial_lengths));
+ std::begin(params.filter_spatial_lengths_),
+ std::end(params.filter_spatial_lengths_));
const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths();
- std::vector output_dims{static_cast(params.N),
- static_cast(params.K)};
+ std::vector output_dims{static_cast(params.N_),
+ static_cast(params.K_)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
@@ -211,7 +211,7 @@ int main(int argc, char* argv[])
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
// bias: assume contiguous 1d vector
Tensor bias(
- HostTensorDescriptor(std::vector({static_cast(params.K)})));
+ HostTensorDescriptor(std::vector({static_cast