mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
sync with upstream
This commit is contained in:
12
.github/CODEOWNERS
vendored
12
.github/CODEOWNERS
vendored
@@ -1,7 +1,7 @@
|
||||
* @zjing14 @asroy @junliume @illsilin @carlushuang @aosewski
|
||||
* @zjing14 @junliume @illsilin @carlushuang @aosewski
|
||||
# Documentation files
|
||||
docs/* @saadrahim @LisaDelaney
|
||||
*.md @saadrahim @LisaDelaney
|
||||
*.rst @saadrahim @LisaDelaney
|
||||
# Header directory
|
||||
library/include/* @saadrahim @LisaDelaney
|
||||
docs/* @ROCm/rocm-documentation
|
||||
*.md @ROCm/rocm-documentation
|
||||
*.rst @ROCm/rocm-documentation
|
||||
# Header directory for Doxygen documentation
|
||||
library/include/* @ROCm/rocm-documentation
|
||||
|
||||
27
CHANGELOG.md
27
CHANGELOG.md
@@ -2,20 +2,27 @@
|
||||
|
||||
Full documentation for Composable Kernel is not yet available.
|
||||
|
||||
## (Unreleased) CK
|
||||
|
||||
### Fixes
|
||||
None
|
||||
|
||||
### Optimizations
|
||||
None
|
||||
## CK for ROCm 6.1.0
|
||||
|
||||
### Additions
|
||||
* Introduced wrapper sublibrary (limited functionality). (#1071, #1098, #1108, #1126, #1139)
|
||||
* Added generic instances for GEMM XDL operations (#1161)
|
||||
* Added gamma and beta parameters for the layernorm and groupnorm bwd operations (#1133)
|
||||
* Introduced wrapper sublibrary (limited functionality). (#1071, #1098, #1108, #1126)
|
||||
* Added an option to vary the number of warm-up cycles and iterations for ckProfiler (#1124)
|
||||
|
||||
### Optimizations
|
||||
* New performance optimizations for GEMM operations on MI200 and MI300 architectures (#1135)
|
||||
|
||||
### Fixes
|
||||
* Reduced the build time for most GPU architectures (#1084)
|
||||
* Fixed some conversion issues for fp8 data type (#1099)
|
||||
|
||||
### Changes
|
||||
None
|
||||
|
||||
### Known issues
|
||||
None
|
||||
|
||||
## CK for ROCm 6.0.0
|
||||
|
||||
### Fixes
|
||||
@@ -32,7 +39,7 @@ None
|
||||
* Grouped convolution support for small K and C (#822 #879 #897)
|
||||
* Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
|
||||
* Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799)
|
||||
* Support for Batched Gemm DL (#732)
|
||||
* Support for Batched GEMM DL (#732)
|
||||
|
||||
### Changes
|
||||
* Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
|
||||
@@ -48,7 +55,7 @@ None
|
||||
|
||||
### Additions
|
||||
* New CMake flags:
|
||||
* "DL_KERNELS"-* Must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances
|
||||
* "DL_KERNELS"-* Must be set to "ON" in order to build the GEMM DL and batched_gemm_multi_d_dl instances
|
||||
* "DTYPES" -- Can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build an instance of the specified data types
|
||||
* "INSTANCES_ONLY" -- Only builds CK library and instances without tests, examples, or profiler
|
||||
* New feature: if GPU_TARGETS is not set in the CMake command line, CK will be built for all targets supported by the compiler
|
||||
|
||||
44
Dockerfile
44
Dockerfile
@@ -3,6 +3,7 @@ ARG DEBIAN_FRONTEND=noninteractive
|
||||
ARG ROCMVERSION=6.0
|
||||
ARG compiler_version=""
|
||||
ARG compiler_commit=""
|
||||
ARG CK_SCCACHE=""
|
||||
|
||||
RUN set -xe
|
||||
|
||||
@@ -16,35 +17,37 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
|
||||
ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
|
||||
RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
|
||||
|
||||
RUN if [ "$ROCMVERSION" != "6.0.1" ]; then \
|
||||
RUN if [ "$ROCMVERSION" != "6.1" ]; then \
|
||||
sh -c "wget https://repo.radeon.com/amdgpu-install/6.0/ubuntu/focal/amdgpu-install_6.0.60000-1_all.deb --no-check-certificate" && \
|
||||
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.0.60000-1_all.deb && \
|
||||
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
|
||||
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
|
||||
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
|
||||
elif [ "$ROCMVERSION" = "6.0.1" ] && [ "$compiler_version" = "rc1" ]; then \
|
||||
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.0-20.04-1_all.deb --no-check-certificate" && \
|
||||
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.0-20.04-1_all.deb && \
|
||||
sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.0.1 rel-95 > /etc/apt/sources.list.d/rocm-build.list' && \
|
||||
amdgpu-repo --amdgpu-build=1704947; \
|
||||
elif [ "$ROCMVERSION" = "6.1" ] && [ "$compiler_version" = "rc2" ]; then \
|
||||
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.1-20.04-1_all.deb --no-check-certificate" && \
|
||||
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.1-20.04-1_all.deb && \
|
||||
sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.1 rel-48 > /etc/apt/sources.list.d/rocm-build.list' && \
|
||||
amdgpu-repo --amdgpu-build=1736298; \
|
||||
fi
|
||||
|
||||
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
|
||||
RUN amdgpu-install -y --usecase=rocm --no-dkms
|
||||
|
||||
## Sccache binary built from source for ROCm
|
||||
## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
|
||||
ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
|
||||
ENV SCCACHE_INSTALL_LOCATION=/usr/local/.cargo/bin
|
||||
RUN mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
|
||||
curl ${SCCACHE_REPO_URL}/portable/0.2.16/sccache-0.2.16-alpha.1-rocm --output ${SCCACHE_INSTALL_LOCATION}/sccache && \
|
||||
chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache
|
||||
ENV PATH=$PATH:${SCCACHE_INSTALL_LOCATION}
|
||||
ENV CK_SCCACHE=$CK_SCCACHE
|
||||
RUN if [ "$CK_SCCACHE" != "" ]; then \
|
||||
mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
|
||||
curl ${SCCACHE_REPO_URL}/portable/0.2.16/sccache-0.2.16-alpha.1-rocm --output ${SCCACHE_INSTALL_LOCATION}/sccache && \
|
||||
chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache; \
|
||||
fi
|
||||
|
||||
# Install dependencies
|
||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
|
||||
build-essential \
|
||||
cmake \
|
||||
ccache \
|
||||
git \
|
||||
hip-rocclr \
|
||||
iputils-ping \
|
||||
@@ -74,13 +77,22 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# hipTensor requires rocm-llvm-dev for rocm versions > 6.0.1
|
||||
RUN if [ "$ROCMVERSION" = "6.1" ]; then \
|
||||
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev"; \
|
||||
fi
|
||||
# Update the cmake to version 3.27.5
|
||||
RUN pip install --upgrade cmake==3.27.5
|
||||
|
||||
#Install latest ccache
|
||||
RUN git clone https://github.com/ccache/ccache.git && \
|
||||
cd ccache && mkdir build && cd build && cmake .. && make install
|
||||
|
||||
#Install ninja build tracing tools
|
||||
RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip
|
||||
RUN gunzip /usr/local/bin/ninja.gz
|
||||
RUN chmod a+x /usr/local/bin/ninja
|
||||
RUN git clone https://github.com/nico/ninjatracing.git
|
||||
# Update the cmake to the latest version
|
||||
RUN pip install --upgrade cmake==3.27.5
|
||||
|
||||
#Install latest cppcheck
|
||||
RUN git clone https://github.com/danmar/cppcheck.git && \
|
||||
@@ -111,7 +123,7 @@ ENV LANG=C.UTF-8
|
||||
RUN groupadd -f render
|
||||
|
||||
# Install the new rocm-cmake version
|
||||
RUN git clone -b master https://github.com/RadeonOpenCompute/rocm-cmake.git && \
|
||||
RUN git clone -b master https://github.com/ROCm/rocm-cmake.git && \
|
||||
cd rocm-cmake && mkdir build && cd build && \
|
||||
cmake .. && cmake --build . && cmake --build . --target install
|
||||
|
||||
@@ -123,7 +135,7 @@ RUN sh -c "echo compiler version = '$compiler_version'"
|
||||
RUN sh -c "echo compiler commit = '$compiler_commit'"
|
||||
|
||||
RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" = "" ]; then \
|
||||
git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
|
||||
git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
|
||||
cd llvm-project && mkdir build && cd build && \
|
||||
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
|
||||
make -j 8 ; \
|
||||
@@ -131,7 +143,7 @@ RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd
|
||||
fi
|
||||
|
||||
RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline-open" ] ) && [ "$compiler_commit" != "" ]; then \
|
||||
git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
|
||||
git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
|
||||
cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
|
||||
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
|
||||
make -j 8 ; \
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
@@ -83,7 +83,7 @@ int main(int argc, char* argv[])
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
@@ -92,7 +92,7 @@ int main(int argc, char* argv[])
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
@@ -93,7 +93,7 @@ int main(int argc, char* argv[])
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
@@ -88,7 +88,7 @@ int main(int argc, char* argv[])
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
@@ -89,7 +89,7 @@ int main(int argc, char* argv[])
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
@@ -84,7 +84,7 @@ int main(int argc, char* argv[])
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
@@ -85,7 +85,7 @@ int main(int argc, char* argv[])
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
@@ -17,6 +17,8 @@
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
|
||||
using ADataType = F16;
|
||||
using BDataType = F16;
|
||||
using BiasDataType = F32;
|
||||
@@ -191,7 +193,7 @@ int main()
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
@@ -78,7 +78,7 @@ int main(int argc, char* argv[])
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <numeric>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <numeric>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <numeric>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <numeric>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <numeric>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
|
||||
@@ -7,6 +7,14 @@ endif()
|
||||
if((DTYPES MATCHES "fp8") OR NOT DEFINED DTYPES)
|
||||
add_executable(client_conv3d_fwd_fp16_comp_fp8 conv3d_fwd_fp16_comp_fp8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_fp16_comp_fp8 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_conv3d_fwd_fp8 conv3d_fwd_fp8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_fp8 PRIVATE composable_kernel::device_conv_operations)
|
||||
endif()
|
||||
|
||||
if((DTYPES MATCHES "bf8") OR NOT DEFINED DTYPES)
|
||||
add_executable(client_conv3d_fwd_bf8 conv3d_fwd_bf8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_bf8 PRIVATE composable_kernel::device_conv_operations)
|
||||
endif()
|
||||
|
||||
if((DTYPES MATCHES "fp32") OR NOT DEFINED DTYPES)
|
||||
|
||||
46
client_example/16_convnd_fwd/conv3d_fwd_bf8.cpp
Normal file
46
client_example/16_convnd_fwd/conv3d_fwd_bf8.cpp
Normal file
@@ -0,0 +1,46 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::bf8_t;
|
||||
using WeiDataType = ck::bf8_t;
|
||||
using OutDataType = ck::f8_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 64;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 3;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 3;
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_fwd<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
3,
|
||||
ck::bf8_t>(
|
||||
{N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
|
||||
46
client_example/16_convnd_fwd/conv3d_fwd_fp8.cpp
Normal file
46
client_example/16_convnd_fwd/conv3d_fwd_fp8.cpp
Normal file
@@ -0,0 +1,46 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::f8_t;
|
||||
using WeiDataType = ck::f8_t;
|
||||
using OutDataType = ck::f8_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 64;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 3;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 3;
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_fwd<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
3,
|
||||
ck::f8_t>(
|
||||
{N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
@@ -88,7 +88,7 @@ int main(int argc, char* argv[])
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
@@ -79,7 +79,7 @@ int main()
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
|
||||
@@ -6,3 +6,6 @@ target_link_libraries(client_grouped_gemm_fixed_nk_fp8 PRIVATE composable_kernel
|
||||
|
||||
add_executable(client_grouped_gemm_fixed_nk_i8 grouped_gemm_fixed_nk_i8.cpp)
|
||||
target_link_libraries(client_grouped_gemm_fixed_nk_i8 PRIVATE composable_kernel::device_gemm_operations)
|
||||
|
||||
add_executable(client_grouped_gemm_fixed_nk_bf16 grouped_gemm_fixed_nk_bf16.cpp)
|
||||
target_link_libraries(client_grouped_gemm_fixed_nk_bf16 PRIVATE composable_kernel::device_gemm_operations)
|
||||
|
||||
237
client_example/22_grouped_gemm/grouped_gemm_fixed_nk_bf16.cpp
Normal file
237
client_example/22_grouped_gemm/grouped_gemm_fixed_nk_bf16.cpp
Normal file
@@ -0,0 +1,237 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_fixed_nk.hpp"
|
||||
|
||||
using I8 = int8_t;
|
||||
using BF16 = ck::bhalf_t;
|
||||
using F32 = float;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using ADataType = BF16;
|
||||
using BDataType = I8;
|
||||
using DsDataType = ck::Tuple<>;
|
||||
using EDataType = BF16;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Row;
|
||||
using DsLayout = ck::Tuple<>;
|
||||
using ELayout = Row;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = PassThrough;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
std::vector<int> Ms, Ns, Ks, StrideAs, StrideBs, StrideEs;
|
||||
|
||||
int sum_of_m = 0;
|
||||
|
||||
const int group_count = 16;
|
||||
|
||||
for(int i = 0; i < group_count; ++i)
|
||||
{
|
||||
Ms.push_back(256 + 256 * i);
|
||||
Ns.push_back(128 + 128 * i);
|
||||
Ks.push_back(128 + 64 * i);
|
||||
|
||||
StrideAs.push_back(std::is_same<Row, ALayout>::value ? Ks[i] : Ms[i]);
|
||||
StrideBs.push_back(std::is_same<Row, BLayout>::value ? Ns[i] : Ks[i]);
|
||||
StrideEs.push_back(std::is_same<Row, ELayout>::value ? Ns[i] : Ms[i]);
|
||||
|
||||
sum_of_m += Ms[i];
|
||||
}
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<SimpleDeviceMem> a_dev_bufs, b_dev_bufs, e_dev_bufs;
|
||||
|
||||
a_dev_bufs.reserve(group_count);
|
||||
b_dev_bufs.reserve(group_count);
|
||||
e_dev_bufs.reserve(group_count);
|
||||
|
||||
std::vector<void*> p_e;
|
||||
|
||||
p_e.reserve(group_count);
|
||||
|
||||
std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
|
||||
|
||||
gemm_descs.reserve(group_count);
|
||||
|
||||
std::vector<ck::tensor_operation::device::GroupedGemmKernelArgument<1>>
|
||||
grouped_gemm_kernel_args_;
|
||||
grouped_gemm_kernel_args_.reserve(group_count);
|
||||
|
||||
for(int i = 0; i < group_count; ++i)
|
||||
{
|
||||
a_dev_bufs.emplace_back(sizeof(ADataType) *
|
||||
f_matrix_space_size(Ms[i], Ks[i], StrideAs[i], ALayout{}));
|
||||
b_dev_bufs.emplace_back(sizeof(BDataType) *
|
||||
f_matrix_space_size(Ks[i], Ns[i], StrideBs[i], BLayout{}));
|
||||
e_dev_bufs.emplace_back(sizeof(EDataType) *
|
||||
f_matrix_space_size(Ms[i], Ns[i], StrideEs[i], ELayout{}));
|
||||
|
||||
gemm_descs.push_back({sum_of_m, Ns[i], Ks[i], 1, StrideBs[i], 1, {0}});
|
||||
|
||||
p_e.push_back(e_dev_bufs[i].GetDeviceBuffer());
|
||||
|
||||
grouped_gemm_kernel_args_.push_back({a_dev_bufs[i].GetDeviceBuffer(),
|
||||
b_dev_bufs[i].GetDeviceBuffer(),
|
||||
{},
|
||||
e_dev_bufs[i].GetDeviceBuffer(),
|
||||
Ms[i],
|
||||
Ns[i],
|
||||
Ks[i],
|
||||
StrideAs[i],
|
||||
StrideBs[i],
|
||||
{},
|
||||
StrideEs[i]});
|
||||
}
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemmFixedNK<ALayout,
|
||||
BLayout,
|
||||
DsLayout,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = 0;
|
||||
float best_tflops = 0;
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
std::vector<const void*> p_a = {}, p_b = {};
|
||||
std::vector<std::array<const void*, 0>> p_ds = {};
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
p_a, p_b, p_ds, p_e, gemm_descs, a_element_op, b_element_op, cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
SimpleDeviceMem grouped_gemm_kernel_args_dev(
|
||||
op_ptr->GetDeviceKernelArgSize(argument_ptr.get()));
|
||||
|
||||
SimpleDeviceMem grouped_gemm_workspace_dev(op_ptr->GetWorkSpaceSize(argument_ptr.get()));
|
||||
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
hipGetErrorString(hipMemcpy(grouped_gemm_kernel_args_dev.GetDeviceBuffer(),
|
||||
grouped_gemm_kernel_args_.data(),
|
||||
op_ptr->GetDeviceKernelArgSize(argument_ptr.get()),
|
||||
hipMemcpyHostToDevice));
|
||||
|
||||
op_ptr->SetWorkSpacePointer(argument_ptr.get(),
|
||||
grouped_gemm_workspace_dev.GetDeviceBuffer());
|
||||
|
||||
op_ptr->SetDeviceKernelArgs(argument_ptr.get(),
|
||||
grouped_gemm_kernel_args_dev.GetDeviceBuffer());
|
||||
|
||||
op_ptr->SetKBatch(argument_ptr.get(), 1);
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = 0, num_btype = 0;
|
||||
for(std::size_t j = 0; j < gemm_descs.size(); ++j)
|
||||
{
|
||||
flop += std::size_t(2) * Ms[j] * Ns[j] * Ks[j];
|
||||
|
||||
num_btype += sizeof(ADataType) * Ms[j] * Ks[j] + sizeof(BDataType) * Ks[j] * Ns[j] +
|
||||
sizeof(EDataType) * Ms[j] * Ns[j];
|
||||
}
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
|
||||
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
@@ -76,7 +76,7 @@ int main()
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
@@ -77,7 +77,7 @@ int main()
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
@@ -77,7 +77,7 @@ int main()
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
if constexpr(std::is_same<Layout, Row>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
|
||||
48
client_example/24_grouped_conv_activation/CMakeLists.txt
Normal file
48
client_example/24_grouped_conv_activation/CMakeLists.txt
Normal file
@@ -0,0 +1,48 @@
|
||||
# Fwd scaleadd scaleadd relu
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32
|
||||
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp16
|
||||
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp16 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_bf16
|
||||
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_bf16 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_int8
|
||||
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_int8 PRIVATE composable_kernel::device_conv_operations)
|
||||
# Fwd scaleadd AB
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_ab_fp32
|
||||
grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp32.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_fp32 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_ab_fp16
|
||||
grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp16.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_fp16 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_ab_bf16
|
||||
grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_bf16.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_bf16 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_ab_int8
|
||||
grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_int8.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_int8 PRIVATE composable_kernel::device_conv_operations)
|
||||
# Fwd bilinear
|
||||
add_executable(client_grouped_convnd_fwd_bilinear_residual_fp16
|
||||
grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_bilinear_residual_fp16 PRIVATE composable_kernel::device_conv_operations)
|
||||
# Bwd data bilinear
|
||||
add_executable(client_grouped_convnd_bwd_data_bilinear_residual_fp16
|
||||
grouped_convnd_bwd_data_bilinear/grouped_conv_bwd_data_bilinear_residual_fp16.cpp)
|
||||
target_link_libraries(client_grouped_convnd_bwd_data_bilinear_residual_fp16 PRIVATE composable_kernel::device_conv_operations)
|
||||
# Fwd scale
|
||||
add_executable(client_grouped_convnd_fwd_scale_fp16
|
||||
grouped_convnd_fwd_scale/grouped_conv_fwd_scale_fp16.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_scale_fp16 PRIVATE composable_kernel::device_conv_operations)
|
||||
# Bwd data scale
|
||||
add_executable(client_grouped_convnd_bwd_data_scale_fp16
|
||||
grouped_convnd_bwd_data_scale/grouped_conv_bwd_data_scale_fp16.cpp)
|
||||
target_link_libraries(client_grouped_convnd_bwd_data_scale_fp16 PRIVATE composable_kernel::device_conv_operations)
|
||||
@@ -0,0 +1,217 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_bilinear.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
// Use std tuple instead of ck tuple to avoid clang
|
||||
// implicit instantiation of undefined template error.
|
||||
using DDataTypes = std::tuple<ck::half_t>;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Bilinear = ck::tensor_operation::element_wise::Bilinear;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 32;
|
||||
static constexpr ck::index_t N = 64; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 32; // input channel (per group)
|
||||
static constexpr ck::index_t Z = 3; // filter D
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Di = 14; // input D
|
||||
static constexpr ck::index_t Hi = 14; // input H
|
||||
static constexpr ck::index_t Wi = 14; // input W
|
||||
static constexpr ck::index_t Do = 14; // output D
|
||||
static constexpr ck::index_t Ho = 14; // output H
|
||||
static constexpr ck::index_t Wo = 14; // output W
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int execute_conv_bwd_data_bilinear()
|
||||
{
|
||||
std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, C, Di, Hi, Wi};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> in_strides{
|
||||
C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, C, Z, Y, X};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> wei_strides{
|
||||
K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, K, Do, Ho, Wo};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> out_strides{
|
||||
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * G * N * Di * Hi * Wi * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Do * Ho * Wo * K);
|
||||
|
||||
using DeviceOp =
|
||||
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NumDimSpatial,
|
||||
OutLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<InLayout>,
|
||||
InLayout,
|
||||
OutDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<InDataType>,
|
||||
InDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{in.GetDeviceBuffer()},
|
||||
in.GetDeviceBuffer(),
|
||||
out_lengths,
|
||||
out_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{in_lengths},
|
||||
{in_strides},
|
||||
in_lengths,
|
||||
in_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
Bilinear{2.f, 2.f});
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = std::size_t(2) * G * N * K * C * Do * Ho * Wo * Y * X +
|
||||
3 * G * N * Di * Hi * Wi * C;
|
||||
std::size_t num_bytes = 2 * sizeof(InDataType) * G * N * Di * Hi * Wi * C +
|
||||
sizeof(WeiDataType) * G * K * Z * Y * X * C +
|
||||
sizeof(OutDataType) * G * N * Do * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{in.GetDeviceBuffer()},
|
||||
in.GetDeviceBuffer(),
|
||||
out_lengths,
|
||||
out_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{in_lengths},
|
||||
{in_strides},
|
||||
in_lengths,
|
||||
in_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
Bilinear{2.f, 2.f});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main() { return execute_conv_bwd_data_bilinear(); }
|
||||
@@ -0,0 +1,216 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_scale.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
// Use std tuple instead of ck tuple to avoid clang
|
||||
// implicit instantiation of undefined template error.
|
||||
using DDataTypes = std::tuple<ck::half_t>;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Scale = ck::tensor_operation::element_wise::Scale;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 32;
|
||||
static constexpr ck::index_t N = 64; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 32; // input channel (per group)
|
||||
static constexpr ck::index_t Z = 3; // filter D
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Di = 14; // input D
|
||||
static constexpr ck::index_t Hi = 14; // input H
|
||||
static constexpr ck::index_t Wi = 14; // input W
|
||||
static constexpr ck::index_t Do = 14; // output D
|
||||
static constexpr ck::index_t Ho = 14; // output H
|
||||
static constexpr ck::index_t Wo = 14; // output W
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int execute_conv_bwd_data_scale()
|
||||
{
|
||||
std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, C, Di, Hi, Wi};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> in_strides{
|
||||
C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, C, Z, Y, X};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> wei_strides{
|
||||
K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, K, Do, Ho, Wo};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> out_strides{
|
||||
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * G * N * Di * Hi * Wi * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Do * Ho * Wo * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NumDimSpatial,
|
||||
OutLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
InLayout,
|
||||
OutDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<>,
|
||||
InDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
in.GetDeviceBuffer(),
|
||||
out_lengths,
|
||||
out_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
in_lengths,
|
||||
in_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
Scale{2.f});
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = std::size_t(2) * G * N * K * C * Do * Ho * Wo * Y * X +
|
||||
3 * G * N * Di * Hi * Wi * C;
|
||||
std::size_t num_bytes = 2 * sizeof(InDataType) * G * N * Di * Hi * Wi * C +
|
||||
sizeof(WeiDataType) * G * K * Z * Y * X * C +
|
||||
sizeof(OutDataType) * G * N * Do * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
in.GetDeviceBuffer(),
|
||||
out_lengths,
|
||||
out_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
in_lengths,
|
||||
in_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
Scale{2.f});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main() { return execute_conv_bwd_data_scale(); }
|
||||
@@ -0,0 +1,221 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bilinear.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
// Use std tuple instead of ck tuple to avoid clang
|
||||
// implicit instantiation of undefined template error.
|
||||
using DDataTypes = std::tuple<ck::half_t>;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Bilinear = ck::tensor_operation::element_wise::Bilinear;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 32;
|
||||
static constexpr ck::index_t N = 64; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 32; // input channel (per group)
|
||||
static constexpr ck::index_t Z = 3; // filter D
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Di = 14; // input D
|
||||
static constexpr ck::index_t Hi = 14; // input H
|
||||
static constexpr ck::index_t Wi = 14; // input W
|
||||
static constexpr ck::index_t Do = 14; // output D
|
||||
static constexpr ck::index_t Ho = 14; // output H
|
||||
static constexpr ck::index_t Wo = 14; // output W
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int execute_conv_fwd_bilinear()
|
||||
{
|
||||
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
|
||||
// However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
|
||||
// Hence, we need to adjust the order of strides.
|
||||
std::array<ck::index_t, 6> in_lengths{G, N, C, Di, Hi, Wi};
|
||||
std::array<ck::index_t, 6> in_strides{
|
||||
C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
|
||||
std::array<ck::index_t, 6> wei_lengths{G, K, C, Z, Y, X};
|
||||
std::array<ck::index_t, 6> wei_strides{
|
||||
K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
|
||||
std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
|
||||
std::array<ck::index_t, 6> out_strides{
|
||||
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
|
||||
// Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
|
||||
std::array<ck::index_t, 6> bias_lengths{G, 1, K, 1, 1, 1};
|
||||
std::array<ck::index_t, 6> bias_strides{K, 0, 1, 0, 0, 0};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Di * Hi * Wi * G * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
|
||||
|
||||
using DeviceOp =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<OutLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<OutDataType>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Bilinear>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{out.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{out_lengths},
|
||||
{out_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
Bilinear{2.f, 2.f});
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop =
|
||||
std::size_t(2) * G * N * K * C * Ho * Wo * Y * X + 3 * N * Ho * Wo * G * K;
|
||||
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
|
||||
sizeof(WeiDataType) * G * K * Y * X * C +
|
||||
sizeof(OutDataType) * 2 * N * Ho * Wo * G * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{out.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{out_lengths},
|
||||
{out_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
Bilinear{2.f, 2.f});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main() { return execute_conv_fwd_bilinear(); }
|
||||
@@ -0,0 +1,220 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scale.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
// Use std tuple instead of ck tuple to avoid clang
|
||||
// implicit instantiation of undefined template error.
|
||||
using DDataTypes = std::tuple<ck::half_t>;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Scale = ck::tensor_operation::element_wise::Scale;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 32;
|
||||
static constexpr ck::index_t N = 64; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 32; // input channel (per group)
|
||||
static constexpr ck::index_t Z = 3; // filter D
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Di = 14; // input D
|
||||
static constexpr ck::index_t Hi = 14; // input H
|
||||
static constexpr ck::index_t Wi = 14; // input W
|
||||
static constexpr ck::index_t Do = 14; // output D
|
||||
static constexpr ck::index_t Ho = 14; // output H
|
||||
static constexpr ck::index_t Wo = 14; // output W
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int execute_conv_fwd_scale()
|
||||
{
|
||||
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
|
||||
// However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
|
||||
// Hence, we need to adjust the order of strides.
|
||||
std::array<ck::index_t, 6> in_lengths{G, N, C, Di, Hi, Wi};
|
||||
std::array<ck::index_t, 6> in_strides{
|
||||
C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
|
||||
std::array<ck::index_t, 6> wei_lengths{G, K, C, Z, Y, X};
|
||||
std::array<ck::index_t, 6> wei_strides{
|
||||
K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
|
||||
std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
|
||||
std::array<ck::index_t, 6> out_strides{
|
||||
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
|
||||
// Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
|
||||
std::array<ck::index_t, 6> bias_lengths{G, 1, K, 1, 1, 1};
|
||||
std::array<ck::index_t, 6> bias_strides{K, 0, 1, 0, 0, 0};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Di * Hi * Wi * G * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Scale>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
Scale{2.f});
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop =
|
||||
std::size_t(2) * G * N * K * C * Ho * Wo * Y * X + 3 * N * Ho * Wo * G * K;
|
||||
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
|
||||
sizeof(WeiDataType) * G * K * Y * X * C +
|
||||
sizeof(OutDataType) * 2 * N * Ho * Wo * G * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
Scale{2.f});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main() { return execute_conv_fwd_scale(); }
|
||||
@@ -0,0 +1,221 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_ab.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ScaleAdd = ck::tensor_operation::element_wise::ScaleAdd;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 32;
|
||||
static constexpr ck::index_t N = 64; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 32; // input channel (per group)
|
||||
static constexpr ck::index_t Z = 3; // filter D
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Di = 14; // input D
|
||||
static constexpr ck::index_t Hi = 14; // input H
|
||||
static constexpr ck::index_t Wi = 14; // input W
|
||||
static constexpr ck::index_t Do = 14; // output D
|
||||
static constexpr ck::index_t Ho = 14; // output H
|
||||
static constexpr ck::index_t Wo = 14; // output W
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int execute_conv_fwd_scaleadd_ab()
|
||||
{
|
||||
constexpr ck::index_t NumAs = 2;
|
||||
constexpr ck::index_t NumBs = 2;
|
||||
|
||||
constexpr float scale = 1.5f;
|
||||
|
||||
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
|
||||
// However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
|
||||
// Hence, we need to adjust the order of strides.
|
||||
std::array<ck::index_t, 6> in_lengths{G, N, C, Di, Hi, Wi};
|
||||
std::array<ck::index_t, 6> in_strides{
|
||||
C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
|
||||
std::array<ck::index_t, 6> wei_lengths{G, K, C, Z, Y, X};
|
||||
std::array<ck::index_t, 6> wei_strides{
|
||||
K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
|
||||
std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
|
||||
std::array<ck::index_t, 6> out_strides{
|
||||
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
|
||||
|
||||
using InputDtype = ck::tuple_element_t<0, InDataType>;
|
||||
using InputBiasDtype = ck::tuple_element_t<1, InDataType>;
|
||||
using WeightDtype = ck::tuple_element_t<0, WeiDataType>;
|
||||
using WeightBiasDtype = ck::tuple_element_t<1, WeiDataType>;
|
||||
|
||||
SimpleDeviceMem in(sizeof(InputDtype) * N * Di * Hi * Wi * G * C);
|
||||
SimpleDeviceMem in_bias(sizeof(InputBiasDtype) * N * Di * Hi * Wi * G * C);
|
||||
SimpleDeviceMem wei(sizeof(WeightDtype) * G * K * Z * Y * X * C);
|
||||
SimpleDeviceMem wei_bias(sizeof(WeightBiasDtype) * G * K * Z * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<>,
|
||||
OutDataType,
|
||||
ScaleAdd,
|
||||
ScaleAdd,
|
||||
PassThrough>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
std::array<const void*, NumAs> as = {in.GetDeviceBuffer(), in_bias.GetDeviceBuffer()};
|
||||
std::array<const void*, NumBs> bs = {wei.GetDeviceBuffer(), wei_bias.GetDeviceBuffer()};
|
||||
std::array<const void*, 0> ds{};
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(as,
|
||||
bs,
|
||||
ds,
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
ScaleAdd{scale},
|
||||
ScaleAdd{scale},
|
||||
PassThrough{});
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = std::size_t(2) * G * N * K * C * Do * Ho * Wo * Z * Y * X +
|
||||
N * Di * Hi * Wi * G * C + G * K * Z * Y * X * C;
|
||||
std::size_t num_bytes = 2 * sizeof(InDataType) * N * Di * Hi * Wi * G * C +
|
||||
2 * sizeof(WeiDataType) * G * K * Z * Y * X * C +
|
||||
sizeof(OutDataType) * N * Do * Ho * Wo * G * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(as,
|
||||
bs,
|
||||
ds,
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
ScaleAdd{scale},
|
||||
ScaleAdd{scale},
|
||||
PassThrough{});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
using InDataType = ck::Tuple<ck::bhalf_t, ck::bhalf_t>;
|
||||
using WeiDataType = ck::Tuple<ck::bhalf_t, ck::bhalf_t>;
|
||||
using OutDataType = ck::bhalf_t;
|
||||
|
||||
#include "grouped_conv_fwd_scaleadd_ab.inc"
|
||||
|
||||
int main() { return execute_conv_fwd_scaleadd_ab(); }
|
||||
@@ -0,0 +1,13 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
using InDataType = ck::Tuple<ck::half_t, ck::half_t>;
|
||||
using WeiDataType = ck::Tuple<ck::half_t, ck::half_t>;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
#include "grouped_conv_fwd_scaleadd_ab.inc"
|
||||
|
||||
int main() { return execute_conv_fwd_scaleadd_ab(); }
|
||||
@@ -0,0 +1,13 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
using InDataType = ck::Tuple<float, float>;
|
||||
using WeiDataType = ck::Tuple<float, float>;
|
||||
using OutDataType = float;
|
||||
|
||||
#include "grouped_conv_fwd_scaleadd_ab.inc"
|
||||
|
||||
int main() { return execute_conv_fwd_scaleadd_ab(); }
|
||||
@@ -0,0 +1,13 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
using InDataType = ck::Tuple<int8_t, int8_t>;
|
||||
using WeiDataType = ck::Tuple<int8_t, int8_t>;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
#include "grouped_conv_fwd_scaleadd_ab.inc"
|
||||
|
||||
int main() { return execute_conv_fwd_scaleadd_ab(); }
|
||||
@@ -0,0 +1,216 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
using BiasLayout = ck::tensor_layout::convolution::G_K;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ScaleAddScaleAddRelu = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 32;
|
||||
static constexpr ck::index_t N = 64; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 32; // input channel (per group)
|
||||
static constexpr ck::index_t Z = 3; // filter D
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Di = 14; // input D
|
||||
static constexpr ck::index_t Hi = 14; // input H
|
||||
static constexpr ck::index_t Wi = 14; // input W
|
||||
static constexpr ck::index_t Do = 14; // output D
|
||||
static constexpr ck::index_t Ho = 14; // output H
|
||||
static constexpr ck::index_t Wo = 14; // output W
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int execute_conv_fwd_scaleadd_scaleadd_relu()
|
||||
{
|
||||
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
|
||||
// However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
|
||||
// Hence, we need to adjust the order of strides.
|
||||
std::array<ck::index_t, 6> in_lengths{G, N, C, Di, Hi, Wi};
|
||||
std::array<ck::index_t, 6> in_strides{
|
||||
C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
|
||||
std::array<ck::index_t, 6> wei_lengths{G, K, C, Z, Y, X};
|
||||
std::array<ck::index_t, 6> wei_strides{
|
||||
K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
|
||||
std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
|
||||
std::array<ck::index_t, 6> out_strides{
|
||||
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
|
||||
// Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
|
||||
std::array<ck::index_t, 6> bias_lengths{G, 1, K, 1, 1, 1};
|
||||
std::array<ck::index_t, 6> bias_strides{K, 0, 1, 0, 0, 0};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Di * Hi * Wi * G * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
|
||||
SimpleDeviceMem d0(sizeof(std::tuple_element_t<0, DDataTypes>) * N * Do * Ho * Wo * G * K);
|
||||
SimpleDeviceMem d1(sizeof(std::tuple_element_t<1, DDataTypes>) * G * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
|
||||
NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<OutLayout, BiasLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<std::tuple_element_t<0, DDataTypes>, std::tuple_element_t<1, DDataTypes>>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
ScaleAddScaleAddRelu>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{d0.GetDeviceBuffer(), d1.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{out_lengths, bias_lengths},
|
||||
{out_strides, bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
ScaleAddScaleAddRelu{2.f, 2.f});
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop =
|
||||
std::size_t(2) * G * N * K * C * Ho * Wo * Y * X + 2 * N * Ho * Wo * G * K;
|
||||
std::size_t num_bytes =
|
||||
sizeof(InDataType) * N * Hi * Wi * G * C + sizeof(WeiDataType) * G * K * Y * X * C +
|
||||
(sizeof(OutDataType) + sizeof(std::tuple_element_t<0, DDataTypes>) +
|
||||
sizeof(std::tuple_element_t<1, DDataTypes>)) *
|
||||
N * Ho * Wo * G * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
{
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{d0.GetDeviceBuffer(), d1.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{out_lengths, bias_lengths},
|
||||
{out_strides, bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
ScaleAddScaleAddRelu{2.f, 2.f});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
using InDataType = ck::bhalf_t;
|
||||
using WeiDataType = ck::bhalf_t;
|
||||
using OutDataType = ck::bhalf_t;
|
||||
// Use std tuple instead of ck tuple to avoid clang
|
||||
// implicit instantiation of undefined template error.
|
||||
using DDataTypes = std::tuple<ck::bhalf_t, ck::bhalf_t>;
|
||||
|
||||
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
|
||||
|
||||
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
|
||||
@@ -0,0 +1,18 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
// Use std tuple instead of ck tuple to avoid clang
|
||||
// implicit instantiation of undefined template error.
|
||||
using DDataTypes = std::tuple<ck::half_t, ck::half_t>;
|
||||
|
||||
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
|
||||
|
||||
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
|
||||
@@ -0,0 +1,18 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
using InDataType = float;
|
||||
using WeiDataType = float;
|
||||
using OutDataType = float;
|
||||
// Use std tuple instead of ck tuple to avoid clang
|
||||
// implicit instantiation of undefined template error.
|
||||
using DDataTypes = std::tuple<float, float>;
|
||||
|
||||
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
|
||||
|
||||
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
|
||||
@@ -0,0 +1,18 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using OutDataType = int8_t;
|
||||
// Use std tuple instead of ck tuple to avoid clang
|
||||
// implicit instantiation of undefined template error.
|
||||
using DDataTypes = std::tuple<float, float>;
|
||||
|
||||
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
|
||||
|
||||
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
|
||||
@@ -2,3 +2,11 @@ add_executable(client_tensor_transform_using_wrapper tensor_transform_using_wrap
|
||||
target_link_libraries(client_tensor_transform_using_wrapper PRIVATE composable_kernel::device_other_operations)
|
||||
add_executable(client_wrapper_img2col wrapper_img2col.cpp)
|
||||
target_link_libraries(client_wrapper_img2col PRIVATE composable_kernel::device_other_operations)
|
||||
if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR
|
||||
GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR
|
||||
GPU_TARGETS MATCHES "gfx942")
|
||||
add_executable(client_wrapper_basic_gemm wrapper_basic_gemm.cpp)
|
||||
target_link_libraries(client_wrapper_basic_gemm PRIVATE composable_kernel::device_other_operations)
|
||||
add_executable(client_wrapper_optimized_gemm wrapper_optimized_gemm.cpp)
|
||||
target_link_libraries(client_wrapper_optimized_gemm PRIVATE composable_kernel::device_other_operations)
|
||||
endif()
|
||||
|
||||
177
client_example/25_wrapper/README.md
Normal file
177
client_example/25_wrapper/README.md
Normal file
@@ -0,0 +1,177 @@
|
||||
# Composable Kernel wrapper GEMM tutorial
|
||||
|
||||
This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK)
|
||||
wrapper. We present the base version of GEMM without most of the available optimizations; however,
|
||||
it's worth noting that CK has kernels with different optimizations.
|
||||
|
||||
To implement these optimizations, you can use the CK wrapper or directly use available instances in
|
||||
CK. You can also refer to the
|
||||
[optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp),
|
||||
that uses CK wrapper based on the
|
||||
[`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation.
|
||||
|
||||
The kernel definition should look similar to:
|
||||
|
||||
```cpp
|
||||
template <typename DataType,
|
||||
typename GemmTraits,
|
||||
ck::index_t scalar_per_vector,
|
||||
typename BlockShape,
|
||||
typename ThreadLayout>
|
||||
__global__ void __CK_WRAPPER_LAUNCH_BOUNDS__ DeviceGemm(const void* p_a,
|
||||
const void* p_b,
|
||||
void* p_c,
|
||||
const ck::index_t M,
|
||||
const ck::index_t N,
|
||||
const ck::index_t K,
|
||||
const BlockShape tile_shape,
|
||||
const ThreadLayout thread_layout)
|
||||
```
|
||||
|
||||
We pass pointers to global memory and matrix dimensions via arguments. Additionally, we pass
|
||||
selected lengths of processed data through each block (`tile_shape`) and thread layout
|
||||
(`thread_layout`). For compilation time parameters, we define the data type,
|
||||
[traits for the GEMM operation](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/wrapper/traits/blockwise_gemm_xdl_traits.hpp)
|
||||
and scalar per vector value during copy.
|
||||
|
||||
Step 1: Create layouts for global and LDS memory.
|
||||
|
||||
```cpp
|
||||
// Specify layouts for global memory.
|
||||
const auto a_global_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(M, K), ck::make_tuple(K, 1));
|
||||
const auto b_global_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(N, K), ck::make_tuple(K, 1));
|
||||
const auto c_global_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(M, N), ck::make_tuple(N, 1));
|
||||
|
||||
// Specify layouts for tiles.
|
||||
constexpr auto a_tile_layout = ck::wrapper::make_layout(
|
||||
ck::make_tuple(MPerBlock, KPerBlock), ck::make_tuple(KPerBlock, ck::Number<1>{}));
|
||||
constexpr auto b_tile_layout = ck::wrapper::make_layout(
|
||||
ck::make_tuple(NPerBlock, KPerBlock), ck::make_tuple(KPerBlock, ck::Number<1>{}));
|
||||
constexpr auto c_tile_layout = ck::wrapper::make_layout(
|
||||
ck::make_tuple(MPerBlock, NPerBlock), ck::make_tuple(NPerBlock, ck::Number<1>{}));
|
||||
|
||||
// Apply padding for global memory.
|
||||
auto a_global_layout_padded = ck::wrapper::pad(a_global_layout, shape(a_tile_layout));
|
||||
auto b_global_layout_padded = ck::wrapper::pad(b_global_layout, shape(b_tile_layout));
|
||||
auto c_global_layout_padded = ck::wrapper::pad(c_global_layout, shape(c_tile_layout));
|
||||
```
|
||||
|
||||
We pad layouts for global tensors in case M, N, and K are not divisible by `MPerBlock`, `NPerBlock`, or
|
||||
`KPerBlock`.
|
||||
|
||||
Step 2: Create tensors for global and LDS memory.
|
||||
|
||||
```cpp
|
||||
// Make tensors for global memory.
|
||||
auto a_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<const DataType*>(p_a), a_global_layout_padded);
|
||||
auto b_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<const DataType*>(p_b), b_global_layout_padded);
|
||||
auto c_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<DataType*>(p_c), c_global_layout_padded);
|
||||
|
||||
// Allocate LDS memory.
|
||||
__shared__ DataType lds_a[ck::wrapper::size(a_tile_layout)];
|
||||
__shared__ DataType lds_b[ck::wrapper::size(b_tile_layout)];
|
||||
|
||||
// Make tensors for lds memory.
|
||||
auto a_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
static_cast<DataType*>(lds_a), a_tile_layout);
|
||||
auto b_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
static_cast<DataType*>(lds_b), b_tile_layout);
|
||||
```
|
||||
|
||||
We must specify parameters for copy and convert block indexes to tuple:
|
||||
|
||||
```cpp
|
||||
// Specify block index as tuple.
|
||||
const auto block_idxs = ck::make_tuple(static_cast<ck::index_t>(blockIdx.x),
|
||||
static_cast<ck::index_t>(blockIdx.y),
|
||||
ck::wrapper::slice());
|
||||
// Specify access parameters for copy.
|
||||
using DimAccessOrder = ck::Tuple<ck::Number<0>, ck::Number<1>>;
|
||||
constexpr ck::index_t vector_dim = 1;
|
||||
```
|
||||
|
||||
We create a local tile (per block) and local partitions (per thread) for the global memory (`C`). We also
|
||||
define and clear an output register (`c_vgpr_reg`) for the accumulation.
|
||||
|
||||
```cpp
|
||||
auto c_global_local_tile = ck::wrapper::make_local_tile(
|
||||
c_global_tensor,
|
||||
tile_shape,
|
||||
block_idxs,
|
||||
make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(KPerBlock)));
|
||||
auto c_global_local_partition =
|
||||
ck::wrapper::make_blockwise_gemm_xdl_c_local_partition<DataType,
|
||||
decltype(a_tile_layout),
|
||||
decltype(b_tile_layout),
|
||||
ck::wrapper::size(thread_layout),
|
||||
GemmTraits>(c_global_local_tile);
|
||||
// Create C vgpr to accumulate results.
|
||||
auto c_vgpr_reg = ck::wrapper::make_blockwise_gemm_xdl_c_vgpr<DataType,
|
||||
decltype(a_tile_layout),
|
||||
decltype(b_tile_layout),
|
||||
ck::wrapper::size(thread_layout),
|
||||
GemmTraits>();
|
||||
// Clear C vgpr.
|
||||
ck::wrapper::clear(c_vgpr_reg);
|
||||
```
|
||||
|
||||
We use two specific functions for `blockwise_gemm`: `make_blockwise_gemm_xdl_c_local_partition` and
|
||||
`make_blockwise_gemm_xdl_c_vgpr`. This helps to choose the appropriate partition for the `C` output
|
||||
and define tensors with specific layouts for `blockwise_gemm`. In the following step, we use only
|
||||
generic functions for the CK wrapper.
|
||||
|
||||
Step 3: Create the compute loop.
|
||||
|
||||
```cpp
|
||||
const ck::index_t num_loop = ck::math::integer_divide_ceil(K, KPerBlock);
|
||||
ck::index_t i = 0;
|
||||
do
|
||||
{
|
||||
// Get KPerBlock slice.
|
||||
const auto k_slice = ck::wrapper::slice(i * KPerBlock, (i + 1) * KPerBlock);
|
||||
auto a_global_tensor_k_slice = a_global_tensor(ck::wrapper::slice(), k_slice);
|
||||
auto b_global_tensor_k_slice = b_global_tensor(ck::wrapper::slice(), k_slice);
|
||||
// Create local tiles for A and B.
|
||||
auto a_global_local_tile = ck::wrapper::make_local_tile(
|
||||
a_global_tensor_k_slice,
|
||||
tile_shape,
|
||||
block_idxs,
|
||||
make_tuple(ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}));
|
||||
auto b_global_local_tile = ck::wrapper::make_local_tile(
|
||||
b_global_tensor_k_slice,
|
||||
tile_shape,
|
||||
block_idxs,
|
||||
make_tuple(ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}));
|
||||
// Copy from global to LDS.
|
||||
ck::wrapper::blockwise_copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
a_global_local_tile, a_lds_tensor, thread_layout);
|
||||
ck::wrapper::blockwise_copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
b_global_local_tile, b_lds_tensor, thread_layout);
|
||||
// Synchronize lds.
|
||||
ck::block_sync_lds();
|
||||
// Execute blockwise GEMM.
|
||||
ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
|
||||
a_lds_tensor, b_lds_tensor, c_vgpr_reg);
|
||||
|
||||
++i;
|
||||
} while(i < num_loop);
|
||||
```
|
||||
|
||||
Loop iterate over `K / KPerBlock`. Each time a local tile is created for A and B tensors (tensor per block),
|
||||
data is copied from global memory to LDS. The `blockwise_gemm` function performs the GEMM
|
||||
operation on `a_lds_tensor` and `b_lds_tensor`, and stores results in `c_vgpr_reg`.
|
||||
|
||||
The end result from `c_vgpr_reg` is stored in the `C` local partition (tensor per thread):
|
||||
|
||||
```cpp
|
||||
ck::wrapper::copy(c_vgpr_reg, c_global_local_partition);
|
||||
```
|
||||
|
||||
If you want to dive deep into the details, you can find the entire example
|
||||
[here](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_basic_gemm.cpp).
|
||||
215
client_example/25_wrapper/wrapper_basic_gemm.cpp
Normal file
215
client_example/25_wrapper/wrapper_basic_gemm.cpp
Normal file
@@ -0,0 +1,215 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <numeric>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/library/utility/fill.hpp"
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
#include "ck/wrapper/tensor.hpp"
|
||||
#include "ck/wrapper/operations/copy.hpp"
|
||||
#include "ck/wrapper/operations/gemm.hpp"
|
||||
#include "ck/wrapper/utils/kernel_utils.hpp"
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
template <typename DataType,
|
||||
typename GemmTraits,
|
||||
ck::index_t scalar_per_vector,
|
||||
typename BlockShape,
|
||||
typename ThreadLayout>
|
||||
__global__ void __CK_WRAPPER_LAUNCH_BOUNDS__ DeviceGemm(const void* p_a,
|
||||
const void* p_b,
|
||||
void* p_c,
|
||||
const ck::index_t M,
|
||||
const ck::index_t N,
|
||||
const ck::index_t K,
|
||||
const BlockShape tile_shape,
|
||||
const ThreadLayout thread_layout)
|
||||
{
|
||||
constexpr auto MPerBlock = ck::wrapper::size<0>(tile_shape);
|
||||
constexpr auto NPerBlock = ck::wrapper::size<1>(tile_shape);
|
||||
constexpr auto KPerBlock = ck::wrapper::size<2>(tile_shape);
|
||||
|
||||
// Specify layouts for global memory.
|
||||
const auto a_global_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(M, K), ck::make_tuple(K, 1));
|
||||
const auto b_global_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(N, K), ck::make_tuple(K, 1));
|
||||
const auto c_global_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(M, N), ck::make_tuple(N, 1));
|
||||
// Specify layouts for tiles.
|
||||
constexpr auto a_tile_layout = ck::wrapper::make_layout(
|
||||
ck::make_tuple(MPerBlock, KPerBlock), ck::make_tuple(KPerBlock, ck::Number<1>{}));
|
||||
constexpr auto b_tile_layout = ck::wrapper::make_layout(
|
||||
ck::make_tuple(NPerBlock, KPerBlock), ck::make_tuple(KPerBlock, ck::Number<1>{}));
|
||||
constexpr auto c_tile_layout = ck::wrapper::make_layout(
|
||||
ck::make_tuple(MPerBlock, NPerBlock), ck::make_tuple(NPerBlock, ck::Number<1>{}));
|
||||
// Apply padding for global memory.
|
||||
auto a_global_layout_padded = ck::wrapper::pad(a_global_layout, shape(a_tile_layout));
|
||||
auto b_global_layout_padded = ck::wrapper::pad(b_global_layout, shape(b_tile_layout));
|
||||
auto c_global_layout_padded = ck::wrapper::pad(c_global_layout, shape(c_tile_layout));
|
||||
// Make tensors for global memory.
|
||||
auto a_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<const DataType*>(p_a), a_global_layout_padded);
|
||||
auto b_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<const DataType*>(p_b), b_global_layout_padded);
|
||||
auto c_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<DataType*>(p_c), c_global_layout_padded);
|
||||
// Allocate lds memory.
|
||||
__shared__ DataType lds_a[ck::wrapper::size(a_tile_layout)];
|
||||
__shared__ DataType lds_b[ck::wrapper::size(b_tile_layout)];
|
||||
// Make tensors for lds memory.
|
||||
auto a_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
static_cast<DataType*>(lds_a), a_tile_layout);
|
||||
auto b_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
static_cast<DataType*>(lds_b), b_tile_layout);
|
||||
// Specify block index as tuple.
|
||||
const auto block_idxs = ck::make_tuple(static_cast<ck::index_t>(blockIdx.x),
|
||||
static_cast<ck::index_t>(blockIdx.y),
|
||||
ck::wrapper::slice());
|
||||
// Specify access parameters for copy.
|
||||
using DimAccessOrder = ck::Tuple<ck::Number<0>, ck::Number<1>>;
|
||||
constexpr ck::index_t vector_dim = 1;
|
||||
// Create tile and partition for C. Use specific function for blockwise_gemm to assign the
|
||||
// appropriate partitions.
|
||||
auto c_global_local_tile = ck::wrapper::make_local_tile(
|
||||
c_global_tensor,
|
||||
tile_shape,
|
||||
block_idxs,
|
||||
make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(KPerBlock)));
|
||||
auto c_global_local_partition =
|
||||
ck::wrapper::make_blockwise_gemm_xdl_c_local_partition<DataType,
|
||||
decltype(a_tile_layout),
|
||||
decltype(b_tile_layout),
|
||||
ck::wrapper::size(thread_layout),
|
||||
GemmTraits>(c_global_local_tile);
|
||||
// Create C vgpr to accumulate results.
|
||||
auto c_vgpr_reg = ck::wrapper::make_blockwise_gemm_xdl_c_vgpr<DataType,
|
||||
decltype(a_tile_layout),
|
||||
decltype(b_tile_layout),
|
||||
ck::wrapper::size(thread_layout),
|
||||
GemmTraits>();
|
||||
// Clear C vgpr.
|
||||
ck::wrapper::clear(c_vgpr_reg);
|
||||
|
||||
// Iterate over K with KPerBlock step.
|
||||
const ck::index_t num_loop = ck::math::integer_divide_ceil(K, KPerBlock);
|
||||
ck::index_t i = 0;
|
||||
do
|
||||
{
|
||||
// Get KPerBlock slice.
|
||||
const auto k_slice = ck::wrapper::slice(i * KPerBlock, (i + 1) * KPerBlock);
|
||||
auto a_global_tensor_k_slice = a_global_tensor(ck::wrapper::slice(), k_slice);
|
||||
auto b_global_tensor_k_slice = b_global_tensor(ck::wrapper::slice(), k_slice);
|
||||
// Create local tiles for A and B.
|
||||
auto a_global_local_tile = ck::wrapper::make_local_tile(
|
||||
a_global_tensor_k_slice,
|
||||
tile_shape,
|
||||
block_idxs,
|
||||
make_tuple(ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}));
|
||||
auto b_global_local_tile = ck::wrapper::make_local_tile(
|
||||
b_global_tensor_k_slice,
|
||||
tile_shape,
|
||||
block_idxs,
|
||||
make_tuple(ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}));
|
||||
// Copy from global to lds.
|
||||
ck::wrapper::blockwise_copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
a_global_local_tile, a_lds_tensor, thread_layout);
|
||||
ck::wrapper::blockwise_copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
b_global_local_tile, b_lds_tensor, thread_layout);
|
||||
// Synchronize lds.
|
||||
ck::block_sync_lds();
|
||||
// Execute blockwise gemm.
|
||||
ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
|
||||
a_lds_tensor, b_lds_tensor, c_vgpr_reg);
|
||||
|
||||
++i;
|
||||
} while(i < num_loop);
|
||||
// Copy vgpr results to C global memory.
|
||||
ck::wrapper::copy(c_vgpr_reg, c_global_local_partition);
|
||||
}
|
||||
|
||||
template <typename DataType,
|
||||
typename GemmTraits,
|
||||
ck::index_t scalar_per_vector,
|
||||
typename BlockShape,
|
||||
typename ThreadLayout>
|
||||
void PerformGemm(const ck::index_t M,
|
||||
const ck::index_t N,
|
||||
const ck::index_t K,
|
||||
const BlockShape& tile_shape,
|
||||
const ThreadLayout& thread_layout)
|
||||
{
|
||||
// Global memory buffers
|
||||
SimpleDeviceMem a_mem(M * K * sizeof(DataType));
|
||||
SimpleDeviceMem b_mem(K * N * sizeof(DataType));
|
||||
SimpleDeviceMem c_mem(M * N * sizeof(DataType));
|
||||
|
||||
const ck::index_t grid_size_x =
|
||||
ck::math::integer_divide_ceil(M, ck::wrapper::size<0>(tile_shape));
|
||||
const ck::index_t grid_size_y =
|
||||
ck::math::integer_divide_ceil(N, ck::wrapper::size<1>(tile_shape));
|
||||
|
||||
const auto kernel =
|
||||
DeviceGemm<DataType, GemmTraits, scalar_per_vector, BlockShape, ThreadLayout>;
|
||||
const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true},
|
||||
kernel,
|
||||
dim3(grid_size_x, grid_size_y, 1),
|
||||
dim3(ck::wrapper::size(thread_layout)),
|
||||
0,
|
||||
a_mem.GetDeviceBuffer(),
|
||||
b_mem.GetDeviceBuffer(),
|
||||
c_mem.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
tile_shape,
|
||||
thread_layout);
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
sizeof(DataType) * M * K + sizeof(DataType) * K * N + sizeof(DataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << std::endl;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
using DataType = ck::half_t;
|
||||
const auto thread_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<64>{}, ck::Number<4>{}),
|
||||
ck::make_tuple(ck::Number<4>{}, ck::Number<1>{}));
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<256>{}, ck::Number<128>{}, ck::Number<32>{});
|
||||
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_4x2XdlPerWave_8K1, 8>(
|
||||
3840, 4096, 4096, tile_shape, thread_layout);
|
||||
return 0;
|
||||
}
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
#include "ck/wrapper/tensor.hpp"
|
||||
#include "ck/wrapper/operations/copy.hpp"
|
||||
#include "ck/wrapper/utils/kernel_utils.hpp"
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
using DataType = float;
|
||||
@@ -36,21 +37,20 @@ struct SimpleDeviceMem
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
// Test copy from Global to Global through LDS and VGPR
|
||||
template <typename InputTensor,
|
||||
typename OutputTensor,
|
||||
typename BlockShape,
|
||||
typename ThreadLayoutShape>
|
||||
__global__ void DeviceImageToColumnPad0(InputTensor input_tensor,
|
||||
OutputTensor output_tensor,
|
||||
const BlockShape tile_shape,
|
||||
const ThreadLayoutShape thread_layout)
|
||||
template <typename InputTensor, typename OutputTensor, typename BlockShape, typename ThreadLayout>
|
||||
__global__ void __CK_WRAPPER_LAUNCH_BOUNDS__
|
||||
DeviceImageToColumnPad0(InputTensor input_tensor,
|
||||
OutputTensor output_tensor,
|
||||
const BlockShape tile_shape,
|
||||
const ThreadLayout thread_layout)
|
||||
{
|
||||
const ck::index_t block_idx = static_cast<ck::index_t>(blockIdx.x);
|
||||
// grid layout (dim1, dim0)
|
||||
const auto block_idxs =
|
||||
ck::make_tuple(static_cast<ck::index_t>(blockIdx.y), static_cast<ck::index_t>(blockIdx.x));
|
||||
|
||||
// Get local tiles for global memory
|
||||
auto input_local_tile = ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idx);
|
||||
auto output_local_tile = ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idx);
|
||||
auto input_local_tile = ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idxs);
|
||||
auto output_local_tile = ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idxs);
|
||||
|
||||
// Get partition per thread
|
||||
const auto input_local_partition =
|
||||
@@ -112,9 +112,11 @@ void PerformImageToColumnPad0(const ck::index_t G,
|
||||
SimpleDeviceMem out_buf(ck::wrapper::size(out_layout) * sizeof(DataType));
|
||||
|
||||
// User can choose appropriate number of threads and sizes per block
|
||||
const auto thread_layout = ck::make_tuple(ck::Number<8>{}, ck::Number<16>{});
|
||||
const auto thread_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<8>{}, ck::Number<16>{}),
|
||||
ck::make_tuple(ck::Number<16>{}, ck::Number<1>{}));
|
||||
// This example doesn't support padding, user should select tile sizes
|
||||
// which divides the shape completely
|
||||
// which are divisible by the shape.
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<32>{}, ck::Number<64>{});
|
||||
|
||||
// Create buffers for global memory
|
||||
@@ -123,10 +125,11 @@ void PerformImageToColumnPad0(const ck::index_t G,
|
||||
auto output_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<DataType*>(out_buf.GetDeviceBuffer()), out_layout);
|
||||
|
||||
const ck::index_t grid_size = ck::math::integer_divide_ceil(ck::wrapper::size<0>(in_layout),
|
||||
ck::wrapper::size<0>(tile_shape)) *
|
||||
ck::math::integer_divide_ceil(ck::wrapper::size<1>(in_layout),
|
||||
ck::wrapper::size<1>(tile_shape));
|
||||
// grid layout (dim1, dim0)
|
||||
const ck::index_t grid_size_x = ck::math::integer_divide_ceil(ck::wrapper::size<1>(in_layout),
|
||||
ck::wrapper::size<1>(tile_shape));
|
||||
const ck::index_t grid_size_y = ck::math::integer_divide_ceil(ck::wrapper::size<0>(in_layout),
|
||||
ck::wrapper::size<0>(tile_shape));
|
||||
|
||||
const auto kernel = DeviceImageToColumnPad0<decltype(input_tensor_global),
|
||||
decltype(output_tensor_global),
|
||||
@@ -134,7 +137,7 @@ void PerformImageToColumnPad0(const ck::index_t G,
|
||||
decltype(thread_layout)>;
|
||||
const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true},
|
||||
kernel,
|
||||
dim3(grid_size),
|
||||
dim3(grid_size_x, grid_size_y, 1),
|
||||
dim3(ck::wrapper::size(thread_layout)),
|
||||
0,
|
||||
input_tensor_global,
|
||||
@@ -178,3 +181,4 @@ int main(int argc, char* argv[])
|
||||
{1, 1, 1} /*filter_dilations*/);
|
||||
return 0;
|
||||
}
|
||||
// MI100 Perf: 0.255178 ms, 1698.9 GB/s,
|
||||
|
||||
307
client_example/25_wrapper/wrapper_optimized_gemm.cpp
Normal file
307
client_example/25_wrapper/wrapper_optimized_gemm.cpp
Normal file
@@ -0,0 +1,307 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <numeric>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/utility/common_header.hpp"
|
||||
#include "ck/library/utility/fill.hpp"
|
||||
#include "ck/wrapper/layout.hpp"
|
||||
#include "ck/wrapper/tensor.hpp"
|
||||
#include "ck/wrapper/operations/copy.hpp"
|
||||
#include "ck/wrapper/operations/gemm.hpp"
|
||||
#include "ck/wrapper/utils/kernel_utils.hpp"
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
template <bool DoPad, typename Layout, typename PaddingDims>
|
||||
__device__ auto ApplyPadding(const Layout& layout, const PaddingDims& padding_dims)
|
||||
{
|
||||
if constexpr(DoPad)
|
||||
{
|
||||
return ck::wrapper::pad(layout, padding_dims);
|
||||
}
|
||||
else
|
||||
{
|
||||
return layout;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DataType,
|
||||
typename GemmTraits,
|
||||
ck::index_t scalar_per_vector,
|
||||
typename BlockShape,
|
||||
typename ThreadLayout,
|
||||
bool DoPadding>
|
||||
__global__ void __CK_WRAPPER_LAUNCH_BOUNDS__ DeviceGemm(const void* p_a,
|
||||
const void* p_b,
|
||||
void* p_c,
|
||||
const ck::index_t M,
|
||||
const ck::index_t N,
|
||||
const ck::index_t K,
|
||||
const BlockShape tile_shape,
|
||||
const ThreadLayout thread_layout)
|
||||
{
|
||||
constexpr auto MPerBlock = ck::wrapper::size<0>(tile_shape);
|
||||
constexpr auto NPerBlock = ck::wrapper::size<1>(tile_shape);
|
||||
constexpr auto KPerBlock = ck::wrapper::size<2>(tile_shape);
|
||||
constexpr auto K1 = GemmTraits::K1;
|
||||
constexpr auto K0PerBlock = KPerBlock / K1;
|
||||
const auto K0 = ck::math::integer_divide_ceil(K, K1);
|
||||
|
||||
const auto tile_shape_k0_m_n_k1 = ck::make_tuple(K0PerBlock, MPerBlock, NPerBlock, K1);
|
||||
// Create layouts for global memory
|
||||
const auto a_global_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(M, K), ck::make_tuple(K, 1));
|
||||
const auto b_global_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(N, K), ck::make_tuple(K, 1));
|
||||
const auto c_global_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(M, N), ck::make_tuple(N, 1));
|
||||
// Apply padding
|
||||
auto a_padded_global_layout =
|
||||
ApplyPadding<DoPadding>(a_global_layout, ck::make_tuple(MPerBlock, KPerBlock));
|
||||
auto b_padded_global_layout =
|
||||
ApplyPadding<DoPadding>(b_global_layout, ck::make_tuple(NPerBlock, KPerBlock));
|
||||
auto c_padded_global_layout =
|
||||
ApplyPadding<DoPadding>(c_global_layout, ck::make_tuple(MPerBlock, NPerBlock));
|
||||
// Reshape from M,K to K0,M,K1
|
||||
const auto reshaped_dims_idxs =
|
||||
ck::make_tuple(ck::Number<1>{}, ck::make_tuple(ck::Number<0>{}, ck::Number<2>{}));
|
||||
auto a_padded_unmerged_global_layout =
|
||||
ck::wrapper::unmerge<1>(a_padded_global_layout, ck::make_tuple(K0, K1), reshaped_dims_idxs);
|
||||
auto b_padded_unmerged_global_layout =
|
||||
ck::wrapper::unmerge<1>(b_padded_global_layout, ck::make_tuple(K0, K1), reshaped_dims_idxs);
|
||||
// Create tensors for global memory
|
||||
auto a_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<const DataType*>(p_a), a_padded_unmerged_global_layout);
|
||||
auto b_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<const DataType*>(p_b), b_padded_unmerged_global_layout);
|
||||
auto c_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
|
||||
static_cast<DataType*>(p_c), c_padded_global_layout);
|
||||
// Create layouts and tensors for lds memory.
|
||||
constexpr auto a_tile_layout = ck::wrapper::make_layout(
|
||||
ck::make_tuple(K0PerBlock, MPerBlock, K1),
|
||||
ck::make_tuple((MPerBlock + ck::Number<1>{}) * K1, K1, ck::Number<1>{}));
|
||||
constexpr auto b_tile_layout = ck::wrapper::make_layout(
|
||||
ck::make_tuple(K0PerBlock, NPerBlock, K1),
|
||||
ck::make_tuple((NPerBlock + ck::Number<1>{}) * K1, K1, ck::Number<1>{}));
|
||||
|
||||
__shared__ DataType lds_a[ck::wrapper::size(a_tile_layout) + K0PerBlock];
|
||||
__shared__ DataType lds_b[ck::wrapper::size(b_tile_layout) + K0PerBlock];
|
||||
|
||||
auto a_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
static_cast<DataType*>(lds_a), a_tile_layout);
|
||||
auto b_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
|
||||
static_cast<DataType*>(lds_b), b_tile_layout);
|
||||
|
||||
const auto block_idxs = ck::make_tuple(ck::wrapper::slice(),
|
||||
static_cast<ck::index_t>(blockIdx.x),
|
||||
static_cast<ck::index_t>(blockIdx.y),
|
||||
ck::wrapper::slice());
|
||||
using DimAccessOrder = ck::Tuple<ck::Number<1>, ck::Number<0>, ck::Number<2>>;
|
||||
constexpr ck::index_t vector_dim = 2;
|
||||
|
||||
// Create tile and partition for C global memory. Use specific gemm
|
||||
// functions to get appropriate layouts.
|
||||
auto c_global_local_tile =
|
||||
ck::wrapper::make_local_tile(c_global_tensor,
|
||||
tile_shape_k0_m_n_k1,
|
||||
block_idxs,
|
||||
make_tuple(ck::wrapper::slice(K0PerBlock),
|
||||
ck::Number<1>{},
|
||||
ck::Number<1>{},
|
||||
ck::wrapper::slice(K1)));
|
||||
auto c_global_local_partition =
|
||||
ck::wrapper::make_blockwise_gemm_xdl_c_local_partition<DataType,
|
||||
decltype(a_tile_layout),
|
||||
decltype(b_tile_layout),
|
||||
ck::wrapper::size(thread_layout),
|
||||
GemmTraits>(c_global_local_tile);
|
||||
// Define and clear c vgpr register
|
||||
auto c_vgpr_reg = ck::wrapper::make_blockwise_gemm_xdl_c_vgpr<DataType,
|
||||
decltype(a_tile_layout),
|
||||
decltype(b_tile_layout),
|
||||
ck::wrapper::size(thread_layout),
|
||||
GemmTraits>();
|
||||
ck::wrapper::clear(c_vgpr_reg);
|
||||
// Local partitions for lds memory
|
||||
auto a_lds_tensor_local_partition =
|
||||
ck::wrapper::make_local_partition(a_lds_tensor, thread_layout, threadIdx.x);
|
||||
auto b_lds_tensor_local_partition =
|
||||
ck::wrapper::make_local_partition(b_lds_tensor, thread_layout, threadIdx.x);
|
||||
// Lamda to slice tensor, then create local tile and partition
|
||||
auto make_global_partition = [&](auto tensor, auto projection, ck::index_t i) {
|
||||
const auto k_slice =
|
||||
ck::make_tuple(ck::wrapper::slice(i * K0PerBlock, (i + 1) * K0PerBlock),
|
||||
ck::wrapper::slice(),
|
||||
ck::wrapper::slice());
|
||||
auto local_tile = ck::wrapper::make_local_tile(
|
||||
tensor(k_slice), tile_shape_k0_m_n_k1, block_idxs, projection);
|
||||
return ck::wrapper::make_local_partition(local_tile, thread_layout, threadIdx.x);
|
||||
};
|
||||
|
||||
auto a_global_local_partition = make_global_partition(
|
||||
a_global_tensor,
|
||||
make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}),
|
||||
0);
|
||||
auto b_global_local_partition = make_global_partition(
|
||||
b_global_tensor,
|
||||
make_tuple(ck::Number<1>{}, ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}),
|
||||
0);
|
||||
|
||||
// (row-major vgpr layout)
|
||||
auto a_vgpr_tensor =
|
||||
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, DataType>(
|
||||
ck::wrapper::make_layout(
|
||||
shape(a_global_local_partition),
|
||||
ck::make_tuple(ck::wrapper::size<1>(a_global_local_partition) *
|
||||
ck::wrapper::size<2>(a_global_local_partition),
|
||||
ck::wrapper::size<2>(a_global_local_partition),
|
||||
ck::Number<1>{})));
|
||||
auto b_vgpr_tensor =
|
||||
ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, DataType>(
|
||||
ck::wrapper::make_layout(
|
||||
shape(b_global_local_partition),
|
||||
ck::make_tuple(ck::wrapper::size<1>(a_global_local_partition) *
|
||||
ck::wrapper::size<2>(a_global_local_partition),
|
||||
ck::wrapper::size<2>(a_global_local_partition),
|
||||
ck::Number<1>{})));
|
||||
// Copy first values to lds
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(a_global_local_partition,
|
||||
a_vgpr_tensor);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(b_global_local_partition,
|
||||
b_vgpr_tensor);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(a_vgpr_tensor,
|
||||
a_lds_tensor_local_partition);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(b_vgpr_tensor,
|
||||
b_lds_tensor_local_partition);
|
||||
// Pipeline loop
|
||||
const ck::index_t num_loop =
|
||||
__builtin_amdgcn_readfirstlane(ck::math::integer_divide_ceil(K, KPerBlock));
|
||||
// Skip if only tile should be processed
|
||||
if(num_loop > 1)
|
||||
{
|
||||
ck::index_t i = 0;
|
||||
do
|
||||
{
|
||||
auto a_global_local_partition_i = make_global_partition(
|
||||
a_global_tensor,
|
||||
make_tuple(
|
||||
ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}),
|
||||
i + 1);
|
||||
auto b_global_local_partition_i = make_global_partition(
|
||||
b_global_tensor,
|
||||
make_tuple(
|
||||
ck::Number<1>{}, ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}),
|
||||
i + 1);
|
||||
// Copy data to A vgpr.
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
a_global_local_partition_i, a_vgpr_tensor);
|
||||
// Synchronize.
|
||||
ck::block_sync_lds();
|
||||
// Copy data to B vgpr.
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
b_global_local_partition_i, b_vgpr_tensor);
|
||||
// Perform gemm.
|
||||
ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
|
||||
a_lds_tensor, b_lds_tensor, c_vgpr_reg);
|
||||
// Synchronize
|
||||
ck::block_sync_lds();
|
||||
// Copy data to A and B lds tiles.
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
a_vgpr_tensor, a_lds_tensor_local_partition);
|
||||
ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
|
||||
b_vgpr_tensor, b_lds_tensor_local_partition);
|
||||
|
||||
++i;
|
||||
} while(i < (num_loop - 1));
|
||||
}
|
||||
// Handle tail.
|
||||
ck::block_sync_lds();
|
||||
ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
|
||||
a_lds_tensor, b_lds_tensor, c_vgpr_reg);
|
||||
// Store data from C vgpr to C global memory.
|
||||
ck::wrapper::copy(c_vgpr_reg, c_global_local_partition);
|
||||
}
|
||||
|
||||
template <typename DataType,
|
||||
typename GemmTraits,
|
||||
ck::index_t scalar_per_vector,
|
||||
bool DoPadding,
|
||||
typename BlockShape,
|
||||
typename ThreadLayout>
|
||||
void PerformGemm(const ck::index_t M,
|
||||
const ck::index_t N,
|
||||
const ck::index_t K,
|
||||
const BlockShape& tile_shape,
|
||||
const ThreadLayout& thread_layout)
|
||||
{
|
||||
// Global memory buffers
|
||||
SimpleDeviceMem a_mem(M * K * sizeof(DataType));
|
||||
SimpleDeviceMem b_mem(K * N * sizeof(DataType));
|
||||
SimpleDeviceMem c_mem(M * N * sizeof(DataType));
|
||||
|
||||
const ck::index_t grid_size_x =
|
||||
ck::math::integer_divide_ceil(M, ck::wrapper::size<0>(tile_shape));
|
||||
const ck::index_t grid_size_y =
|
||||
ck::math::integer_divide_ceil(N, ck::wrapper::size<1>(tile_shape));
|
||||
|
||||
const auto kernel =
|
||||
DeviceGemm<DataType, GemmTraits, scalar_per_vector, BlockShape, ThreadLayout, DoPadding>;
|
||||
const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true},
|
||||
kernel,
|
||||
dim3(grid_size_x, grid_size_y, 1),
|
||||
dim3(ck::wrapper::size(thread_layout)),
|
||||
0,
|
||||
a_mem.GetDeviceBuffer(),
|
||||
b_mem.GetDeviceBuffer(),
|
||||
c_mem.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
tile_shape,
|
||||
thread_layout);
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
sizeof(DataType) * M * K + sizeof(DataType) * K * N + sizeof(DataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << std::endl;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
using DataType = ck::half_t;
|
||||
const auto thread_layout =
|
||||
ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
|
||||
ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}, ck::Number<1>{}));
|
||||
const auto tile_shape = ck::make_tuple(ck::Number<256>{}, ck::Number<128>{}, ck::Number<32>{});
|
||||
PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_4x2XdlPerWave_8K1, 8, false>(
|
||||
3840, 4096, 4096, tile_shape, thread_layout);
|
||||
return 0;
|
||||
}
|
||||
238
cmake/Embed.cmake
Normal file
238
cmake/Embed.cmake
Normal file
@@ -0,0 +1,238 @@
|
||||
#####################################################################################
|
||||
# The MIT License (MIT)
|
||||
#
|
||||
# Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
#####################################################################################
|
||||
|
||||
if(WIN32)
|
||||
set(EMBED_USE RC CACHE STRING "Use RC or CArrays to embed data files")
|
||||
set_property(CACHE EMBED_USE PROPERTY STRINGS "RC;CArrays")
|
||||
else()
|
||||
if(BUILD_SHARED_LIBS)
|
||||
set(EMBED_USE LD CACHE STRING "Use LD or CArrays to embed data files")
|
||||
else()
|
||||
set(EMBED_USE CArrays CACHE STRING "Use LD or CArrays to embed data files")
|
||||
endif()
|
||||
set_property(CACHE EMBED_USE PROPERTY STRINGS "LD;CArrays")
|
||||
endif()
|
||||
|
||||
if(EMBED_USE STREQUAL "LD")
|
||||
find_program(EMBED_LD ld REQUIRED)
|
||||
find_program(EMBED_OBJCOPY objcopy REQUIRED)
|
||||
endif()
|
||||
|
||||
function(embed_wrap_string)
|
||||
set(options)
|
||||
set(oneValueArgs VARIABLE AT_COLUMN)
|
||||
set(multiValueArgs)
|
||||
cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
|
||||
string(LENGTH ${${PARSE_VARIABLE}} string_length)
|
||||
math(EXPR offset "0")
|
||||
|
||||
while(string_length GREATER 0)
|
||||
|
||||
if(string_length GREATER ${PARSE_AT_COLUMN})
|
||||
math(EXPR length "${PARSE_AT_COLUMN}")
|
||||
else()
|
||||
math(EXPR length "${string_length}")
|
||||
endif()
|
||||
|
||||
string(SUBSTRING ${${PARSE_VARIABLE}} ${offset} ${length} line)
|
||||
set(lines "${lines}\n${line}")
|
||||
|
||||
math(EXPR string_length "${string_length} - ${length}")
|
||||
math(EXPR offset "${offset} + ${length}")
|
||||
endwhile()
|
||||
|
||||
set(${PARSE_VARIABLE} "${lines}" PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
function(generate_embed_source EMBED_NAME EMBED_DIR BASE_DIRECTORY)
|
||||
set(options)
|
||||
set(oneValueArgs)
|
||||
set(multiValueArgs SYMBOLS FILES)
|
||||
cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
|
||||
set(RESOURCE_ID 100)
|
||||
|
||||
list(LENGTH PARSE_SYMBOLS SYMBOLS_LEN)
|
||||
list(LENGTH PARSE_FILES FILES_LEN)
|
||||
if(NOT ${SYMBOLS_LEN} EQUAL ${FILES_LEN})
|
||||
message(FATAL_ERROR "Symbols and objects dont match: ${SYMBOLS_LEN} != ${FILES_LEN}")
|
||||
endif()
|
||||
math(EXPR LEN "${SYMBOLS_LEN} - 1")
|
||||
|
||||
foreach(idx RANGE ${LEN})
|
||||
list(GET PARSE_SYMBOLS ${idx} SYMBOL)
|
||||
list(GET PARSE_FILES ${idx} FILE)
|
||||
file(RELATIVE_PATH BASE_NAME "${BASE_DIRECTORY}" ${FILE})
|
||||
if(EMBED_USE STREQUAL "RC")
|
||||
string(TOUPPER "${SYMBOL}" SYMBOL)
|
||||
string(APPEND FILE_IDS "#define IDR_${SYMBOL} ${RESOURCE_ID}\n")
|
||||
file(TO_NATIVE_PATH "${FILE}" NATIVE_FILE)
|
||||
string(REPLACE "\\" "\\\\" NATIVE_FILE "${NATIVE_FILE}")
|
||||
string(APPEND RC_FILE_MAPPING "IDR_${SYMBOL} TEXTFILE \"${NATIVE_FILE}\"\n")
|
||||
string(APPEND INIT_KERNELS "\n {\"${BASE_NAME}\", resource::read(IDR_${SYMBOL})},")
|
||||
math(EXPR RESOURCE_ID "${RESOURCE_ID} + 1" OUTPUT_FORMAT DECIMAL)
|
||||
else()
|
||||
set(START_SYMBOL "_binary_${SYMBOL}_start")
|
||||
set(LENGTH_SYMBOL "_binary_${SYMBOL}_length")
|
||||
if(EMBED_USE STREQUAL "LD")
|
||||
string(APPEND EXTERNS "
|
||||
extern const char ${START_SYMBOL}[];
|
||||
extern const size_t _binary_${SYMBOL}_size;
|
||||
const auto ${LENGTH_SYMBOL} = reinterpret_cast<size_t>(&_binary_${SYMBOL}_size);
|
||||
")
|
||||
else()
|
||||
string(APPEND EXTERNS "
|
||||
extern const char ${START_SYMBOL}[];
|
||||
extern const size_t ${LENGTH_SYMBOL};
|
||||
")
|
||||
endif()
|
||||
string(APPEND INIT_KERNELS "
|
||||
{ \"${BASE_NAME}\", { ${START_SYMBOL}, ${LENGTH_SYMBOL}} },")
|
||||
endif()
|
||||
endforeach()
|
||||
if(EMBED_USE STREQUAL "RC")
|
||||
file(WRITE "${EMBED_DIR}/include/resource.h" "
|
||||
#define TEXTFILE 256
|
||||
|
||||
${FILE_IDS}
|
||||
")
|
||||
file(WRITE "${EMBED_DIR}/resource.rc" "
|
||||
#include \"resource.h\"
|
||||
|
||||
${RC_FILE_MAPPING}
|
||||
")
|
||||
set(EXTERNS "
|
||||
#include <Windows.h>
|
||||
#include \"resource.h\"
|
||||
|
||||
namespace resource {
|
||||
std::string_view read(int id)
|
||||
{
|
||||
HMODULE handle = GetModuleHandle(nullptr);
|
||||
HRSRC rc = FindResource(handle, MAKEINTRESOURCE(id), MAKEINTRESOURCE(TEXTFILE));
|
||||
HGLOBAL data = LoadResource(handle, rc);
|
||||
return {static_cast<const char*>(LockResource(data)), SizeofResource(handle, rc)};
|
||||
}
|
||||
}
|
||||
")
|
||||
set(EMBED_FILES ${EMBED_DIR}/include/resource.h ${EMBED_DIR}/resource.rc)
|
||||
endif()
|
||||
file(WRITE "${EMBED_DIR}/include/${EMBED_NAME}.hpp" "
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
std::unordered_map<std::string_view, std::string_view> ${EMBED_NAME}();
|
||||
")
|
||||
|
||||
file(WRITE "${EMBED_DIR}/${EMBED_NAME}.cpp" "
|
||||
#include <${EMBED_NAME}.hpp>
|
||||
${EXTERNS}
|
||||
std::unordered_map<std::string_view, std::string_view> ${EMBED_NAME}()
|
||||
{
|
||||
static std::unordered_map<std::string_view, std::string_view> result = {${INIT_KERNELS}
|
||||
};
|
||||
return result;
|
||||
}
|
||||
")
|
||||
list(APPEND EMBED_FILES ${EMBED_DIR}/${EMBED_NAME}.cpp ${EMBED_DIR}/include/${EMBED_NAME}.hpp)
|
||||
set(EMBED_FILES ${EMBED_FILES} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
function(embed_file FILE BASE_DIRECTORY)
|
||||
message(STATUS " ${FILE}")
|
||||
file(RELATIVE_PATH REL_FILE "${BASE_DIRECTORY}" ${FILE})
|
||||
string(MAKE_C_IDENTIFIER "${REL_FILE}" OUTPUT_SYMBOL)
|
||||
get_filename_component(OUTPUT_FILE_DIR "${REL_FILE}" DIRECTORY)
|
||||
file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_FILE_DIR}")
|
||||
if(EMBED_USE STREQUAL "LD")
|
||||
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.o")
|
||||
add_custom_command(
|
||||
OUTPUT "${OUTPUT_FILE}"
|
||||
COMMAND ${EMBED_LD} -r -o "${OUTPUT_FILE}" -z noexecstack --format=binary "${REL_FILE}"
|
||||
COMMAND ${EMBED_OBJCOPY} --rename-section .data=.rodata,alloc,load,readonly,data,contents "${OUTPUT_FILE}"
|
||||
WORKING_DIRECTORY "${BASE_DIRECTORY}"
|
||||
DEPENDS "${FILE}"
|
||||
VERBATIM)
|
||||
set(OUTPUT_FILE ${OUTPUT_FILE} PARENT_SCOPE)
|
||||
elseif(EMBED_USE STREQUAL "CArrays")
|
||||
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${FILE})
|
||||
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.cpp")
|
||||
# reads source file contents as hex string
|
||||
file(READ ${FILE} HEX_STRING HEX)
|
||||
# wraps the hex string into multiple lines
|
||||
embed_wrap_string(VARIABLE HEX_STRING AT_COLUMN 80)
|
||||
# adds '0x' prefix and comma suffix before and after every byte respectively
|
||||
string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1, " ARRAY_VALUES ${HEX_STRING})
|
||||
# removes trailing comma
|
||||
string(REGEX REPLACE ", $" "" ARRAY_VALUES ${ARRAY_VALUES})
|
||||
file(WRITE "${OUTPUT_FILE}" "
|
||||
#include <cstddef>
|
||||
extern const char _binary_${OUTPUT_SYMBOL}_start[] = { ${ARRAY_VALUES} };
|
||||
extern const size_t _binary_${OUTPUT_SYMBOL}_length = sizeof(_binary_${OUTPUT_SYMBOL}_start);
|
||||
")
|
||||
set(OUTPUT_FILE ${OUTPUT_FILE} PARENT_SCOPE)
|
||||
endif()
|
||||
set(OUTPUT_SYMBOL ${OUTPUT_SYMBOL} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
function(add_embed_library EMBED_NAME)
|
||||
set(options)
|
||||
set(oneValueArgs RELATIVE)
|
||||
set(multiValueArgs)
|
||||
cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
|
||||
set(EMBED_DIR ${CMAKE_CURRENT_BINARY_DIR}/embed/${EMBED_NAME})
|
||||
file(MAKE_DIRECTORY ${EMBED_DIR})
|
||||
message(STATUS "Embedding kernel files:")
|
||||
foreach(FILE ${PARSE_UNPARSED_ARGUMENTS})
|
||||
embed_file(${FILE} ${PARSE_RELATIVE})
|
||||
list(APPEND OUTPUT_FILES ${OUTPUT_FILE})
|
||||
list(APPEND SYMBOLS ${OUTPUT_SYMBOL})
|
||||
endforeach()
|
||||
message(STATUS "Generating embedding library '${EMBED_NAME}'")
|
||||
generate_embed_source(${EMBED_NAME} ${EMBED_DIR} "${PARSE_RELATIVE}" SYMBOLS ${SYMBOLS} FILES ${PARSE_UNPARSED_ARGUMENTS})
|
||||
set(INTERNAL_EMBED_LIB embed_lib_${EMBED_NAME})
|
||||
if(EMBED_USE STREQUAL "LD")
|
||||
add_library(${INTERNAL_EMBED_LIB} STATIC ${EMBED_FILES} ${OUTPUT_FILES})
|
||||
else()
|
||||
add_library(${INTERNAL_EMBED_LIB} OBJECT ${EMBED_FILES})
|
||||
endif()
|
||||
if(EMBED_USE STREQUAL "CArrays")
|
||||
target_sources(${INTERNAL_EMBED_LIB} PRIVATE ${OUTPUT_FILES})
|
||||
endif()
|
||||
target_include_directories(${INTERNAL_EMBED_LIB} PRIVATE "${EMBED_DIR}/include")
|
||||
target_compile_options(${INTERNAL_EMBED_LIB} PRIVATE -Wno-reserved-identifier -Wno-extern-initializer -Wno-missing-variable-declarations)
|
||||
set_target_properties(${INTERNAL_EMBED_LIB} PROPERTIES POSITION_INDEPENDENT_CODE On)
|
||||
add_library(${EMBED_NAME} INTERFACE)
|
||||
if(EMBED_USE STREQUAL "RC")
|
||||
target_link_libraries(${EMBED_NAME} INTERFACE $<TARGET_OBJECTS:${INTERNAL_EMBED_LIB}>)
|
||||
elseif(EMBED_USE STREQUAL "LD")
|
||||
target_link_libraries(${EMBED_NAME} INTERFACE ${INTERNAL_EMBED_LIB})
|
||||
else()
|
||||
target_sources(${EMBED_NAME} INTERFACE $<TARGET_OBJECTS:${INTERNAL_EMBED_LIB}>)
|
||||
endif()
|
||||
target_include_directories(${EMBED_NAME} INTERFACE "${EMBED_DIR}/include")
|
||||
endfunction()
|
||||
|
||||
49
codegen/CMakeLists.txt
Normal file
49
codegen/CMakeLists.txt
Normal file
@@ -0,0 +1,49 @@
|
||||
cmake_minimum_required(VERSION 3.16)
|
||||
project(composable_kernel_host)
|
||||
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
|
||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
|
||||
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||
set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||
|
||||
find_package(ROCM)
|
||||
include(ROCMInstallTargets)
|
||||
include(ROCMTest)
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake)
|
||||
include(Embed)
|
||||
file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
|
||||
${CK_ROOT}/include/ck/*.hpp)
|
||||
message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
|
||||
message(STATUS "RELATIVE: ${CK_ROOT}/include")
|
||||
add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)
|
||||
|
||||
add_definitions(-std=c++17)
|
||||
|
||||
file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
|
||||
# TODO: Use object library
|
||||
add_library(ck_host STATIC ${SOURCES})
|
||||
target_link_libraries(ck_host PRIVATE ck_headers)
|
||||
|
||||
set_target_properties(ck_host PROPERTIES
|
||||
LINKER_LANGUAGE CXX
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
target_include_directories(ck_host PUBLIC
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
||||
)
|
||||
|
||||
add_executable(ck-template-driver driver/main.cpp)
|
||||
target_link_libraries(ck-template-driver ck_host)
|
||||
|
||||
rocm_install(
|
||||
TARGETS ck_host ck_headers
|
||||
EXPORT ck_hostTargets
|
||||
)
|
||||
rocm_install(DIRECTORY include/ck DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
|
||||
if(BUILD_TESTING)
|
||||
add_subdirectory(test)
|
||||
endif()
|
||||
71
codegen/driver/main.cpp
Normal file
71
codegen/driver/main.cpp
Normal file
@@ -0,0 +1,71 @@
|
||||
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include "ck/host/device_gemm_multiple_d/operation.hpp"
|
||||
#include "ck/host/stringutils.hpp"
|
||||
|
||||
using ck::host::Transform;
|
||||
|
||||
struct Emitters
|
||||
{
|
||||
std::unordered_map<std::string, std::function<std::vector<std::string>()>> m;
|
||||
|
||||
template <class T>
|
||||
void Register(const std::string& name)
|
||||
{
|
||||
m[name] = [] {
|
||||
auto configs = T::CreateOperations();
|
||||
|
||||
return Transform(configs, [](const auto& ops) { return ToTuple(ops); });
|
||||
};
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static std::string ToTuple(const T& ops)
|
||||
{
|
||||
auto templates = Transform(
|
||||
ops, [](const auto& op) { return " " + op.ToSolution().ToTemplateString(); });
|
||||
return "std::tuple<\n" + ck::host::JoinStrings(templates, ",\n") + ">";
|
||||
}
|
||||
|
||||
std::string Emit(const std::string& name) { return ck::host::JoinStrings(m.at(name)(), "\n"); }
|
||||
|
||||
std::vector<std::string> List() const
|
||||
{
|
||||
return Transform(m, [](auto&& p) { return p.first; });
|
||||
}
|
||||
};
|
||||
|
||||
int main(int argc, const char* argv[])
|
||||
{
|
||||
std::string prog = argv[0];
|
||||
std::vector<std::string> args(argv + 1, argv + argc);
|
||||
Emitters e;
|
||||
e.Register<ck::host::device_gemm_multiple_d::Operation_Xdl_CShuffle>(
|
||||
"DeviceGemmMultipleD_Xdl_CShuffle");
|
||||
|
||||
if(args.empty() or std::any_of(args.begin(), args.end(), [](auto arg) {
|
||||
return arg == "-h" or arg == "--help";
|
||||
}))
|
||||
{
|
||||
std::cout << "USAGE:" << std::endl;
|
||||
std::cout << " " << prog << " [TEMPLATE]" << std::endl;
|
||||
std::cout << std::endl;
|
||||
std::cout << "FLAGS:" << std::endl;
|
||||
std::cout << " -h, --help Show help" << std::endl;
|
||||
std::cout << std::endl;
|
||||
std::cout << "TEMPLATES:" << std::endl;
|
||||
for(auto x : e.List())
|
||||
std::cout << " " << x << std::endl;
|
||||
std::cout << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
for(auto name : args)
|
||||
std::cout << e.Emit(name) << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
42
codegen/include/ck/host/device_gemm_multiple_d.hpp
Normal file
42
codegen/include/ck/host/device_gemm_multiple_d.hpp
Normal file
@@ -0,0 +1,42 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include "ck/host/types.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace device_gemm_multiple_d {
|
||||
|
||||
struct Problem
|
||||
{
|
||||
std::size_t M = 0;
|
||||
std::size_t N = 0;
|
||||
std::size_t K = 0;
|
||||
bool TransA = false;
|
||||
bool TransB = false;
|
||||
bool TransE = false;
|
||||
std::vector<bool> DsTrans = {};
|
||||
DataType ADataType = DataType::Half;
|
||||
DataType BDataType = DataType::Half;
|
||||
DataType EDataType = DataType::Half;
|
||||
std::vector<DataType> DsDataType = {};
|
||||
std::string AElementOp = "ck::tensor_operation::element_wise::PassThrough";
|
||||
std::string BElementOp = "ck::tensor_operation::element_wise::PassThrough";
|
||||
std::string CDEElementOp = "ck::Tuple<>";
|
||||
|
||||
std::string GetIncludeHeader() const;
|
||||
|
||||
std::vector<Solution> GetSolutions(const std::string& arch) const;
|
||||
};
|
||||
|
||||
} // namespace device_gemm_multiple_d
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
42
codegen/include/ck/host/device_gemm_multiple_d/operation.hpp
Normal file
42
codegen/include/ck/host/device_gemm_multiple_d/operation.hpp
Normal file
@@ -0,0 +1,42 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "ck/host/types.hpp"
|
||||
#include "ck/host/operation/gemm.hpp"
|
||||
#include "ck/host/device_gemm_multiple_d/problem.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace device_gemm_multiple_d {
|
||||
|
||||
struct Operation_Xdl_CShuffle
|
||||
{
|
||||
static std::vector<std::vector<Operation_Xdl_CShuffle>> CreateOperations();
|
||||
static std::vector<Operation_Xdl_CShuffle> CreateOperations(const Problem& prob);
|
||||
TensorDesc A{};
|
||||
TensorDesc B{};
|
||||
DataType acc = DataType::Float;
|
||||
DataType cs_type = DataType::Half;
|
||||
std::vector<TensorDesc> Ds = {};
|
||||
TensorDesc E{};
|
||||
std::string a_elem_op = PassThrough;
|
||||
std::string b_elem_op = PassThrough;
|
||||
std::string cde_elem_op = Bilinear;
|
||||
std::string gemm_specialization = "ck::tensor_operation::device::GemmSpecialization::Default";
|
||||
operation::TileDesc tile_desc{};
|
||||
operation::BlockTransferDesc a_block_transfer{};
|
||||
operation::BlockTransferDesc b_block_transfer{};
|
||||
operation::CShuffleDesc cshuffle{};
|
||||
operation::CBlockTransferDesc c_block_transfer{};
|
||||
|
||||
Solution ToSolution() const;
|
||||
};
|
||||
|
||||
} // namespace device_gemm_multiple_d
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
39
codegen/include/ck/host/device_gemm_multiple_d/problem.hpp
Normal file
39
codegen/include/ck/host/device_gemm_multiple_d/problem.hpp
Normal file
@@ -0,0 +1,39 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "ck/host/types.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
namespace device_gemm_multiple_d {
|
||||
|
||||
struct Problem
|
||||
{
|
||||
std::size_t M = 0;
|
||||
std::size_t N = 0;
|
||||
std::size_t K = 0;
|
||||
bool TransA = false;
|
||||
bool TransB = false;
|
||||
bool TransE = false;
|
||||
std::vector<bool> DsTrans = {};
|
||||
DataType ADataType = DataType::Half;
|
||||
DataType BDataType = DataType::Half;
|
||||
DataType EDataType = DataType::Half;
|
||||
std::vector<DataType> DsDataType = {};
|
||||
std::string AElementOp = PassThrough;
|
||||
std::string BElementOp = PassThrough;
|
||||
std::string CDEElementOp = PassThrough;
|
||||
|
||||
std::string GetIncludeHeader() const;
|
||||
|
||||
std::vector<Solution> GetSolutions(const std::string& arch) const;
|
||||
};
|
||||
|
||||
} // namespace device_gemm_multiple_d
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
18
codegen/include/ck/host/headers.hpp
Normal file
18
codegen/include/ck/host/headers.hpp
Normal file
@@ -0,0 +1,18 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace ck {
|
||||
namespace host {
|
||||
|
||||
std::unordered_map<std::string_view, std::string_view> GetHeaders();
|
||||
|
||||
} // namespace host
|
||||
} // namespace ck
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user