commit 1eb2e57380bf2a80dde88eb5bc9b7895e2d79f68
Author: Joseph Macaranas <145489236+amd-jmacaran@users.noreply.github.com>
Date: Wed Apr 30 13:46:39 2025 -0400
Reorganize project folders (#6)
diff --git a/.azuredevops/rocm-ci.yml b/.azuredevops/rocm-ci.yml
new file mode 100644
index 0000000000..b37b8cc27f
--- /dev/null
+++ b/.azuredevops/rocm-ci.yml
@@ -0,0 +1,30 @@
+resources:
+ repositories:
+ - repository: pipelines_repo
+ type: github
+ endpoint: ROCm
+ name: ROCm/ROCm
+
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml@pipelines_repo
+
+trigger:
+ batch: true
+ branches:
+ include:
+ - develop
+ - amd-develop
+ paths:
+ exclude:
+ - .github
+ - docs
+ - '.*.y*ml'
+ - '*.md'
+ - Jenkinsfile
+ - LICENSE
+
+pr: none
+
+jobs:
+ - template: ${{ variables.CI_COMPONENT_PATH }}/composable_kernel.yml@pipelines_repo
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000..22f2674966
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,90 @@
+---
+Language: Cpp
+AccessModifierOffset: 0
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+ AfterClass: true
+ AfterControlStatement: true
+ AfterEnum: true
+ AfterFunction: true
+ AfterNamespace: false
+ AfterObjCDeclaration: true
+ AfterStruct: true
+ AfterUnion: true
+ BeforeCatch: true
+ BeforeElse: true
+ IndentBraces: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+ColumnLimit: 100
+CommentPragmas: '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeCategories:
+ - Regex: '^"(llvm|llvm-c|clang|clang-c)/'
+ Priority: 2
+ - Regex: '^(<|"(gtest|isl|json)/)'
+ Priority: 3
+ - Regex: '.*'
+ Priority: 1
+IndentCaseLabels: false
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+ReflowComments: true
+SortIncludes: false
+SpaceAfterCStyleCast: false
+# SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: Never
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+TabWidth: 8
+UseTab: Never
+...
+
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000000..3815c654fe
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,3 @@
+CheckOptions:
+ - key: bugprone-reserved-identifier.AllowedIdentifiers
+ value: '__HIP_PLATFORM_HCC__;__HIP_PLATFORM_AMD__;__HIP_ROCclr__'
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 0000000000..ccdfb0f6fb
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,8 @@
+* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli
+# Documentation files
+docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
+*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
+*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
+.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
+# Header directory for Doxygen documentation
+library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
new file mode 100644
index 0000000000..56f2acee71
--- /dev/null
+++ b/.github/CONTRIBUTING.md
@@ -0,0 +1,10 @@
+We'd love for you to contribute to our source code!
+
+Some helpful links:
+
+- [Code of Conduct guidelines](https://www.contributor-covenant.org/version/2/1/code_of_conduct/code_of_conduct.txt)
+- [New issue guidelines](https://github.com/rocm/composable_kernel/blob/develop/.github/ISSUE_TEMPLATE.md)
+- [Submitting a pull request guidelines](https://github.com/rocm/composable_kernel/blob/develop/.github/PULL_REQUEST_TEMPLATE.md)
+- [Maintainers](https://github.com/rocm/composable_kernel/blob/develop/CONTRIBUTORS.md)
+- [General information](https://github.com/rocm/composable_kernel/blob/develop/README.md)
+- [ROCm documentation](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel.html)
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 0000000000..263cc3480d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,14 @@
+When creating an issue, please check if a similar issue already exists.
+
+### When reporting a bug, please include:
+- [ ] A descriptive title
+- [ ] An isolated way to reproduce the behavior (preferably a docker container with a repro)
+- [ ] ROCm version, clang version, Composable Kernel commit pin
+- [ ] Environment variables
+- [ ] The behavior you expect to see, and the behavior you actually see
+
+### When requesting a feature, please include:
+- [ ] A descriptive title
+- [ ] A detailed description of the problem you are trying to solve
+- [ ] An overview of the suggested solution
+- [ ] Explanation why the solution is an improvement
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000..0086358db1
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
+blank_issues_enabled: true
diff --git a/.github/ISSUE_TEMPLATE/issue_report.yml b/.github/ISSUE_TEMPLATE/issue_report.yml
new file mode 100644
index 0000000000..ef6e6faa1b
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/issue_report.yml
@@ -0,0 +1,221 @@
+name: Issue Report
+description: File a report for ROCm related issues on Linux and Windows. For issues pertaining to documentation or non-bug related, please open a blank issue located below.
+title: "[Issue]: "
+
+body:
+- type: markdown
+ attributes:
+ value: |
+ Thank you for taking the time to fill out this report!
+
+ You can acquire your OS, CPU, GPU (for filling out this report) with the following commands:
+
+ Linux:
+ echo "OS:" && cat /etc/os-release | grep -E "^(NAME=|VERSION=)";
+ echo "CPU: " && cat /proc/cpuinfo | grep "model name" | sort --unique;
+ echo "GPU:" && /opt/rocm/bin/rocminfo | grep -E "^\s*(Name|Marketing Name)";
+
+ Windows:
+ (Get-WmiObject Win32_OperatingSystem).Version
+ (Get-WmiObject win32_Processor).Name
+ (Get-WmiObject win32_VideoController).Name
+- type: textarea
+ attributes:
+ label: Problem Description
+ description: Describe the issue you encountered.
+ validations:
+ required: true
+- type: input
+ attributes:
+ label: Operating System
+ description: What is the name and version number of the OS?
+ placeholder: "e.g. Ubuntu 22.04.3 LTS (Jammy Jellyfish)"
+ validations:
+ required: true
+- type: input
+ attributes:
+ label: CPU
+ description: What CPU did you encounter the issue on?
+ placeholder: "e.g. AMD Ryzen 9 5900HX with Radeon Graphics"
+ validations:
+ required: true
+- type: dropdown
+ attributes:
+ label: GPU
+ description: What GPU(s) did you encounter the issue on (you can select multiple GPUs from the list)
+ multiple: true
+ options:
+ - AMD Instinct MI300X
+ - AMD Instinct MI300A
+ - AMD Instinct MI300
+ - AMD Instinct MI250X
+ - AMD Instinct MI250
+ - AMD Instinct MI210
+ - AMD Instinct MI100
+ - AMD Instinct MI50
+ - AMD Instinct MI25
+ - AMD Radeon Pro V620
+ - AMD Radeon Pro VII
+ - AMD Radeon RX 7900 XTX
+ - AMD Radeon VII
+ - AMD Radeon Pro W7900
+ - AMD Radeon Pro W7800
+ - AMD Radeon Pro W6800
+ - AMD Radeon Pro W6600
+ - AMD Radeon Pro W5500
+ - AMD Radeon RX 7900 XT
+ - AMD Radeon RX 7600
+ - AMD Radeon RX 6950 XT
+ - AMD Radeon RX 6900 XT
+ - AMD Radeon RX 6800 XT
+ - AMD Radeon RX 6800
+ - AMD Radeon RX 6750
+ - AMD Radeon RX 6700 XT
+ - AMD Radeon RX 6700
+ - AMD Radeon RX 6650 XT
+ - AMD Radeon RX 6600 XT
+ - AMD Radeon RX 6600
+ - Other
+ validations:
+ required: true
+- type: input
+ attributes:
+ label: Other
+ description: If you selected Other, please specify
+- type: dropdown
+ attributes:
+ label: ROCm Version
+ description: What version(s) of ROCm did you encounter the issue on?
+ multiple: true
+ options:
+ - ROCm 6.0.0
+ - ROCm 5.7.1
+ - ROCm 5.7.0
+ - ROCm 5.6.1
+ - ROCm 5.6.0
+ - ROCm 5.5.1
+ - ROCm 5.5.0
+ validations:
+ required: true
+- type: dropdown
+ attributes:
+ label: ROCm Component
+ description: (Optional) If this issue relates to a specific ROCm component, it can be mentioned here.
+ multiple: true
+ options:
+ - Other
+ - AMD Common Language Runtime
+ - AMD MIGraphX
+ - AMD System Management Interface
+ - amdgpu KCL/autoconf
+ - amdgpu Kernel-mode GPU Driver
+ - amdgpu-install
+ - AOMP
+ - AOMP Extras
+ - AqlProfile
+ - build-infra
+ - chelsio
+ - clang-ocl
+ - Composable Kernel
+ - dkms
+ - docker / ROCm-docker
+ - flang
+ - gpuburn
+ - half
+ - HIP
+ - HIP Examples
+ - hipBLAS
+ - hipBLASLt
+ - HIPCC
+ - hipCUB
+ - hip-examples-private
+ - hipFFT
+ - hipfort
+ - HIPIFY
+ - hipRAND
+ - hipSOLVER
+ - hipSPARSE
+ - hipSPARSELt
+ - hipTensor
+ - hip-tests
+ - HSA Runtime
+ - infrastructure
+ - jenkins-utils
+ - libdrm
+ - Linux BPI packaging framework
+ - llvm-project
+ - Mesa
+ - meta
+ - MIOpen
+ - MIVisionX
+ - ml-framework-ci
+ - MLSEQA_TestRepo
+ - OpenCL API C++ Bindings
+ - OpenCL API Headers
+ - OpenCL Conformance Test Suite
+ - OpenCL ICD Loader
+ - perftest-p2p
+ - prototype
+ - RCCL
+ - rccl-rdma-sharp-plugins
+ - rocALUTION
+ - rocBLAS
+ - ROCdbgapi
+ - ROCdebug-agent
+ - rocFFT
+ - ROCgdb
+ - ROCK
+ - ROCm Documentation/Website
+ - ROCm Data Center Tool
+ - ROCm Examples
+ - ROCm for Windows
+ - ROCm Performance Primitives
+ - ROCm System Management Interface Library
+ - ROCm Thrust
+ - ROCm Validation Suite
+ - rocm_bandwidth_test
+ - rocm-cmake
+ - rocm-core
+ - rocm-docs-core
+ - rocminfo
+ - rocMLIR
+ - rocmtools
+ - rocPRIM
+ - rocprofiler
+ - rocRAND
+ - ROCR-Runtime
+ - rocSOLVER
+ - rocSPARSE
+ - roctracer
+ - ROCT-Thunk-Interface
+ - rocWMMA
+ - Tensile
+ - umr
+ - ibv_rc_pingpong-amd
+ - mellanox
+ - mpitest
+ - Pytorch
+ - Tensorflow
+ - APEX
+ - torchvision
+ - Magma
+- type: textarea
+ attributes:
+ label: Steps to Reproduce
+ description: (Optional) Detailed steps to reproduce the issue.
+ validations:
+ required: false
+
+- type: textarea
+ attributes:
+ label: (Optional for Linux users) Output of /opt/rocm/bin/rocminfo --support
+ description: The output of rocminfo --support could help to better address the problem.
+ validations:
+ required: false
+
+- type: textarea
+ attributes:
+ label: Additional Information
+ description: (Optional) Any additional information that is relevant, e.g. relevant environment variables, dockerfiles, log files, dmesg output (on Linux), etc.
+ validations:
+ required: false
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000000..8a988ad1c9
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,20 @@
+## Proposed changes
+
+Please describe the motivation behind the pull request, whether it enables a new feature or fixes a bug. If there are associated pull requests or issues, please link them to the pull request.
+
+## Checklist
+
+Please put an `x` into the boxes that apply. You can also fill these out after creating the PR. If you're not sure, please don't hesitate to ask.
+
+- [ ] I have added tests relevant to the introduced functionality, and the unit tests are passing locally
+- [ ] I have added the test to REGRESSION_TESTS list defined at the top of CMakeLists.txt in tests/CMakeLists.txt, **IF** the test takes more than 30 seconds to run.
+- [ ] I have added inline documentation which enables the maintainers with understanding the motivation
+- [ ] I have removed the stale documentation which is no longer relevant after this pull request
+- [ ] (If this change is user-facing) I have added release notes which provide the end users with a brief summary of the improvement from this pull request
+- [ ] I have run `clang-format` on all changed files
+- [ ] Any dependent changes have been merged
+
+## Discussion
+
+If this is a relatively large or complex change, feel free to start a discussion by explaining why you chose the solution you did and what alternatives you considered
+
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000000..0e0a252eb6
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,18 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+
+version: 2
+updates:
+ - package-ecosystem: "pip" # See documentation for possible values
+ directory: "/docs/sphinx" # Location of package manifests
+ open-pull-requests-limit: 10
+ schedule:
+ interval: "daily"
+ labels:
+ - "documentation"
+ - "dependencies"
+ - "ci:docs-only"
+ reviewers:
+ - "samjwu"
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000..599ef99e35
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,70 @@
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+*.ipch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+# vim tags
+tags
+.tags
+.*.swp
+
+# Editors
+.vscode
+
+# build-in-source directory
+build*
+
+# emacs temporary/backup files
+.\#*
+\#*\#
+*~
+
+# GDB temporary files
+.gdb_history
+install.dir*
+
+# documentation artifacts
+_build/
+_images/
+_static/
+_templates/
+_toc.yml
+_doxygen/
+docs/doxygen/html
+docs/doxygen/xml
+
+# JetBrains IDE
+.idea/
+cmake-build*/
+build*/
+
+# Python virtualenv
+.venv/
+
+# Python cache
+__pycache__/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100755
index 0000000000..d6700ae05b
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,14 @@
+repos:
+- repo: local
+ hooks:
+ - id: clang-format
+ name: clang-format
+ entry: clang-format-12 -i --style=file
+ language: system
+ types_or: [c++, inc]
+ - id: copyright-year-checker
+ name: copyright-year-checker
+ entry: script/check_copyright_year.sh
+ verbose: false
+ language: script
+ types: [c++]
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000000..b3299fa4e8
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,18 @@
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+version: 2
+
+sphinx:
+ configuration: docs/conf.py
+
+formats: [htmlzip, pdf, epub]
+
+python:
+ install:
+ - requirements: docs/sphinx/requirements.txt
+
+build:
+ os: ubuntu-22.04
+ tools:
+ python: "3.10"
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000000..e0ec214c69
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,123 @@
+# Changelog for Composable Kernel
+
+Documentation for Composable Kernel available at [https://rocm.docs.amd.com/projects/composable_kernel/en/latest/](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/).
+
+## Composable Kernel 1.1.0 for ROCm 6.5.0
+
+### Added
+
+* Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data
+* Added a fully asynchronous HOST (CPU) arguments copy flow for CK grouped GEMM kernels.
+* Added support GKCYX layout for grouped convolution forward (NGCHW/GKCYX/NGKHW, number of instances in instance factory for NGCHW/GKYXC/NGKHW has been reduced).
+* Added support for GKCYX layout for grouped convolution forward (NGCHW/GKCYX/NGKHW).
+* Added support for GKCYX layout for grouped convolution backward weight (NGCHW/GKCYX/NGKHW).
+* Added support for GKCYX layout for grouped convolution backward data (NGCHW/GKCYX/NGKHW).
+* Added support for Stream-K version of mixed fp8/bf16 GEMM
+* Added GEMM pipeline for microscaling (MX) data types
+* Added support for FP16 2:4 structured sparsity to universal GEMM.
+* Added support for Split K for grouped convolution backward data.
+
+### Optimized
+
+None
+
+### Fixes
+
+None
+
+### Changes
+
+* Removed support for gfx940 and gfx941 targets (#1944)
+* Replaced the raw buffer load/store intrinsics with Clang20 built-ins (#1876)
+* DL and DPP kernels are now enabled by default.
+* Number of instances in instance factory for grouped convolution forward NGCHW/GKYXC/NGKHW has been reduced.
+* Number of instances in instance factory for grouped convolution backward weight NGCHW/GKYXC/NGKHW has been reduced.
+* Number of instances in instance factory for grouped convolution backward data NGCHW/GKYXC/NGKHW has been reduced.
+
+### Known issues
+
+None
+
+## Composable Kernel 1.1.0 for ROCm 6.1.0
+
+### Additions
+
+* Added generic instances for GEMM XDL operations (#1161)
+* Added gamma and beta parameters for the layernorm and groupnorm bwd operations (#1133)
+* Introduced wrapper sublibrary (limited functionality). (#1071, #1098, #1108, #1126)
+* Added an option to vary the number of warm-up cycles and iterations for ckProfiler (#1124)
+
+### Optimizations
+
+* New performance optimizations for GEMM operations on MI200 and MI300 architectures (#1135)
+
+### Fixes
+
+* Reduced the build time for most GPU architectures (#1084)
+* Fixed some conversion issues for fp8 data type (#1099)
+
+### Changes
+
+None
+
+### Known issues
+
+None
+
+## Composable Kernel 1.1.0 for ROCm 6.0.0
+
+### Fixes
+
+* Fixed a hazard associated with inline v_dot (#808)
+* Fixed two bugs in grouped convolution backward data without K padding (#848 #876)
+
+### Optimizations
+
+None
+
+### Additions
+
+* Added an image to a column kernel (#867)
+* Added a column to an image kernel (#930)
+* Support for 3D grouped convolution on RDNA 3 GPUs (#935, #950, #985)
+* Grouped convolution support for small K and C (#822 #879 #897)
+* Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
+* Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799)
+* Support for Batched GEMM DL (#732)
+
+### Changes
+
+* Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
+
+## Composable Kernel 0.2.0 for ROCm 5.7.0
+
+### Fixes
+
+* Fixed a bug in 6-dimensional kernels (#555)
+* Fixed a test case failure with grouped convolution backward weight (#524)
+
+### Optimizations
+
+* Improved the performance of the normalization kernel
+
+### Additions
+
+* New CMake flags:
+ * "DL_KERNELS"-* Must be set to "ON" in order to build the GEMM DL and batched_gemm_multi_d_dl instances
+ * "DTYPES" -- Can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build an instance of the specified data types
+ * "INSTANCES_ONLY" -- Only builds CK library and instances without tests, examples, or profiler
+* New feature: if GPU_TARGETS is not set in the CMake command line, CK will be built for all targets supported by the compiler
+* Support for MI300A/MI300X
+* Support for AMD RDNA 3
+* New user tutorial (#563)
+* Additional instances for irregular GEMM sizes (#560)
+* New inter-wave consumer-producer programming model for GEMM kernels (#310)
+* GEMM with support multiple elementwise fusions (multi-D) (#534)
+* Multi-embeddings support (#542)
+* AMD RDNA 3 blockwise GEMM and real GEMM support (#541)
+* AMD RDNA grouped convolution backward weight support (#505)
+* MaxPool and AvgPool forward (#815); MaxPool backward (#750)
+
+### Changes
+
+None
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000000..3813d63812
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,67 @@
+cff-version: 1.2.0
+title: Composable Kernel
+message: If you use this software, please cite using the following metadata.
+type: software
+authors:
+ - given-names: Chao
+ family-names: Liu
+ email: chao.liu2@amd.com
+ affiliation: AMD
+ - given-names: Jing
+ family-names: Zhang
+ email: jing.zhang3@amd.com
+ affiliation: AMD
+ - given-names: Letao
+ family-names: Qin
+ email: letao.qin@amd.com
+ affiliation: AMD
+ - given-names: Qianfeng
+ family-names: Zhang
+ email: qianfeng.zhang@amd.com
+ affiliation: AMD
+ - given-names: Liang
+ family-names: Huang
+ email: carlus.huang@amd.com
+ affiliation: AMD
+ - given-names: Shaojie
+ family-names: Wang
+ email: shaojie.wang@amd.com
+ affiliation: AMD
+ - given-names: Anthony
+ family-names: Chang
+ email: antc@amd.com
+ affiliation: AMD
+ - given-names: Chunyu
+ family-names: Lai
+ email: chunyu.lai@amd.com
+ affiliation: AMD
+ - given-names: Illia
+ family-names: Silin
+ email: illia.silin@amd.com
+ affiliation: AMD
+ - given-names: Adam
+ family-names: Osewski
+ email: adam.osewski@amd.com
+ affiliation: AMD
+ - given-names: Poyen
+ family-names: Chen
+ email: poyen.chen@amd.com
+ affiliation: AMD
+ - given-names: Rosty
+ family-names: Geyyer
+ email: rosty.geyyer@amd.com
+ affiliation: AMD
+ - given-names: Hanwen
+ family-names: Chen
+ - given-names: Tejash
+ family-names: Shah
+ - given-names: Xiaoyan
+ family-names: Zhou
+ - given-names: Jianfeng
+ family-names: Yan
+repository-code: 'https://github.com/ROCm/composable_kernel'
+abstract: Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for Machine Learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel progarmming languages, like HIP C++.
+keywords:
+ - 'CK, Composable Kernel, Tensor Coordinate Transformation'
+license: MIT
+license-url: https://github.com/ROCm/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000000..4e12462a41
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,671 @@
+cmake_minimum_required(VERSION 3.14)
+if(POLICY CMP0140)
+ # policies CMP0140 not known to CMake until 3.25
+ cmake_policy(SET CMP0140 NEW)
+endif()
+
+get_property(_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
+
+# This has to be initialized before the project() command appears
+# Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
+if(_GENERATOR_IS_MULTI_CONFIG)
+ set(CMAKE_CONFIGURATION_TYPES "Debug;Release;RelWithDebInfo;MinSizeRel" CACHE STRING
+ "Available build types (configurations) on multi-config generators")
+else()
+ set(CMAKE_BUILD_TYPE Release CACHE STRING
+ "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel.")
+endif()
+
+# Default installation path
+if(NOT WIN32)
+ set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
+endif()
+
+set(version 1.1.0)
+# Check support for CUDA/HIP in Cmake
+project(composable_kernel VERSION ${version} LANGUAGES CXX HIP)
+include(CTest)
+
+# Usage: for customized Python location cmake -DCK_USE_ALTERNATIVE_PYTHON="/opt/Python-3.8.13/bin/python3.8"
+# CK Codegen requires dataclass which is added in Python 3.7
+# Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04
+if(NOT CK_USE_ALTERNATIVE_PYTHON)
+ find_package(Python3 3.8 COMPONENTS Interpreter REQUIRED)
+else()
+ message("Using alternative python version")
+ set(EXTRA_PYTHON_PATH)
+ # this is overly restrictive, we may need to be more flexible on the following
+ string(REPLACE "/bin/python3.8" "" EXTRA_PYTHON_PATH "${CK_USE_ALTERNATIVE_PYTHON}")
+ message("alternative python path is: ${EXTRA_PYTHON_PATH}")
+ find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED)
+ add_definitions(-DPython3_EXECUTABLE="${CK_USE_ALTERNATIVE_PYTHON}")
+ set(Python3_EXECUTABLE "${CK_USE_ALTERNATIVE_PYTHON}")
+ set(PYTHON_EXECUTABLE "${CK_USE_ALTERNATIVE_PYTHON}")
+ set(ENV{LD_LIBRARY_PATH} "${EXTRA_PYTHON_PATH}/lib:$ENV{LD_LIBRARY_PATH}")
+endif()
+
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+
+if (DTYPES)
+ add_definitions(-DDTYPES)
+ if (DTYPES MATCHES "int8")
+ add_definitions(-DCK_ENABLE_INT8)
+ set(CK_ENABLE_INT8 "ON")
+ endif()
+ if (DTYPES MATCHES "fp8")
+ add_definitions(-DCK_ENABLE_FP8)
+ set(CK_ENABLE_FP8 "ON")
+ endif()
+ if (DTYPES MATCHES "bf8")
+ add_definitions(-DCK_ENABLE_BF8)
+ set(CK_ENABLE_BF8 "ON")
+ endif()
+ if (DTYPES MATCHES "fp16")
+ add_definitions(-DCK_ENABLE_FP16)
+ set(CK_ENABLE_FP16 "ON")
+ endif()
+ if (DTYPES MATCHES "fp32")
+ add_definitions(-DCK_ENABLE_FP32)
+ set(CK_ENABLE_FP32 "ON")
+ endif()
+ if (DTYPES MATCHES "fp64")
+ add_definitions(-DCK_ENABLE_FP64)
+ set(CK_ENABLE_FP64 "ON")
+ endif()
+ if (DTYPES MATCHES "bf16")
+ add_definitions(-DCK_ENABLE_BF16)
+ set(CK_ENABLE_BF16 "ON")
+ endif()
+ message("DTYPES macro set to ${DTYPES}")
+else()
+ add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16 -DCK_ENABLE_FP8 -DCK_ENABLE_BF8)
+ set(CK_ENABLE_INT8 "ON")
+ set(CK_ENABLE_FP16 "ON")
+ set(CK_ENABLE_FP32 "ON")
+ set(CK_ENABLE_FP64 "ON")
+ set(CK_ENABLE_BF16 "ON")
+ set(CK_ENABLE_FP8 "ON")
+ set(CK_ENABLE_BF8 "ON")
+endif()
+
+#for f8/bf8_t type
+add_compile_options(-Wno-bit-int-extension)
+add_compile_options(-Wno-pass-failed)
+add_compile_options(-Wno-switch-default)
+add_compile_options(-Wno-unique-object-duplication)
+
+if(NOT DISABLE_DL_KERNELS)
+ add_definitions(-DDL_KERNELS)
+ set(DL_KERNELS "ON")
+ set(CK_ENABLE_DL_KERNELS "ON")
+endif()
+if(NOT DISABLE_DPP_KERNELS)
+ add_definitions(-DDPP_KERNELS)
+ set(DPP_KERNELS "ON")
+ set(CK_ENABLE_DPP_KERNELS "ON")
+endif()
+option(CK_USE_CODEGEN "Enable codegen library" OFF)
+if(CK_USE_CODEGEN)
+ add_definitions(-DCK_USE_CODEGEN)
+endif()
+
+option(CK_TIME_KERNEL "Enable kernel time tracking" ON)
+if(CK_TIME_KERNEL)
+ add_definitions(-DCK_TIME_KERNEL=1)
+else()
+ add_definitions(-DCK_TIME_KERNEL=0)
+endif()
+
+include(getopt)
+
+# CK version file to record release version as well as git commit hash
+find_package(Git REQUIRED)
+execute_process(COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD OUTPUT_VARIABLE COMMIT_ID OUTPUT_STRIP_TRAILING_WHITESPACE)
+configure_file(include/ck/version.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/version.h)
+
+set(ROCM_SYMLINK_LIBS OFF)
+find_package(ROCM REQUIRED PATHS /opt/rocm)
+
+include(ROCMInstallTargets)
+include(ROCMPackageConfigHelpers)
+include(ROCMSetupVersion)
+include(ROCMInstallSymlinks)
+include(ROCMCreatePackage)
+include(CheckCXXCompilerFlag)
+include(ROCMCheckTargetIds)
+include(TargetFlags)
+
+rocm_setup_version(VERSION ${version})
+
+list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip "$ENV{ROCM_PATH}" "$ENV{HIP_PATH}")
+
+message("GPU_TARGETS= ${GPU_TARGETS}")
+message("GPU_ARCHS= ${GPU_ARCHS}")
+if(GPU_ARCHS)
+ #disable GPU_TARGETS to avoid conflicts, this needs to happen before we call hip package
+ unset(GPU_TARGETS CACHE)
+ unset(AMDGPU_TARGETS CACHE)
+endif()
+if(GPU_TARGETS)
+ set(USER_GPU_TARGETS 1)
+else()
+ set(USER_GPU_TARGETS 0)
+endif()
+find_package(hip REQUIRED)
+# No assumption that HIP kernels are launched with uniform block size for backward compatibility
+# SWDEV-413293 and https://reviews.llvm.org/D155213
+math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}")
+message("hip_version_flat=${hip_VERSION_FLAT}")
+
+message("checking which targets are supported")
+#In order to build just the CK library (without tests and examples) for all supported GPU targets
+#use -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+#the GPU_TARGETS flag will be reset in this case in order to avoid conflicts.
+#
+#In order to build CK along with all tests and examples it should be OK to set GPU_TARGETS to just 1 or 2 similar architectures.
+if(NOT ENABLE_ASAN_PACKAGING)
+ if(NOT WIN32 AND ${hip_VERSION_FLAT} LESS 600300000)
+ # WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above
+ set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
+ elseif(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER_EQUAL 600300000 AND ${hip_VERSION_FLAT} LESS 600400000)
+ set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
+ elseif(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER_EQUAL 600400000)
+ set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950")
+ endif()
+else()
+ #build CK only for xnack-supported targets when using ASAN
+ set(CK_GPU_TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+")
+endif()
+
+#if user set GPU_ARCHS on the cmake command line, overwrite default target list with user's list
+#otherwise, if user set GPU_TARGETS, use that set of targets
+if(GPU_ARCHS)
+ set(CK_GPU_TARGETS ${GPU_ARCHS})
+else()
+ if(USER_GPU_TARGETS)
+ set(CK_GPU_TARGETS ${GPU_TARGETS})
+ endif()
+endif()
+#if the user did not set GPU_TARGETS, delete whatever was set by HIP package
+if(NOT USER_GPU_TARGETS)
+ set(GPU_TARGETS "")
+endif()
+#make sure all the targets on the list are actually supported by the current compiler
+rocm_check_target_ids(SUPPORTED_GPU_TARGETS
+ TARGETS ${CK_GPU_TARGETS})
+
+message("Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}")
+
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
+ message("Enabling XDL instances")
+ add_definitions(-DCK_USE_XDL)
+ set(CK_USE_XDL "ON")
+endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx94" OR SUPPORTED_GPU_TARGETS MATCHES "gfx95")
+ message("Enabling XDL FP8 gemms on native architectures")
+ add_definitions(-DCK_USE_GFX94)
+ set(CK_USE_GFX94 "ON")
+endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
+ message("Enabling WMMA instances")
+ add_definitions(-DCK_USE_WMMA)
+ set(CK_USE_WMMA "ON")
+endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx12")
+ message("Enabling WMMA FP8 gemms on native architectures")
+ add_definitions(-DCK_USE_WMMA_FP8)
+ set(CK_USE_WMMA_FP8 "ON")
+endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx950")
+ add_definitions(-DCK_USE_OCP_FP8)
+ set(CK_USE_OCP_FP8 "ON")
+endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx94")
+ add_definitions(-DCK_USE_FNUZ_FP8)
+ set(CK_USE_FNUZ_FP8 "ON")
+endif()
+if (SUPPORTED_GPU_TARGETS MATCHES "gfx950")
+ add_definitions(-DCK_USE_NATIVE_MX_SUPPORT)
+ set(CK_USE_NATIVE_MX_SUPPORT "ON")
+endif()
+
+option(CK_USE_FP8_ON_UNSUPPORTED_ARCH "Enable FP8 GEMM instances on older architectures" OFF)
+if(CK_USE_FP8_ON_UNSUPPORTED_ARCH AND (SUPPORTED_GPU_TARGETS MATCHES "gfx90a" OR SUPPORTED_GPU_TARGETS MATCHES "gfx908"))
+ add_definitions(-DCK_USE_FP8_ON_UNSUPPORTED_ARCH)
+ set(CK_USE_FP8_ON_UNSUPPORTED_ARCH "ON")
+endif()
+
+# CK config file to record supported datatypes, etc.
+configure_file(include/ck/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/config.h)
+
+if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302)
+ check_cxx_compiler_flag("-fno-offload-uniform-block" HAS_NO_OFFLOAD_UNIFORM_BLOCK)
+ if(HAS_NO_OFFLOAD_UNIFORM_BLOCK)
+ message("Adding the fno-offload-uniform-block compiler flag")
+ add_compile_options(-fno-offload-uniform-block)
+ endif()
+endif()
+if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500500000)
+ check_cxx_compiler_flag("-mllvm --lsr-drop-solution=1" HAS_LSR_DROP_SOLUTION)
+ if(HAS_LSR_DROP_SOLUTION)
+ message("Adding the lsr-drop-solution=1 compiler flag")
+ add_compile_options("SHELL: -mllvm --lsr-drop-solution=1")
+ endif()
+endif()
+if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
+ check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED)
+ if(HAS_ENABLE_POST_MISCHED)
+ message("Adding the enable-post-misched=0 compiler flag")
+ add_compile_options("SHELL: -mllvm -enable-post-misched=0")
+ endif()
+endif()
+set(check-coerce)
+check_cxx_compiler_flag(" -mllvm -amdgpu-coerce-illegal-types=1" check-coerce)
+if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132)
+ message("Adding the amdgpu-coerce-illegal-types=1")
+ add_compile_options("SHELL: -mllvm -amdgpu-coerce-illegal-types=1")
+endif()
+if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
+ message("Adding -amdgpu-early-inline-all=true and -amdgpu-function-calls=false")
+ add_compile_options("SHELL: -mllvm -amdgpu-early-inline-all=true")
+ add_compile_options("SHELL: -mllvm -amdgpu-function-calls=false")
+endif()
+#
+# Seperate linking jobs from compiling
+# Too many concurrent linking jobs can break the build
+# Copied from LLVM
+set(CK_PARALLEL_LINK_JOBS "" CACHE STRING
+ "Define the maximum number of concurrent link jobs (Ninja only).")
+if(CMAKE_GENERATOR MATCHES "Ninja")
+ if(CK_PARALLEL_LINK_JOBS)
+ set_property(GLOBAL APPEND PROPERTY JOB_POOLS link_job_pool=${CK_PARALLEL_LINK_JOBS})
+ set(CMAKE_JOB_POOL_LINK link_job_pool)
+ endif()
+elseif(CK_PARALLEL_LINK_JOBS)
+ message(WARNING "Job pooling is only available with Ninja generators.")
+endif()
+# Similar for compiling
+set(CK_PARALLEL_COMPILE_JOBS "" CACHE STRING
+ "Define the maximum number of concurrent compile jobs (Ninja only).")
+if(CMAKE_GENERATOR MATCHES "Ninja")
+ if(CK_PARALLEL_COMPILE_JOBS)
+ set_property(GLOBAL APPEND PROPERTY JOB_POOLS compile_job_pool=${CK_PARALLEL_COMPILE_JOBS})
+ set(CMAKE_JOB_POOL_COMPILE compile_job_pool)
+ endif()
+elseif(CK_PARALLEL_COMPILE_JOBS)
+ message(WARNING "Job pooling is only available with Ninja generators.")
+endif()
+
+
+option(USE_BITINT_EXTENSION_INT4 "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
+option(USE_OPT_GFX11 "Whether to enable LDS cumode and Wavefront32 mode for GFX11 silicons." OFF)
+
+if(USE_BITINT_EXTENSION_INT4)
+ add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+ add_compile_options(-Wno-bit-int-extension)
+ message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
+endif()
+
+if(USE_OPT_GFX11)
+ add_compile_options(-mcumode)
+ add_compile_options(-mno-wavefrontsize64)
+ message("CK compiled with USE_OPT_GFX11 set to ${USE_OPT_GFX11}")
+endif()
+
+## Threads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
+link_libraries(Threads::Threads)
+
+## C++
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+message("CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
+
+# https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_macros.html
+# _GLIBCXX_ASSERTIONS
+# Undefined by default. When defined, enables extra error checking in the form of
+# precondition assertions, such as bounds checking in strings and null pointer
+# checks when dereferencing smart pointers
+option(USE_GLIBCXX_ASSERTIONS "Turn on additional c++ library checks." OFF)
+if(USE_GLIBCXX_ASSERTIONS)
+ add_compile_options(-Wp,-D_GLIBCXX_ASSERTIONS)
+endif()
+
+## HIP
+set(CMAKE_HIP_PLATFORM amd)
+set(CMAKE_HIP_COMPILER ${CMAKE_CXX_COMPILER})
+set(CMAKE_HIP_EXTENSIONS ON)
+message("CMAKE_HIP_COMPILER: ${CMAKE_HIP_COMPILER}")
+
+## OpenMP
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ # workaround issue hipcc in rocm3.5 cannot find openmp
+ set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
+ set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
+ set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
+ set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+ set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+ set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+else()
+ find_package(OpenMP REQUIRED)
+endif()
+
+message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
+message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
+message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
+message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
+
+link_libraries(${OpenMP_gomp_LIBRARY})
+link_libraries(${OpenMP_pthread_LIBRARY})
+
+## HIP
+# Override HIP version in config.h, if necessary.
+# The variables set by find_package() can't be overwritten,
+# therefore let's use intermediate variables.
+set(CK_HIP_VERSION_MAJOR "${HIP_VERSION_MAJOR}")
+set(CK_HIP_VERSION_MINOR "${HIP_VERSION_MINOR}")
+set(CK_HIP_VERSION_PATCH "${HIP_VERSION_PATCH}")
+if( DEFINED CK_OVERRIDE_HIP_VERSION_MAJOR )
+ set(CK_HIP_VERSION_MAJOR "${CK_OVERRIDE_HIP_VERSION_MAJOR}")
+ message(STATUS "CK_HIP_VERSION_MAJOR overriden with ${CK_OVERRIDE_HIP_VERSION_MAJOR}")
+endif()
+if( DEFINED CK_OVERRIDE_HIP_VERSION_MINOR )
+ set(CK_HIP_VERSION_MINOR "${CK_OVERRIDE_HIP_VERSION_MINOR}")
+ message(STATUS "CK_HIP_VERSION_MINOR overriden with ${CK_OVERRIDE_HIP_VERSION_MINOR}")
+endif()
+if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
+ set(CK_HIP_VERSION_PATCH "${CK_OVERRIDE_HIP_VERSION_PATCH}")
+ message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
+endif()
+message(STATUS "Build with HIP ${HIP_VERSION}")
+link_libraries(hip::device)
+if(CK_hip_VERSION VERSION_GREATER_EQUAL 6.0.23494)
+ add_compile_definitions(__HIP_PLATFORM_AMD__=1)
+else()
+ add_compile_definitions(__HIP_PLATFORM_HCC__=1)
+endif()
+
+## tidy
+include(EnableCompilerWarnings)
+set(CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
+if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
+ set(CK_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter)
+# Enable tidy on hip
+elseif(CK_BACKEND STREQUAL "HIP" OR CK_BACKEND STREQUAL "HIPNOGPU")
+ set(CK_TIDY_ERRORS ALL)
+endif()
+
+
+include(ClangTidy)
+enable_clang_tidy(
+ CHECKS
+ *
+ -abseil-*
+ -android-cloexec-fopen
+ # Yea we shouldn't be using rand()
+ -cert-msc30-c
+ -bugprone-exception-escape
+ -bugprone-macro-parentheses
+ -cert-env33-c
+ -cert-msc32-c
+ -cert-msc50-cpp
+ -cert-msc51-cpp
+ -cert-dcl37-c
+ -cert-dcl51-cpp
+ -clang-analyzer-alpha.core.CastToStruct
+ -clang-analyzer-optin.performance.Padding
+ -clang-diagnostic-deprecated-declarations
+ -clang-diagnostic-extern-c-compat
+ -clang-diagnostic-unused-command-line-argument
+ -cppcoreguidelines-avoid-c-arrays
+ -cppcoreguidelines-avoid-magic-numbers
+ -cppcoreguidelines-explicit-virtual-functions
+ -cppcoreguidelines-init-variables
+ -cppcoreguidelines-macro-usage
+ -cppcoreguidelines-non-private-member-variables-in-classes
+ -cppcoreguidelines-pro-bounds-array-to-pointer-decay
+ -cppcoreguidelines-pro-bounds-constant-array-index
+ -cppcoreguidelines-pro-bounds-pointer-arithmetic
+ -cppcoreguidelines-pro-type-member-init
+ -cppcoreguidelines-pro-type-reinterpret-cast
+ -cppcoreguidelines-pro-type-union-access
+ -cppcoreguidelines-pro-type-vararg
+ -cppcoreguidelines-special-member-functions
+ -fuchsia-*
+ -google-explicit-constructor
+ -google-readability-braces-around-statements
+ -google-readability-todo
+ -google-runtime-int
+ -google-runtime-references
+ -hicpp-vararg
+ -hicpp-braces-around-statements
+ -hicpp-explicit-conversions
+ -hicpp-named-parameter
+ -hicpp-no-array-decay
+ # We really shouldn't use bitwise operators with signed integers, but
+ # opencl leaves us no choice
+ -hicpp-avoid-c-arrays
+ -hicpp-signed-bitwise
+ -hicpp-special-member-functions
+ -hicpp-uppercase-literal-suffix
+ -hicpp-use-auto
+ -hicpp-use-equals-default
+ -hicpp-use-override
+ -llvm-header-guard
+ -llvm-include-order
+ #-llvmlibc-*
+ -llvmlibc-restrict-system-libc-headers
+ -llvmlibc-callee-namespace
+ -llvmlibc-implementation-in-namespace
+ -llvm-else-after-return
+ -llvm-qualified-auto
+ -misc-misplaced-const
+ -misc-non-private-member-variables-in-classes
+ -misc-no-recursion
+ -modernize-avoid-bind
+ -modernize-avoid-c-arrays
+ -modernize-pass-by-value
+ -modernize-use-auto
+ -modernize-use-default-member-init
+ -modernize-use-equals-default
+ -modernize-use-trailing-return-type
+ -modernize-use-transparent-functors
+ -performance-unnecessary-value-param
+ -readability-braces-around-statements
+ -readability-else-after-return
+ # we are not ready to use it, but very useful
+ -readability-function-cognitive-complexity
+ -readability-isolate-declaration
+ -readability-magic-numbers
+ -readability-named-parameter
+ -readability-uppercase-literal-suffix
+ -readability-convert-member-functions-to-static
+ -readability-qualified-auto
+ -readability-redundant-string-init
+ # too many narrowing conversions in our code
+ -bugprone-narrowing-conversions
+ -cppcoreguidelines-narrowing-conversions
+ -altera-struct-pack-align
+ -cppcoreguidelines-prefer-member-initializer
+ ${CK_TIDY_CHECKS}
+ ${CK_TIDY_ERRORS}
+ HEADER_FILTER
+ "\.hpp$"
+ EXTRA_ARGS
+ -DCK_USE_CLANG_TIDY
+)
+
+include(CppCheck)
+enable_cppcheck(
+ CHECKS
+ warning
+ style
+ performance
+ portability
+ SUPPRESS
+ ConfigurationNotChecked
+ constStatement
+ duplicateCondition
+ noExplicitConstructor
+ passedByValue
+ preprocessorErrorDirective
+ shadowVariable
+ unusedFunction
+ unusedPrivateFunction
+ unusedStructMember
+ unmatchedSuppression
+ FORCE
+ SOURCES
+ library/src
+ INCLUDE
+ ${CMAKE_CURRENT_SOURCE_DIR}/include
+ ${CMAKE_CURRENT_BINARY_DIR}/include
+ ${CMAKE_CURRENT_SOURCE_DIR}/library/include
+ DEFINE
+ CPPCHECK=1
+ __linux__=1
+)
+
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
+
+# set CK project include directories
+include_directories(BEFORE
+ ${PROJECT_BINARY_DIR}/include
+ ${PROJECT_SOURCE_DIR}/include
+ ${PROJECT_SOURCE_DIR}/library/include
+ ${HIP_INCLUDE_DIRS}
+)
+
+SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
+if(BUILD_DEV)
+ add_compile_options(-Werror)
+ add_compile_options(-Weverything)
+endif()
+message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+
+if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+ add_compile_options(-fcolor-diagnostics)
+endif()
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9)
+ add_compile_options(-fdiagnostics-color=always)
+endif()
+
+# make check runs the entire set of examples and tests
+add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
+# make smoke runs the tests and examples that runs within 30 seconds on gfx90a
+add_custom_target(smoke COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR} -L "SMOKE_TEST")
+# make regression runs the tests and examples that runs for more 30 seconds on gfx90a
+add_custom_target(regression COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR} -L "REGRESSION_TEST")
+
+
+file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
+file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
+set(CK_DEVICE_INSTANCES)
+FOREACH(subdir_path ${dir_list})
+set(target_dir)
+IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
+ set(cmake_instance)
+ file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
+ set(add_inst 0)
+ if(("${cmake_instance}" MATCHES "fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8")
+ set(add_inst 1)
+ endif()
+ if(("${cmake_instance}" MATCHES "bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8")
+ set(add_inst 1)
+ endif()
+ if(("${cmake_instance}" MATCHES "fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
+ set(add_inst 1)
+ endif()
+ if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
+ set(add_inst 1)
+ endif()
+ if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
+ set(add_inst 1)
+ endif()
+ if(("${cmake_instance}" MATCHES "bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16")
+ set(add_inst 1)
+ endif()
+ if(("${cmake_instance}" MATCHES "int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
+ set(add_inst 1)
+ endif()
+ if(NOT "${cmake_instance}" MATCHES "DTYPES")
+ set(add_inst 1)
+ endif()
+ if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
+ list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
+ endif()
+ENDIF()
+ENDFOREACH()
+
+add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES})
+add_subdirectory(library)
+
+if(NOT GPU_ARCHS AND USER_GPU_TARGETS)
+ rocm_package_setup_component(tests
+ LIBRARY_NAME composablekernel
+ PACKAGE_NAME tests # Prevent -static suffix on package name
+ )
+
+ rocm_package_setup_component(examples
+ LIBRARY_NAME composablekernel
+ PACKAGE_NAME examples
+ )
+ add_subdirectory(example)
+ add_subdirectory(tile_engine)
+ if(BUILD_TESTING)
+ add_subdirectory(test)
+ endif()
+endif()
+
+rocm_package_setup_component(profiler
+ LIBRARY_NAME composablekernel
+ PACKAGE_NAME ckprofiler
+)
+add_subdirectory(profiler)
+
+if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))
+ add_subdirectory(codegen)
+endif()
+
+#Create an interface target for the include only files and call it "composablekernels"
+include(CMakePackageConfigHelpers)
+
+write_basic_package_version_file(
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+ VERSION "${version}"
+ COMPATIBILITY AnyNewerVersion
+)
+
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+ INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+ NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
+
+rocm_install(FILES
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
+
+# Install CK version and configuration files
+rocm_install(FILES
+ ${PROJECT_BINARY_DIR}/include/ck/version.h
+ ${PROJECT_BINARY_DIR}/include/ck/config.h
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck/
+)
+
+set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
+set(CPACK_RPM_PACKAGE_LICENSE "MIT")
+
+rocm_create_package(
+ NAME composablekernel
+ DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
+ MAINTAINER "MIOpen Kernels Dev Team
"
+ LDCONFIG
+ HEADER_ONLY
+)
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
new file mode 100644
index 0000000000..0900b7a1f8
--- /dev/null
+++ b/CONTRIBUTORS.md
@@ -0,0 +1,35 @@
+[Back to the main page](./README.md)
+# Composable Kernel Developers and Contributors
+
+This is the list of developers and contributors to Composable Kernel library
+
+
+## Developers
+[Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2023
+
+[Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2023
+
+[Anthony Chang](https://github.com/rosenrodt), [Chunyu Lai](https://github.com/rocking5566), [Illia Silin](https://github.com/illsilin), [Adam Osewski](https://github.com/aosewski), [Poyen Chen](https://github.com/poyenc), [Rosty Geyyer](https://github.com/geyyer), [Astha Rai](https://github.com/arai713), [Shi YanXing](https://github.com/Yanxing-Shi), 2022-2023
+
+[Hari Sadasivan](https://github.com/hsadasiv), [Bartlomiej Kocot](https://github.com/bartekxk), [Bartlomiej Wroblewski](https://github.com/bwroblew), 2023
+
+Hanwen Chang, 2019-2021,
+
+Tejash Shah, 2019-2020
+
+Xiaoyan Zhou, 2020
+
+[Jianfeng Yan](https://github.com/j4yan), 2021-2022
+[Jun Liu](https://github.com/junliume), 2021-2024
+
+## Product Manager
+[John Afaganis](https://github.com/afagaj)
+
+
+
+## Contributors
+[Dan Yao](https://github.com/danyao12), [Guangzhao Lu](https://github.com/guangzlu), [Raman Jana](https://github.com/ramjana), [Jehandad Khan](https://github.com/JehandadKhan), [Wen-Heng (Jack) Chung](https://github.com/whchung)
+
+
+## Acknowledgement
+CK team works closely with Meta [AITemplate](https://github.com/facebookincubator/AITemplate) team ([Bing Xu](https://github.com/antinucleon), [Hao Lu](https://github.com/hlu1), [Ying Zhang](https://github.com/ipiszy), etc). Most of the lucrative graph optimization opportunities in ML models were identified by AITemplate team, and we also co-designed many high performance fused kernels for AMD GPUs. Without this collaboration, CK would not reach its current potential.
diff --git a/Config.cmake.in b/Config.cmake.in
new file mode 100644
index 0000000000..2861a28f49
--- /dev/null
+++ b/Config.cmake.in
@@ -0,0 +1,11 @@
+@PACKAGE_INIT@
+
+set(_composable_kernel_supported_components device_other_operations device_gemm_operations device_conv_operations device_mha_operations device_contraction_operations device_reduction_operations utility)
+
+foreach(_comp ${composable_kernel_FIND_COMPONENTS})
+ if(NOT _comp IN_LIST _composable_kernel_supported_components)
+ set(composable_kernel_FOUND False)
+ set(composable_kernel_NOT_FOUND_MESSAGE "Unsupported component: ${_comp}")
+ endif()
+ include("${CMAKE_CURRENT_LIST_DIR}/composable_kernel${_comp}Targets.cmake")
+endforeach()
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000..c629bd034c
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,123 @@
+FROM ubuntu:24.04
+ARG DEBIAN_FRONTEND=noninteractive
+ARG ROCMVERSION=6.4
+ARG compiler_version=""
+ARG compiler_commit=""
+ARG CK_SCCACHE=""
+ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
+ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
+
+# Add rocm repository
+RUN set -xe && \
+ apt-get update && apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl && \
+ curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
+
+RUN if [ "$ROCMVERSION" != "6.5" ]; then \
+ sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/jammy/amdgpu-install_6.4.60400-1_all.deb --no-check-certificate" && \
+ apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.4.60400-1_all.deb && \
+ wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
+ sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO jammy main > /etc/apt/sources.list.d/rocm.list" && \
+ sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu jammy main > /etc/apt/sources.list.d/amdgpu.list'; \
+ fi
+
+RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu jammy main universe | tee -a /etc/apt/sources.list" && \
+ amdgpu-install -y --usecase=rocm --no-dkms
+
+## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined
+ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache
+ENV SCCACHE_INSTALL_LOCATION=/usr/local/.cargo/bin
+ENV PATH=$PATH:${SCCACHE_INSTALL_LOCATION}
+ENV CK_SCCACHE=$CK_SCCACHE
+RUN if [ "$CK_SCCACHE" != "" ]; then \
+ mkdir -p ${SCCACHE_INSTALL_LOCATION} && \
+ curl ${SCCACHE_REPO_URL}/portable/0.2.16/sccache-0.2.16-alpha.1-rocm --output ${SCCACHE_INSTALL_LOCATION}/sccache && \
+ chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache; \
+ fi
+
+# Install dependencies
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+ build-essential \
+ cmake \
+ git \
+ hip-rocclr \
+ iputils-ping \
+ jq \
+ libelf-dev \
+ libnuma-dev \
+ libpthread-stubs0-dev \
+ llvm-amdgpu \
+ mpich \
+ net-tools \
+ pkg-config \
+ python3-full \
+ redis \
+ rocm-llvm-dev \
+ sshpass \
+ stunnel \
+ software-properties-common \
+ vim \
+ nano \
+ zlib1g-dev \
+ zip \
+ libzstd-dev \
+ openssh-server \
+ clang-format-12 \
+ kmod && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/* && \
+ rm -rf amdgpu-install* && \
+# Remove unnecessary rocm components that take a lot of space
+ apt-get remove -y rocblas rocfft rocsparse composablekernel-dev hipblaslt
+
+#Install latest ccache
+RUN git clone https://github.com/ccache/ccache.git && \
+ cd ccache && mkdir build && cd build && cmake .. && make install && \
+#Install ninja build tracing tools
+ cd / && \
+ wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip && \
+ gunzip /usr/local/bin/ninja.gz && \
+ chmod a+x /usr/local/bin/ninja && \
+ git clone https://github.com/nico/ninjatracing.git && \
+#Install ClangBuildAnalyzer
+ git clone https://github.com/aras-p/ClangBuildAnalyzer.git && \
+ cd ClangBuildAnalyzer/ && \
+ make -f projects/make/Makefile && \
+ cd / && \
+#Install latest cppcheck
+ git clone https://github.com/danmar/cppcheck.git && \
+ cd cppcheck && mkdir build && cd build && cmake .. && cmake --build . && \
+ cd / && \
+# Install an init system
+ wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb && \
+ dpkg -i dumb-init_*.deb && rm dumb-init_*.deb && \
+# Install packages for processing the performance results
+ pip3 install --break-system-packages --upgrade pytest pymysql pandas==2.2.3 sqlalchemy==2.0.3 setuptools-rust setuptools sshtunnel==0.4.0 && \
+# Add render group
+ groupadd -f render && \
+# Install the new rocm-cmake version
+ git clone -b master https://github.com/ROCm/rocm-cmake.git && \
+ cd rocm-cmake && mkdir build && cd build && \
+ cmake .. && cmake --build . && cmake --build . --target install
+
+WORKDIR /
+# Add alternative compilers, if necessary
+ENV compiler_version=$compiler_version
+ENV compiler_commit=$compiler_commit
+RUN sh -c "echo compiler version = '$compiler_version'" && \
+ sh -c "echo compiler commit = '$compiler_commit'"
+
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
+ git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
+ cd llvm-project && mkdir build && cd build && \
+ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
+ make -j 8 ; \
+ else echo "using the release compiler"; \
+ fi
+
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
+ git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
+ cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
+ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
+ make -j 8 ; \
+ else echo "using the release compiler"; \
+ fi
diff --git a/Dockerfile.compiler b/Dockerfile.compiler
new file mode 100644
index 0000000000..7534910681
--- /dev/null
+++ b/Dockerfile.compiler
@@ -0,0 +1,26 @@
+ARG BASE_DOCKER="rocm/composable_kernel:ck_ub24.04_rocm6.4"
+FROM $BASE_DOCKER
+ARG compiler_version=""
+ARG compiler_commit=""
+
+# Add alternative compilers, if necessary
+ENV compiler_version=$compiler_version
+ENV compiler_commit=$compiler_commit
+RUN sh -c "echo compiler version = '$compiler_version'" && \
+ sh -c "echo compiler commit = '$compiler_commit'"
+
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \
+ git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
+ cd llvm-project && mkdir build && cd build && \
+ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
+ make -j 16 ; \
+ else echo "using the release compiler"; \
+ fi
+
+RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \
+ git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \
+ cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
+ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \
+ make -j 16 ; \
+ else echo "using the release compiler"; \
+ fi
diff --git a/Jenkinsfile b/Jenkinsfile
new file mode 100644
index 0000000000..3e22eb2f01
--- /dev/null
+++ b/Jenkinsfile
@@ -0,0 +1,1268 @@
+def rocmnode(name) {
+ return '(rocmtest || miopen) && (' + name + ')'
+}
+
+def show_node_info() {
+ sh """
+ echo "NODE_NAME = \$NODE_NAME"
+ lsb_release -sd
+ uname -r
+ cat /sys/module/amdgpu/version
+ ls /opt/ -la
+ """
+}
+
+def nthreads() {
+ def nproc = sh(returnStdout: true, script: 'nproc')
+ echo "Number of cores: ${nproc}"
+ def n = nproc.toInteger()
+ if (n > 32){
+ n /= 2
+ }
+ if (n > 64){
+ n = 64
+ }
+ echo "Number of threads used for building: ${n}"
+ return n
+}
+
+def runShell(String command){
+ def responseCode = sh returnStatus: true, script: "${command} > tmp.txt"
+ def output = readFile(file: "tmp.txt")
+ return (output != "")
+}
+
+def getBaseDockerImageName(){
+ def img
+ if (params.USE_CUSTOM_DOCKER != ""){
+ img = "${params.USE_CUSTOM_DOCKER}"
+ }
+ else{
+ def ROCM_numeric = "${params.ROCMVERSION}" as float
+ if ( ROCM_numeric < 6.5 ){
+ img = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm${params.ROCMVERSION}"
+ }
+ else{
+ img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm${params.ROCMVERSION}"
+ }
+ }
+ return img
+}
+
+def getDockerImageName(){
+ def img
+ def base_name = getBaseDockerImageName()
+ if (params.USE_CUSTOM_DOCKER != ""){
+ img = "${params.USE_CUSTOM_DOCKER}"
+ }
+ else{
+ if (params.COMPILER_VERSION == "") {
+ img = "${base_name}"
+ }
+ else{
+ if (params.COMPILER_COMMIT == ""){
+ img = "${base_name}_${params.COMPILER_VERSION}"
+ }
+ else{
+ def commit = "${params.COMPILER_COMMIT}"[0..6]
+ img = "${base_name}_${params.COMPILER_VERSION}_${commit}"
+ }
+ }
+ }
+ return img
+}
+
+def check_host() {
+ if ("${env.CK_SCCACHE}" != "null"){
+ def SCCACHE_SERVER="${env.CK_SCCACHE.split(':')[0]}"
+ echo "sccache server: ${SCCACHE_SERVER}"
+ sh '''ping -c 1 -p 6379 "${SCCACHE_SERVER}" | echo $? > tmp.txt'''
+ def output = readFile(file: "tmp.txt")
+ echo "tmp.txt contents: \$output"
+ return (output != "0")
+ }
+ else{
+ return 1
+ }
+}
+
+def build_compiler(){
+ def compiler
+ compiler = "${params.BUILD_COMPILER}"
+ return compiler
+}
+
+def getDockerImage(Map conf=[:]){
+ env.DOCKER_BUILDKIT=1
+ def prefixpath = conf.get("prefixpath", "/opt/rocm")
+ def no_cache = conf.get("no_cache", false)
+ def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' --build-arg DISABLE_CACHE='git rev-parse ${params.COMPILER_VERSION}' "
+ if(no_cache)
+ {
+ dockerArgs = dockerArgs + " --no-cache "
+ }
+ echo "Docker Args: ${dockerArgs}"
+ def image
+ if ( params.BUILD_LEGACY_OS && conf.get("docker_name", "") != "" ){
+ image = conf.get("docker_name", "")
+ echo "Using legacy docker: ${image}"
+ }
+ else{
+ image = getDockerImageName()
+ echo "Using default docker: ${image}"
+ }
+ //Check if image exists
+ def retimage
+ try
+ {
+ echo "Pulling down image: ${image}"
+ retimage = docker.image("${image}")
+ withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) {
+ retimage.pull()
+ }
+ }
+ catch(Exception ex)
+ {
+ error "Unable to locate image: ${image}"
+ }
+ return [retimage, image]
+}
+
+def buildDocker(install_prefix){
+ show_node_info()
+ env.DOCKER_BUILDKIT=1
+ checkout scm
+ def image_name = getDockerImageName()
+ def base_image_name = getBaseDockerImageName()
+ echo "Building Docker for ${image_name}"
+ def dockerArgs = "--build-arg PREFIX=${install_prefix} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+ if(params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
+ dockerArgs = dockerArgs + " --no-cache --build-arg BASE_DOCKER='${base_image_name}' -f Dockerfile.compiler . "
+ }
+ else{
+ dockerArgs = dockerArgs + " -f Dockerfile . "
+ }
+ echo "Build Args: ${dockerArgs}"
+ try{
+ if(params.BUILD_DOCKER){
+ //force building the new docker if that parameter is true
+ echo "Building image: ${image_name}"
+ retimage = docker.build("${image_name}", dockerArgs)
+ withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) {
+ retimage.push()
+ }
+ sh 'docker images -q -f dangling=true | xargs --no-run-if-empty docker rmi'
+ }
+ else{
+ echo "Checking for image: ${image_name}"
+ sh "docker manifest inspect --insecure ${image_name}"
+ echo "Image: ${image_name} found! Skipping building image"
+ }
+ }
+ catch(Exception ex){
+ echo "Unable to locate image: ${image_name}. Building image now"
+ retimage = docker.build("${image_name}", dockerArgs + ' .')
+ withDockerRegistry([ credentialsId: "ck_docker_cred", url: "" ]) {
+ retimage.push()
+ }
+ }
+}
+
+def cmake_build(Map conf=[:]){
+
+ def compiler = build_compiler()
+ def config_targets = conf.get("config_targets","check")
+ def debug_flags = "-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined " + conf.get("extradebugflags", "")
+ def build_envs = "CTEST_PARALLEL_LEVEL=4 " + conf.get("build_env","")
+ def prefixpath = conf.get("prefixpath","/opt/rocm")
+ def setup_args = conf.get("setup_args","")
+
+ if (prefixpath != "/usr/local"){
+ setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} "
+ }
+
+ def build_type_debug = (conf.get("build_type",'release') == 'debug')
+
+ //cmake_env can overwrite default CXX variables.
+ def cmake_envs = "CXX=${compiler} CXXFLAGS='-Werror' " + conf.get("cmake_ex_env","")
+
+ def package_build = (conf.get("package_build","") == "true")
+
+ if (package_build == true) {
+ config_targets = "package"
+ }
+
+ if(conf.get("build_install","") == "true")
+ {
+ config_targets = 'install ' + config_targets
+ setup_args = ' -DBUILD_DEV=On -DCMAKE_INSTALL_PREFIX=../install' + setup_args
+ } else{
+ setup_args = ' -DBUILD_DEV=On' + setup_args
+ }
+ if (params.DISABLE_DL_KERNELS){
+ setup_args = setup_args + " -DDISABLE_DL_KERNELS=ON "
+ }
+
+ if(build_type_debug){
+ setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'" + setup_args
+ }else{
+ setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args
+ }
+
+ def pre_setup_cmd = """
+ #!/bin/bash
+ echo \$HSA_ENABLE_SDMA
+ ulimit -c unlimited
+ rm -rf build
+ mkdir build
+ rm -rf install
+ mkdir install
+ cd build
+ """
+ def invocation_tag=""
+ if (setup_args.contains("gfx12")){
+ invocation_tag="gfx12"
+ }
+ if (setup_args.contains("gfx11")){
+ invocation_tag="gfx11"
+ }
+ if (setup_args.contains("gfx10")){
+ invocation_tag="gfx10"
+ }
+ if (setup_args.contains("gfx908")){
+ invocation_tag="gfx908"
+ }
+ if (setup_args.contains("gfx90a")){
+ invocation_tag="gfx90a"
+ }
+ if (setup_args.contains("gfx94")){
+ invocation_tag="gfx94"
+ }
+ echo "invocation tag: ${invocation_tag}"
+ def redis_pre_setup_cmd = pre_setup_cmd
+ if(check_host() && params.USE_SCCACHE && "${env.CK_SCCACHE}" != "null" && "${invocation_tag}" != "") {
+ redis_pre_setup_cmd = pre_setup_cmd + """
+ #!/bin/bash
+ export ROCM_PATH=/opt/rocm
+ export SCCACHE_ENABLED=true
+ export SCCACHE_LOG_LEVEL=debug
+ export SCCACHE_IDLE_TIMEOUT=14400
+ export COMPILERS_HASH_DIR=/tmp/.sccache
+ export SCCACHE_BIN=/usr/local/.cargo/bin/sccache
+ export SCCACHE_EXTRAFILES=/tmp/.sccache/rocm_compilers_hash_file
+ export SCCACHE_REDIS="redis://${env.CK_SCCACHE}"
+ echo "connect = ${env.CK_SCCACHE}" >> ../script/redis-cli.conf
+ export SCCACHE_C_CUSTOM_CACHE_BUSTER="${invocation_tag}"
+ echo \$SCCACHE_C_CUSTOM_CACHE_BUSTER
+ stunnel ../script/redis-cli.conf
+ ../script/sccache_wrapper.sh --enforce_redis
+ """
+ try {
+ def cmd1 = conf.get("cmd1", """
+ ${redis_pre_setup_cmd}
+ """)
+ sh cmd1
+ setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache " + setup_args
+ }
+ catch(Exception err){
+ echo "could not connect to redis server: ${err.getMessage()}. will not use sccache."
+ def cmd2 = conf.get("cmd2", """
+ ${pre_setup_cmd}
+ """)
+ sh cmd2
+ }
+ }
+ else{
+ def cmd3 = conf.get("cmd3", """
+ ${pre_setup_cmd}
+ """)
+ sh cmd3
+ }
+
+ // reduce parallelism when compiling, clang uses too much memory
+ def nt = nthreads()
+ def cmd
+ def setup_cmd
+ def build_cmd
+ def execute_cmd = conf.get("execute_cmd", "")
+ if(!setup_args.contains("NO_CK_BUILD")){
+ if (setup_args.contains("gfx90a") && params.NINJA_BUILD_TRACE){
+ echo "running ninja build trace"
+ setup_cmd = conf.get("setup_cmd", """${cmake_envs} cmake -G Ninja ${setup_args} -DCMAKE_CXX_FLAGS=" -O3 -ftime-trace " .. """)
+ build_cmd = conf.get("build_cmd", "${build_envs} ninja -j${nt} ${config_targets}")
+ }
+ else{
+ setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ")
+ build_cmd = conf.get("build_cmd", "${build_envs} make -j${nt} ${config_targets}")
+ }
+ cmd = conf.get("cmd", """
+ ${setup_cmd}
+ ${build_cmd}
+ ${execute_cmd}
+ """)
+ }
+ else{
+ cmd = conf.get("cmd", """
+ ${execute_cmd}
+ """)
+ }
+
+ echo cmd
+
+ dir("build"){
+ //build CK
+ sh cmd
+ //run tests except when NO_CK_BUILD or BUILD_LEGACY_OS are set
+ if(!setup_args.contains("NO_CK_BUILD") && !params.BUILD_LEGACY_OS){
+ if (setup_args.contains("gfx90a") && params.NINJA_BUILD_TRACE){
+ sh "/ninjatracing/ninjatracing .ninja_log > ck_build_trace.json"
+ sh "/ClangBuildAnalyzer/build/ClangBuildAnalyzer --all . clang_build.log"
+ sh "/ClangBuildAnalyzer/build/ClangBuildAnalyzer --analyze clang_build.log > clang_build_analysis.log"
+ archiveArtifacts "ck_build_trace.json"
+ archiveArtifacts "clang_build_analysis.log"
+ // do not run unit tests when building instances only
+ if(!params.BUILD_INSTANCES_ONLY){
+ sh "ninja test"
+ }
+ }
+ else{
+ // run unit tests unless building library for all targets
+ if (!params.BUILD_INSTANCES_ONLY){
+ sh "make check"
+ }
+ }
+ }
+ }
+
+ // Only archive from master or develop
+ if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) {
+ archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
+ }
+ //check the node gpu architecture
+ def arch_type = 0
+ sh 'rocminfo | tee rocminfo.log'
+ if ( runShell('grep -n "gfx90a" rocminfo.log') ){
+ arch_type = 1
+ }
+ else if ( runShell('grep -n "gfx942" rocminfo.log') ) {
+ arch_type = 2
+ }
+ if (params.RUN_CK_TILE_FMHA_TESTS){
+ try{
+ archiveArtifacts "perf_fmha_*.log"
+ if (arch_type == 1){
+ stash includes: "perf_fmha_**_gfx90a.log", name: "perf_fmha_log_gfx90a"
+ }
+ else if (arch_type == 2){
+ stash includes: "perf_fmha_**_gfx942.log", name: "perf_fmha_log_gfx942"
+ }
+ }
+ catch(Exception err){
+ echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
+ }
+ }
+ if (params.RUN_CK_TILE_GEMM_TESTS){
+ try{
+ archiveArtifacts "perf_tile_gemm_**.log"
+ if (arch_type == 1){
+ stash includes: "perf_tile_gemm_**_gfx90a.log", name: "perf_tile_gemm_log_gfx90a"
+ }
+ else if (arch_type == 2){
+ stash includes: "perf_tile_gemm_**_gfx942.log", name: "perf_tile_gemm_log_gfx942"
+ }
+ }
+ catch(Exception err){
+ echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
+ }
+ }
+}
+
+def buildHipClangJob(Map conf=[:]){
+ show_node_info()
+
+ env.HSA_ENABLE_SDMA=0
+ checkout scm
+
+ def image
+ if ( params.BUILD_LEGACY_OS && conf.get("docker_name", "") != "" ){
+ image = conf.get("docker_name", "")
+ echo "Using legacy docker: ${image}"
+ }
+ else{
+ image = getDockerImageName()
+ echo "Using default docker: ${image}"
+ }
+ def prefixpath = conf.get("prefixpath", "/opt/rocm")
+
+ // Jenkins is complaining about the render group
+ def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+ if (conf.get("enforce_xnack_on", false)) {
+ dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
+ }
+ def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg CK_SCCACHE='${env.CK_SCCACHE}' --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+ if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
+ dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
+ }
+ def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
+ def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
+ dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
+ echo "Docker flags: ${dockerOpts}"
+
+ def variant = env.STAGE_NAME
+
+ def retimage
+ (retimage, image) = getDockerImage(conf)
+
+ gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+ withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+ timeout(time: 20, unit: 'HOURS')
+ {
+ cmake_build(conf)
+ }
+ }
+ }
+ return retimage
+}
+
+def reboot(){
+ build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),]
+}
+
+def buildHipClangJobAndReboot(Map conf=[:]){
+ try{
+ buildHipClangJob(conf)
+ }
+ catch(e){
+ echo "throwing error exception for the stage"
+ echo 'Exception occurred: ' + e.toString()
+ throw e
+ }
+ finally{
+ if (!conf.get("no_reboot", false)) {
+ reboot()
+ }
+ }
+}
+
+def Build_CK(Map conf=[:]){
+ show_node_info()
+
+ env.HSA_ENABLE_SDMA=0
+ env.DOCKER_BUILDKIT=1
+ checkout scm
+
+ def image
+ if ( params.BUILD_LEGACY_OS && conf.get("docker_name", "") != "" ){
+ image = conf.get("docker_name", "")
+ echo "Using legacy docker: ${image}"
+ }
+ else{
+ image = getDockerImageName()
+ echo "Using default docker: ${image}"
+ }
+
+ def prefixpath = conf.get("prefixpath", "/opt/rocm")
+
+ // Jenkins is complaining about the render group
+ def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+ if (conf.get("enforce_xnack_on", false)) {
+ dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
+ }
+ def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+ if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline" || params.COMPILER_COMMIT != ""){
+ dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
+ }
+ if(params.BUILD_LEGACY_OS){
+ dockerOpts = dockerOpts + " --env LD_LIBRARY_PATH='/opt/Python-3.8.13/lib' "
+ }
+ def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
+ def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
+ dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
+ echo "Docker flags: ${dockerOpts}"
+
+ def variant = env.STAGE_NAME
+ def retimage
+
+ gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+ try {
+ (retimage, image) = getDockerImage(conf)
+ withDockerContainer(image: image, args: dockerOpts) {
+ timeout(time: 2, unit: 'MINUTES'){
+ sh 'rocminfo | tee rocminfo.log'
+ if ( !runShell('grep -n "gfx" rocminfo.log') ){
+ throw new Exception ("GPU not found")
+ }
+ else{
+ echo "GPU is OK"
+ }
+ }
+ }
+ }
+ catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+ echo "The job was cancelled or aborted"
+ throw e
+ }
+ withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+ timeout(time: 20, unit: 'HOURS')
+ {
+ //check whether to run performance tests on this node
+ def arch_type = 0
+ sh 'rocminfo | tee rocminfo.log'
+ if ( runShell('grep -n "gfx90a" rocminfo.log') ){
+ arch_type = 1
+ }
+ else if ( runShell('grep -n "gfx942" rocminfo.log') ) {
+ arch_type = 2
+ }
+ else if ( runShell('grep -n "gfx10" rocminfo.log') ) {
+ arch_type = 3
+ }
+ else if ( runShell('grep -n "gfx11" rocminfo.log') ) {
+ arch_type = 4
+ }
+ else if ( runShell('grep -n "gfx12" rocminfo.log') ) {
+ arch_type = 5
+ }
+ else if ( runShell('grep -n "gfx908" rocminfo.log') ) {
+ arch_type = 6
+ }
+ cmake_build(conf)
+ if ( params.RUN_INDUCTOR_TESTS && !params.BUILD_LEGACY_OS && arch_type == 1 ){
+ echo "Run inductor codegen tests"
+ sh """
+ python3 -m venv ${env.WORKSPACE}
+ . ${env.WORKSPACE}/bin/activate
+ python3 -m pip install pytest build setuptools setuptools_scm
+ python3 -m pip install .
+ python3 -m pytest python/test/test_gen_instances.py
+ """
+ }
+ dir("build"){
+ if (params.RUN_FULL_QA && arch_type == 1 ){
+ // build deb packages for all gfx9 targets on gfx90a system and prepare to export
+ echo "Build ckProfiler package"
+ sh 'make -j package'
+ archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb'
+ sh 'mv composablekernel-ckprofiler_*.deb ckprofiler_0.2.0_amd64.deb'
+ stash includes: "ckprofiler_0.2.0_amd64.deb", name: "ckprofiler_0.2.0_amd64.deb"
+ }
+ }
+ // run performance tests, stash the logs, results will be processed on the master node
+ dir("script"){
+ if (params.RUN_PERFORMANCE_TESTS){
+ if (params.RUN_FULL_QA && arch_type == 1){
+ // run full tests on gfx90a
+ echo "Run full performance tests"
+ sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
+ archiveArtifacts "perf_gemm.log"
+ archiveArtifacts "perf_resnet50_N256.log"
+ archiveArtifacts "perf_resnet50_N4.log"
+ archiveArtifacts "perf_batched_gemm.log"
+ archiveArtifacts "perf_grouped_gemm.log"
+ archiveArtifacts "perf_grouped_conv_fwd.log"
+ archiveArtifacts "perf_grouped_conv_bwd_data.log"
+ archiveArtifacts "perf_grouped_conv_bwd_weight.log"
+ archiveArtifacts "perf_gemm_bilinear.log"
+ archiveArtifacts "perf_reduction.log"
+ archiveArtifacts "perf_splitK_gemm.log"
+ archiveArtifacts "perf_onnx_gemm.log"
+ archiveArtifacts "perf_mixed_gemm.log"
+ stash includes: "perf_**.log", name: "perf_log"
+ }
+ else if ( arch_type == 1 ){
+ // run standard tests on gfx90a
+ echo "Run performance tests"
+ sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
+ archiveArtifacts "perf_gemm.log"
+ archiveArtifacts "perf_onnx_gemm.log"
+ archiveArtifacts "perf_resnet50_N256.log"
+ archiveArtifacts "perf_resnet50_N4.log"
+ stash includes: "perf_**.log", name: "perf_log"
+ }
+ // disable performance tests on gfx1030 for now.
+ //else if ( arch_type == 3){
+ // run basic tests on gfx1030
+ // echo "Run gemm performance tests"
+ // sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx10"
+ // archiveArtifacts "perf_onnx_gemm_gfx10.log"
+ // stash includes: "perf_onnx_gemm_gfx10.log", name: "perf_log_gfx10"
+ //}
+ else if ( arch_type == 4){
+ // run basic tests on gfx11
+ echo "Run gemm performance tests"
+ sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx11"
+ archiveArtifacts "perf_onnx_gemm_gfx11.log"
+ stash includes: "perf_onnx_gemm_gfx11.log", name: "perf_log_gfx11"
+ }
+ else if ( arch_type == 5 ){
+ // run basic tests on gfx12
+ echo "Run gemm performance tests"
+ sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx12"
+ archiveArtifacts "perf_onnx_gemm_gfx12.log"
+ stash includes: "perf_onnx_gemm_gfx12.log", name: "perf_log_gfx12"
+ }
+ else if ( arch_type == 6 ){
+ // run basic tests on gfx908
+ echo "Run performance tests"
+ sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx908"
+ archiveArtifacts "perf_onnx_gemm_gfx908.log"
+ stash includes: "perf_onnx_gemm_gfx908.log", name: "perf_log_gfx908"
+ }
+ }
+ }
+ if (params.hipTensor_test && arch_type == 1 ){
+ // build and test hipTensor on gfx90a node
+ sh """#!/bin/bash
+ rm -rf "${params.hipTensor_branch}".zip
+ rm -rf hipTensor-"${params.hipTensor_branch}"
+ wget https://github.com/ROCm/hipTensor/archive/refs/heads/"${params.hipTensor_branch}".zip
+ unzip -o "${params.hipTensor_branch}".zip
+ """
+ dir("hipTensor-${params.hipTensor_branch}"){
+ sh """#!/bin/bash
+ mkdir -p build
+ ls -ltr
+ CC=hipcc CXX=hipcc cmake -Bbuild . -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install"
+ cmake --build build -- -j
+ ctest --test-dir build
+ """
+ }
+ }
+ }
+ }
+ }
+ return retimage
+}
+
+def Build_CK_and_Reboot(Map conf=[:]){
+ try{
+ Build_CK(conf)
+ }
+ catch(e){
+ echo "throwing error exception while building CK"
+ echo 'Exception occurred: ' + e.toString()
+ throw e
+ }
+ finally{
+ if (!conf.get("no_reboot", false)) {
+ reboot()
+ }
+ }
+}
+
+def process_results(Map conf=[:]){
+ env.HSA_ENABLE_SDMA=0
+ checkout scm
+ def image = getDockerImageName()
+ def prefixpath = "/opt/rocm"
+
+ // Jenkins is complaining about the render group
+ def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+ if (conf.get("enforce_xnack_on", false)) {
+ dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
+ }
+
+ def variant = env.STAGE_NAME
+ def retimage
+
+ gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+ try {
+ (retimage, image) = getDockerImage(conf)
+ }
+ catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+ echo "The job was cancelled or aborted"
+ throw e
+ }
+ }
+
+ withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+ timeout(time: 15, unit: 'MINUTES'){
+ try{
+ dir("script"){
+ if (params.RUN_CK_TILE_FMHA_TESTS){
+ try{
+ unstash "perf_fmha_log_gfx942"
+ unstash "perf_fmha_log_gfx90a"
+ }
+ catch(Exception err){
+ echo "could not locate the FMHA performance logs: ${err.getMessage()}."
+ }
+ }
+ if (params.RUN_CK_TILE_GEMM_TESTS){
+ try{
+ unstash "perf_tile_gemm_log_gfx942"
+ unstash "perf_tile_gemm_log_gfx90a"
+ }
+ catch(Exception err){
+ echo "could not locate the GEMM performance logs: ${err.getMessage()}."
+ }
+ }
+ if (params.RUN_FULL_QA){
+ // unstash perf files to master
+ unstash "ckprofiler_0.2.0_amd64.deb"
+ sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no ckprofiler_0.2.0_amd64.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/"
+ unstash "perf_log"
+ try{
+ unstash "perf_log_gfx11"
+ unstash "perf_log_gfx12"
+ }
+ catch(Exception err){
+ echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}."
+ }
+ sh "./process_qa_data.sh"
+ }
+ else{
+ // unstash perf files to master
+ unstash "perf_log"
+ try{
+ unstash "perf_log_gfx11"
+ unstash "perf_log_gfx12"
+ }
+ catch(Exception err){
+ echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}."
+ }
+ sh "./process_perf_data.sh"
+ }
+ }
+ }
+ catch(e){
+ echo "Throwing error exception while processing performance test results"
+ echo 'Exception occurred: ' + e.toString()
+ throw e
+ }
+ finally{
+ echo "Finished processing performance test results"
+ }
+ }
+ }
+}
+
+//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;ROCMVERSION=6.4;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
+ 0 21 * * * % ROCMVERSION=6.4;hipTensor_test=true;RUN_CODEGEN_TESTS=true;BUILD_GFX908=true
+ 0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
+ 0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
+ 0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false
+ 0 13 * * * % BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false''' : ""
+
+pipeline {
+ agent none
+ triggers {
+ parameterizedCron(CRON_SETTINGS)
+ }
+ options {
+ parallelsAlwaysFailFast()
+ }
+ parameters {
+ booleanParam(
+ name: "BUILD_DOCKER",
+ defaultValue: false,
+ description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
+ string(
+ name: 'USE_CUSTOM_DOCKER',
+ defaultValue: '',
+ description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
+ string(
+ name: 'ROCMVERSION',
+ defaultValue: '6.4',
+ description: 'Specify which ROCM version to use: 6.3 (default).')
+ string(
+ name: 'COMPILER_VERSION',
+ defaultValue: '',
+ description: 'Specify which version of compiler to use: release, amd-staging, amd-mainline, or leave blank (default).')
+ string(
+ name: 'COMPILER_COMMIT',
+ defaultValue: '',
+ description: 'Specify which commit of compiler branch to use: leave blank to use the latest commit (default), or use some specific commit of llvm-project branch.')
+ string(
+ name: 'BUILD_COMPILER',
+ defaultValue: '/opt/rocm/llvm/bin/clang++',
+ description: 'Build CK with /opt/rocm/bin/hipcc, /llvm-project/build/bin/clang++, or with /opt/rocm/llvm/bin/clang++ (default).')
+ booleanParam(
+ name: "RUN_FULL_QA",
+ defaultValue: false,
+ description: "Select whether to run small set of performance tests (default) or full QA")
+ booleanParam(
+ name: "DISABLE_DL_KERNELS",
+ defaultValue: false,
+ description: "Select whether to build DL kernels (default: OFF)")
+ booleanParam(
+ name: "hipTensor_test",
+ defaultValue: false,
+ description: "Use the CK build to verify hipTensor build and tests (default: OFF)")
+ string(
+ name: 'hipTensor_branch',
+ defaultValue: 'mainline',
+ description: 'Specify which branch of hipTensor to use (default: mainline)')
+ booleanParam(
+ name: "USE_SCCACHE",
+ defaultValue: true,
+ description: "Use the sccache for building CK (default: ON)")
+ booleanParam(
+ name: "RUN_CPPCHECK",
+ defaultValue: false,
+ description: "Run the cppcheck static analysis (default: OFF)")
+ booleanParam(
+ name: "RUN_PERFORMANCE_TESTS",
+ defaultValue: true,
+ description: "Run the performance tests (default: ON)")
+ booleanParam(
+ name: "RUN_GROUPED_CONV_LARGE_CASES_TESTS",
+ defaultValue: false,
+ description: "Run the grouped conv large cases tests (default: OFF)")
+ booleanParam(
+ name: "RUN_CODEGEN_TESTS",
+ defaultValue: false,
+ description: "Run codegen tests (default: OFF)")
+ booleanParam(
+ name: "RUN_CK_TILE_FMHA_TESTS",
+ defaultValue: false,
+ description: "Run the ck_tile FMHA tests (default: OFF)")
+ booleanParam(
+ name: "RUN_CK_TILE_GEMM_TESTS",
+ defaultValue: false,
+ description: "Run the ck_tile GEMM tests (default: OFF)")
+ booleanParam(
+ name: "BUILD_INSTANCES_ONLY",
+ defaultValue: false,
+ description: "Test building instances for various architectures simultaneously (default: OFF)")
+ booleanParam(
+ name: "BUILD_GFX908",
+ defaultValue: false,
+ description: "Build CK and run tests on gfx908 (default: OFF)")
+ booleanParam(
+ name: "BUILD_GFX12",
+ defaultValue: true,
+ description: "Build CK and run tests on gfx12 (default: ON)")
+ booleanParam(
+ name: "NINJA_BUILD_TRACE",
+ defaultValue: false,
+ description: "Generate a ninja build trace (default: OFF)")
+ booleanParam(
+ name: "BUILD_LEGACY_OS",
+ defaultValue: false,
+ description: "Try building CK with legacy OS dockers: RHEL8 and SLES15 (default: OFF)")
+ booleanParam(
+ name: "RUN_INDUCTOR_TESTS",
+ defaultValue: false,
+ description: "Run inductor codegen tests (default: OFF)")
+ }
+ environment{
+ dbuser = "${dbuser}"
+ dbpassword = "${dbpassword}"
+ dbsship = "${dbsship}"
+ dbsshport = "${dbsshport}"
+ dbsshuser = "${dbsshuser}"
+ dbsshpassword = "${dbsshpassword}"
+ ck_git_creds = "${ck_git_creds}"
+ gerrit_cred="${gerrit_cred}"
+ DOCKER_BUILDKIT = "1"
+ }
+ stages{
+ stage("Build Docker"){
+ parallel{
+ stage('Docker /opt/rocm'){
+ agent{ label rocmnode("nogpu") }
+ steps{
+ buildDocker('/opt/rocm')
+ cleanWs()
+ }
+ }
+ }
+ }
+ stage("Static checks") {
+ parallel{
+ stage('Clang Format and Cppcheck') {
+ when {
+ beforeAgent true
+ expression { params.RUN_CPPCHECK.toBoolean() }
+ }
+ agent{ label rocmnode("nogpu") }
+ environment{
+ setup_args = "NO_CK_BUILD"
+ execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \
+ -o -not -path \'*.git*\' -iname \'*.hpp\' \
+ -o -not -path \'*.git*\' -iname \'*.cpp\' \
+ -o -iname \'*.h.in\' \
+ -o -iname \'*.hpp.in\' \
+ -o -iname \'*.cpp.in\' \
+ -o -iname \'*.cl\' \
+ | grep -v 'build/' \
+ | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-12 -style=file {} | diff - {}\' && \
+ /cppcheck/build/bin/cppcheck ../* -v -j \$(nproc) -I ../include -I ../profiler/include -I ../library/include \
+ -D CK_ENABLE_FP64 -D CK_ENABLE_FP32 -D CK_ENABLE_FP16 -D CK_ENABLE_FP8 -D CK_ENABLE_BF16 -D CK_ENABLE_BF8 -D CK_ENABLE_INT8 \
+ -D __gfx908__ -D __gfx90a__ -D __gfx942__ -D __gfx1030__ -D __gfx1100__ -D __gfx1101__ -D __gfx1102__ \
+ -U __gfx803__ -U __gfx900__ -U __gfx906__ -U CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 \
+ --file-filter=*.cpp --force --enable=all --output-file=ck_cppcheck.log"
+ }
+ steps{
+ buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
+ archiveArtifacts "build/ck_cppcheck.log"
+ cleanWs()
+ }
+ }
+ stage('Clang Format') {
+ when {
+ beforeAgent true
+ expression { !params.RUN_CPPCHECK.toBoolean() }
+ }
+ agent{ label rocmnode("nogpu") }
+ environment{
+ setup_args = "NO_CK_BUILD"
+ execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \
+ -o -not -path \'*.git*\' -iname \'*.hpp\' \
+ -o -not -path \'*.git*\' -iname \'*.cpp\' \
+ -o -iname \'*.h.in\' \
+ -o -iname \'*.hpp.in\' \
+ -o -iname \'*.cpp.in\' \
+ -o -iname \'*.cl\' \
+ | grep -v 'build/' \
+ | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-12 -style=file {} | diff - {}\'"
+ }
+ steps{
+ buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
+ cleanWs()
+ }
+ }
+ }
+ }
+ stage("Run Grouped Conv Large Case Tests")
+ {
+ parallel
+ {
+ stage("Run Grouped Conv Large Case Tests on gfx90a")
+ {
+ when {
+ beforeAgent true
+ expression { params.RUN_GROUPED_CONV_LARGE_CASES_TESTS.toBoolean() }
+ }
+ agent{ label rocmnode("gfx90a")}
+ environment{
+ setup_args = "NO_CK_BUILD"
+ execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \
+ make -j64 test_grouped_convnd_fwd_large_cases_xdl test_grouped_convnd_bwd_data_xdl_large_cases && \
+ ./bin/test_grouped_convnd_fwd_large_cases_xdl && ./bin/test_grouped_convnd_bwd_data_xdl_large_cases"""
+ }
+ steps{
+ buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+ cleanWs()
+ }
+ }
+ }
+ }
+ stage("Run Codegen Tests")
+ {
+ parallel
+ {
+ stage("Run Codegen Tests on gfx90a")
+ {
+ when {
+ beforeAgent true
+ expression { params.RUN_CODEGEN_TESTS.toBoolean() }
+ }
+ agent{ label rocmnode("gfx90a")}
+ environment{
+ setup_args = "NO_CK_BUILD"
+ execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen && \
+ make -j64 check"""
+ }
+ steps{
+ buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+ cleanWs()
+ }
+ }
+ }
+ }
+ stage("Run CK_TILE_FMHA Tests")
+ {
+ parallel
+ {
+ stage("Run CK_TILE_FMHA Tests on gfx90a")
+ {
+ when {
+ beforeAgent true
+ expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
+ }
+ agent{ label rocmnode("gfx90a") }
+ environment{
+ setup_args = "NO_CK_BUILD"
+ execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \
+ make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
+ cd ../ &&
+ example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
+ }
+ steps{
+ buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+ cleanWs()
+ }
+ }
+ stage("Run CK_TILE_FMHA Tests on gfx942")
+ {
+ when {
+ beforeAgent true
+ expression { params.RUN_CK_TILE_FMHA_TESTS.toBoolean() }
+ }
+ agent{ label rocmnode("gfx942") }
+ environment{
+ setup_args = "NO_CK_BUILD"
+ execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \
+ make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
+ cd ../ &&
+ example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
+ }
+ steps{
+ buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+ cleanWs()
+ }
+ }
+ }
+ }
+ stage("Run CK_TILE_GEMM Tests")
+ {
+ parallel
+ {
+ stage("Run CK_TILE_GEMM Tests on gfx90a")
+ {
+ when {
+ beforeAgent true
+ expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
+ }
+ agent{ label rocmnode("gfx90a") }
+ environment{
+ setup_args = "NO_CK_BUILD"
+ execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \
+ make -j64 tile_example_gemm_universal && \
+ cd ../ &&
+ example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
+ }
+ steps{
+ buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+ cleanWs()
+ }
+ }
+ stage("Run CK_TILE_GEMM Tests on gfx942")
+ {
+ when {
+ beforeAgent true
+ expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() }
+ }
+ agent{ label rocmnode("gfx942") }
+ environment{
+ setup_args = "NO_CK_BUILD"
+ execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \
+ make -j64 tile_example_gemm_universal && \
+ cd ../ &&
+ example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
+ }
+ steps{
+ buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+ cleanWs()
+ }
+ }
+ }
+ }
+
+ stage("Build CK and run Tests")
+ {
+ parallel
+ {
+ stage("Build CK with RHEL8")
+ {
+ when {
+ beforeAgent true
+ expression { params.BUILD_LEGACY_OS.toBoolean() }
+ }
+ agent{ label rocmnode("gfx90a") }
+ environment{
+ def docker_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_rhel8_rocm6.3"
+ setup_args = """ -DGPU_TARGETS="gfx942" \
+ -DCMAKE_CXX_FLAGS=" -O3 " \
+ -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
+ execute_args = " "
+ }
+ steps{
+ Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name)
+ cleanWs()
+ }
+ }
+ stage("Build CK with SLES15")
+ {
+ when {
+ beforeAgent true
+ expression { params.BUILD_LEGACY_OS.toBoolean() }
+ }
+ agent{ label rocmnode("gfx90a") }
+ environment{
+ def docker_name = "${env.CK_DOCKERHUB_PRIVATE}:ck_sles15_rocm6.3"
+ setup_args = """ -DGPU_TARGETS="gfx942" \
+ -DCMAKE_CXX_FLAGS=" -O3 " \
+ -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
+ execute_args = " "
+ }
+ steps{
+ Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name)
+ cleanWs()
+ }
+ }
+ stage("Build CK for all gfx9 targets")
+ {
+ when {
+ beforeAgent true
+ expression { params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+ }
+ agent{ label rocmnode("gfx942") }
+ environment{
+ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \
+ -DGPU_TARGETS="gfx90a;gfx942" \
+ -DCMAKE_CXX_FLAGS=" -O3 " """
+ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
+ cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
+ -DGPU_TARGETS="gfx90a;gfx942" \
+ -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+ -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+ }
+ steps{
+ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+ cleanWs()
+ }
+ }
+ stage("Build CK and run Tests on gfx908")
+ {
+ when {
+ beforeAgent true
+ expression { params.BUILD_GFX908.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+ }
+ agent{ label rocmnode("gfx908") }
+ environment{
+ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908" -DCMAKE_CXX_FLAGS=" -O3 " """
+ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
+ cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
+ -DGPU_TARGETS="gfx908" \
+ -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+ -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+ }
+ steps{
+ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+ cleanWs()
+ }
+ }
+ stage("Build CK and run Tests on gfx90a")
+ {
+ when {
+ beforeAgent true
+ expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+ }
+ agent{ label rocmnode("gfx90a") }
+ environment{
+ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " """
+ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
+ cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
+ -DGPU_TARGETS="gfx90a" \
+ -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+ -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+ }
+ steps{
+ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+ cleanWs()
+ }
+ }
+ stage("Build CK instances for different targets")
+ {
+ when {
+ beforeAgent true
+ expression { params.BUILD_INSTANCES_ONLY.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+ }
+ agent{ label rocmnode("gfx942") }
+ environment{
+ execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
+ -D CMAKE_CXX_COMPILER="${build_compiler()}" \
+ -D CMAKE_BUILD_TYPE=Release \
+ -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1151;gfx1201" \
+ -D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """
+ }
+ steps{
+ buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+ cleanWs()
+ }
+ }
+ stage("Build CK and run Tests on gfx1030")
+ {
+ when {
+ beforeAgent true
+ expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+ }
+ agent{ label rocmnode("gfx1030") }
+ environment{
+ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DCMAKE_CXX_FLAGS=" -O3 " """
+ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
+ cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
+ -DGPU_TARGETS="gfx1030" \
+ -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+ -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+ }
+ steps{
+ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+ cleanWs()
+ }
+ }
+ stage("Build CK and run Tests on gfx1101")
+ {
+ when {
+ beforeAgent true
+ expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+ }
+ agent{ label rocmnode("gfx1101") }
+ environment{
+ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DCMAKE_CXX_FLAGS=" -O3 " """
+ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
+ cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
+ -DGPU_TARGETS="gfx1101" \
+ -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+ -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+ }
+ steps{
+ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+ cleanWs()
+ }
+ }
+ stage("Build CK and run Tests on gfx1201")
+ {
+ when {
+ beforeAgent true
+ expression { params.BUILD_GFX12.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+ }
+ agent{ label rocmnode("gfx1201") }
+ environment{
+ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1201" -DCMAKE_CXX_FLAGS=" -O3 " """
+ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
+ cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
+ -DGPU_TARGETS="gfx1201" \
+ -DCMAKE_CXX_COMPILER="${build_compiler()}" \
+ -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
+ }
+ steps{
+ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+ cleanWs()
+ }
+ }
+ }
+ }
+ stage("Process Performance Test Results")
+ {
+ parallel
+ {
+ stage("Process results"){
+ when {
+ beforeAgent true
+ expression { params.RUN_PERFORMANCE_TESTS.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+ }
+ agent { label 'mici' }
+ steps{
+ process_results()
+ cleanWs()
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000..68f6ae5746
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2018- , Advanced Micro Devices, Inc. (Chao Liu, Jing Zhang)
+Copyright (c) 2019- , Advanced Micro Devices, Inc. (Letao Qin, Qianfeng Zhang, Liang Huang, Shaojie Wang)
+Copyright (c) 2022- , Advanced Micro Devices, Inc. (Anthony Chang, Chunyu Lai, Illia Silin, Adam Osewski, Poyen Chen, Jehandad Khan)
+Copyright (c) 2019-2021, Advanced Micro Devices, Inc. (Hanwen Chang)
+Copyright (c) 2019-2020, Advanced Micro Devices, Inc. (Tejash Shah)
+Copyright (c) 2020 , Advanced Micro Devices, Inc. (Xiaoyan Zhou)
+Copyright (c) 2021-2022, Advanced Micro Devices, Inc. (Jianfeng Yan)
+
+SPDX-License-Identifier: MIT
+Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000..29d3d4e85a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,216 @@
+# Composable Kernel
+
+> [!NOTE]
+> The published documentation is available at [Composable Kernel](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
+
+The Composable Kernel (CK) library provides a programming model for writing performance-critical
+kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library
+uses general purpose kernel languages, such as HIP C++.
+
+CK uses two concepts to achieve performance portability and code maintainability:
+
+* A tile-based programming model
+* Algorithm complexity reduction for complex machine learning (ML) operators. This uses an innovative
+ technique called *Tensor Coordinate Transformation*.
+
+
+
+The current CK library is structured into four layers:
+
+* Templated Tile Operators
+* Templated Kernel and Invoker
+* Instantiated Kernel and Invoker
+* Client API
+
+
+
+## General information
+
+* [CK supported operations](include/ck/README.md)
+* [CK Tile supported operations](include/ck_tile/README.md)
+* [CK wrapper](client_example/25_wrapper/README.md)
+* [CK codegen](codegen/README.md)
+* [CK profiler](profiler/README.md)
+* [Examples (Custom use of CK supported operations)](example/README.md)
+* [Client examples (Use of CK supported operations with instance factory)](client_example/README.md)
+* [Terminology](/TERMINOLOGY.md)
+* [Contributors](/CONTRIBUTORS.md)
+
+CK is released under the **[MIT license](/LICENSE)**.
+
+## Building CK
+
+We recommend building CK inside Docker containers, which include all necessary packages. Pre-built
+Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composable_kernel/tags).
+
+1. To build a new Docker image, use the Dockerfile provided with the source code:
+
+ ```bash
+ DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
+ ```
+
+2. Launch the Docker container:
+
+ ```bash
+ docker run \
+ -it \
+ --privileged \
+ --group-add sudo \
+ -w /root/workspace \
+ -v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
+ ck:latest \
+ /bin/bash
+ ```
+
+3. Clone CK source code from the GitHub repository and start the build:
+
+ ```bash
+ git clone https://github.com/ROCm/composable_kernel.git && \
+ cd composable_kernel && \
+ mkdir build && \
+ cd build
+ ```
+
+ You must set the `GPU_TARGETS` macro to specify the GPU target architecture(s) you want
+ to run CK on. You can specify single or multiple architectures. If you specify multiple architectures,
+ use a semicolon between each; for example, `gfx908;gfx90a;gfx942`.
+
+ ```bash
+ cmake \
+ -D CMAKE_PREFIX_PATH=/opt/rocm \
+ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
+ -D CMAKE_BUILD_TYPE=Release \
+ -D GPU_TARGETS="gfx908;gfx90a" \
+ ..
+ ```
+
+ If you don't set `GPU_TARGETS` on the cmake command line, CK is built for all GPU targets
+ supported by the current compiler (this may take a long time).
+ Tests and examples will only get built if the GPU_TARGETS is set by the user on the cmake command line.
+
+ NOTE: If you try setting `GPU_TARGETS` to a list of architectures, the build will only work if the
+ architectures are similar, e.g., `gfx908;gfx90a`, or `gfx1100;gfx1101;gfx11012`. Otherwise, if you
+ want to build the library for a list of different architectures,
+ you should use the `GPU_ARCHS` build argument, for example `GPU_ARCHS=gfx908;gfx1030;gfx1100;gfx942`.
+
+4. Build the entire CK library:
+
+ ```bash
+ make -j
+ ```
+
+5. Install CK:
+
+ ```bash
+ make -j install
+ ```
+ **[See Note on -j](#notes)**
+
+## Optional post-install steps
+
+* Build examples and tests:
+
+ ```bash
+ make -j examples tests
+ ```
+
+* Build and run all examples and tests:
+
+ ```bash
+ make -j check
+ ```
+
+ You can find instructions for running each individual example in [example](/example).
+
+* Build and run smoke/regression examples and tests:
+
+ ```bash
+ make -j smoke # tests and examples that run for < 30 seconds each
+ ```
+ ```bash
+ make -j regression # tests and examples that run for >= 30 seconds each
+ ```
+
+* Build ckProfiler:
+
+ ```bash
+ make -j ckProfiler
+ ```
+
+ You can find instructions for running ckProfiler in [profiler](/profiler).
+
+* Build our documentation locally:
+
+ ``` bash
+ cd docs
+ pip3 install -r sphinx/requirements.txt
+ python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
+ ```
+
+### Notes
+The `-j` option for building with multiple threads in parallel, which speeds up the build significantly.
+However, `-j` launches unlimited number of threads, which can cause the build to run out of memory and
+crash. On average, you should expect each thread to use ~2Gb of RAM.
+Depending on the number of CPU cores and the amount of RAM on your system, you may want to
+limit the number of threads. For example, if you have a 128-core CPU and 128 Gb of RAM it's advisable to use `-j32`.
+
+Additional cmake flags can be used to significantly speed-up the build:
+
+* `DTYPES` (default is not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build
+ instances of select data types only. The main default data types are fp32 and fp16; you can safely skip
+ other data types.
+
+* `DISABLE_DL_KERNELS` (default is OFF) must be set to ON in order not to build instances, such as `gemm_dl` or
+ `batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most
+ other platforms have faster instances, such as `xdl` or `wmma`, available.
+
+* `DISABLE_DPP_KERNELS` (default is OFF) must be set to ON in order not to build instances, such as `gemm_dpp`.
+ These instances offer a slightly better performance of fp16 gemms on NAVI2x. But on other architectures faster alternatives are available.
+
+* `CK_USE_FP8_ON_UNSUPPORTED_ARCH` (default is OFF) must be set to ON in order to build instances,
+ such as `gemm_universal`, `gemm_universal_streamk` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on
+ architectures like the MI100/MI200 for the functional support only.
+
+## Using sccache for building
+
+The default CK Docker images come with a pre-installed version of sccache, which supports clang
+being used as hip-compiler (" -x hip"). Using sccache can help reduce the time to re-build code from
+hours to 1-2 minutes. In order to invoke sccache, you need to run:
+
+```bash
+ sccache --start-server
+```
+
+then add the following flags to the cmake command line:
+
+```bash
+ -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache
+```
+
+You may need to clean up the build folder and repeat the cmake and make steps in order to take
+advantage of the sccache during subsequent builds.
+
+## Using CK as pre-built kernel library
+
+You can find instructions for using CK as a pre-built kernel library in [client_example](/client_example).
+
+## Contributing to CK
+
+When you contribute to CK, make sure you run `clang-format` on all changed files. We highly
+recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:
+
+```bash
+sudo script/install_precommit.sh
+```
+
+With this approach, `pre-commit` adds the appropriate hooks to your local repository and
+automatically runs `clang-format` (and possibly additional checks) before any commit is created.
+
+If you need to uninstall hooks from the repository, you can do so by running the following command:
+
+```bash
+script/uninstall_precommit.sh
+```
+
+If you need to temporarily disable pre-commit hooks, you can add the `--no-verify` option to the
+`git commit` command.
\ No newline at end of file
diff --git a/TERMINOLOGY.md b/TERMINOLOGY.md
new file mode 100644
index 0000000000..e8833efb89
--- /dev/null
+++ b/TERMINOLOGY.md
@@ -0,0 +1,2 @@
+[Back to the main page](./README.md)
+# Composable Kernel terminology
\ No newline at end of file
diff --git a/client_example/01_gemm/CMakeLists.txt b/client_example/01_gemm/CMakeLists.txt
new file mode 100644
index 0000000000..6c4103cda8
--- /dev/null
+++ b/client_example/01_gemm/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_gemm gemm.cpp)
+target_link_libraries(client_gemm PRIVATE composable_kernel::device_other_operations composable_kernel::device_gemm_operations)
diff --git a/client_example/01_gemm/README.md b/client_example/01_gemm/README.md
new file mode 100644
index 0000000000..6dcd1e2959
--- /dev/null
+++ b/client_example/01_gemm/README.md
@@ -0,0 +1,126 @@
+[Back to supported operations](../../../include/ck/README.md)
+# Composable Kernel GEMM
+
+## GEMM
+General matrix multiplications operation. In CK GEMM operation is called as `DeviceGemm` and requires following types as template parameters:
+
+* **ALayout** - A matrix layout (RowMajor/ColumnMajor).
+* **BLayout** - B matrix layout (RowMajor/ColumnMajor).
+* **CLayout** - B matrix layout (RowMajor/ColumnMajor).
+* **ADataType** - A matrix data type.
+* **BDataType** - B matrix data type.
+* **CDataType** - B matrix data type.
+* **AElementwiseOperation** - Fused operation on tensor A before GEMM.
+* **BElementwiseOperation** - Fused operation on tensor B before GEMM.
+* **CElementwiseOperation** - Fused operation on tensor C after GEMM.
+
+For matrices with large K dimension `DeviceGemmSplitK` implementation is available. This implementation allows user to split K dimension between work groups. This implementation uses `AtomicAdd` operation on global memory, thus need to zero-out output buffer for correct results.
+
+For fused operations with additional tensor there are `DeviceGemmMultipleABD` or `DeviceGemmMultipleD` operation which require following parameters:
+* **DsLayout** - layouts for additional tensors for fused operations.
+* **DsDataType** - data types for additional tensors for fused operations.
+
+For `DeviceGemmMultipleABD` **ALayout**, **BLayout**, **ADataType** and **BDataType** user should pass a tuple.
+
+List of the device operations in CK:
+
+* **DeviceGemmDl** - Device operation with DL instructions.
+* **DeviceGemmDpp** - Device operation with DL instructions with DPP instructions during data load.
+* **DeviceGemmWmma_CShuffle** - Device operation with WMMA instructions with CShuffle optimization for more optimized data store.
+* **DeviceGemm_Xdl_CShuffle_LdsDirectLoad** - Device operation with XDL instructions and CShuffle optimization for more optimized data store and direct load from global memory to shared memory.
+* **DeviceGemm_Xdl_CShuffle** - Device operation with XDL instructions with CShuffle optimization for more optimized data store.
+* **DeviceGemm_Xdl_CShuffleV2** - Device operation with XDL instructions with CShuffle optimization for more optimized data store. GEMM pipeline has been optimized compared to **DeviceGemm_Xdl_CShuffle**.
+* **DeviceGemmXdlSkipBLds** - Device operation with XDL instructions. Load to shared memory has been skiped for B matrix.
+* **DeviceGemm_Xdl_WaveletModel_CShuffle** - Device operation with XDL instructions with CShuffle optimization for more optimized data store. Producer and consumer scheme cooperation between waves in workgroup.
+* **DeviceGemmXdl** - Device operation with XDL instructions.
+
+Table of supported cases by instance factory with XDL instruction for Row/Row/Row, Row/Column/Row, Column/Row/Row or Column/Column/Row:
+
+| |Is supported|
+|-------|---|
+|bf16|✓|
+|fp16|✓|
+|fp32|✓|
+|int8|✓|
+|fp8 |✓|
+
+Table of supported cases by instance factory with WMMA instruction for Row/Row/Row, Row/Column/Row, Column/Row/Row or Column/Column/Row:
+
+| |Is supported|
+|-------|---|
+|bf16|✓|
+|fp16|✓|
+|fp32|✗|
+|int8|✓|
+|fp8 |✗|
+
+Table of supported cases by instance factory with DL instruction for Row/Row/Row, Row/Column/Row, Column/Row/Row or Column/Column/Row:
+
+| |Is supported|
+|-------|---|
+|bf16|✗|
+|fp16|✓|
+|fp32|✓|
+|int8|✓|
+|fp8 |✗|
+
+Table of supported cases by instance factory with fused output elementwise operation:
+
+* **B Matrix Multiply + Add + Gelu** - bf16 (int8 for B matrix)
+* **B Matrix Multiply + Add** - bf16 (int8 for B matrix)
+* **B Matrix Multiply + Gelu** - bf16 (int8 for B matrix)
+* **B Matrix Multiply** - bf16 (int8 for B matrix)
+
+* **Add + Add + Gelu** - fp16
+* **Add + Gelu** - fp16, bf16 (int8 for B matrix) for Row/Column/Row
+* **Multiply** - fp16
+* **Add + Multiply** - fp16
+* **Add + Relu** - fp16 (int8 for B matrix) for Row/Column/Row, bf16 (int8 for B matrix) for Row/Column/Row
+* **Add + Silu** - fp16 (int8 for B matrix) for Row/Column/Row, bf16 (int8 for B matrix) for Row/Column/Row
+* **Add** - fp16 (int8 for B matrix) for Row/Column/Row, bf16 (int8 for B matrix) for Row/Column/Row
+* **Bilinear** - fp16, int8
+* **Gelu** - fp16
+* **Multiply + Add** - fp16 for Row/Column/Row and Row/Row/Row, fp16 (int8 for B matrix, fp32 for Bias) for Row/Column/Row and Row/Row/Row,
+* **Quantization** - int8
+
+## GEMM V2 (Universal GEMM)
+General matrix multiplications operation optimized for MI300 series. Operation is called as `DeviceGemmV2` and requires following types as template parameters:
+
+* **ALayout** - A matrix layout (RowMajor/ColumnMajor).
+* **BLayout** - B matrix layout (RowMajor/ColumnMajor).
+* **CLayout** - B matrix layout (RowMajor/ColumnMajor).
+* **ADataType** - A matrix data type.
+* **BDataType** - B matrix data type.
+* **CDataType** - B matrix data type.
+* **AElementwiseOperation** - Fused operation on tensor A before GEMM.
+* **BElementwiseOperation** - Fused operation on tensor B before GEMM.
+* **CElementwiseOperation** - Fused operation on tensor C after GEMM.
+
+This implementation allows user to split K dimension between work groups. This implementation requires AtomicAdd operation on global memory (output buffer must be set to zeroes if splitK parameter is larger than one).
+
+List of the device operations for in CK:
+
+* **DeviceGemm_Xdl_CShuffleV3** - Device operation with XDL instructions with CShuffle optimization for more optimized data store.
+* **DeviceGemm_Xdl_CShuffleV3R1** - Device operation with XDL instructions with CShuffle optimization for more optimized data store. This implementation perform reduction on splitted K dimension after GEMM instead of AtomicAdd instruction.
+
+Table of supported cases by instance factory with XDL instruction for Row/Row/Row, Row/Column/Row, Column/Row/Row or Column/Column/Row:
+
+| |Is supported|
+|-------|---|
+|bf16|✓|
+|fp16|✓|
+|fp32|✗|
+|int8|✗|
+|fp8 (C bf16)|✓|
+|fp16 (A fp8)|✓|
+|fp16 (B fp8)|✓|
+
+## Others
+
+* **DeviceGemm_dequantB** - GEMM with dequantization (implemented with WMMA instructions).
+* **DeviceGemmMultipleD_ABScale** - GEMM with scale for A and B matrix.
+* **DeviceGemmMultipleDLayernorm** - GEMM fused with layernorm.
+* **DeviceGemmMultipleDMultipleR** - GEMM fused with reductions and custom global reductions operators.
+* **DeviceGemmReduce** - GEMM fused with reduction.
+* **DeviceGemm_Streamk_V2** - GEMM stream K implementation. Implementation allows to use reduction instead of AtomicAdd.
+* **DeviceGemmStreamK** - GEMM stream K implementation using AtomicAdd.
diff --git a/client_example/01_gemm/gemm.cpp b/client_example/01_gemm/gemm.cpp
new file mode 100644
index 0000000000..e63cda6162
--- /dev/null
+++ b/client_example/01_gemm/gemm.cpp
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+using ADataType = F16;
+using BDataType = F16;
+using CDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideC = 4096;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 7)
+ {
+ M = std::stoi(argv[1]);
+ N = std::stoi(argv[2]);
+ K = std::stoi(argv[3]);
+
+ StrideA = std::stoi(argv[4]);
+ StrideB = std::stoi(argv[5]);
+ StrideC = std::stoi(argv[6]);
+ }
+ else
+ {
+ printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideC\n");
+ exit(0);
+ }
+
+ auto f_matrix_space_size =
+ [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+ using Layout = decltype(layout);
+
+ if constexpr(std::is_same::value)
+ {
+ return (nRow - 1) * stride + nCol;
+ }
+ else
+ {
+ return (nCol - 1) * stride + nRow;
+ }
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+ SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
+
+ using DeviceOp =
+ ck::tensor_operation::device::DeviceGemm;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto c_element_op = CElementOp{};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ c_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ StrideC,
+ a_element_op,
+ b_element_op,
+ c_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best intance
+ if(found)
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ c_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ StrideC,
+ a_element_op,
+ b_element_op,
+ c_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
diff --git a/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt b/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
new file mode 100644
index 0000000000..4ba86026b2
--- /dev/null
+++ b/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
@@ -0,0 +1,29 @@
+if(GPU_TARGETS MATCHES "gfx9")
+ add_custom_target(client_gemm_fastgelu_examples)
+
+ add_executable(client_gemm_add_add_fastgelu gemm_add_add_fastgelu.cpp)
+ target_link_libraries(client_gemm_add_add_fastgelu PRIVATE composable_kernel::device_gemm_operations)
+
+ add_executable(client_gemm_add_fastgelu gemm_add_fastgelu.cpp)
+ target_link_libraries(client_gemm_add_fastgelu PRIVATE composable_kernel::device_gemm_operations)
+
+ add_executable(client_gemm_fastgelu gemm_fastgelu.cpp)
+ target_link_libraries(client_gemm_fastgelu PRIVATE composable_kernel::device_gemm_operations)
+
+ add_dependencies(client_gemm_fastgelu_examples client_gemm_add_add_fastgelu client_gemm_add_fastgelu
+ client_gemm_fastgelu)
+
+ add_custom_target(client_gemm_fastgelu_generic_examples)
+
+ add_executable(client_gemm_add_add_fastgelu_generic gemm_add_add_fastgelu_generic.cpp)
+ target_link_libraries(client_gemm_add_add_fastgelu_generic composable_kernel::device_gemm_operations)
+
+ add_executable(client_gemm_add_fastgelu_generic gemm_add_fastgelu_generic.cpp)
+ target_link_libraries(client_gemm_add_fastgelu_generic PRIVATE composable_kernel::device_gemm_operations)
+
+ add_executable(client_gemm_fastgelu_generic gemm_fastgelu_generic.cpp)
+ target_link_libraries(client_gemm_fastgelu_generic PRIVATE composable_kernel::device_gemm_operations)
+
+ add_dependencies(client_gemm_fastgelu_generic_examples client_gemm_add_add_fastgelu_generic
+ client_gemm_add_fastgelu_generic client_gemm_fastgelu_generic)
+endif()
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
new file mode 100644
index 0000000000..5809681661
--- /dev/null
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+
+using ADataType = F16;
+using BDataType = F16;
+using D0DataType = F16;
+using D1DataType = F16;
+using EDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using ELayout = Row;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideD0 = 0;
+ ck::index_t StrideD1 = 4096;
+ ck::index_t StrideE = 4096;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 9)
+ {
+ M = std::stoi(argv[1]);
+ N = std::stoi(argv[2]);
+ K = std::stoi(argv[3]);
+
+ StrideA = std::stoi(argv[4]);
+ StrideB = std::stoi(argv[5]);
+ StrideD0 = std::stoi(argv[6]);
+ StrideD1 = std::stoi(argv[7]);
+ StrideE = std::stoi(argv[8]);
+ }
+ else
+ {
+ printf("arg1 to 8: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
+ exit(0);
+ }
+
+ auto f_matrix_space_size =
+ [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+ using Layout = decltype(layout);
+
+ if constexpr(std::is_same::value)
+ {
+ return (nRow - 1) * stride + nCol;
+ }
+ else
+ {
+ return (nCol - 1) * stride + nRow;
+ }
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+ SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
+ f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+ SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) *
+ f_matrix_space_size(M, N, StrideD1, D1Layout{}));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+ ALayout,
+ BLayout,
+ ck::Tuple,
+ ELayout,
+ ADataType,
+ BDataType,
+ ck::Tuple,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::AddAddFastGelu>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(
+ a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d0_m_n_device_buf.GetDeviceBuffer(),
+ d1_m_n_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ std::array{StrideD0, StrideD1},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best intance
+ if(found)
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(
+ a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d0_m_n_device_buf.GetDeviceBuffer(),
+ d1_m_n_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ std::array{StrideD0, StrideD1},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp
new file mode 100644
index 0000000000..3cc4313aab
--- /dev/null
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+
+using ADataType = F16;
+using BDataType = F16;
+using D0DataType = F16;
+using D1DataType = F16;
+using EDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using ELayout = Row;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideD0 = 0;
+ ck::index_t StrideD1 = 4096;
+ ck::index_t StrideE = 4096;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 9)
+ {
+ M = std::stoi(argv[1]);
+ N = std::stoi(argv[2]);
+ K = std::stoi(argv[3]);
+
+ StrideA = std::stoi(argv[4]);
+ StrideB = std::stoi(argv[5]);
+ StrideD0 = std::stoi(argv[6]);
+ StrideD1 = std::stoi(argv[7]);
+ StrideE = std::stoi(argv[8]);
+ }
+ else
+ {
+ printf("arg1 to 8: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
+ exit(0);
+ }
+
+ auto f_matrix_space_size =
+ [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+ using Layout = decltype(layout);
+
+ if constexpr(std::is_same::value)
+ {
+ return (nRow - 1) * stride + nCol;
+ }
+ else
+ {
+ return (nCol - 1) * stride + nRow;
+ }
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+ SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
+ f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+ SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) *
+ f_matrix_space_size(M, N, StrideD1, D1Layout{}));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+ ALayout,
+ BLayout,
+ ck::Tuple,
+ ELayout,
+ ADataType,
+ BDataType,
+ ck::Tuple,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::AddAddFastGelu>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{};
+
+ // get generic instance
+ auto& op_ptr = op_ptrs[0];
+
+ std::cout << "Run the generic instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+
+ // run the generic instance
+ auto argument_ptr =
+ op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d0_m_n_device_buf.GetDeviceBuffer(),
+ d1_m_n_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ std::array{StrideD0, StrideD1},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+ else
+ {
+ throw std::runtime_error(
+ "Generic instance should be suitable for various input lengths/strides");
+ }
+
+ std::cout << "Done" << std::endl;
+
+ return 0;
+}
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
new file mode 100644
index 0000000000..1fd80d10c7
--- /dev/null
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = AddFastGelu;
+
+using ADataType = F16;
+using BDataType = F16;
+using D0DataType = F16;
+using EDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using D0Layout = Row;
+using ELayout = Row;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideD0 = 0;
+ ck::index_t StrideE = 4096;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 8)
+ {
+ M = std::stoi(argv[1]);
+ N = std::stoi(argv[2]);
+ K = std::stoi(argv[3]);
+
+ StrideA = std::stoi(argv[4]);
+ StrideB = std::stoi(argv[5]);
+ StrideD0 = std::stoi(argv[6]);
+ StrideE = std::stoi(argv[7]);
+ }
+ else
+ {
+ printf("arg1 to 7: M, N, K, StrideA, StrideB, StrideD0, StrideE\n");
+ exit(0);
+ }
+
+ auto f_matrix_space_size =
+ [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+ using Layout = decltype(layout);
+
+ if constexpr(std::is_same::value)
+ {
+ return (nRow - 1) * stride + nCol;
+ }
+ else
+ {
+ return (nCol - 1) * stride + nRow;
+ }
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+ SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
+ f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+ ALayout,
+ BLayout,
+ ck::Tuple,
+ ELayout,
+ ADataType,
+ BDataType,
+ ck::Tuple,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::AddFastGelu>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(
+ a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d0_m_n_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ std::array{StrideD0},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best intance
+ if(found)
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(
+ a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d0_m_n_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ std::array{StrideD0},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp
new file mode 100644
index 0000000000..e54bcfd989
--- /dev/null
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = AddFastGelu;
+
+using ADataType = F16;
+using BDataType = F16;
+using D0DataType = F16;
+using EDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using D0Layout = Row;
+using ELayout = Row;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideD0 = 0;
+ ck::index_t StrideE = 4096;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 8)
+ {
+ M = std::stoi(argv[1]);
+ N = std::stoi(argv[2]);
+ K = std::stoi(argv[3]);
+
+ StrideA = std::stoi(argv[4]);
+ StrideB = std::stoi(argv[5]);
+ StrideD0 = std::stoi(argv[6]);
+ StrideE = std::stoi(argv[7]);
+ }
+ else
+ {
+ printf("arg1 to 7: M, N, K, StrideA, StrideB, StrideD0, StrideE\n");
+ exit(0);
+ }
+
+ auto f_matrix_space_size =
+ [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+ using Layout = decltype(layout);
+
+ if constexpr(std::is_same::value)
+ {
+ return (nRow - 1) * stride + nCol;
+ }
+ else
+ {
+ return (nCol - 1) * stride + nRow;
+ }
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+ SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
+ f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+ ALayout,
+ BLayout,
+ ck::Tuple,
+ ELayout,
+ ADataType,
+ BDataType,
+ ck::Tuple,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::AddFastGelu>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{};
+
+ // get generic instance
+ auto& op_ptr = op_ptrs[0];
+
+ std::cout << "Run the generic instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+
+ // run the generic instance
+ auto argument_ptr =
+ op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d0_m_n_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ std::array{StrideD0},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+ else
+ {
+ throw std::runtime_error(
+ "Generic instance should be suitable for various input lengths/strides");
+ }
+
+ std::cout << "Done" << std::endl;
+
+ return 0;
+}
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
new file mode 100644
index 0000000000..47fd58f691
--- /dev/null
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using FastGelu = ck::tensor_operation::element_wise::FastGelu;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = FastGelu;
+
+using ADataType = F16;
+using BDataType = F16;
+using EDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideE = 4096;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 7)
+ {
+ M = std::stoi(argv[1]);
+ N = std::stoi(argv[2]);
+ K = std::stoi(argv[3]);
+
+ StrideA = std::stoi(argv[4]);
+ StrideB = std::stoi(argv[5]);
+ StrideE = std::stoi(argv[6]);
+ }
+ else
+ {
+ printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideE\n");
+ exit(0);
+ }
+
+ auto f_matrix_space_size =
+ [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+ using Layout = decltype(layout);
+
+ if constexpr(std::is_same::value)
+ {
+ return (nRow - 1) * stride + nCol;
+ }
+ else
+ {
+ return (nCol - 1) * stride + nRow;
+ }
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+ ALayout,
+ BLayout,
+ ck::Tuple<>,
+ ELayout,
+ ADataType,
+ BDataType,
+ ck::Tuple<>,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::FastGelu>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ {},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ {},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best intance
+ if(found)
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ {},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ {},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp
new file mode 100644
index 0000000000..f43554f2bd
--- /dev/null
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using FastGelu = ck::tensor_operation::element_wise::FastGelu;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = FastGelu;
+
+using ADataType = F16;
+using BDataType = F16;
+using EDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideE = 4096;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 7)
+ {
+ M = std::stoi(argv[1]);
+ N = std::stoi(argv[2]);
+ K = std::stoi(argv[3]);
+
+ StrideA = std::stoi(argv[4]);
+ StrideB = std::stoi(argv[5]);
+ StrideE = std::stoi(argv[6]);
+ }
+ else
+ {
+ printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideE\n");
+ exit(0);
+ }
+
+ auto f_matrix_space_size =
+ [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+ using Layout = decltype(layout);
+
+ if constexpr(std::is_same::value)
+ {
+ return (nRow - 1) * stride + nCol;
+ }
+ else
+ {
+ return (nCol - 1) * stride + nRow;
+ }
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+ ALayout,
+ BLayout,
+ ck::Tuple<>,
+ ELayout,
+ ADataType,
+ BDataType,
+ ck::Tuple<>,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::FastGelu>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{};
+
+ // get generic instance
+ auto& op_ptr = op_ptrs[0];
+
+ std::cout << "Run the generic instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+
+ // run the generic instance
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ {},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ {},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+ else
+ {
+ throw std::runtime_error(
+ "Generic instance should be suitable for various input lengths/strides");
+ }
+
+ std::cout << "Done" << std::endl;
+
+ return 0;
+}
diff --git a/client_example/03_gemm_layernorm/CMakeLists.txt b/client_example/03_gemm_layernorm/CMakeLists.txt
new file mode 100644
index 0000000000..8fedc84635
--- /dev/null
+++ b/client_example/03_gemm_layernorm/CMakeLists.txt
@@ -0,0 +1,7 @@
+if(GPU_TARGETS MATCHES "gfx9")
+ add_executable(client_gemm_add_add_layernorm_naive gemm_add_add_layernorm_naive.cpp)
+ target_link_libraries(client_gemm_add_add_layernorm_naive PRIVATE composable_kernel::device_gemm_operations composable_kernel::device_other_operations)
+
+ add_executable(client_gemm_add_relu_add_layernorm_welford gemm_add_relu_add_layernorm_welford.cpp)
+ target_link_libraries(client_gemm_add_relu_add_layernorm_welford PRIVATE composable_kernel::device_gemm_operations composable_kernel::device_other_operations)
+endif()
diff --git a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
new file mode 100644
index 0000000000..020f047d1a
--- /dev/null
+++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
@@ -0,0 +1,277 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+
+using ADataType = F16;
+using BDataType = F16;
+using BiasDataType = F32;
+using CDataType = F16;
+using D0DataType = F16;
+using ReduceDataType = F32;
+using GammaDataType = F16;
+using BetaDataType = F16;
+using LayerNormOutDataType = F16;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+template
+bool RunDeviceGemmMeanSquareMean(gemm_reduce_op_ptr& p_op,
+ const void* p_a,
+ const void* p_b,
+ const void* p_bias,
+ const void* p_d0,
+ void* p_c,
+ void* p_mean,
+ void* p_square_mean,
+ int M,
+ int N,
+ int K,
+ int StrideA,
+ int StrideB,
+ int StrideC,
+ int StrideD0,
+ bool time_kernel)
+{
+ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+ using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide;
+ using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare;
+
+ auto passOp = PassThrough{};
+ auto squareOp = UnarySquareElementOp{};
+ auto divOp = UnaryDivElementOp{N};
+
+ auto argument_ptr =
+ p_op->MakeArgumentPointer(p_a,
+ p_b,
+ p_bias,
+ {p_d0},
+ p_c,
+ {p_mean, p_square_mean},
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ StrideC,
+ {StrideD0},
+ {&passOp, &passOp, &passOp}, // functor for a, b, c
+ {&passOp}, // functor for d0
+ {&passOp, &squareOp}, // functor for inputs of reduction
+ {&divOp, &divOp}); // functor for outputs of reduction
+
+ if(p_op->IsSupportedArgument(argument_ptr.get()))
+ {
+ auto invoker_ptr = p_op->MakeInvokerPointer();
+
+ // If we evaluate running time of gemm_reduce. The output may wrong.
+ // Because we need to initialize the reduction tensor before runing the kernel.
+ // However we run kernel many times for time_kernel = trie without reinitialize the out
+ // of reduction tensor.
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+ if(time_kernel)
+ std::cout << "Gemm + reduce Perf: " << std::setw(10) << ave_time << " ms" << std::endl;
+
+ return true;
+ }
+
+ return false;
+}
+
+template
+bool RunDeviceNormalize2D(normalize_op_ptr& p_op,
+ const void* p_x,
+ const void* p_mean,
+ const void* p_square_mean,
+ const void* p_gamma,
+ const void* p_beta,
+ void* p_y,
+ int M,
+ int N,
+ int StrideX,
+ bool time_kernel)
+{
+ std::array input = {p_x, p_mean, p_square_mean, p_gamma, p_beta};
+ std::array output = {p_y};
+ auto normalize_functor = ck::tensor_operation::element_wise::Normalize{};
+
+ std::array xyLengths = {M, N};
+ std::array xyStrides = {StrideX, 1};
+
+ auto argument_ptr = p_op->MakeArgumentPointer(xyLengths,
+ {xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
+ {xyStrides},
+ input,
+ output,
+ ck::tensor_operation::element_wise::Normalize{});
+
+ if(p_op->IsSupportedArgument(argument_ptr.get()))
+ {
+ auto invoker_ptr = p_op->MakeInvokerPointer();
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+ if(time_kernel)
+ std::cout << "Normalize Perf: " << std::setw(10) << ave_time << " ms" << std::endl;
+
+ return true;
+ }
+
+ return false;
+}
+
+int main()
+{
+ ck::index_t M = 1024;
+ ck::index_t N = 1024;
+ ck::index_t K = 1024;
+
+ ck::index_t StrideA = 1024;
+ ck::index_t StrideB = 1024;
+ ck::index_t StrideC = 1024;
+ ck::index_t StrideD0 = 1024;
+
+ const auto gemm_reduce_ptrs =
+ ck::tensor_operation::device::instance::get_device_gemm_add_add_mean_squaremean_instances<
+ ADataType,
+ BDataType,
+ CDataType,
+ ALayout,
+ BLayout,
+ CLayout>();
+
+ std::cout << "found " << gemm_reduce_ptrs.size()
+ << " gemm_reduceMean_reduceSquareMean instances" << std::endl;
+
+ using NormalizeDeviceOp = ck::tensor_operation::device::DeviceElementwise<
+ ck::Tuple,
+ ck::Tuple,
+ ck::tensor_operation::element_wise::Normalize,
+ 2>;
+
+ const auto normalize_ptrs =
+ ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ NormalizeDeviceOp>::GetInstances();
+
+ std::cout << "found " << normalize_ptrs.size() << " normalize instances" << std::endl;
+
+ auto f_matrix_space_size =
+ [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+ using Layout = decltype(layout);
+
+ if constexpr(std::is_same::value)
+ {
+ return (nRow - 1) * stride + nCol;
+ }
+ else
+ {
+ return (nCol - 1) * stride + nRow;
+ }
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+ SimpleDeviceMem bias_device_buf(sizeof(BiasDataType) * N);
+ SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
+ SimpleDeviceMem d0_device_buf(sizeof(D0DataType) *
+ f_matrix_space_size(M, N, StrideD0, CLayout{}));
+ SimpleDeviceMem reduceMean_device_buf(sizeof(ReduceDataType) * M);
+ SimpleDeviceMem reduceMeanSquare_device_buf(sizeof(ReduceDataType) * M);
+ SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
+ SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
+ SimpleDeviceMem layerNorm_device_buf(sizeof(LayerNormOutDataType) * M * N);
+
+ bool b_time_kernel = true;
+ bool b_only_run_first_kernel = true;
+
+ // layernorm => (1) + (2)
+ // (1). c = gemm(a, b), reduce_mean(c), reduce_square_mean(c)
+ // (2). normalize(c, mean, square_mean, gamma, beta)
+ for(auto& gemm_reduce_ptr : gemm_reduce_ptrs)
+ {
+ // run first available kernel
+ if(RunDeviceGemmMeanSquareMean(gemm_reduce_ptr,
+ a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ bias_device_buf.GetDeviceBuffer(),
+ d0_device_buf.GetDeviceBuffer(),
+ c_device_buf.GetDeviceBuffer(),
+ reduceMean_device_buf.GetDeviceBuffer(),
+ reduceMeanSquare_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ StrideC,
+ StrideD0,
+ b_time_kernel))
+ {
+ if(b_only_run_first_kernel)
+ break;
+ }
+ else
+ {
+ std::cout << gemm_reduce_ptr->GetTypeString() << " does not support this problem"
+ << std::endl;
+ }
+ }
+
+ for(auto& normalize_ptr : normalize_ptrs)
+ {
+ if(RunDeviceNormalize2D(normalize_ptr,
+ c_device_buf.GetDeviceBuffer(),
+ reduceMean_device_buf.GetDeviceBuffer(),
+ reduceMeanSquare_device_buf.GetDeviceBuffer(),
+ gamma_device_buf.GetDeviceBuffer(),
+ beta_device_buf.GetDeviceBuffer(),
+ layerNorm_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ StrideC,
+ b_time_kernel))
+ {
+ if(b_only_run_first_kernel)
+ break;
+ }
+ else
+ {
+ std::cout << normalize_ptr->GetTypeString() << " does not support this problem"
+ << std::endl;
+ }
+ }
+}
diff --git a/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp b/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp
new file mode 100644
index 0000000000..7d5ef5f9bf
--- /dev/null
+++ b/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using F16 = ck::half_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddReluAdd = ck::tensor_operation::element_wise::AddReluAdd;
+
+// DataType
+using ADataType = F16;
+using BDataType = F16;
+using D0DataType = F16;
+using D1DataType = F16;
+using GammaDataType = F16;
+using BetaDataType = F16;
+using HDataType = F16;
+
+// Layout
+using ALayout = Row;
+using BLayout = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using HLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = AddReluAdd;
+using HElementOp = PassThrough;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}, mMemSize_(mem_size)
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ void SetZero() const { (void)hipMemset(p_mem_, 0, mMemSize_); }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+ std::size_t mMemSize_;
+};
+
+int main(int argc, char* argv[])
+{
+ // GEMM shape
+ ck::index_t M = 1024;
+ ck::index_t N = 1024;
+ ck::index_t K = 1024;
+
+ ck::index_t StrideA = K;
+ ck::index_t StrideB = K;
+ ck::index_t StrideD0 = 0;
+ ck::index_t StrideD1 = N;
+ ck::index_t StrideH = N;
+
+ float epsilon = 1e-5;
+
+ auto f_matrix_space_size =
+ [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+ using Layout = decltype(layout);
+
+ if constexpr(std::is_same::value)
+ {
+ return (nRow - 1) * stride + nCol;
+ }
+ else
+ {
+ return (nCol - 1) * stride + nRow;
+ }
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+ SimpleDeviceMem d0_device_buf(sizeof(D0DataType) *
+ f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+ SimpleDeviceMem d1_device_buf(sizeof(D1DataType) *
+ f_matrix_space_size(M, N, StrideD1, D1Layout{}));
+ SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
+ SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
+ SimpleDeviceMem h_device_buf(sizeof(HDataType) * f_matrix_space_size(M, N, StrideH, HLayout{}));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleDLayernorm<
+ ALayout,
+ BLayout,
+ ck::Tuple,
+ HLayout,
+ ADataType,
+ BDataType,
+ ck::Tuple,
+ GammaDataType,
+ BetaDataType,
+ HDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::AddReluAdd,
+ ck::tensor_operation::element_wise::PassThrough>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{};
+ const auto h_element_op = HElementOp{};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = std::numeric_limits::max();
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(
+ a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+ gamma_device_buf.GetDeviceBuffer(),
+ beta_device_buf.GetDeviceBuffer(),
+ h_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ {StrideD0, StrideD1},
+ StrideH,
+ epsilon,
+ a_element_op,
+ b_element_op,
+ cde_element_op,
+ h_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+ SimpleDeviceMem workspace_dev(workspace_sz);
+ op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+ h_device_buf.SetZero();
+
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t num_byte =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+ (sizeof(D0DataType) + sizeof(D1DataType) + sizeof(HDataType)) * M * N +
+ (sizeof(GammaDataType) + sizeof(BetaDataType)) * N;
+
+ float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+ << op_name << std::endl;
+
+ if(ave_time < best_ave_time)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+ << best_op_name << std::endl;
+
+ // run the best intance
+ if(found)
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+ auto argument_ptr = op_ptr->MakeArgumentPointer(
+ a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+ gamma_device_buf.GetDeviceBuffer(),
+ beta_device_buf.GetDeviceBuffer(),
+ h_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ {StrideD0, StrideD1},
+ StrideH,
+ epsilon,
+ a_element_op,
+ b_element_op,
+ cde_element_op,
+ h_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+ SimpleDeviceMem workspace_dev(workspace_sz);
+ op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+ h_device_buf.SetZero();
+
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
\ No newline at end of file
diff --git a/client_example/04_contraction/CMakeLists.txt b/client_example/04_contraction/CMakeLists.txt
new file mode 100644
index 0000000000..13c0375846
--- /dev/null
+++ b/client_example/04_contraction/CMakeLists.txt
@@ -0,0 +1,16 @@
+if(GPU_TARGETS MATCHES "gfx9")
+ add_executable(client_contraction_scale_fp32 contraction_scale_fp32.cpp)
+ target_link_libraries(client_contraction_scale_fp32 PRIVATE composable_kernel::device_other_operations composable_kernel::device_contraction_operations composable_kernel::device_gemm_operations)
+
+ add_executable(client_contraction_bilinear_fp32 contraction_bilinear_fp32.cpp)
+ target_link_libraries(client_contraction_bilinear_fp32 PRIVATE composable_kernel::device_other_operations composable_kernel::device_contraction_operations composable_kernel::device_gemm_operations)
+
+ add_executable(client_contraction_scale_fp64 contraction_scale_fp64.cpp)
+ target_link_libraries(client_contraction_scale_fp64 PRIVATE composable_kernel::device_other_operations composable_kernel::device_contraction_operations composable_kernel::device_gemm_operations)
+
+ add_executable(client_contraction_bilinear_fp64 contraction_bilinear_fp64.cpp)
+ target_link_libraries(client_contraction_bilinear_fp64 PRIVATE composable_kernel::device_other_operations composable_kernel::device_contraction_operations composable_kernel::device_gemm_operations)
+
+ add_executable(contraction_g1m2n3k1_add_xdl_fp16 contraction_g1m2n3k1_add_xdl_fp16.cpp)
+ target_link_libraries(contraction_g1m2n3k1_add_xdl_fp16 PRIVATE composable_kernel::device_other_operations composable_kernel::device_contraction_operations composable_kernel::device_gemm_operations)
+endif()
diff --git a/client_example/04_contraction/contraction_bilinear_fp32.cpp b/client_example/04_contraction/contraction_bilinear_fp32.cpp
new file mode 100644
index 0000000000..f1881e60a0
--- /dev/null
+++ b/client_example/04_contraction/contraction_bilinear_fp32.cpp
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear = ck::tensor_operation::element_wise::Bilinear;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = Bilinear;
+
+using ADataType = F32;
+using BDataType = F32;
+using AccDataType = F32;
+using CShuffleDataType = F32;
+using DDataType = F32;
+using DsDataType = ck::Tuple;
+using EDataType = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // A[M0, M1, K0, K1]
+ std::vector a_ms_ks_lengths{30, 128, 32, 64};
+ std::vector a_ms_ks_strides{524288, 4096, 128, 1};
+ // B[N0, N1, K0, K1]
+ std::vector b_ns_ks_lengths{32, 64, 32, 64};
+ std::vector b_ns_ks_strides{524288, 4096, 128, 1};
+ // D[M0, M1, N0, N1]
+ std::vector d_ms_ns_lengths{30, 128, 32, 64};
+ std::vector d_ms_ns_strides{524288, 4096, 128, 1};
+ // E[M0, M1, N0, N1]
+ std::vector e_ms_ns_lengths{30, 128, 32, 64};
+ std::vector e_ms_ns_strides{524288, 4096, 128, 1};
+
+ float alpha = 1.f;
+ float beta = 1.f;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 25)
+ {
+ const ck::index_t M0 = std::stoi(argv[1]);
+ const ck::index_t M1 = std::stoi(argv[2]);
+
+ const ck::index_t N0 = std::stoi(argv[3]);
+ const ck::index_t N1 = std::stoi(argv[4]);
+
+ const ck::index_t K0 = std::stoi(argv[5]);
+ const ck::index_t K1 = std::stoi(argv[6]);
+
+ a_ms_ks_lengths = {M0, M1, K0, K1};
+ a_ms_ks_strides = {
+ std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
+
+ b_ns_ks_lengths = {N0, N1, K0, K1};
+ b_ns_ks_strides = {
+ std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
+
+ d_ms_ns_lengths = {M0, M1, N0, N1};
+ d_ms_ns_strides = {
+ std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
+
+ e_ms_ns_lengths = {M0, M1, N0, N1};
+ e_ms_ns_strides = {
+ std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21]), std::stoi(argv[22])};
+
+ alpha = std::stof(argv[23]);
+ beta = std::stof(argv[24]);
+ }
+ else
+ {
+ printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
+ printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+ printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+ printf("arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
+ printf("arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+ printf("arg23 to 24: alpha, beta\n");
+ exit(0);
+ }
+
+ auto f_tensor_space_size = [](auto lengths, auto strides) {
+ std::size_t space_size = 1;
+ for(std::size_t i = 0; i < lengths.size(); ++i)
+ {
+ space_size += (lengths[i] - 1) * strides[i];
+ }
+ return space_size;
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) *
+ f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) *
+ f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
+ SimpleDeviceMem d_device_buf(sizeof(DDataType) *
+ f_tensor_space_size(d_ms_ns_lengths, d_ms_ns_strides));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) *
+ f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
+ NumDimM,
+ NumDimN,
+ NumDimK,
+ ADataType,
+ BDataType,
+ ck::Tuple,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::Bilinear>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{alpha, beta};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr =
+ op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ a_ms_ks_lengths,
+ a_ms_ks_strides,
+ b_ns_ks_lengths,
+ b_ns_ks_strides,
+ std::array, 1>{d_ms_ns_lengths},
+ std::array, 1>{d_ms_ns_strides},
+ e_ms_ns_lengths,
+ e_ms_ns_strides,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ ck::index_t M = ck::accumulate_n(
+ e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+ ck::index_t N = ck::accumulate_n(
+ e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+ ck::index_t K = ck::accumulate_n(
+ a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+ sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ return 0;
+}
diff --git a/client_example/04_contraction/contraction_bilinear_fp64.cpp b/client_example/04_contraction/contraction_bilinear_fp64.cpp
new file mode 100644
index 0000000000..8b499eee21
--- /dev/null
+++ b/client_example/04_contraction/contraction_bilinear_fp64.cpp
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+using F64 = double;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear = ck::tensor_operation::element_wise::Bilinear;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = Bilinear;
+
+using ADataType = F64;
+using BDataType = F64;
+using AccDataType = F64;
+using CShuffleDataType = F64;
+using DDataType = F64;
+using DsDataType = ck::Tuple;
+using EDataType = F64;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+// kknn
+#if 1
+ // A[M0, M1, K0, K1]
+ std::vector a_ms_ks_lengths{30, 128, 32, 64};
+ std::vector a_ms_ks_strides{524288, 4096, 128, 1};
+ // B[N0, N1, K0, K1]
+ std::vector b_ns_ks_lengths{32, 64, 32, 64};
+ std::vector b_ns_ks_strides{524288, 4096, 128, 1};
+ // D[M0, M1, N0, N1]
+ std::vector d_ms_ns_lengths{30, 128, 32, 64};
+ std::vector d_ms_ns_strides{524288, 4096, 128, 1};
+ // E[M0, M1, N0, N1]
+ std::vector e_ms_ns_lengths{30, 128, 32, 64};
+ std::vector e_ms_ns_strides{524288, 4096, 128, 1};
+// knnn
+#elif 0
+ // A[M0, M1, K0, K1]
+ std::vector a_ms_ks_lengths{30, 128, 32, 64};
+ std::vector a_ms_ks_strides{524288, 4096, 128, 1};
+ // B[N0, N1, K0, K1]
+ std::vector b_ns_ks_lengths{32, 64, 32, 64};
+ std::vector b_ns_ks_strides{64, 1, 131072, 2048};
+ // D[M0, M1, N0, N1]
+ std::vector d_ms_ns_lengths{30, 128, 32, 64};
+ std::vector d_ms_ns_strides{524288, 4096, 128, 1};
+ // E[M0, M1, N0, N1]
+ std::vector e_ms_ns_lengths{30, 128, 32, 64};
+ std::vector e_ms_ns_strides{524288, 4096, 128, 1};
+// mknn
+#elif 0
+ // A[M0, M1, K0, K1]
+ std::vector a_ms_ks_lengths{30, 128, 32, 64};
+ std::vector a_ms_ks_strides{128, 1, 245760, 3840};
+ // B[N0, N1, K0, K1]
+ std::vector b_ns_ks_lengths{32, 64, 32, 64};
+ std::vector b_ns_ks_strides{524288, 4096, 128, 1};
+ // D[M0, M1, N0, N1]
+ std::vector d_ms_ns_lengths{30, 128, 32, 64};
+ std::vector d_ms_ns_strides{524288, 4096, 128, 1};
+ // E[M0, M1, N0, N1]
+ std::vector e_ms_ns_lengths{30, 128, 32, 64};
+ std::vector e_ms_ns_strides{524288, 4096, 128, 1};
+// mnnn
+#elif 0
+ // A[M0, M1, K0, K1]
+ std::vector a_ms_ks_lengths{30, 128, 32, 64};
+ std::vector a_ms_ks_strides{128, 1, 245760, 3840};
+ // B[N0, N1, K0, K1]
+ std::vector b_ns_ks_lengths{32, 64, 32, 64};
+ std::vector b_ns_ks_strides{64, 1, 131072, 2048};
+ // D[M0, M1, N0, N1]
+ std::vector d_ms_ns_lengths{30, 128, 32, 64};
+ std::vector d_ms_ns_strides{524288, 4096, 128, 1};
+ // E[M0, M1, N0, N1]
+ std::vector e_ms_ns_lengths{30, 128, 32, 64};
+ std::vector e_ms_ns_strides{524288, 4096, 128, 1};
+#endif
+
+ float alpha = 1.f;
+ float beta = 1.f;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 25)
+ {
+ const ck::index_t M0 = std::stoi(argv[1]);
+ const ck::index_t M1 = std::stoi(argv[2]);
+
+ const ck::index_t N0 = std::stoi(argv[3]);
+ const ck::index_t N1 = std::stoi(argv[4]);
+
+ const ck::index_t K0 = std::stoi(argv[5]);
+ const ck::index_t K1 = std::stoi(argv[6]);
+
+ a_ms_ks_lengths = {M0, M1, K0, K1};
+ a_ms_ks_strides = {
+ std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
+
+ b_ns_ks_lengths = {N0, N1, K0, K1};
+ b_ns_ks_strides = {
+ std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
+
+ d_ms_ns_lengths = {M0, M1, N0, N1};
+ d_ms_ns_strides = {
+ std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
+
+ e_ms_ns_lengths = {M0, M1, N0, N1};
+ e_ms_ns_strides = {
+ std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21]), std::stoi(argv[22])};
+
+ alpha = std::stof(argv[23]);
+ beta = std::stof(argv[24]);
+ }
+ else
+ {
+ printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
+ printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+ printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+ printf("arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
+ printf("arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+ printf("arg23 to 24: alpha, beta\n");
+ exit(0);
+ }
+
+ auto f_tensor_space_size = [](auto lengths, auto strides) {
+ std::size_t space_size = 1;
+ for(std::size_t i = 0; i < lengths.size(); ++i)
+ {
+ space_size += (lengths[i] - 1) * strides[i];
+ }
+ return space_size;
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) *
+ f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) *
+ f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
+ SimpleDeviceMem d_device_buf(sizeof(DDataType) *
+ f_tensor_space_size(d_ms_ns_lengths, d_ms_ns_strides));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) *
+ f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
+ NumDimM,
+ NumDimN,
+ NumDimK,
+ ADataType,
+ BDataType,
+ ck::Tuple,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::Bilinear>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{alpha, beta};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr =
+ op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ a_ms_ks_lengths,
+ a_ms_ks_strides,
+ b_ns_ks_lengths,
+ b_ns_ks_strides,
+ std::array, 1>{d_ms_ns_lengths},
+ std::array, 1>{d_ms_ns_strides},
+ e_ms_ns_lengths,
+ e_ms_ns_strides,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ ck::index_t M = ck::accumulate_n(
+ e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+ ck::index_t N = ck::accumulate_n(
+ e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+ ck::index_t K = ck::accumulate_n(
+ a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+ sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ return 0;
+}
diff --git a/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp b/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp
new file mode 100644
index 0000000000..a5ef40a2dc
--- /dev/null
+++ b/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add = ck::tensor_operation::element_wise::Add;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = Add;
+
+using ADataType = F16;
+using BDataType = F16;
+using AccDataType = F32;
+using CShuffleDataType = F16;
+using DDataType = F16;
+using DsDataType = ck::Tuple;
+using EDataType = F16;
+
+static constexpr ck::index_t NumDimG = 1;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 3;
+static constexpr ck::index_t NumDimK = 1;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ ck::index_t G0 = 1;
+
+ ck::index_t M0 = 64;
+ ck::index_t M1 = 256;
+
+ ck::index_t N0 = 3;
+ ck::index_t N1 = 12;
+ ck::index_t N2 = 64;
+
+ ck::index_t K0 = 768;
+
+ // A[M0, M1, M2, K0]
+ std::vector a_gs_ms_ks_lengths{G0, M0, M1, K0};
+ std::vector a_gs_ms_ks_strides{M0 * M1 * K0, M1 * K0, K0, 1};
+ // B[N0, N1, N2, K0]
+ std::vector b_gs_ns_ks_lengths{G0, N0, N1, N2, K0};
+ std::vector b_gs_ns_ks_strides{N0 * N1 * N2 * K0, N1 * N2 * K0, N2 * K0, K0, 1};
+
+ // D[N0, M0, N1, M1, N2]
+ std::vector d_gs_ms_ns_lengths{G0, M0, M1, N0, N1, N2};
+ std::vector d_gs_ms_ns_strides{N0 * N1 * N2, 0, 0, N1 * N2, N2, 1};
+ // E[N0 M0 N1 N2 M1]
+ std::vector e_gs_ms_ns_lengths{G0, M0, M1, N0, N1, N2};
+ std::vector e_gs_ms_ns_strides{
+ M0 * M1 * N0 * N1 * N2, N1 * N2 * M1, 1, M0 * N1 * N2 * M1, M1 * N2, M1};
+
+ auto f_tensor_space_size = [](auto lengths, auto strides) {
+ std::size_t space_size = 1;
+ for(std::size_t i = 0; i < lengths.size(); ++i)
+ {
+ space_size += (lengths[i] - 1) * strides[i];
+ }
+ return space_size;
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) *
+ f_tensor_space_size(a_gs_ms_ks_lengths, a_gs_ms_ks_strides));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) *
+ f_tensor_space_size(b_gs_ns_ks_lengths, b_gs_ns_ks_strides));
+ SimpleDeviceMem d_device_buf(sizeof(DDataType) *
+ f_tensor_space_size(d_gs_ms_ns_lengths, d_gs_ms_ns_strides));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) *
+ f_tensor_space_size(e_gs_ms_ns_lengths, e_gs_ms_ns_strides));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceBatchedContractionMultipleD<
+ NumDimG,
+ NumDimM,
+ NumDimN,
+ NumDimK,
+ ADataType,
+ BDataType,
+ DsDataType,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::Add>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr =
+ op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ a_gs_ms_ks_lengths,
+ a_gs_ms_ks_strides,
+ b_gs_ns_ks_lengths,
+ b_gs_ns_ks_strides,
+ std::array, 1>{d_gs_ms_ns_lengths},
+ std::array, 1>{d_gs_ms_ns_strides},
+ e_gs_ms_ns_lengths,
+ e_gs_ms_ns_strides,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ ck::index_t M = ck::accumulate_n(
+ e_gs_ms_ns_lengths.begin() + NumDimG, NumDimM, 1, std::multiplies<>{});
+
+ ck::index_t N = ck::accumulate_n(
+ e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+ ck::index_t K = ck::accumulate_n(
+ a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+ sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ return 0;
+}
diff --git a/client_example/04_contraction/contraction_scale_fp32.cpp b/client_example/04_contraction/contraction_scale_fp32.cpp
new file mode 100644
index 0000000000..5c06d31488
--- /dev/null
+++ b/client_example/04_contraction/contraction_scale_fp32.cpp
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale = ck::tensor_operation::element_wise::Scale;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = Scale;
+
+using ADataType = F32;
+using BDataType = F32;
+using AccDataType = F32;
+using CShuffleDataType = F32;
+using DsDataType = ck::Tuple<>;
+using EDataType = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // A[M0, M1, K0, K1]
+ std::vector a_ms_ks_lengths{30, 128, 32, 64};
+ std::vector a_ms_ks_strides{524288, 4096, 128, 1};
+ // B[N0, N1, K0, K1]
+ std::vector b_ns_ks_lengths{32, 64, 32, 64};
+ std::vector b_ns_ks_strides{524288, 4096, 128, 1};
+ // E[M0, M1, N0, N1]
+ std::vector e_ms_ns_lengths{30, 128, 32, 64};
+ std::vector e_ms_ns_strides{524288, 4096, 128, 1};
+
+ float scale = 1.f;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 20)
+ {
+ const ck::index_t M0 = std::stoi(argv[1]);
+ const ck::index_t M1 = std::stoi(argv[2]);
+
+ const ck::index_t N0 = std::stoi(argv[3]);
+ const ck::index_t N1 = std::stoi(argv[4]);
+
+ const ck::index_t K0 = std::stoi(argv[5]);
+ const ck::index_t K1 = std::stoi(argv[6]);
+
+ a_ms_ks_lengths = {M0, M1, K0, K1};
+ a_ms_ks_strides = {
+ std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
+
+ b_ns_ks_lengths = {N0, N1, K0, K1};
+ b_ns_ks_strides = {
+ std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
+
+ e_ms_ns_lengths = {M0, M1, N0, N1};
+ e_ms_ns_strides = {
+ std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
+
+ scale = std::stof(argv[19]);
+ }
+ else
+ {
+ printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
+ printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+ printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+ printf("arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+ printf("arg19: scale\n");
+ exit(0);
+ }
+
+ auto f_tensor_space_size = [](auto lengths, auto strides) {
+ std::size_t space_size = 1;
+ for(std::size_t i = 0; i < lengths.size(); ++i)
+ {
+ space_size += (lengths[i] - 1) * strides[i];
+ }
+ return space_size;
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) *
+ f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) *
+ f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) *
+ f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
+ NumDimM,
+ NumDimN,
+ NumDimK,
+ ADataType,
+ BDataType,
+ ck::Tuple<>,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::Scale>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{scale};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{},
+ e_device_buf.GetDeviceBuffer(),
+ a_ms_ks_lengths,
+ a_ms_ks_strides,
+ b_ns_ks_lengths,
+ b_ns_ks_strides,
+ std::array, 0>{},
+ std::array, 0>{},
+ e_ms_ns_lengths,
+ e_ms_ns_strides,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ ck::index_t M = ck::accumulate_n(
+ e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+ ck::index_t N = ck::accumulate_n(
+ e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+ ck::index_t K = ck::accumulate_n(
+ a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ return 0;
+}
diff --git a/client_example/04_contraction/contraction_scale_fp64.cpp b/client_example/04_contraction/contraction_scale_fp64.cpp
new file mode 100644
index 0000000000..14fb8741e7
--- /dev/null
+++ b/client_example/04_contraction/contraction_scale_fp64.cpp
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+using F64 = double;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale = ck::tensor_operation::element_wise::Scale;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = Scale;
+
+using ADataType = F64;
+using BDataType = F64;
+using AccDataType = F64;
+using CShuffleDataType = F64;
+using DsDataType = ck::Tuple<>;
+using EDataType = F64;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+// kkn
+#if 1
+ // A[M0, M1, K0, K1]
+ std::vector a_ms_ks_lengths{30, 128, 32, 64};
+ std::vector a_ms_ks_strides{524288, 4096, 128, 1};
+ // B[N0, N1, K0, K1]
+ std::vector b_ns_ks_lengths{32, 64, 32, 64};
+ std::vector b_ns_ks_strides{524288, 4096, 128, 1};
+ // D[M0, M1, N0, N1]
+ std::vector d_ms_ns_lengths{30, 128, 32, 64};
+ std::vector d_ms_ns_strides{524288, 4096, 128, 1};
+ // E[M0, M1, N0, N1]
+ std::vector e_ms_ns_lengths{30, 128, 32, 64};
+ std::vector e_ms_ns_strides{524288, 4096, 128, 1};
+// knn
+#elif 0
+ // A[M0, M1, K0, K1]
+ std::vector a_ms_ks_lengths{30, 128, 32, 64};
+ std::vector a_ms_ks_strides{524288, 4096, 128, 1};
+ // B[N0, N1, K0, K1]
+ std::vector b_ns_ks_lengths{32, 64, 32, 64};
+ std::vector b_ns_ks_strides{64, 1, 131072, 2048};
+ // D[M0, M1, N0, N1]
+ std::vector d_ms_ns_lengths{30, 128, 32, 64};
+ std::vector d_ms_ns_strides{524288, 4096, 128, 1};
+ // E[M0, M1, N0, N1]
+ std::vector e_ms_ns_lengths{30, 128, 32, 64};
+ std::vector e_ms_ns_strides{524288, 4096, 128, 1};
+// mkn
+#elif 0
+ // A[M0, M1, K0, K1]
+ std::vector a_ms_ks_lengths{30, 128, 32, 64};
+ std::vector a_ms_ks_strides{128, 1, 245760, 3840};
+ // B[N0, N1, K0, K1]
+ std::vector b_ns_ks_lengths{32, 64, 32, 64};
+ std::vector b_ns_ks_strides{524288, 4096, 128, 1};
+ // D[M0, M1, N0, N1]
+ std::vector d_ms_ns_lengths{30, 128, 32, 64};
+ std::vector d_ms_ns_strides{524288, 4096, 128, 1};
+ // E[M0, M1, N0, N1]
+ std::vector e_ms_ns_lengths{30, 128, 32, 64};
+ std::vector e_ms_ns_strides{524288, 4096, 128, 1};
+// mnn
+#elif 0
+ // A[M0, M1, K0, K1]
+ std::vector a_ms_ks_lengths{30, 128, 32, 64};
+ std::vector a_ms_ks_strides{128, 1, 245760, 3840};
+ // B[N0, N1, K0, K1]
+ std::vector b_ns_ks_lengths{32, 64, 32, 64};
+ std::vector b_ns_ks_strides{64, 1, 131072, 2048};
+ // D[M0, M1, N0, N1]
+ std::vector d_ms_ns_lengths{30, 128, 32, 64};
+ std::vector d_ms_ns_strides{524288, 4096, 128, 1};
+ // E[M0, M1, N0, N1]
+ std::vector e_ms_ns_lengths{30, 128, 32, 64};
+ std::vector e_ms_ns_strides{524288, 4096, 128, 1};
+#endif
+
+ float scale = 1.f;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 20)
+ {
+ const ck::index_t M0 = std::stoi(argv[1]);
+ const ck::index_t M1 = std::stoi(argv[2]);
+
+ const ck::index_t N0 = std::stoi(argv[3]);
+ const ck::index_t N1 = std::stoi(argv[4]);
+
+ const ck::index_t K0 = std::stoi(argv[5]);
+ const ck::index_t K1 = std::stoi(argv[6]);
+
+ a_ms_ks_lengths = {M0, M1, K0, K1};
+ a_ms_ks_strides = {
+ std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
+
+ b_ns_ks_lengths = {N0, N1, K0, K1};
+ b_ns_ks_strides = {
+ std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
+
+ e_ms_ns_lengths = {M0, M1, N0, N1};
+ e_ms_ns_strides = {
+ std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
+
+ scale = std::stof(argv[19]);
+ }
+ else
+ {
+ printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
+ printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+ printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+ printf("arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+ printf("arg19: scale\n");
+ exit(0);
+ }
+
+ auto f_tensor_space_size = [](auto lengths, auto strides) {
+ std::size_t space_size = 1;
+ for(std::size_t i = 0; i < lengths.size(); ++i)
+ {
+ space_size += (lengths[i] - 1) * strides[i];
+ }
+ return space_size;
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) *
+ f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) *
+ f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) *
+ f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
+ NumDimM,
+ NumDimN,
+ NumDimK,
+ ADataType,
+ BDataType,
+ ck::Tuple<>,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::Scale>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{scale};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{},
+ e_device_buf.GetDeviceBuffer(),
+ a_ms_ks_lengths,
+ a_ms_ks_strides,
+ b_ns_ks_lengths,
+ b_ns_ks_strides,
+ std::array, 0>{},
+ std::array, 0>{},
+ e_ms_ns_lengths,
+ e_ms_ns_strides,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ ck::index_t M = ck::accumulate_n(
+ e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+ ck::index_t N = ck::accumulate_n(
+ e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+ ck::index_t K = ck::accumulate_n(
+ a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ return 0;
+}
diff --git a/client_example/05_layernorm/CMakeLists.txt b/client_example/05_layernorm/CMakeLists.txt
new file mode 100644
index 0000000000..b7b3c830ed
--- /dev/null
+++ b/client_example/05_layernorm/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_executable(client_layernorm2d_bwd_data layernorm2d_bwd_data.cpp)
+target_link_libraries(client_layernorm2d_bwd_data PRIVATE composable_kernel::device_other_operations)
+
+add_executable(client_layernorm2d_bwd_gamma_beta layernorm2d_bwd_gamma_beta.cpp)
+target_link_libraries(client_layernorm2d_bwd_gamma_beta PRIVATE composable_kernel::device_other_operations)
+
+add_executable(client_layernorm2d_fwd layernorm2d_fwd.cpp)
+target_link_libraries(client_layernorm2d_fwd PRIVATE composable_kernel::device_other_operations)
+
+add_executable(client_layernorm4d_fwd layernorm4d_fwd.cpp)
+target_link_libraries(client_layernorm4d_fwd PRIVATE composable_kernel::device_other_operations)
diff --git a/client_example/05_layernorm/layernorm2d_bwd_data.cpp b/client_example/05_layernorm/layernorm2d_bwd_data.cpp
new file mode 100644
index 0000000000..ec02cb2c4e
--- /dev/null
+++ b/client_example/05_layernorm/layernorm2d_bwd_data.cpp
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization_bwd_data.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/layernorm_bwd_data.hpp"
+
+using DYDataType = float;
+using XDataType = float;
+using GammaDataType = float;
+using MeanInvStdDataType = float;
+using DXDataType = float;
+
+constexpr int Rank = 2;
+constexpr int NumReduceDim = 1;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ ck::index_t M = 1024;
+ ck::index_t N = 1024;
+
+ SimpleDeviceMem dy_dev(sizeof(DYDataType) * M * N);
+ SimpleDeviceMem x_dev(sizeof(XDataType) * M * N);
+ SimpleDeviceMem gamma_dev(sizeof(GammaDataType) * N);
+ SimpleDeviceMem mean_dev(sizeof(MeanInvStdDataType) * M);
+ SimpleDeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * M);
+ SimpleDeviceMem dx_dev(sizeof(DXDataType) * M * N);
+
+ using DeviceOp = ck::tensor_operation::device::DeviceNormalizationBwdData;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = std::numeric_limits::max();
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
+ {N, 1}, // dyStrides
+ {N, 1}, // xStrides
+ {0, 1}, // gammaStrides
+ {1, 0}, // meanStrides
+ {1, 0}, // invStdStrides
+ {N, 1}, // dxStrides
+ {1}, // reduceDims
+ dy_dev.GetDeviceBuffer(),
+ x_dev.GetDeviceBuffer(),
+ gamma_dev.GetDeviceBuffer(),
+ mean_dev.GetDeviceBuffer(),
+ inv_std_dev.GetDeviceBuffer(),
+ dx_dev.GetDeviceBuffer());
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+ SimpleDeviceMem workspace(workspace_sz);
+ op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t num_byte = sizeof(DYDataType) * M * N + sizeof(XDataType) * M * N +
+ sizeof(GammaDataType) * N + sizeof(MeanInvStdDataType) * M * 2 +
+ sizeof(DXDataType) * M * N;
+
+ float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+ << op_name << std::endl;
+
+ if(ave_time < best_ave_time)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+ << best_op_name << std::endl;
+
+ // run the best intance
+ if(found)
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
+ {N, 1}, // dyStrides
+ {N, 1}, // xStrides
+ {0, 1}, // gammaStrides
+ {1, 0}, // meanStrides
+ {1, 0}, // invStdStrides
+ {N, 1}, // dxStrides
+ {1}, // reduceDims
+ dy_dev.GetDeviceBuffer(),
+ x_dev.GetDeviceBuffer(),
+ gamma_dev.GetDeviceBuffer(),
+ mean_dev.GetDeviceBuffer(),
+ inv_std_dev.GetDeviceBuffer(),
+ dx_dev.GetDeviceBuffer());
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+ SimpleDeviceMem workspace(workspace_sz);
+ op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
diff --git a/client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp b/client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp
new file mode 100644
index 0000000000..1d1ebefd5b
--- /dev/null
+++ b/client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp"
+
+using DYDataType = float;
+using XDataType = float;
+using GammaDataType = float;
+using MeanInvStdDataType = float;
+using DGammaDataType = float;
+using DBetaDataType = float;
+
+constexpr int Rank = 2;
+constexpr int NumReduceDim = 1;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ ck::index_t M = 1024;
+ ck::index_t N = 1024;
+
+ SimpleDeviceMem dy_dev(sizeof(DYDataType) * M * N);
+ SimpleDeviceMem x_dev(sizeof(XDataType) * M * N);
+ SimpleDeviceMem mean_dev(sizeof(MeanInvStdDataType) * M);
+ SimpleDeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * M);
+ SimpleDeviceMem dgamma_dev(sizeof(DGammaDataType) * N);
+ SimpleDeviceMem dbeta_dev(sizeof(DBetaDataType) * N);
+
+ using DeviceOp =
+ ck::tensor_operation::device::DeviceNormalizationBwdGammaBeta;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = std::numeric_limits::max();
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ std::size_t num_bytes = sizeof(DYDataType) * M * N + sizeof(XDataType) * M * N +
+ sizeof(MeanInvStdDataType) * M * 2 + sizeof(DGammaDataType) * N +
+ sizeof(DBetaDataType) * N;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // inLengths
+ {N, 1}, // dyStrides
+ {N, 1}, // xStrides
+ {1, 0}, // meanStrides
+ {1, 0}, // invStdStrides
+ {N}, // outLengths
+ {1}, // dgammaStrides
+ {1}, // dbetaStrides
+ {0}, // reduceDims
+ dy_dev.GetDeviceBuffer(),
+ x_dev.GetDeviceBuffer(),
+ mean_dev.GetDeviceBuffer(),
+ inv_std_dev.GetDeviceBuffer(),
+ dgamma_dev.GetDeviceBuffer(),
+ dbeta_dev.GetDeviceBuffer());
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+ SimpleDeviceMem workspace(workspace_sz);
+ op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+ float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+ << op_name << std::endl;
+
+ if(ave_time < best_ave_time)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+ << best_op_name << std::endl;
+
+ // run the best intance
+ if(found)
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // inLengths
+ {N, 1}, // dyStrides
+ {N, 1}, // xStrides
+ {1, 0}, // meanStrides
+ {1, 0}, // invStdStrides
+ {N}, // outLengths
+ {1}, // dgammaStrides
+ {1}, // dbetaStrides
+ {0}, // reduceDims
+ dy_dev.GetDeviceBuffer(),
+ x_dev.GetDeviceBuffer(),
+ mean_dev.GetDeviceBuffer(),
+ inv_std_dev.GetDeviceBuffer(),
+ dgamma_dev.GetDeviceBuffer(),
+ dbeta_dev.GetDeviceBuffer());
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+ SimpleDeviceMem workspace(workspace_sz);
+ op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
diff --git a/client_example/05_layernorm/layernorm2d_fwd.cpp b/client_example/05_layernorm/layernorm2d_fwd.cpp
new file mode 100644
index 0000000000..22599f43ca
--- /dev/null
+++ b/client_example/05_layernorm/layernorm2d_fwd.cpp
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/normalization_fwd.hpp"
+
+using XDataType = ck::half_t;
+using GammaDataType = ck::half_t;
+using BetaDataType = ck::half_t;
+using YDataType = ck::half_t;
+using SaveMeanInvStdDataType = ck::half_t;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+#define SAVE_MEAN_INV_STD
+
+constexpr int Rank = 2;
+constexpr int NumReduceDim = 1;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ ck::index_t M = 1024;
+ ck::index_t N = 1024;
+ ck::index_t Stride = 1024;
+
+ auto xy_size = (M - 1) * Stride + N;
+
+ SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size);
+ SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
+ SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
+ SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
+#ifdef SAVE_MEAN_INV_STD
+ SimpleDeviceMem save_mean_device_buf(sizeof(SaveMeanInvStdDataType) * M);
+ SimpleDeviceMem save_inv_std_device_buf(sizeof(SaveMeanInvStdDataType) * M);
+#endif
+
+ using DeviceOp = ck::tensor_operation::device::DeviceNormalizationFwd;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = std::numeric_limits