From f60cd9d7a6911f30b412a6405f0041221bc64ea9 Mon Sep 17 00:00:00 2001 From: Sam Wu Date: Tue, 5 Dec 2023 11:05:55 -0700 Subject: [PATCH 01/18] Standardize documentation for ReadtheDocs (#1057) Relates to https://github.com/RadeonOpenCompute/rocm-docs-core/issues/330 --- .github/dependabot.yml | 6 ++++++ .gitignore | 1 - .readthedocs.yaml | 10 +++++----- docs/conf.py | 27 +++++++++++++++++++-------- docs/doxygen/Doxyfile | 2 +- docs/sphinx/_toc.yml.in | 6 +++--- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 6 ++---- 8 files changed, 37 insertions(+), 23 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 276690bd4f..0e0a252eb6 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -10,3 +10,9 @@ updates: open-pull-requests-limit: 10 schedule: interval: "daily" + labels: + - "documentation" + - "dependencies" + - "ci:docs-only" + reviewers: + - "samjwu" diff --git a/.gitignore b/.gitignore index 7af066c82d..340f11cbd2 100644 --- a/.gitignore +++ b/.gitignore @@ -54,5 +54,4 @@ _images/ _static/ _templates/ _toc.yml -docBin/ _doxygen/ diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 5f50df2525..9e6678abe5 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -3,11 +3,6 @@ version: 2 -build: - os: ubuntu-22.04 - tools: - python: "3.8" - sphinx: configuration: docs/conf.py @@ -16,3 +11,8 @@ formats: [htmlzip, pdf, epub] python: install: - requirements: docs/sphinx/requirements.txt + +build: + os: ubuntu-22.04 + tools: + python: "3.8" diff --git a/docs/conf.py b/docs/conf.py index 0de590da1a..e441ff1ced 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -4,23 +4,34 @@ # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html -import subprocess +import re from rocm_docs import ROCmDocs +html_theme_options = {"flavor": "list"} -name = "Composable Kernel" -get_version = r'sed -n -e "s/^rocm_setup_version(.* \([0-9\.]\{1,\}\).*/\1/p" ../CMakeLists.txt' -version = subprocess.getoutput(get_version) -if len(version) > 0: - name = f"{name} {version}" +with open('../CMakeLists.txt', encoding='utf-8') as f: + match = re.search(r'.*set\(version ([0-9.]+)[^0-9.]+', f.read()) + if not match: + raise ValueError("VERSION not found!") + version_number = match[1] +left_nav_title = f"Composable Kernel {version_number} Documentation" + +# for PDF output on Read the Docs +project = "Composable Kernel Documentation" +author = "Advanced Micro Devices, Inc." +copyright = "Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved." +version = version_number +release = version_number external_toc_path = "./sphinx/_toc.yml" -docs_core = ROCmDocs(f"{name} Documentation") -docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/docBin/xml") +docs_core = ROCmDocs(left_nav_title) +docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml") docs_core.setup() +external_projects_current_project = "composable_kernel" + mathjax3_config = { 'tex': { 'macros': { diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index 1084f94c81..2594422095 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -58,7 +58,7 @@ PROJECT_LOGO = # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. -OUTPUT_DIRECTORY = docBin +OUTPUT_DIRECTORY = . # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 83dd1e7b1a..c37ba29cec 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -5,6 +5,6 @@ defaults: maxdepth: 6 root: index subtrees: - - caption: About - entries: - - file: license +- caption: About + entries: + - file: license diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index c4ce8be79a..f5ee431e7d 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core>=0.20.0 +rocm-docs-core==0.29.0 sphinxcontrib-bibtex==2.6.1 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 5852315958..0442ae9a2b 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -96,9 +96,7 @@ pygments==2.14.0 # pydata-sphinx-theme # sphinx pyjwt[crypto]==2.6.0 - # via - # pygithub - # pyjwt + # via pygithub pynacl==1.5.0 # via pygithub pytz==2023.3.post1 @@ -113,7 +111,7 @@ requests==2.28.2 # via # pygithub # sphinx -rocm-docs-core==0.27.0 +rocm-docs-core==0.29.0 # via -r requirements.in six==1.16.0 # via From 836b7e557d028cc2d7c6b341352253fd81003e54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Wed, 6 Dec 2023 11:58:59 +0100 Subject: [PATCH 02/18] Introduce wrapper library (#1071) * Introduce wrapper library * Update cmake files * Revert "Update cmake files" This reverts commit c27f88b56590c11a88e26d5d0df7aca51a08133d. * Fix comments --- CHANGELOG.md | 1 + .../25_tensor_transforms/CMakeLists.txt | 4 + .../tensor_transform.cpp | 0 .../tensor_transform_using_wrapper.cpp | 31 +- docs/doxygen/Doxyfile | 4 +- docs/index.rst | 2 + docs/wrapper.rst | 54 ++ example/64_tensor_transforms/CMakeLists.txt | 2 - include/ck/utility/tuple_helper.hpp | 12 + .../ck/wrapper/layout.hpp | 181 ++----- include/ck/wrapper/layout_utils.hpp | 321 ++++++++++++ test/CMakeLists.txt | 1 + test/wrapper/CMakeLists.txt | 2 + test/wrapper/test_layout.cpp | 481 ++++++++++++++++++ 14 files changed, 945 insertions(+), 151 deletions(-) create mode 100644 client_example/25_tensor_transforms/CMakeLists.txt rename {example/64_tensor_transforms => client_example/25_tensor_transforms}/tensor_transform.cpp (100%) rename {example/64_tensor_transforms => client_example/25_tensor_transforms}/tensor_transform_using_wrapper.cpp (74%) create mode 100644 docs/wrapper.rst delete mode 100644 example/64_tensor_transforms/CMakeLists.txt rename example/64_tensor_transforms/tensor_transform_wrapper.hpp => include/ck/wrapper/layout.hpp (68%) create mode 100644 include/ck/wrapper/layout_utils.hpp create mode 100644 test/wrapper/CMakeLists.txt create mode 100644 test/wrapper/test_layout.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e46a4ab4b..3da22fc790 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ None - Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804) - Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799) - Support for Batched Gemm DL (#732) +- Introduce wrapper sublibrary (limited functionality) (#1071) ### Changes - Changed the grouped convolution API to maintain consistency with other convolution kernels (#817) diff --git a/client_example/25_tensor_transforms/CMakeLists.txt b/client_example/25_tensor_transforms/CMakeLists.txt new file mode 100644 index 0000000000..d1543fb0ef --- /dev/null +++ b/client_example/25_tensor_transforms/CMakeLists.txt @@ -0,0 +1,4 @@ +add_executable(client_tensor_transform tensor_transform.cpp) +target_link_libraries(client_tensor_transform PRIVATE composable_kernel::device_other_operations) +add_executable(client_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp) +target_link_libraries(client_tensor_transform_using_wrapper PRIVATE composable_kernel::device_other_operations) diff --git a/example/64_tensor_transforms/tensor_transform.cpp b/client_example/25_tensor_transforms/tensor_transform.cpp similarity index 100% rename from example/64_tensor_transforms/tensor_transform.cpp rename to client_example/25_tensor_transforms/tensor_transform.cpp diff --git a/example/64_tensor_transforms/tensor_transform_using_wrapper.cpp b/client_example/25_tensor_transforms/tensor_transform_using_wrapper.cpp similarity index 74% rename from example/64_tensor_transforms/tensor_transform_using_wrapper.cpp rename to client_example/25_tensor_transforms/tensor_transform_using_wrapper.cpp index df2449e99d..de9fcde0b4 100644 --- a/example/64_tensor_transforms/tensor_transform_using_wrapper.cpp +++ b/client_example/25_tensor_transforms/tensor_transform_using_wrapper.cpp @@ -9,7 +9,7 @@ #include "ck/utility/tuple.hpp" #include "ck/utility/sequence.hpp" -#include "tensor_transform_wrapper.hpp" +#include "ck/wrapper/layout.hpp" using DataType = int; @@ -17,7 +17,7 @@ template void Print1d(const Layout& layout) { std::cout << "Print1d" << std::endl; - for(ck::index_t w = 0; w < ck::tensor_transform_wrapper::size(layout); w++) + for(ck::index_t w = 0; w < ck::wrapper::size(layout); w++) { std::cout << layout(ck::make_tuple(w)) << " "; } @@ -28,9 +28,9 @@ template void Print2d(const Layout& layout) { std::cout << "Print2d" << std::endl; - for(ck::index_t h = 0; h < ck::tensor_transform_wrapper::size<0>(layout); h++) + for(ck::index_t h = 0; h < ck::wrapper::size<0>(layout); h++) { - for(ck::index_t w = 0; w < ck::tensor_transform_wrapper::size<1>(layout); w++) + for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++) { std::cout << layout(ck::make_tuple(h, w)) << " "; } @@ -43,15 +43,11 @@ template void Print3dCustom(const Layout& layout) { std::cout << "Print3dCustom" << std::endl; - for(ck::index_t d = 0; - d < ck::tensor_transform_wrapper::size<0>(ck::tensor_transform_wrapper::get<0>(layout)); - d++) + for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++) { - for(ck::index_t h = 0; - h < ck::tensor_transform_wrapper::size<1>(ck::tensor_transform_wrapper::get<0>(layout)); - h++) + for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++) { - for(ck::index_t w = 0; w < ck::tensor_transform_wrapper::size<1>(layout); w++) + for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++) { std::cout << layout(ck::make_tuple(ck::make_tuple(d, h), w)) << " "; } @@ -68,7 +64,7 @@ int main() // Basic descriptor 0, 1, 2, ... 30, 31 (compile-time descriptor) // (dims:4,8 strides:1,4) const auto shape_4x8 = ck::make_tuple(ck::Number<4>{}, ck::Number<8>{}); - const auto layout_4x8_s1x4 = ck::tensor_transform_wrapper::make_layout(shape_4x8); + const auto layout_4x8_s1x4 = ck::wrapper::make_layout(shape_4x8); std::cout << "dims:4,8 strides:1,4" << std::endl; Print2d(layout_4x8_s1x4); using Cord1x1Type = ck::Tuple, ck::Number<1>>; @@ -77,10 +73,9 @@ int main() // Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (runtime descriptor) // dims:4,(2,4) strides:2,(1,8) - const auto shape_4x2x4 = ck::make_tuple(4, ck::make_tuple(2, 4)); - const auto strides_s2x1x8 = ck::make_tuple(2, ck::make_tuple(1, 8)); - const auto layout_4x2x4_s2x1x8 = - ck::tensor_transform_wrapper::make_layout(shape_4x2x4, strides_s2x1x8); + const auto shape_4x2x4 = ck::make_tuple(4, ck::make_tuple(2, 4)); + const auto strides_s2x1x8 = ck::make_tuple(2, ck::make_tuple(1, 8)); + const auto layout_4x2x4_s2x1x8 = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8); std::cout << "dims:4,(2,4) strides:2,(1,8)" << std::endl; Print2d(layout_4x2x4_s2x1x8); @@ -92,7 +87,7 @@ int main() const auto strides_s1x4x2x8 = ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}), ck::make_tuple(ck::Number<2>{}, ck::Number<8>{})); static const auto layout_2x2x2x4_s1x4x2x8 = - ck::tensor_transform_wrapper::make_layout(shape_2x2x2x4, strides_s1x4x2x8); + ck::wrapper::make_layout(shape_2x2x2x4, strides_s1x4x2x8); std::cout << "dims:(2,2),(2,4) strides:(1,4),(2,8)" << std::endl; Print2d(layout_2x2x2x4_s1x4x2x8); @@ -108,7 +103,7 @@ int main() ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}), ck::Number<2>{}), ck::Number<8>{}); static const auto layout_2x2x2x4_s1x4x2x8_nested = - ck::tensor_transform_wrapper::make_layout(shape_2x2x2x4_nested, strides_s1x4x2x8_nested); + ck::wrapper::make_layout(shape_2x2x2x4_nested, strides_s1x4x2x8_nested); std::cout << "dims:((2,2),2),4 strides:((1,4),2),8" << std::endl; Print1d(layout_2x2x2x4_s1x4x2x8_nested); diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index 2594422095..fac9e138e1 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -778,7 +778,9 @@ WARN_LOGFILE = INPUT = ../../include/ck/tensor_operation/gpu/grid \ ../../include/ck/tensor_operation/gpu/block \ ../../include/ck/tensor_operation/gpu/thread \ - ../../library/include/ck/library/utility + ../../library/include/ck/library/utility \ + ../../include/ck/wrapper + # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/docs/index.rst b/docs/index.rst index 51c0c862ae..8c4aaa2b3d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -34,6 +34,7 @@ Current CK library are structured into 4 layers: * "Templated Tile Operators" layer * "Templated Kernel and Invoker" layer * "Instantiated Kernel and Invoker" layer +* "Wrapper for tensor transform operations" * "Client API" layer .. image:: data/ck_layer.png @@ -50,6 +51,7 @@ The following is a list of CK documents in the suggested reading order: tutorial_hello_world dockerhub + wrapper Supported_Primitives_Guide API_Reference_Guide Contributors_Guide diff --git a/docs/wrapper.rst b/docs/wrapper.rst new file mode 100644 index 0000000000..64fb6a4031 --- /dev/null +++ b/docs/wrapper.rst @@ -0,0 +1,54 @@ +=============== +Wrapper +=============== + +------------------------------------- +Description +------------------------------------- + +.. note:: + + The wrapper is under development and its functionality is limited. + + +CK provides a lightweight wrapper for more complex operations implemented in +the library. It allows indexing of nested layouts using a simple interface +(avoiding complex descriptor transformations). + +Example: + +.. code-block:: c + + const auto shape_4x2x4 = ck::make_tuple(4, ck::make_tuple(2, 4)); + const auto strides_s2x1x8 = ck::make_tuple(2, ck::make_tuple(1, 8)); + const auto layout = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8); + + std::cout << "dims:4,(2,4) strides:2,(1,8)" << std::endl; + for(ck::index_t h = 0; h < ck::wrapper::size<0>(layout); h++) + { + for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++) + { + std::cout << layout(ck::make_tuple(h, w)) << " "; + } + std::cout << std::endl; + } + +Output:: + + dims:4,(2,4) strides:2,(1,8) + 0 1 8 9 16 17 24 25 + 2 3 10 11 18 19 26 27 + 4 5 12 13 20 21 28 29 + 6 7 14 15 22 23 30 31 + +------------------------------------- +Layout +------------------------------------- + +.. doxygenstruct:: ck::wrapper::Layout + +------------------------------------- +Layout helpers +------------------------------------- + +.. doxygenfile:: layout_utils.hpp diff --git a/example/64_tensor_transforms/CMakeLists.txt b/example/64_tensor_transforms/CMakeLists.txt deleted file mode 100644 index 9d14a410e3..0000000000 --- a/example/64_tensor_transforms/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_example_executable(example_tensor_transform tensor_transform.cpp) -add_example_executable(example_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp) diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp index d7b492fe66..75f2693f20 100644 --- a/include/ck/utility/tuple_helper.hpp +++ b/include/ck/utility/tuple_helper.hpp @@ -166,4 +166,16 @@ __host__ __device__ constexpr auto IsNestedTuple(const Tuple&) return (is_detected::value || ...); } +template +__host__ __device__ constexpr auto TupleDepth(const T&) +{ + return depth; +} + +template +__host__ __device__ constexpr auto TupleDepth(const Tuple&) +{ + return math::max(TupleDepth(Ts{})...); +} + } // namespace ck diff --git a/example/64_tensor_transforms/tensor_transform_wrapper.hpp b/include/ck/wrapper/layout.hpp similarity index 68% rename from example/64_tensor_transforms/tensor_transform_wrapper.hpp rename to include/ck/wrapper/layout.hpp index 71cd6091f8..b337d88a1a 100644 --- a/example/64_tensor_transforms/tensor_transform_wrapper.hpp +++ b/include/ck/wrapper/layout.hpp @@ -3,27 +3,13 @@ #pragma once -#include "ck/ck.hpp" - -#include "ck/utility/number.hpp" -#include "ck/utility/tuple.hpp" -#include "ck/utility/tuple_helper.hpp" -#include "ck/utility/sequence.hpp" -#include "ck/utility/sequence_helper.hpp" -#include "ck/utility/is_detected.hpp" - -#include "ck/tensor_description/tensor_descriptor.hpp" -#include "ck/tensor_description/tensor_descriptor_helper.hpp" -#include "ck/tensor_description/multi_index_transform_helper.hpp" +#include "ck/wrapper/layout_utils.hpp" namespace ck { -namespace tensor_transform_wrapper { +namespace wrapper { /** - * \brief Layout wrapper - * - * \details - * Layout wrapper that performs the tensor descriptor logic. + * \brief Layout wrapper that performs the tensor descriptor logic. * * \tparam Shape Tuple of Number<> (for compile-time layout) or index_t * (dynamic layout). It is possible to pass nested shapes @@ -32,21 +18,19 @@ namespace tensor_transform_wrapper { * (dynamic layout). Stride tuple should be nested if shape tuple is * nested. */ -template > +template struct Layout { private: static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; - template - using is_tuple = decltype(std::declval().IsTuple()); - // Generate packed (column-major) strides if not passed template __host__ __device__ constexpr static auto - GenerateColumnMajorPackedStrides(const Tuple& tuple) + GenerateColumnMajorPackedStrides(const Tuple& shape) { + const auto unrolled_shape = UnrollNestedTuple(shape); return generate_tuple( [&](auto i) { if constexpr(i.value == 0) @@ -56,10 +40,10 @@ struct Layout else { return TupleReduce([](auto x, auto y) { return x * y; }, - tuple); + unrolled_shape); } }, - Number::Size()>{}); + Number{}); } // Generate LowerDims in Compile-time for MergeTrasform using passed Type @@ -112,8 +96,8 @@ struct Layout // Example shape: (2, (2, 2)), 2, (2, 2) // Unrolled shape: 2, (2, 2), 2, (2, 2) template - __host__ __device__ constexpr static auto UnrollShapeViaIdx(const Tuple& shape, - const Tuple& idx) + __host__ __device__ constexpr static auto AlignShapeToIdx(const Tuple& shape, + const Tuple& idx) { if constexpr(!IsNestedTuple(Tuple{})) { @@ -125,7 +109,7 @@ struct Layout // Iterate over shape tuple elements: // 1. If corresponding idx element is tuple then return (will be unrolled) // 2. If no, pack in tuple. It will be restored during unroll. - auto unrolled_shape_via_idx = generate_tuple( + auto aligned_shape = generate_tuple( [&](auto i) { if constexpr(is_detected>>::value) @@ -140,8 +124,8 @@ struct Layout Number::Size()>{}); // Unroll and process next step - return UnrollShapeViaIdx(UnrollNestedTuple<0, 1>(unrolled_shape_via_idx), - UnrollNestedTuple<0, 1>(idx)); + return AlignShapeToIdx(UnrollNestedTuple<0, 1>(aligned_shape), + UnrollNestedTuple<0, 1>(idx)); } } @@ -150,27 +134,24 @@ struct Layout DescriptorToMerge& desc) { // Reverse each element in tuple - using ReversedUnrolledShape = decltype(TupleReverse(UnrollNestedTuple(shape))); - const auto merge_elems = ReversedUnrolledShape{}; - + const auto merge_elems = TupleReverse(UnrollNestedTuple(shape)); // Generate reverted indexes (column major traverse) - using MergeElemsSequence = - typename arithmetic_sequence_gen<0, ReversedUnrolledShape::Size(), 1>::type; - const auto lower_dims = make_tuple(MergeElemsSequence::Reverse()); - const auto upper_dims = make_tuple(Sequence<0>{}); + using MergeElemsSequence = typename arithmetic_sequence_gen<0, merge_elems.Size(), 1>::type; + const auto lower_dims = make_tuple(MergeElemsSequence::Reverse()); + const auto upper_dims = make_tuple(Sequence<0>{}); // Merge to 1d return transform_tensor_descriptor( desc, make_tuple(make_merge_transform(merge_elems)), lower_dims, upper_dims); } - // Merge nested shape dims + // Merge nested shape dims. Merge nested shape dims when idx is also nested. // Input desc shape: 2, 2, 2, 2, 2, 2 // Example idx: 1, 1, 1, 1 // Example shape: 2, (2, 2), 2, (2, 2) // Merged shape: 2, 4, 2, 4 template - __host__ __device__ constexpr static auto - MakeMerges(const Tuple& shape, const Tuple&, DescriptorToMerge& desc) + __host__ __device__ constexpr static auto CreateMergedDescriptor( + const Tuple& shape, const Tuple&, DescriptorToMerge& desc) { const auto transforms = generate_tuple( [&](auto i) { @@ -224,9 +205,9 @@ struct Layout static_assert(Tuple::Size() == Tuple::Size(), "Idx rank and Shape rank must be the same (except 1d)."); // Unroll while IdxDims is nested - const auto unrolled_shape_via_idx = UnrollShapeViaIdx(shape, idx); + const auto aligned_shape = AlignShapeToIdx(shape, idx); // Transform correct form of shape - return MakeMerges(unrolled_shape_via_idx, UnrollNestedTuple(idx), descriptor_); + return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idx), descriptor_); } } @@ -234,26 +215,21 @@ struct Layout __host__ __device__ static auto MakeNaiveDescriptor(const LayoutShape& shape, const LayoutStrides& strides) { - const auto unrolled_shape = UnrollNestedTuple(shape); - - if constexpr(ck::is_same_v>) - { - // If shape is packed - const auto column_major_packed_strides = - GenerateColumnMajorPackedStrides(unrolled_shape); - return make_naive_tensor_descriptor(unrolled_shape, column_major_packed_strides); - } - else - { - const auto unrolled_strides = UnrollNestedTuple(strides); - static_assert(unrolled_shape.Size() == unrolled_strides.Size(), - "Size of strides and shape are not consistent."); - return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); - } + const auto unrolled_shape = UnrollNestedTuple(shape); + const auto unrolled_strides = UnrollNestedTuple(strides); + static_assert(unrolled_shape.Size() == unrolled_strides.Size(), + "Size of strides and shape are not consistent."); + return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); } public: - using NaiveDescriptorType = remove_cvref_t; + // If the stride is not passed, you can infer it from `GenerateColumnMajorPackedStrides`. + using DeducedStrides = + std::conditional_t>, + remove_cvref_t, + Strides>; + using NaiveDescriptorType = + remove_cvref_t; /** * \brief Layout constructor. @@ -268,9 +244,9 @@ struct Layout // Construct if runtime mode if constexpr(!NaiveDescriptorType::IsKnownAtCompileTime()) { - // Keep only shape, strides are not need for transforms shape_ = shape; - descriptor_ = MakeNaiveDescriptor(shape, strides); + strides_ = strides; + descriptor_ = MakeNaiveDescriptor(shape_, strides_); } } @@ -279,7 +255,8 @@ struct Layout if constexpr(!NaiveDescriptorType::IsKnownAtCompileTime()) { shape_ = shape; - descriptor_ = MakeNaiveDescriptor(shape, Strides{}); + strides_ = GenerateColumnMajorPackedStrides(shape_); + descriptor_ = MakeNaiveDescriptor(shape_, strides_); } } @@ -338,7 +315,7 @@ struct Layout * * \return Calculated size. */ - __host__ __device__ constexpr index_t GetLength() const + __host__ __device__ constexpr index_t GetLengths() const { const auto unrolled_shape = UnrollNestedTuple(shape_); return TupleReduce([](auto x, auto y) { return x * y; }, @@ -346,80 +323,24 @@ struct Layout } /** - * \brief Dimension getter. + * \brief Shape getter. * - * \tparam IDim Dimension idx. - * \return Calculated size. + * \return Shape. */ - template - __host__ __device__ constexpr auto Get() const - { - const auto elem = shape_.At(Number{}); - return elem; - } + __host__ __device__ constexpr Shape GetShape() const { return shape_; } + + /** + * \brief Strides getter. + * + * \return Strides. + */ + __host__ __device__ constexpr DeducedStrides GetStrides() const { return strides_; } private: NaiveDescriptorType descriptor_; Shape shape_; + DeducedStrides strides_; }; -// Layout helpers -// Length getter (product if tuple) -template -__host__ __device__ constexpr index_t size(const Layout& layout) -{ - return layout.template GetLength(); -} - -// Get shape size (product of dims if tuple) -template -__host__ __device__ constexpr index_t size(const Tuple& shape) -{ - using UnrolledShape = decltype(UnrollNestedTuple(shape)); - return TupleReduce<0, UnrolledShape::Size()>([](auto x, auto y) { return x * y; }, - UnrolledShape{}); -} - -// Get dim size (could be returned from get function) -template -__host__ __device__ T constexpr size(const T& dim) -{ - return dim; -} - -// Get layout size (product of shapes) -template -__host__ __device__ constexpr index_t size(const Layout& layout) -{ - return layout.GetLength(); -} - -// Get shape element size -template -__host__ __device__ constexpr index_t size(const Tuple& shape) -{ - return size(shape.At(Number{})); -} - -// Dim getter (tuple if tuple) -template -__host__ __device__ constexpr auto get(const Layout& layout) -{ - return layout.template Get(); -} - -template -__host__ __device__ constexpr Layout make_layout(const Shape& shape, - const Strides& strides) -{ - return Layout(shape, strides); -} - -template -__host__ __device__ constexpr Layout make_layout(const Shape& shape) -{ - return Layout(shape); -} - -} // namespace tensor_transform_wrapper +} // namespace wrapper } // namespace ck diff --git a/include/ck/wrapper/layout_utils.hpp b/include/ck/wrapper/layout_utils.hpp new file mode 100644 index 0000000000..fac8f33854 --- /dev/null +++ b/include/ck/wrapper/layout_utils.hpp @@ -0,0 +1,321 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/ck.hpp" + +#include "ck/utility/number.hpp" +#include "ck/utility/tuple.hpp" +#include "ck/utility/tuple_helper.hpp" +#include "ck/utility/sequence.hpp" +#include "ck/utility/sequence_helper.hpp" +#include "ck/utility/is_detected.hpp" + +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_description/multi_index_transform_helper.hpp" + +namespace ck { +namespace wrapper { + +// Disable from doxygen docs generation +/// @cond +// forward declaration +template > +struct Layout; + +template +using is_tuple = decltype(std::declval().IsTuple()); +/// @endcond + +// make_* +/** + * \brief Make layout function. + * + * \tparam Shape Shape for layout. + * \tparam Strides Strides for layout. + * \return Constructed layout. + */ +template +__host__ __device__ constexpr Layout make_layout(const Shape& shape, + const Strides& strides) +{ + return Layout(shape, strides); +} + +/** + * \brief Make layout function with packed strides + * (column-major). + * + * \tparam Shape Shape for layout. + * \return Constructed layout. + */ +template +__host__ __device__ constexpr Layout make_layout(const Shape& shape) +{ + return Layout(shape); +} + +// Layout helpers +// get +/** + * \brief Get element from tuple (Shape/Strides/Idxs). + * + * \tparam idx Index to lookup. + * \param tuple Tuple to lookup. + * \return Requsted element. + */ +template +__host__ __device__ constexpr auto get(const Tuple& tuple) +{ + return tuple.At(Number{}); +} + +/** + * \brief Get sub layout. + * + * \tparam idx Index to lookup. + * \param layout Layout to create sub layout. + * \return Requsted sub layout. + */ +template +__host__ __device__ constexpr auto get(const Layout& layout) +{ + const auto new_shape = get(layout.GetShape()); + static_assert(is_detected::value, + "Shape of sub layout must be tuple"); + if constexpr(is_same_v>) + { + // If stride not passed, create without strides + return make_layout(new_shape); + } + else + { + const auto new_strides = get(layout.GetStrides()); + static_assert(is_detected::value, + "Strides of sub layout must be tuple"); + return make_layout(new_shape, new_strides); + } +} + +/** + * \brief Hierarchical get. + * + * \tparam Idxs Indexes to lookup. + * \param elem Element to lookup. + * \return Requsted element. + */ +template +__host__ __device__ constexpr auto get(const T& elem) +{ + return get(get(elem)); +} + +// size +/** + * \brief Length get (product if tuple). + * + * \tparam idx Index to lookup. + * \param layout Layout to get Shape. + * \return Requsted length. + */ +template +__host__ __device__ constexpr index_t size(const Layout& layout) +{ + return layout.template GetLength(); +} + +/** + * \brief Shape size (product of dims). + * + * \param shape Shape to lookup. + * \return Requsted size. + */ +template +__host__ __device__ constexpr index_t size(const Tuple& shape) +{ + const auto unrolled_shape = UnrollNestedTuple(shape); + return TupleReduce<0, unrolled_shape.Size()>([](auto x, auto y) { return x * y; }, + unrolled_shape); +} + +// Get dim size (could be returned from get function) +/** + * \private + */ +template +__host__ __device__ T constexpr size(const T& dim) +{ + return dim; +} + +/** + * \brief Layout size (product of dims). + * + * \param layout Layout to calculate shape size. + * \return Requsted size. + */ +template +__host__ __device__ constexpr index_t size(const Layout& layout) +{ + return layout.GetLengths(); +} + +/** + * \brief Length get from tuple (product if tuple). + * + * \tparam idx Index to lookup. + * \param tuple Tuple to lookup. + * \return Requsted length. + */ +template +__host__ __device__ constexpr index_t size(const Tuple& tuple) +{ + return size(tuple.At(Number{})); +} + +/** + * \brief Hierarchical size. + * + * \tparam Idxs Indexes to lookup. + * \param elem Element to lookup. + * \return Requsted element. + */ +template +__host__ __device__ constexpr auto size(const T& elem) +{ + return size(get(elem)); +} + +// rank +/** + * \brief Get layout rank (num elements in shape). + * + * \param layout Layout to calculate rank. + * \return Requsted rank. + */ +template +__host__ __device__ constexpr auto rank([[maybe_unused]] const Layout& layout) +{ + return Shape::Size(); +} + +/** + * \brief Get tuple rank (num elements in tuple). + * Return 1 if scalar passed. + * + * \param tuple Tuple to calculate rank. + * \return Requsted rank. + */ +template +__host__ __device__ constexpr auto rank([[maybe_unused]] const Tuple& tuple) +{ + return Tuple::Size(); +} + +/** + * \private + */ +template +__host__ __device__ constexpr index_t rank(const Number&) +{ + return 1; +} + +/** + * \private + */ +__host__ __device__ constexpr index_t rank(const index_t&) { return 1; } + +/** + * \brief Hierarchical rank. + * + * \tparam Idxs Indexes to lookup. + * \param elem Element to lookup. + * \return Requsted rank. + */ +template +__host__ __device__ constexpr auto rank(const T& elem) +{ + return rank(get(elem)); +} + +// depth +/** + * \brief Get depth of the layout shape (return 0 if scalar). + * + * \param layout Layout to calculate depth. + * \return Requsted depth. + */ +template +__host__ __device__ constexpr auto depth(const Layout& layout) +{ + return TupleDepth(layout.GetShape()); +} + +/** + * \brief Get depth of the tuple. (return 0 if scalar) + * + * \param tuple Tuple to calculate depth. + * \return Requsted depth. + */ +template +__host__ __device__ constexpr auto depth(const Tuple& tuple) +{ + return TupleDepth(tuple); +} + +/** + * \private + */ +template +__host__ __device__ constexpr index_t depth(const Number&) +{ + return 0; +} + +/** + * \private + */ +__host__ __device__ constexpr index_t depth(const index_t&) { return 0; } + +/** + * \brief Hierarchical depth. + * + * \tparam Idxs Indexes to lookup. + * \param elem Element to lookup. + * \return Requsted depth. + */ +template +__host__ __device__ constexpr auto depth(const T& elem) +{ + return depth(get(elem)); +} + +/** + * \brief Get Layout strides. + * + * \param layout Layout to get strides. + * \return Requsted strides. + */ +template +__host__ __device__ constexpr auto stride(const Layout& layout) +{ + return layout.GetStrides(); +} + +/** + * \brief Get Layout shape. + * + * \param layout Layout to get shape. + * \return Requsted shape. + */ +template +__host__ __device__ constexpr auto shape(const Layout& layout) +{ + return layout.GetShape(); +} + +} // namespace wrapper +} // namespace ck diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4aaa5fcfa5..b325a3a7f8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -149,6 +149,7 @@ add_subdirectory(batched_gemm_multi_d) add_subdirectory(grouped_convnd_bwd_data) add_subdirectory(conv_tensor_rearrange) add_subdirectory(transpose) +add_subdirectory(wrapper) if(GPU_TARGETS MATCHES "gfx11") add_subdirectory(wmma_op) endif() diff --git a/test/wrapper/CMakeLists.txt b/test/wrapper/CMakeLists.txt new file mode 100644 index 0000000000..e25ef176dd --- /dev/null +++ b/test/wrapper/CMakeLists.txt @@ -0,0 +1,2 @@ +add_gtest_executable(test_layout test_layout.cpp) +target_link_libraries(test_layout PRIVATE utility) diff --git a/test/wrapper/test_layout.cpp b/test/wrapper/test_layout.cpp new file mode 100644 index 0000000000..7d09696fbb --- /dev/null +++ b/test/wrapper/test_layout.cpp @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/utility/common_header.hpp" + +#include "ck/wrapper/layout.hpp" + +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_description/multi_index_transform_helper.hpp" + +class TestWrapperLayout : public ::testing::Test +{ + protected: + static constexpr auto I0 = ck::Number<0>{}; + static constexpr auto I1 = ck::Number<1>{}; + + template + void Run(Desc& desc, + Desc1d& desc_1d, + LayoutRuntime& layout_runtime, + LayoutCompiletime& layout_compiletime, + const std::vector& idxs) + { + // 1d check + EXPECT_EQ(desc_1d.GetLength(I0), ck::wrapper::size(layout_runtime)); + // Check layout compiletime and runtime result consistency + EXPECT_EQ(ck::wrapper::size(layout_runtime), ck::wrapper::size(layout_compiletime)); + + for(ck::index_t i = 0; i < desc_1d.GetLength(I0); i++) + { + const ck::index_t layout_runtime_offset_1d = layout_runtime(ck::make_tuple(i)); + const ck::index_t layout_compiletime_offset_1d = layout_compiletime(ck::make_tuple(i)); + const ck::index_t desc_offset_1d = desc_1d.CalculateOffset(ck::make_tuple(i)); + EXPECT_EQ(layout_runtime_offset_1d, desc_offset_1d); + EXPECT_EQ(layout_compiletime_offset_1d, layout_runtime_offset_1d); + } + // size(layout)-d check, don't check if access is hierarchical + if constexpr(!IsNestedTuple(Idxs{})) + { + ck::static_for<0, Idxs::Size(), 1>{}([&](auto d) { + EXPECT_EQ(desc.GetLength(ck::Number{}), ck::wrapper::size(layout_runtime)); + EXPECT_EQ(ck::wrapper::size(layout_runtime), + ck::wrapper::size(layout_compiletime)); + }); + } + for(const auto idx : idxs) + { + const ck::index_t layout_runtime_offset = layout_runtime(idx); + const ck::index_t layout_compiletime_offset = layout_compiletime(idx); + const ck::index_t desc_offset = + desc.CalculateOffset(UnrollNestedTuple(idx)); // Unroll if nested + EXPECT_EQ(layout_runtime_offset, desc_offset); + EXPECT_EQ(layout_runtime_offset, layout_compiletime_offset); + } + } +}; + +TEST_F(TestWrapperLayout, 2d) +{ + // dims:(4, 3) strides:(1, 4) + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + constexpr ck::index_t s1 = 1; + constexpr ck::index_t s0 = 4; + const auto desc = + ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number{}, ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{})); + // Reverse due to column major + const auto desc_1d = transform_tensor_descriptor( + desc, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1))), + ck::make_tuple(ck::Sequence<1, 0>{}), + ck::make_tuple(ck::Sequence<0>{})); + const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0)); + const auto layout_compiletime = + ck::wrapper::make_layout(ck::make_tuple(ck::Number{}, ck::Number{})); + std::vector> idxs; + + for(ck::index_t h = 0; h < d1; h++) + { + for(ck::index_t w = 0; w < d0; w++) + { + idxs.emplace_back(h, w); + } + } + + this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs); +} + +TEST_F(TestWrapperLayout, 3d_nested) +{ + // dims:((2, 3), 4, 3) strides:((2, 4), 12, 48) + constexpr ck::index_t d3 = 2; + constexpr ck::index_t d2 = 3; + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + constexpr ck::index_t s3 = 2; + constexpr ck::index_t s2 = 4; + constexpr ck::index_t s1 = 12; + constexpr ck::index_t s0 = 48; + const auto desc = ck::make_naive_tensor_descriptor( + ck::make_tuple(ck::Number{}, ck::Number{}, ck::Number{}, ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{}, ck::Number{}, ck::Number{})); + // Reverse due to column major + const auto desc_1d = transform_tensor_descriptor( + desc, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))), + ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}), + ck::make_tuple(ck::Sequence<0>{})); + const auto desc_3d = transform_tensor_descriptor( + desc, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)), + ck::make_pass_through_transform(d1), + ck::make_pass_through_transform(d2)), + ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<3>{}), + ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{})); + const auto layout_runtime = + ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), d1, d0), + ck::make_tuple(ck::make_tuple(s3, s2), s1, s0)); + const auto layout_compiletime = ck::wrapper::make_layout( + ck::make_tuple( + ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}, ck::Number{}), + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), + ck::Number{}, + ck::Number{})); + std::vector> idxs_3d; + + for(ck::index_t d = 0; d < d2 * d3; d++) + { + for(ck::index_t h = 0; h < d1; h++) + { + for(ck::index_t w = 0; w < d0; w++) + { + idxs_3d.emplace_back(d, h, w); + } + } + } + this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d); + + // Check also 4d iteration + std::vector, ck::index_t, ck::index_t>> idxs_4d; + + for(ck::index_t e = 0; e < d3; e++) + { + for(ck::index_t d = 0; d < d2; d++) + { + for(ck::index_t h = 0; h < d1; h++) + { + for(ck::index_t w = 0; w < d0; w++) + { + idxs_4d.emplace_back(ck::make_tuple(e, d), h, w); + } + } + } + } + this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d); +} + +TEST_F(TestWrapperLayout, 2d_nested) +{ + // dims:((2, 3), (4, 3)) strides:((2, 4), (48, 12)) + constexpr ck::index_t d3 = 2; + constexpr ck::index_t d2 = 3; + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + constexpr ck::index_t s3 = 2; + constexpr ck::index_t s2 = 4; + constexpr ck::index_t s1 = 48; + constexpr ck::index_t s0 = 12; + const auto desc = ck::make_naive_tensor_descriptor( + ck::make_tuple(ck::Number{}, ck::Number{}, ck::Number{}, ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{}, ck::Number{}, ck::Number{})); + // Reverse due to column major + const auto desc_1d = transform_tensor_descriptor( + desc, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))), + ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}), + ck::make_tuple(ck::Sequence<0>{})); + const auto desc_2d = transform_tensor_descriptor( + desc, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)), + ck::make_merge_transform(ck::make_tuple(d0, d1))), + ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<3, 2>{}), + ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{})); + const auto layout_runtime = + ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), ck::make_tuple(d1, d0)), + ck::make_tuple(ck::make_tuple(s3, s2), ck::make_tuple(s1, s0))); + const auto layout_compiletime = ck::wrapper::make_layout( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{})), + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{}))); + std::vector> idxs_2d; + + for(ck::index_t h = 0; h < d2 * d3; h++) + { + for(ck::index_t w = 0; w < d0 * d1; w++) + { + idxs_2d.emplace_back(h, w); + } + } + this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d); + // Check also 4d iteration + std::vector, ck::Tuple>> + idxs_4d; + + for(ck::index_t e = 0; e < d3; e++) + { + for(ck::index_t d = 0; d < d2; d++) + { + for(ck::index_t h = 0; h < d1; h++) + { + for(ck::index_t w = 0; w < d0; w++) + { + idxs_4d.emplace_back(ck::make_tuple(e, d), ck::make_tuple(h, w)); + } + } + } + } + this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d); +} + +TEST_F(TestWrapperLayout, 3d_double_nested) +{ + // dims:(((2, 2), 3), (4, 3)) strides:(((2, 4), 8), (96, 24)) + constexpr ck::index_t d4 = 2; + constexpr ck::index_t d3 = 2; + constexpr ck::index_t d2 = 3; + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + constexpr ck::index_t s4 = 2; + constexpr ck::index_t s3 = 4; + constexpr ck::index_t s2 = 8; + constexpr ck::index_t s1 = 96; + constexpr ck::index_t s0 = 24; + const auto desc = ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number{}, + ck::Number{}, + ck::Number{}, + ck::Number{}, + ck::Number{}), + ck::make_tuple(ck::Number{}, + ck::Number{}, + ck::Number{}, + ck::Number{}, + ck::Number{})); + // Reverse due to column major + const auto desc_1d = transform_tensor_descriptor( + desc, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3, d4))), + ck::make_tuple(ck::Sequence<4, 3, 2, 1, 0>{}), + ck::make_tuple(ck::Sequence<0>{})); + const auto desc_3d = transform_tensor_descriptor( + desc, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d3, d4)), + ck::make_pass_through_transform(d2), + ck::make_merge_transform(ck::make_tuple(d0, d1))), + ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<4, 3>{}), + ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{})); + const auto desc_2d = transform_tensor_descriptor( + desc_3d, + ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3 * d4)), + ck::make_pass_through_transform(d1 * d0)), + ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}), + ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{})); + const auto layout_runtime = ck::wrapper::make_layout( + ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)), + ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, s3), s2), ck::make_tuple(s1, s0))); + const auto layout_compiletime = ck::wrapper::make_layout( + ck::make_tuple( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{})), + ck::make_tuple( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{}))); + std::vector> idxs_2d; + + for(ck::index_t h = 0; h < d2 * d3 * d4; h++) + { + for(ck::index_t w = 0; w < d0 * d1; w++) + { + idxs_2d.emplace_back(h, w); + } + } + this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d); + // Check also 3d iteration + std::vector, ck::index_t>> idxs_3d; + + for(ck::index_t d = 0; d < d3 * d4; d++) + { + for(ck::index_t h = 0; h < d2; h++) + { + for(ck::index_t w = 0; w < d1 * d0; w++) + { + idxs_3d.emplace_back(ck::make_tuple(d, h), w); + } + } + } + this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d); + // Check also 5d iteration + std::vector, ck::index_t>, + ck::Tuple>> + idxs_5d; + + for(ck::index_t f = 0; f < d4; f++) + { + for(ck::index_t e = 0; e < d3; e++) + { + for(ck::index_t d = 0; d < d2; d++) + { + for(ck::index_t h = 0; h < d1; h++) + { + for(ck::index_t w = 0; w < d0; w++) + { + idxs_5d.emplace_back(ck::make_tuple(ck::make_tuple(f, e), d), + ck::make_tuple(h, w)); + } + } + } + } + } + this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_5d); +} + +TEST(TestLayoutHelpers, SizeAndGet) +{ + // dims:(((2, 2), 3), (4, 3)) + constexpr ck::index_t d4 = 2; + constexpr ck::index_t d3 = 2; + constexpr ck::index_t d2 = 3; + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + const auto layout_runtime = ck::wrapper::make_layout( + ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0))); + const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{}))); + + // Size of layout + EXPECT_EQ(ck::wrapper::size(layout_runtime), d4 * d3 * d2 * d1 * d0); + EXPECT_EQ(ck::wrapper::size(layout_compiletime), d4 * d3 * d2 * d1 * d0); + + // Size of dims + EXPECT_EQ(ck::wrapper::size<0>(layout_runtime), d4 * d3 * d2); + EXPECT_EQ(ck::wrapper::size<0>(layout_compiletime), d4 * d3 * d2); + EXPECT_EQ(ck::wrapper::size<1>(layout_runtime), d1 * d0); + EXPECT_EQ(ck::wrapper::size<1>(layout_compiletime), d1 * d0); + + // Access through new layout (using get with layout object) + EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_runtime)), d4 * d3); + EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_compiletime)), d4 * d3); + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2); + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2); + + EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d4); + EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))), + d4); + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d3); + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))), + d3); + + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2); + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2); + + EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_runtime)), d1); + EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_compiletime)), d1); + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_runtime)), d0); + EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_compiletime)), d0); +} + +TEST(TestLayoutHelpers, DepthAndRank) +{ + // dims:(((2, 2), 3), (4, 3)) + constexpr ck::index_t d4 = 2; + constexpr ck::index_t d3 = 2; + constexpr ck::index_t d2 = 3; + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + const auto layout_runtime = ck::wrapper::make_layout( + ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0))); + const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{}))); + + EXPECT_EQ(ck::wrapper::depth(layout_runtime), 3); + EXPECT_EQ(ck::wrapper::depth(layout_compiletime), 3); + EXPECT_EQ(ck::wrapper::depth(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2); + // Check for integer + EXPECT_EQ(ck::wrapper::depth(d0), 0); + + EXPECT_EQ(ck::wrapper::rank(layout_runtime), 2); + EXPECT_EQ(ck::wrapper::rank(layout_compiletime), 2); + EXPECT_EQ(ck::wrapper::rank(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2); + // Check for integer + EXPECT_EQ(ck::wrapper::rank(d0), 1); +} + +TEST(TestLayoutHelpers, ShapeAndStrides) +{ + // dims:(((2, 2), 3), (4, 3)) + constexpr ck::index_t d4 = 2; + constexpr ck::index_t d3 = 2; + constexpr ck::index_t d2 = 3; + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + constexpr ck::index_t s4 = 2; + constexpr ck::index_t s3 = 4; + constexpr ck::index_t s2 = 8; + constexpr ck::index_t s1 = 96; + constexpr ck::index_t s0 = 24; + const auto shape_compiletime = ck::make_tuple( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{})); + const auto strides_compiletime = ck::make_tuple( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{})); + const auto shape_runtime = + ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)); + const auto strides_runtime = + ck::make_tuple(ck::make_tuple(ck::make_tuple(s4, s3), s2), ck::make_tuple(s1, s0)); + const auto layout_runtime = ck::wrapper::make_layout(shape_runtime, strides_runtime); + const auto layout_compiletime = + ck::wrapper::make_layout(shape_compiletime, strides_compiletime); + + constexpr bool check_compiletime_shape = + std::is_same_v::type, + decltype(shape(layout_compiletime))>; + constexpr bool check_compiletime_strides = + std::is_same_v::type, + decltype(stride(layout_compiletime))>; + constexpr bool check_runtime_shape = + std::is_same_v::type, + decltype(shape(layout_runtime))>; + constexpr bool check_runtime_strides = + std::is_same_v::type, + decltype(stride(layout_runtime))>; + EXPECT_TRUE(check_compiletime_shape); + EXPECT_TRUE(check_compiletime_strides); + EXPECT_TRUE(check_runtime_shape); + EXPECT_TRUE(check_runtime_strides); +} + +TEST(TestLayoutHelpers, Hierarchical) +{ + // dims:(((2, 2), 3), (4, 3)) + constexpr ck::index_t d4 = 2; + constexpr ck::index_t d3 = 2; + constexpr ck::index_t d2 = 3; + constexpr ck::index_t d1 = 4; + constexpr ck::index_t d0 = 3; + const auto runtime_shape = + ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)); + const auto layout_runtime = ck::wrapper::make_layout(runtime_shape); + const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple( + ck::make_tuple(ck::make_tuple(ck::Number{}, ck::Number{}), ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{}))); + + EXPECT_EQ((ck::wrapper::rank<0, 0>(runtime_shape)), 2); + EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_runtime)), 2); + EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_compiletime)), 2); + + EXPECT_EQ((ck::wrapper::depth<0, 0>(runtime_shape)), 1); + EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_runtime)), 1); + EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_compiletime)), 1); + + EXPECT_EQ((ck::wrapper::size<0, 0>(runtime_shape)), d4 * d3); + EXPECT_EQ((ck::wrapper::size<0, 0>(layout_runtime)), d4 * d3); + EXPECT_EQ((ck::wrapper::size<0, 0>(layout_compiletime)), d4 * d3); + + EXPECT_EQ((ck::wrapper::get<0, 0, 0>(runtime_shape)), d4); +} From 6896c3b0ae3da9adfa3cd4979621cee642257fc3 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 6 Dec 2023 12:48:10 -0800 Subject: [PATCH 03/18] Fix the CI builds using clang++ directly. (#1087) * turn on -O3 compiler flag explicitly * change cmake syntax for CI * modify cmake line breaks in jenkinsfile --- Jenkinsfile | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 8e67f9cc39..d5fbff288f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -768,8 +768,15 @@ pipeline { } agent{ label rocmnode("gfx908 || gfx90a") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -DCMAKE_EXE_LINKER_FLAGS=" -L ${env.WORKSPACE}/script -T hip_fatbin_insert " """ - execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \ + -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \ + -DCMAKE_EXE_LINKER_FLAGS=" -L ${env.WORKSPACE}/script -T hip_fatbin_insert " \ + -DCMAKE_CXX_FLAGS=" -O3 " """ + execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ + cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ + -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \ + -DCMAKE_CXX_COMPILER="${build_compiler()}" \ + -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ } steps{ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') @@ -784,8 +791,12 @@ pipeline { } agent{ label rocmnode("gfx908 || gfx90a") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" """ - execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " """ + execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ + cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ + -DGPU_TARGETS="gfx908;gfx90a" \ + -DCMAKE_CXX_COMPILER="${build_compiler()}" \ + -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ } steps{ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') @@ -800,8 +811,12 @@ pipeline { } agent{ label rocmnode("navi21") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON """ - execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """ + execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ + cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ + -DGPU_TARGETS="gfx1030" \ + -DCMAKE_CXX_COMPILER="${build_compiler()}" \ + -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ } steps{ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') @@ -816,8 +831,12 @@ pipeline { } agent{ label rocmnode("navi32") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON """ - execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """ + execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ + cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ + -DGPU_TARGETS="gfx1101" \ + -DCMAKE_CXX_COMPILER="${build_compiler()}" \ + -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ } steps{ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') From 957281ce45025f674c75ee3e318257d9df3a52d7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 7 Dec 2023 10:32:04 -0700 Subject: [PATCH 04/18] Bump rocm-docs-core from 0.29.0 to 0.30.1 in /docs/sphinx (#1090) Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.29.0 to 0.30.1. - [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases) - [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.29.0...v0.30.1) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index f5ee431e7d..0a65ffc81a 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==0.29.0 +rocm-docs-core==0.30.1 sphinxcontrib-bibtex==2.6.1 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 0442ae9a2b..01cb32e714 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -96,7 +96,9 @@ pygments==2.14.0 # pydata-sphinx-theme # sphinx pyjwt[crypto]==2.6.0 - # via pygithub + # via + # pygithub + # pyjwt pynacl==1.5.0 # via pygithub pytz==2023.3.post1 @@ -111,7 +113,7 @@ requests==2.28.2 # via # pygithub # sphinx -rocm-docs-core==0.29.0 +rocm-docs-core==0.30.1 # via -r requirements.in six==1.16.0 # via From 33600202c644f64d3596d6340466982895772822 Mon Sep 17 00:00:00 2001 From: zjing14 Date: Thu, 7 Dec 2023 13:39:40 -0600 Subject: [PATCH 05/18] remove imcomplete transpose profiler (#1088) Co-authored-by: Jing Zhang Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> --- profiler/src/profile_transpose.cpp | 85 ------------------------------ 1 file changed, 85 deletions(-) delete mode 100644 profiler/src/profile_transpose.cpp diff --git a/profiler/src/profile_transpose.cpp b/profiler/src/profile_transpose.cpp deleted file mode 100644 index c239a520d1..0000000000 --- a/profiler/src/profile_transpose.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include - -#include "profiler/profile_transpose_impl.hpp" -#include "profiler_operation_registry.hpp" - -enum struct MatrixLayout -{ - NCDHW, // 0 - NCHWD, // 1 -}; - -enum struct DataType -{ - F32_F32_F32_F32_F32, // 0 - F16_F16_F16_F16_F16, // 1 -}; - -#define OP_NAME "transpose" -#define OP_DESC "Transpose" - -int profile_transpose(int argc, char* argv[]) -{ - if(argc != 15) - { - printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); - printf("arg2: data type (0: fp32; 1: fp16)\n"); - // printf("arg3: matrix layout (NCDHW -> NDCHW);\n"); - printf("arg4: verification (0: no; 1: yes)\n"); - printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg6: print tensor value (0: no; 1: yes)\n"); - printf("arg7: time kernel (0=no, 1=yes)\n"); - printf("arg8 to 13: N, C, D, H, W\n"); - exit(1); - } - - const auto data_type = static_cast(std::stoi(argv[2])); - // const auto layout = static_cast(std::stoi(argv[3])); - const bool do_verification = std::stoi(argv[3]); - const int init_method = std::stoi(argv[4]); - const bool do_log = std::stoi(argv[5]); - const bool time_kernel = std::stoi(argv[6]); - std::vector lengths = std::stoi(argv[7]); - - /**const int N = std::stoi(argv[7]); - const int C = std::stoi(argv[8]); - const int D = std::stoi(argv[9]); - const int H = std::stoi(argv[10]); - const int W = std::stoi(argv[11]);**/ - - using F32 = float; - using F16 = ck::half_t; - - auto profile = [&](auto a_type, auto b_type) { - using ADataType = decltype(a_type); - using BDataType = decltype(b_type); - - bool pass = ck::profiler::profile_transpose_impl( - do_verification, init_method, do_log, time_kernel, lengths); - - return pass ? 0 : 1; - }; - - if(data_type == GemmDataType::F32_F32_F32_F32_F32) - { - return profile(F32{}, F32{}); - } - else if(data_type == GemmDataType::F16_F16_F16_F16_F16) - { - return profile(F16{}, F16{}); - } - else - { - std::cout << "this data_type & layout is not implemented" << std::endl; - - return 1; - } -} - -REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_transpose); From d939411dae1aa0e09fecb466cfdc1e3044085720 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Thu, 7 Dec 2023 15:59:34 -0800 Subject: [PATCH 06/18] Switch from ROCmSoftwarePlatform to ROCm org (#1091) * switch from ROCmSoftwarePlatform to ROCm org * replace ROCmSoftwarePlatform with ROCm in few more places --- CITATION.cff | 4 ++-- Jenkinsfile | 10 +++++----- README.md | 2 +- dev-requirements.txt | 4 ++-- include/ck/host_utility/device_prop.hpp | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index d35fe9e587..3813d63812 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -59,9 +59,9 @@ authors: family-names: Zhou - given-names: Jianfeng family-names: Yan -repository-code: 'https://github.com/ROCmSoftwarePlatform/composable_kernel' +repository-code: 'https://github.com/ROCm/composable_kernel' abstract: Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for Machine Learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel progarmming languages, like HIP C++. keywords: - 'CK, Composable Kernel, Tensor Coordinate Transformation' license: MIT -license-url: https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE +license-url: https://github.com/ROCm/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE diff --git a/Jenkinsfile b/Jenkinsfile index d5fbff288f..8f661e4780 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -302,7 +302,7 @@ def buildHipClangJob(Map conf=[:]){ def retimage (retimage, image) = getDockerImage(conf) - gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { timeout(time: 5, unit: 'HOURS') { @@ -355,7 +355,7 @@ def runCKProfiler(Map conf=[:]){ def variant = env.STAGE_NAME def retimage - gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { try { (retimage, image) = getDockerImage(conf) withDockerContainer(image: image, args: dockerOpts) { @@ -487,7 +487,7 @@ def Build_CK(Map conf=[:]){ def retimage def navi_node = 0 - gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { try { (retimage, image) = getDockerImage(conf) withDockerContainer(image: image, args: dockerOpts) { @@ -553,7 +553,7 @@ def Build_CK(Map conf=[:]){ sh """#!/bin/bash rm -rf "${params.hipTensor_branch}".zip rm -rf hipTensor-"${params.hipTensor_branch}" - wget https://github.com/ROCmSoftwarePlatform/hipTensor/archive/refs/heads/"${params.hipTensor_branch}".zip + wget https://github.com/ROCm/hipTensor/archive/refs/heads/"${params.hipTensor_branch}".zip unzip -o "${params.hipTensor_branch}".zip """ dir("hipTensor-${params.hipTensor_branch}"){ @@ -605,7 +605,7 @@ def process_results(Map conf=[:]){ def variant = env.STAGE_NAME def retimage - gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { try { (retimage, image) = getDockerImage(conf) } diff --git a/README.md b/README.md index e5a20f143f..7679607e69 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa 3. Clone CK source code from the GitHub repository and start the build: ```bash - git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git && \ + git clone https://github.com/ROCm/composable_kernel.git && \ cd composable_kernel && \ mkdir build && \ cd build diff --git a/dev-requirements.txt b/dev-requirements.txt index 9e7b9f01e1..d5d91f8c27 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,3 +1,3 @@ -ROCmSoftwarePlatform/rocm-recipes +ROCm/rocm-recipes RadeonOpenCompute/rocm-cmake@04f694df2a8dc9d7e35fa4dee4ba5fa407ec04f8 --build -danmar/cppcheck@2.9 \ No newline at end of file +danmar/cppcheck@2.9 diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp index be2c2395fc..e8dabc9973 100644 --- a/include/ck/host_utility/device_prop.hpp +++ b/include/ck/host_utility/device_prop.hpp @@ -26,7 +26,7 @@ inline std::string get_device_name() } const std::string raw_name(props.gcnArchName); - // https://github.com/ROCmSoftwarePlatform/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40 + // https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40 static std::map device_name_map = { {"Ellesmere", "gfx803"}, {"Baffin", "gfx803"}, From f83698489109205dfe1780ce63c032b2a27e7434 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Fri, 8 Dec 2023 11:07:42 +0100 Subject: [PATCH 07/18] Support broadcast for bias in grouped conv fwd (#1081) * Support broadcast for bias in grouped conv fwd * Fix comment * Comment fixes * Remove GK layout --- ...rouped_conv_fwd_scaleadd_scaleadd_relu.inc | 16 +- example/62_conv_fwd_activ/CMakeLists.txt | 2 + ...aleadd_scaleadd_relu_bcasted_bias_fp16.cpp | 294 ++++++++++++++++++ .../run_convnd_fwd_activ_example.inc | 2 +- ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp | 32 +- ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 3 +- .../gpu/device/tensor_layout.hpp | 6 - .../transform_conv_fwd_to_gemm.hpp | 15 +- .../device_operation_instance_factory.hpp | 6 +- ...olution_forward_scaleadd_scaleadd_relu.hpp | 12 +- ...elu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp | 8 +- ...relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp | 8 +- ...relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp | 8 +- ...elu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp | 8 +- .../conv2d_fwd/conv2d_quantization_common.hpp | 6 +- 15 files changed, 371 insertions(+), 55 deletions(-) create mode 100644 example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp diff --git a/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc b/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc index c72c72971d..e8f5529520 100644 --- a/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc +++ b/client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc @@ -16,6 +16,7 @@ using InLayout = ck::tensor_layout::convolution::NDHWGC; using WeiLayout = ck::tensor_layout::convolution::GKZYXC; using OutLayout = ck::tensor_layout::convolution::NDHWGK; +using BiasLayout = ck::tensor_layout::convolution::G_K; using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ScaleAddScaleAddRelu = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu; @@ -64,6 +65,9 @@ int execute_conv_fwd_scaleadd_scaleadd_relu() std::array out_lengths{G, N, K, Do, Ho, Wo}; std::array out_strides{ K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K}; + // Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW) + std::array bias_lengths{G, 1, K, 1, 1, 1}; + std::array bias_strides{K, 0, 1, 0, 0, 0}; std::array filter_strides{1, 1, 1}; std::array filter_dilations{1, 1, 1}; @@ -74,13 +78,13 @@ int execute_conv_fwd_scaleadd_scaleadd_relu() SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C); SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K); SimpleDeviceMem d0(sizeof(std::tuple_element_t<0, DDataTypes>) * N * Do * Ho * Wo * G * K); - SimpleDeviceMem d1(sizeof(std::tuple_element_t<1, DDataTypes>) * N * Do * Ho * Wo * G * K); + SimpleDeviceMem d1(sizeof(std::tuple_element_t<1, DDataTypes>) * G * K); using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD< NumDimSpatial, InLayout, WeiLayout, - ck::Tuple, + ck::Tuple, OutLayout, InDataType, WeiDataType, @@ -117,8 +121,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu() in_strides, wei_lengths, wei_strides, - {out_lengths, out_lengths}, - {out_strides, out_strides}, + {out_lengths, bias_lengths}, + {out_strides, bias_strides}, out_lengths, out_strides, filter_strides, @@ -187,8 +191,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu() in_strides, wei_lengths, wei_strides, - {out_lengths, out_lengths}, - {out_strides, out_strides}, + {out_lengths, bias_lengths}, + {out_strides, bias_strides}, out_lengths, out_strides, filter_strides, diff --git a/example/62_conv_fwd_activ/CMakeLists.txt b/example/62_conv_fwd_activ/CMakeLists.txt index bb95602416..d1f26bbfe1 100644 --- a/example/62_conv_fwd_activ/CMakeLists.txt +++ b/example/62_conv_fwd_activ/CMakeLists.txt @@ -42,6 +42,8 @@ foreach(gpu IN LISTS GPU_TARGETS) # ScaleAdd ScaleAdd Relu add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp) add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16) + add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp) + add_example_dependencies(example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16) set(target 1) endif() endforeach() diff --git a/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp b/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp new file mode 100644 index 0000000000..196636f8b5 --- /dev/null +++ b/example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp" + +#include "ck/library/utility/algorithm.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/convolution_parameter.hpp" +#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" + +constexpr ck::index_t NDimSpatial = 3; +using InDataType = ck::half_t; +using WeiDataType = ck::half_t; +using AccDataType = float; +using CShuffleDataType = ck::half_t; +using OutDataType = ck::half_t; + +template +using S = ck::Sequence; + +using InLayout = ck::tensor_layout::convolution::NDHWGC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::NDHWGK; + +using BiasLayout = ck::tensor_layout::convolution::G_K; + +using InElementOp = ck::tensor_operation::element_wise::PassThrough; +using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; + +using OutElementOp = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu; + +static constexpr auto ConvSpec = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +template +using DeviceGroupedConvNDFwdInstance = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle< + NDimSpatial, + InLayout, + WeiLayout, + ck::Tuple, + OutLayout, + InDataType, + WeiDataType, + AccDataType, + CShuffleDataType, + ck::Tuple, + OutDataType, + InElementOp, + WeiElementOp, + OutElementOp, + ConvSpec, // ConvForwardSpecialization + GemmSpec, // GemmSpecialization + 1, // + 256, // BlockSize + 128, // MPerBlock + 256, // NPerBlock + 32, // KPerBlock + 8, // AK1 + 8, // BK1 + 32, // MPerXdl + 32, // NPerXdl + 2, // MXdlPerWave + 4, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_AK1 + 1, // ABlockLdsExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_BK1 + 1, // BBlockLdsExtraN + 1, + 1, + S<1, 32, 1, 8>, + 8>; + +using DeviceGroupedConvNDFwdActivInstance = DeviceGroupedConvNDFwdInstance; + +namespace { +// Use custom implementation to pass two more tensors for post op +template +bool run_grouped_conv_fwd(bool do_verification, + int init_method, + bool time_kernel, + const ck::utils::conv::ConvParam& conv_param, + const HostTensorDescriptor& in_g_n_c_wis_desc, + const HostTensorDescriptor& wei_g_k_c_xs_desc, + const HostTensorDescriptor& out_g_n_k_wos_desc, + const InElementOp& in_element_op, + const WeiElementOp& wei_element_op, + const OutElementOp& out_element_op) +{ + constexpr ck::index_t NumDs = 2; + const ck::index_t G = out_g_n_k_wos_desc.GetLengths()[0]; + const ck::index_t K = out_g_n_k_wos_desc.GetLengths()[2]; + + // Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW) + std::array bias_g_k_lengths; + std::array bias_g_k_strides; + // Fill other lenghts than G,K with 1 and strides with 0 + bias_g_k_lengths.fill(1); + bias_g_k_strides.fill(0); + bias_g_k_lengths[0] = G; + bias_g_k_lengths[2] = K; + bias_g_k_strides[0] = K; // stride to G + bias_g_k_strides[2] = 1; // stride to K + const auto broadcasted_bias_desc = HostTensorDescriptor(bias_g_k_lengths, bias_g_k_strides); + + // y = relu ( alpha1 * conv(x) + alpha2 * z + bias ) + Tensor in(in_g_n_c_wis_desc); + Tensor wei(wei_g_k_c_xs_desc); + Tensor out_host(out_g_n_k_wos_desc); + Tensor out_device(out_g_n_k_wos_desc); + std::array, NumDs> d_tensors = {Tensor(out_g_n_k_wos_desc), + Tensor(broadcasted_bias_desc)}; + + std::cout << "in: " << in.mDesc << std::endl; + std::cout << "wei: " << wei.mDesc << std::endl; + std::cout << "out: " << out_host.mDesc << std::endl; + std::cout << "z_tensor: " << d_tensors[0].mDesc << std::endl; + std::cout << "bias_tensor: " << d_tensors[1].mDesc << std::endl; + + // Make sure that we allocated only G * K values for bias + assert(static_cast(d_tensors[1].mData.size()) == G * K); + + switch(init_method) + { + case 0: break; + case 1: + in.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + wei.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + d_tensors[0].GenerateTensorValue(GeneratorTensor_2{-2, 2}); + d_tensors[1].GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + default: + in.GenerateTensorValue(GeneratorTensor_3{-1.0, 1.0}); + wei.GenerateTensorValue(GeneratorTensor_3{-0.05, 0.05}); + d_tensors[0].GenerateTensorValue(GeneratorTensor_3{-0.05, 0.05}); + d_tensors[1].GenerateTensorValue(GeneratorTensor_3{-0.05, 0.05}); + } + + DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); + DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize()); + DeviceMem z_buf(sizeof(OutDataType) * d_tensors[0].mDesc.GetElementSpaceSize()); + DeviceMem bias_buf(sizeof(OutDataType) * d_tensors[1].mDesc.GetElementSpaceSize()); + DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize()); + + in_device_buf.ToDevice(in.mData.data()); + wei_device_buf.ToDevice(wei.mData.data()); + z_buf.ToDevice(d_tensors[0].mData.data()); + bias_buf.ToDevice(d_tensors[1].mData.data()); + + std::array a_g_n_c_wis_lengths{}; + std::array a_g_n_c_wis_strides{}; + std::array b_g_k_c_xs_lengths{}; + std::array b_g_k_c_xs_strides{}; + std::array e_g_n_k_wos_lengths{}; + std::array e_g_n_k_wos_strides{}; + std::array conv_filter_strides{}; + std::array conv_filter_dilations{}; + std::array input_left_pads{}; + std::array input_right_pads{}; + + auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; + + copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); + copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); + copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths); + copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides); + copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths); + copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides); + copy(conv_param.conv_filter_strides_, conv_filter_strides); + copy(conv_param.conv_filter_dilations_, conv_filter_dilations); + copy(conv_param.input_left_pads_, input_left_pads); + copy(conv_param.input_right_pads_, input_right_pads); + + const std::array ds = {z_buf.GetDeviceBuffer(), bias_buf.GetDeviceBuffer()}; + + auto conv = DeviceConvNDFwdInstance{}; + auto invoker = conv.MakeInvoker(); + auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(), + wei_device_buf.GetDeviceBuffer(), + ds, + out_device_buf.GetDeviceBuffer(), + a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + std::array, NumDs>{ + e_g_n_k_wos_lengths, bias_g_k_lengths}, + std::array, NumDs>{ + e_g_n_k_wos_strides, bias_g_k_strides}, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + in_element_op, + wei_element_op, + out_element_op); + + if(!conv.IsSupportedArgument(argument)) + { + throw std::runtime_error("The device op with the specified compilation parameters does " + "not support this convolution problem."); + } + + float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); + + std::size_t flop = conv_param.GetFlops() + G * K + + conv_param.GetOutputByte() / sizeof(OutDataType); + std::size_t num_btype = conv_param.GetByte() + + G * K * sizeof(OutDataType) + conv_param.GetOutputByte(); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_btype / 1.E6 / avg_time; + std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << conv.GetTypeString() << std::endl; + + if(do_verification) + { + auto ref_conv = + ck::tensor_operation::host::ReferenceConvFwd(); + + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(in, + wei, + out_host, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_, + in_element_op, + wei_element_op, + out_element_op, + {}, + {}, + d_tensors); + + ref_invoker.Run(ref_argument); + + out_device_buf.FromDevice(out_device.mData.data()); + + return ck::utils::check_err(out_device, out_host, "Error: incorrect results!"); + } + + return true; +} + +} // namespace + +#include "run_convnd_fwd_activ_example.inc" + +int main(int argc, char* argv[]) { return !run_convnd_fwd_example(argc, argv); } diff --git a/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc b/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc index 7c20c01066..aa547c870a 100644 --- a/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc +++ b/example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc @@ -24,7 +24,7 @@ bool run_convnd_fwd_example(int argc, char* argv[]) // Following shapes are selected to avoid overflow. Expect inf in case of // size increase for some elementwise ops. ck::utils::conv::ConvParam conv_param{ - 3, 1, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}; + 3, 2, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}; if(argc == 1) { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp index 26224b5dec..4afef85d8c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp @@ -357,15 +357,17 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle return out_gemmm_gemmn_desc; } + // Shape of Ds and E must be aligned. Strides can be different. + // Pass e_g_n_k_wos_lengths for logical broadcast. static auto MakeDsGridDescriptor_M_N( - const std::array, NumDTensor>& ds_g_n_k_wos_lengths, + const std::array& e_g_n_k_wos_lengths, const std::array, NumDTensor>& ds_g_n_k_wos_strides) { return generate_tuple( [&](auto i) { using DLayout = remove_cvref_t>; - return DeviceOp::MakeEGridDescriptor_M_N(ds_g_n_k_wos_lengths[i], + return DeviceOp::MakeEGridDescriptor_M_N(e_g_n_k_wos_lengths, ds_g_n_k_wos_strides[i]); }, Number{}); @@ -569,7 +571,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle // D desc ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N( - ds_g_n_k_wos_lengths[i], ds_g_n_k_wos_strides[i]); + e_g_n_k_wos_lengths, ds_g_n_k_wos_strides[i]); }); compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_k_wos_strides[0]; @@ -916,8 +918,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle is_same_v || is_same_v || is_same_v || is_same_v || is_same_v || is_same_v || - is_same_v || is_same_v || - is_same_v) + is_same_v || is_same_v) { const index_t K = arg.ds_g_n_k_wos_lengths_[i][2]; @@ -925,6 +926,27 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle { valid = false; } + + if constexpr(is_same_v) + { + // G and K must be the same + if(arg.ds_g_n_k_wos_lengths_[i][0] != arg.e_g_n_k_wos_lengths_[0] || + arg.ds_g_n_k_wos_lengths_[i][2] != arg.e_g_n_k_wos_lengths_[2]) + { + valid = false; + } + } + else + { + // E and D must have the same shape + for(index_t d = 0; d < NDimSpatial + 3; d++) + { + if(arg.ds_g_n_k_wos_lengths_[i][d] != arg.e_g_n_k_wos_lengths_[d]) + { + valid = false; + } + } + } } else { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp index 80a5d0e97a..0050a5b281 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp @@ -631,8 +631,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle is_same_v || is_same_v || is_same_v || is_same_v || is_same_v || is_same_v || - is_same_v || is_same_v || - is_same_v) + is_same_v || is_same_v) { const index_t K = arg.ds_g_n_k_wos_lengths_[i][2]; diff --git a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp index b2d141fd61..ecc71ba2f2 100644 --- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp +++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp @@ -308,12 +308,6 @@ struct GNDHWK : public BaseTensorLayout static constexpr const char* name = "GNDHWK"; }; -// for output bias -struct GK : public BaseTensorLayout -{ - static constexpr const char* name = "GK"; -}; - // output tensor // packed NWGK/NHWGK/NDHWGK struct NWGK : public BaseTensorLayout diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp index 6f546f1d6d..e2f75142d4 100644 --- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp @@ -522,22 +522,21 @@ struct TransformConvFwdToGemm // for output bias template || - is_same_v, + typename std::enable_if, bool>::type = false> - static auto - MakeCDescriptor_M_N(const std::array& c_g_n_k_wos_lengths, - const std::array& /* c_g_n_k_wos_strides */) + static auto MakeCDescriptor_M_N(const std::array& c_g_n_k_wos_lengths, + const std::array& c_g_n_k_wos_strides) { - const index_t N = c_g_n_k_wos_lengths[1]; - const index_t K = c_g_n_k_wos_lengths[2]; + const index_t N = c_g_n_k_wos_lengths[1]; + const index_t K = c_g_n_k_wos_lengths[2]; + const index_t KStride = c_g_n_k_wos_strides[2]; const index_t NHoWo = N * ck::accumulate_n( c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>()); const auto out_gemmm_gemmn_desc = - make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(I0, I1)); + make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(I0, KStride)); return out_gemmm_gemmn_desc; } diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp index 89b8b9667f..dc47c7ec1a 100644 --- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp +++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp @@ -86,9 +86,9 @@ using NHWGK = ck::tensor_layout::convolution::NHWGK; using NDHWGK = ck::tensor_layout::convolution::NDHWGK; // -using GK = ck::tensor_layout::convolution::G_K; -using GK_Tuple = ck::Tuple; -using GK_GK_Tuple = ck::Tuple; +using G_K = ck::tensor_layout::convolution::G_K; +using GK_Tuple = ck::Tuple; +using GK_GK_Tuple = ck::Tuple; // pointwise functor using PassThrough = ck::tensor_operation::element_wise::PassThrough; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp index dc9f44dc86..efb6266426 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp @@ -27,7 +27,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, BF16, BF16, @@ -43,7 +43,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, F16, F16, @@ -59,7 +59,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, F32, F32, @@ -75,7 +75,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, int8_t, int8_t, @@ -130,7 +130,9 @@ struct DeviceOperationInstanceFactory> op_ptrs; if constexpr(NumDimSpatial == 3 && is_same_v && - is_same_v && is_same_v) + is_same_v && is_same_v && + DLayouts::Size() == 2 && is_same_v, NDHWGK> && + is_same_v, G_K>) { #ifdef CK_ENABLE_FP32 if constexpr(is_same_v && is_same_v && diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp index c6627a4825..7d2df94ad7 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp @@ -13,7 +13,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, BF16, BF16, @@ -28,7 +28,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_bf16_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwdDefault>{}); add_device_operation_instances( @@ -36,7 +36,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_bf16_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1P0>{}); add_device_operation_instances( @@ -44,7 +44,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_bf16_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1S1P0>{}); } diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp index 627af24d7b..8a09d03967 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp @@ -13,7 +13,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, F16, F16, @@ -28,7 +28,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f16_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwdDefault>{}); add_device_operation_instances( @@ -36,7 +36,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f16_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1P0>{}); add_device_operation_instances( @@ -44,7 +44,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f16_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1S1P0>{}); } diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp index 1fd567e360..6966959639 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp @@ -13,7 +13,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, F32, F32, @@ -28,7 +28,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f32_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwdDefault>{}); add_device_operation_instances( @@ -36,7 +36,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f32_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1P0>{}); add_device_operation_instances( @@ -44,7 +44,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f32_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1S1P0>{}); } diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp index dae292891c..2606f69428 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_scaleadd_scaleadd_relu/xdl/device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp @@ -12,7 +12,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw std::vector, + ck::Tuple, NDHWGK, int8_t, int8_t, @@ -27,7 +27,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_int8_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwdDefault>{}); add_device_operation_instances( @@ -35,7 +35,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_int8_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1P0>{}); add_device_operation_instances( @@ -43,7 +43,7 @@ void add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhw device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_int8_instances<3, NDHWGC, GKZYXC, - ck::Tuple, + ck::Tuple, NDHWGK, ConvFwd1x1S1P0>{}); } diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp index 711314985a..d46fe090b8 100644 --- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp +++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp @@ -22,13 +22,13 @@ using S = ck::Sequence; using NHWGC = ck::tensor_layout::convolution::NHWGC; using GKYXC = ck::tensor_layout::convolution::GKYXC; using NHWGK = ck::tensor_layout::convolution::NHWGK; -using GK = ck::tensor_layout::convolution::G_K; +using G_K = ck::tensor_layout::convolution::G_K; using PassThrough = ck::tensor_operation::element_wise::PassThrough; using Relu = ck::tensor_operation::element_wise::Relu; using TanH = ck::tensor_operation::element_wise::TanH; -using GK_Tuple = ck::Tuple; -using GK_GK_Tuple = ck::Tuple; +using GK_Tuple = ck::Tuple; +using GK_GK_Tuple = ck::Tuple; using I32_Tuple = ck::Tuple; using F32_Tuple = ck::Tuple; using I32_F32_Tuple = ck::Tuple; From b4dcd5803f1dae92467d39c31f176131ce796735 Mon Sep 17 00:00:00 2001 From: Nicolas Macchioni Date: Fri, 8 Dec 2023 11:30:01 -0800 Subject: [PATCH 08/18] Add F8 dtype definition in f16_f8_f16 gemm instances (#1092) --- .../device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp | 1 + .../device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp index 3c9e03b674..38667ad42b 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp @@ -16,6 +16,7 @@ namespace tensor_operation { namespace device { namespace instance { +using F8 = ck::f8_t; using F16 = ck::half_t; using F32 = float; diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp index aab0af990d..820404e064 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp @@ -16,6 +16,7 @@ namespace tensor_operation { namespace device { namespace instance { +using F8 = ck::f8_t; using F16 = ck::half_t; using F32 = float; From f199035b748331901b7e0d58cbcd88e108bdcadd Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 8 Dec 2023 14:32:37 -0800 Subject: [PATCH 09/18] fix clang format (#1095) --- .../device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp | 2 +- .../device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp index 38667ad42b..b3d1e925df 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instance.cpp @@ -16,7 +16,7 @@ namespace tensor_operation { namespace device { namespace instance { -using F8 = ck::f8_t; +using F8 = ck::f8_t; using F16 = ck::half_t; using F32 = float; diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp index 820404e064..9c80995949 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instance.cpp @@ -16,7 +16,7 @@ namespace tensor_operation { namespace device { namespace instance { -using F8 = ck::f8_t; +using F8 = ck::f8_t; using F16 = ck::half_t; using F32 = float; From 89ee47460bedd3e028a6240c2c395023cb233f4c Mon Sep 17 00:00:00 2001 From: Bartlomiej Wroblewski Date: Mon, 11 Dec 2023 17:12:32 +0100 Subject: [PATCH 10/18] Fix IsSupported check in the contraction op (#1066) Current implementation of IsSupported method in contraction ops does not cover a lot of possible cases in which ScalarPerVector cannot really be used to read A, B or D, or write E. This PR extends both the regular and multiABD contraction ops with improved checks and also adds new instances with smaller values of ScalarPerVector to support instances that are not supported by other instances. --- ..._contraction_multiple_abd_xdl_cshuffle.hpp | 153 ++++++++-------- ...ce_contraction_multiple_d_xdl_cshuffle.hpp | 163 ++++++++---------- .../device/impl/device_contraction_utils.hpp | 87 ++++++++++ .../device_contraction_instance.hpp | 24 ++- 4 files changed, 261 insertions(+), 166 deletions(-) create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp index 29d7a2b949..0c8e11a17b 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp @@ -14,6 +14,7 @@ #include "ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" @@ -500,22 +501,29 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle // for sanity check of vector memory access for(index_t i = 0; i < NumATensor; ++i) { - a_mz_stride_[i] = a_ms_ks_strides[i][NumDimM - 1]; - a_kz_stride_[i] = a_ms_ks_strides[i][NumDimM + NumDimK - 1]; + as_mz_consecutive_[i] = a_ms_ks_strides[i][NumDimM - 1] == 1; + as_kz_consecutive_[i] = a_ms_ks_strides[i][NumDimM + NumDimK - 1] == 1; + as_max_read_elems_[i] = + CalculateMaxRead(a_ms_ks_lengths[i], a_ms_ks_strides[i]); } for(index_t i = 0; i < NumBTensor; ++i) { - b_nz_stride_[i] = b_ns_ks_strides[i][NumDimN - 1]; - b_kz_stride_[i] = b_ns_ks_strides[i][NumDimN + NumDimK - 1]; + bs_nz_consecutive_[i] = b_ns_ks_strides[i][NumDimN - 1] == 1; + bs_kz_consecutive_[i] = b_ns_ks_strides[i][NumDimN + NumDimK - 1] == 1; + bs_max_read_elems_[i] = + CalculateMaxRead(b_ns_ks_lengths[i], b_ns_ks_strides[i]); } for(index_t i = 0; i < NumDTensor; ++i) { - ds_nz_stride_[i] = d_ms_ns_strides[i][NumDimM + NumDimN - 1]; + ds_nz_consecutive_[i] = d_ms_ns_strides[i][NumDimM + NumDimN - 1] == 1; + ds_max_read_elems_[i] = + CalculateMaxRead(d_ms_ns_lengths[i], d_ms_ns_strides[i]); } - e_nz_stride_ = e_ms_ns_stride[NumDimM + NumDimN - 1]; + e_nz_consecutive_ = e_ms_ns_stride[NumDimM + NumDimN - 1] == 1; + e_max_write_elems_ = CalculateMaxRead(e_ms_ns_length, e_ms_ns_stride); } // pointers @@ -545,16 +553,19 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle BElementwiseOperation b_element_op_; CDEElementwiseOperation cde_element_op_; - // Strides for the last M/N/K dimensions of A/B/Ds/E - // for sanity check of vector load/store - std::array a_mz_stride_; - std::array a_kz_stride_; + // Describe whether the last part of a given dimension of A/B/D/E is consecutive + // in the memory or not. + std::array as_mz_consecutive_; + std::array as_kz_consecutive_; + std::array bs_nz_consecutive_; + std::array bs_kz_consecutive_; + std::array ds_nz_consecutive_; + bool e_nz_consecutive_; - std::array b_nz_stride_; - std::array b_kz_stride_; - - std::array ds_nz_stride_; - index_t e_nz_stride_; + std::array as_max_read_elems_; + std::array bs_max_read_elems_; + std::array ds_max_read_elems_; + index_t e_max_write_elems_; }; // Invoker @@ -643,73 +654,65 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle // check vector load/store { - bool all_valid = true; - + bool valid_as_access = true; static_for<0, NumATensor, 1>{}([&](auto i) { - // vector memory access of A: could be on M or AK1 dimension - if constexpr(ABlockTransferSrcVectorDim == 1) + const bool valid_a_vector_size = + arg.as_max_read_elems_[i] % ABlockTransferSrcScalarPerVector == 0; + const bool valid_a_access_dim_m = + ABlockTransferSrcVectorDim == 1 && arg.as_mz_consecutive_[i]; + const bool valid_a_access_dim_k = + ABlockTransferSrcVectorDim == 2 && arg.as_kz_consecutive_[i]; + const bool valid_a_access_dim = valid_a_access_dim_m || valid_a_access_dim_k; + if(!(valid_a_vector_size && valid_a_access_dim)) { - if(!(arg.a_mz_stride_[i] == 1 && arg.as_grid_desc_ak0_m_ak1_[i].GetLength(I1) % - ABlockTransferSrcScalarPerVector == - 0)) - { - all_valid = false; - } - } - else - { - if(!(arg.a_kz_stride_[i] == 1 && arg.as_grid_desc_ak0_m_ak1_[i].GetLength(I2) % - ABlockTransferSrcScalarPerVector == - 0)) - { - all_valid = false; - } + valid_as_access = false; } }); - - // vector memory access of B: could be on N or BK1 dimension - static_for<0, NumBTensor, 1>{}([&](auto i) { - if constexpr(BBlockTransferSrcVectorDim == 1) - { - if(!(arg.b_nz_stride_[i] == 1 && arg.bs_grid_desc_bk0_n_bk1_[i].GetLength(I1) % - BBlockTransferSrcScalarPerVector == - 0)) - { - all_valid = false; - } - } - else - { - if(!(arg.b_kz_stride_[i] == 1 && arg.bs_grid_desc_bk0_n_bk1_[i].GetLength(I2) % - BBlockTransferSrcScalarPerVector == - 0)) - { - all_valid = false; - } - } - }); - - // check vector load of Ds - static_for<0, NumDTensor, 1>{}([&](auto i) { - if(!(arg.ds_nz_stride_[i] == 1 && - arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) % - CDEBlockTransferScalarPerVector_NPerBlock == - 0)) - { - all_valid = false; - } - }); - - // vector memory access of E: always on NPerBlock dimension - if(!(arg.e_nz_stride_ == 1 && - arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) % - CDEBlockTransferScalarPerVector_NPerBlock == - 0)) + if(!valid_as_access) { - all_valid = false; + return false; } - if(!all_valid) + bool valid_bs_access = true; + static_for<0, NumBTensor, 1>{}([&](auto i) { + const bool valid_b_vector_size = + arg.bs_max_read_elems_[i] % BBlockTransferSrcScalarPerVector == 0; + const bool valid_b_access_dim_n = + BBlockTransferSrcVectorDim == 1 && arg.bs_nz_consecutive_[i]; + const bool valid_b_access_dim_k = + BBlockTransferSrcVectorDim == 2 && arg.bs_kz_consecutive_[i]; + const bool valid_b_access_dim = valid_b_access_dim_n || valid_b_access_dim_k; + if(!(valid_b_vector_size && valid_b_access_dim)) + { + valid_bs_access = false; + } + }); + if(!valid_bs_access) + { + return false; + } + + bool valid_ds_access = true; + static_for<0, NumDTensor, 1>{}([&](auto i) { + const bool valid_d_vector_size = + arg.ds_max_read_elems_[i] % CDEBlockTransferScalarPerVector_NPerBlock == 0; + // Vector read of Ds is always on N dimension. + const bool valid_d_access_dim = arg.ds_nz_consecutive_[i]; + if(!(valid_d_vector_size && valid_d_access_dim)) + { + valid_ds_access = false; + } + }); + if(!valid_ds_access) + { + return false; + } + + const bool valid_e_vector_size = + arg.e_max_write_elems_ % CDEBlockTransferScalarPerVector_NPerBlock == 0; + // Vector write of E is always on N dimension. + const bool valid_e_access_dim = arg.e_nz_consecutive_; + if(!(valid_e_vector_size && valid_e_access_dim)) { return false; } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp index 71ff2ba17d..290abe221a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp @@ -13,6 +13,7 @@ #include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" @@ -183,7 +184,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle return generate_tuple([&](auto i) { return vec[i]; }, num); }; - const auto a_ms_ns_lengths = to_tuple(a_ms_ks_lengths_vec, Number{}); + const auto a_ms_ks_lengths = to_tuple(a_ms_ks_lengths_vec, Number{}); const auto a_ms_ks_strides = to_tuple(a_ms_ks_strides_vec, Number{}); // dimension Ids for M0, M1, ... @@ -194,14 +195,14 @@ struct DeviceContractionMultipleD_Xdl_CShuffle typename arithmetic_sequence_gen::type{}; // lengths for M0, M1, ... - const auto mLengths = get_container_subset(a_ms_ns_lengths, mDimIds); + const auto mLengths = get_container_subset(a_ms_ks_lengths, mDimIds); // lengths for K0, K1, ... - const auto kLengths = get_container_subset(a_ms_ns_lengths, kDimIds); + const auto kLengths = get_container_subset(a_ms_ks_lengths, kDimIds); // naive tensor A[M0, M1, M2, ..., K0, K1, K2...] const auto a_grid_desc_ms_ks = - make_naive_tensor_descriptor(a_ms_ns_lengths, a_ms_ks_strides); + make_naive_tensor_descriptor(a_ms_ks_lengths, a_ms_ks_strides); // transformed tensor A[MRaw = M0 * M1 * M2 * ... , KRaw = K0 * K1 * K2 * ...] const auto a_grid_desc_mraw_kraw = transform_tensor_descriptor( @@ -383,7 +384,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle const void* p_b_grid, std::array p_ds_grid, void* p_e_grid, - const std::vector& a_ms_ns_lengths, + const std::vector& a_ms_ks_lengths, const std::vector& a_ms_ks_strides, const std::vector& b_ns_ks_lengths, const std::vector& b_ns_ks_strides, @@ -398,7 +399,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle p_b_grid_{static_cast(p_b_grid)}, p_ds_grid_{}, p_e_grid_{static_cast(p_e_grid)}, - a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(a_ms_ns_lengths, a_ms_ks_strides)}, + a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(a_ms_ks_lengths, a_ms_ks_strides)}, b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(b_ns_ks_lengths, b_ns_ks_strides)}, ds_grid_desc_m_n_{}, e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(e_ms_ns_lengths, e_ms_ns_strides)}, @@ -411,13 +412,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)}, a_element_op_{a_element_op}, b_element_op_{b_element_op}, - cde_element_op_{cde_element_op}, - a_mz_stride_{}, - a_kz_stride_{}, - b_nz_stride_{}, - b_kz_stride_{}, - ds_nz_stride_{}, - e_nz_stride_{} + cde_element_op_{cde_element_op} { // populate pointer, batch stride, desc for Ds static_for<0, NumDTensor, 1>{}([&](auto i) { @@ -448,18 +443,26 @@ struct DeviceContractionMultipleD_Xdl_CShuffle } // for sanity check of vector memory access - a_mz_stride_ = a_ms_ks_strides[NumDimM - 1]; - a_kz_stride_ = a_ms_ks_strides[NumDimM + NumDimK - 1]; + a_mz_consecutive_ = a_ms_ks_strides[NumDimM - 1] == 1; + a_kz_consecutive_ = a_ms_ks_strides[NumDimM + NumDimK - 1] == 1; + a_max_read_elems_ = + CalculateMaxRead(a_ms_ks_lengths, a_ms_ks_strides); - b_nz_stride_ = b_ns_ks_strides[NumDimN - 1]; - b_kz_stride_ = b_ns_ks_strides[NumDimN + NumDimK - 1]; + b_nz_consecutive_ = b_ns_ks_strides[NumDimN - 1] == 1; + b_kz_consecutive_ = b_ns_ks_strides[NumDimN + NumDimK - 1] == 1; + b_max_read_elems_ = + CalculateMaxRead(b_ns_ks_lengths, b_ns_ks_strides); for(index_t i = 0; i < NumDTensor; ++i) { - ds_nz_stride_[i] = ds_ms_ns_strides[i][NumDimM + NumDimN - 1]; + ds_nz_consecutive_[i] = ds_ms_ns_strides[i][NumDimM + NumDimN - 1] == 1; + ds_max_read_elems_[i] = + CalculateMaxRead(ds_ms_ns_lengths[i], ds_ms_ns_strides[i]); } - e_nz_stride_ = e_ms_ns_strides[NumDimM + NumDimN - 1]; + e_nz_consecutive_ = e_ms_ns_strides[NumDimM + NumDimN - 1] == 1; + e_max_write_elems_ = + CalculateMaxRead(e_ms_ns_lengths, e_ms_ns_strides); } void Print() const @@ -499,15 +502,19 @@ struct DeviceContractionMultipleD_Xdl_CShuffle BElementwiseOperation b_element_op_; CDEElementwiseOperation cde_element_op_; - // Strides for the last M/N/K dimensions of A/B/Ds/E - // for sanity check of vector load/store - index_t a_mz_stride_; - index_t a_kz_stride_; - index_t b_nz_stride_; - index_t b_kz_stride_; - std::array ds_nz_stride_; - index_t e_mz_stride_; - index_t e_nz_stride_; + // Describe whether the last part of a given dimension of A/B/D/E is consecutive + // in the memory or not. + bool a_mz_consecutive_; + bool a_kz_consecutive_; + bool b_nz_consecutive_; + bool b_kz_consecutive_; + std::array ds_nz_consecutive_; + bool e_nz_consecutive_; + + index_t a_max_read_elems_; + index_t b_max_read_elems_; + std::array ds_max_read_elems_; + index_t e_max_write_elems_; }; // Invoker @@ -616,65 +623,47 @@ struct DeviceContractionMultipleD_Xdl_CShuffle (BBlockTransferSrcVectorDim == 1 || BBlockTransferSrcVectorDim == 2), "wrong!"); - // vector memory access of A: could be on M or AK1 dimension - if constexpr(ABlockTransferSrcVectorDim == 1) - { - if(!(arg.a_mz_stride_ == 1 && - arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) % ABlockTransferSrcScalarPerVector == 0)) - { - return false; - } - } - else - { - if(!(arg.a_kz_stride_ == 1 && - arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) % ABlockTransferSrcScalarPerVector == 0)) - { - return false; - } - } - - // vector memory access of B: could be on N or BK1 dimension - if constexpr(BBlockTransferSrcVectorDim == 1) - { - if(!(arg.b_nz_stride_ == 1 && - arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) % BBlockTransferSrcScalarPerVector == 0)) - { - return false; - } - } - else - { - if(!(arg.b_kz_stride_ == 1 && - arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) % BBlockTransferSrcScalarPerVector == 0)) - { - return false; - } - } - - // vector memory access of Ds: always on NPerBlock dimension - bool valid_d_access = true; - - static_for<0, NumDTensor, 1>{}([&](auto i) { - if(!(arg.ds_nz_stride_[i] == 1 && - arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) % - CDEBlockTransferScalarPerVector_NPerBlock == - 0)) - { - valid_d_access = false; - } - }); - - if(valid_d_access == false) + const bool valid_a_vector_size = + arg.a_max_read_elems_ % ABlockTransferSrcScalarPerVector == 0; + const bool valid_a_access_dim_m = ABlockTransferSrcVectorDim == 1 && arg.a_mz_consecutive_; + const bool valid_a_access_dim_k = ABlockTransferSrcVectorDim == 2 && arg.a_kz_consecutive_; + const bool valid_a_access_dim = valid_a_access_dim_m || valid_a_access_dim_k; + if(!(valid_a_vector_size && valid_a_access_dim)) { return false; } - // vector memory access of E: always on NPerBlock dimension - if(!(arg.e_nz_stride_ == 1 && - arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) % - CDEBlockTransferScalarPerVector_NPerBlock == - 0)) + const bool valid_b_vector_size = + arg.b_max_read_elems_ % BBlockTransferSrcScalarPerVector == 0; + const bool valid_b_access_dim_n = BBlockTransferSrcVectorDim == 1 && arg.b_nz_consecutive_; + const bool valid_b_access_dim_k = BBlockTransferSrcVectorDim == 2 && arg.b_kz_consecutive_; + const bool valid_b_access_dim = valid_b_access_dim_n || valid_b_access_dim_k; + if(!(valid_b_vector_size && valid_b_access_dim)) + { + return false; + } + + bool valid_ds_access = true; + static_for<0, NumDTensor, 1>{}([&](auto i) { + const bool valid_d_vector_size = + arg.ds_max_read_elems_[i] % CDEBlockTransferScalarPerVector_NPerBlock == 0; + // Vector read of Ds is always on N dimension. + const bool valid_d_access_dim = arg.ds_nz_consecutive_[i]; + if(!(valid_d_vector_size && valid_d_access_dim)) + { + valid_ds_access = false; + } + }); + if(!valid_ds_access) + { + return false; + } + + const bool valid_e_vector_size = + arg.e_max_write_elems_ % CDEBlockTransferScalarPerVector_NPerBlock == 0; + // Vector write of E is always on N dimension. + const bool valid_e_access_dim = arg.e_nz_consecutive_; + if(!(valid_e_vector_size && valid_e_access_dim)) { return false; } @@ -692,7 +681,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle const void* p_b, std::array p_ds, void* p_e, - const std::vector& a_ms_ns_lengths, + const std::vector& a_ms_ks_lengths, const std::vector& a_ms_ks_strides, const std::vector& b_ns_ks_lengths, const std::vector& b_ns_ks_strides, @@ -708,7 +697,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle p_b, p_ds, p_e, - a_ms_ns_lengths, + a_ms_ks_lengths, a_ms_ks_strides, b_ns_ks_lengths, b_ns_ks_strides, @@ -729,7 +718,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle const void* p_b, std::array p_ds, void* p_e, - const std::vector& a_ms_ns_lengths, + const std::vector& a_ms_ks_lengths, const std::vector& a_ms_ks_strides, const std::vector& b_ns_ks_lengths, const std::vector& b_ns_ks_strides, @@ -745,7 +734,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle p_b, p_ds, p_e, - a_ms_ns_lengths, + a_ms_ks_lengths, a_ms_ks_strides, b_ns_ks_lengths, b_ns_ks_strides, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp new file mode 100644 index 0000000000..0e14b40942 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck/ck.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +/** + * Calculates the maximum number of subsequent elements of the fast changing dimension + * that are consecutive in memory. + * + * Example: + * NumDimM = 2, NumDimK = 3 + * A shape = [ 2, 3, 4, 5, 6] + * A strides = [360, 120, 30, 6, 1] + * | M | | K | + * It follows from strides that K is FCD and all the subsequent elements of K are consecutive + * in memory. + * But if strides were [360, 120, 6, 24, 1], then only 6 subsequent elements of K would be + * consecutive in memory. + * + * Assumes that the dimensions are split into two groups of `NumDim1` and `NumDim2` dimensions. + */ +template +auto CalculateMaxRead(const std::vector& lengths, const std::vector& strides) +{ + if(lengths.size() != NumDim1 + NumDim2) + { + std::ostringstream err; + err << "Incorrect number of lengths in " << __FILE__ << ":" << __LINE__ + << ", in function: " << __func__; + throw std::runtime_error(err.str()); + } + if(strides.size() != NumDim1 + NumDim2) + { + std::ostringstream err; + err << "Incorrect number of strides in " << __FILE__ << ":" << __LINE__ + << ", in function: " << __func__; + throw std::runtime_error(err.str()); + } + + // Determine the beginning and end idx of the group representing the FCD. + index_t begin_idx, end_idx; + if(strides[NumDim1 - 1] == 1) + { + begin_idx = 0; + end_idx = NumDim1 - 1; + } + else if(strides[NumDim1 + NumDim2 - 1] == 1) + { + begin_idx = NumDim1; + end_idx = NumDim1 + NumDim2 - 1; + } + else + { + // The dimension consecutive in memory is not the last dimension of any group, so only + // one element can be read/written at once. + return 1; + } + + index_t consecutive_stride = 1; + for(index_t dim_idx = end_idx; dim_idx >= begin_idx; --dim_idx) + { + if(strides[dim_idx] == consecutive_stride) + { + consecutive_stride *= lengths[dim_idx]; + } + else + { + break; + } + } + const index_t max_subsequent_elems = consecutive_stride; + return max_subsequent_elems; +} + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp index b43d34d69a..b67119ad19 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp @@ -61,7 +61,11 @@ using device_contraction_kk_instance = std::tuple< DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, ComputeDataType>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, ComputeDataType> + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, ComputeDataType>, + // Small scalar per vector + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 2, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 8, 1, 8>, 1, ComputeDataType> // clang-format on >; @@ -96,7 +100,11 @@ using device_contraction_kn_instance = std::tuple< DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 1, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 1, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType> + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, + // Small scalar per vector + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 8>, 2, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 8, 1, 8>, 1, ComputeDataType> // clang-format on >; @@ -131,7 +139,11 @@ using device_contraction_mk_instance = std::tuple< DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 4, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 4, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType> + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, + // Small scalar per vector + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 2, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 4, 1, 1, 1, S<1, 8, 1, 8>, 1, ComputeDataType> // clang-format on >; @@ -166,7 +178,11 @@ using device_contraction_mn_instance = std::tuple< DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType> + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, ComputeDataType>, + // Small scalar per vector + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 1, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 8>, 2, ComputeDataType>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementwiseOp, BElementwiseOp, CDEElementwiseOp, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 8, 1, 8>, 1, ComputeDataType> // clang-format on >; From c004e0d99048d76c40da81c4dd2a36921cee0293 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Mon, 11 Dec 2023 17:49:27 -0800 Subject: [PATCH 11/18] disabling some fp8 gemm instances to reduce build time (#1084) * disabling some fp8 gemm instances to reduce build time * disable fp8 gemm instances to reduce build time * remove the unused variable * build fp8 gemm default and padded instances separately * fix include pathsc --- ...shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp} | 20 +++----------- .../tensor_operation_instance/gpu/gemm.hpp | 9 +++++-- .../gpu/gemm/CMakeLists.txt | 3 ++- ..._fp8_fp8_fp8_mk_kn_mn_default_instance.cpp | 26 +++++++++++++++++++ ...e_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp | 26 +++++++++++++++++++ 5 files changed, 64 insertions(+), 20 deletions(-) rename library/{src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp => include/ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp} (96%) create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp similarity index 96% rename from library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp rename to library/include/ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp index 82eae9f0a2..005cec94ec 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp @@ -25,10 +25,6 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; - -static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // Compilation parameters for a[m, k] * b[k, n] = c[m, n] template using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple< @@ -37,7 +33,7 @@ using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple< //#####################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| | | //#####################| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | //#####################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - // pipeline v1, 1 wave + // pipeline v1, 1 wave DeviceGemm_Xdl_CShuffle< Row, Row, Row, F8, F8, F8, F32, F8, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 256, 128, 64, 16, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Row, Row, F8, F8, F8, F32, F8, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, LoopScheduler::Default, PipelineVersion::v1>, DeviceGemm_Xdl_CShuffle< Row, Row, Row, F8, F8, F8, F32, F8, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 128, 256, 64, 16, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, LoopScheduler::Default, PipelineVersion::v1>, @@ -75,7 +71,8 @@ using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple< DeviceGemm_Xdl_CShuffle< Row, Row, Row, F8, F8, F8, F32, F8, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 64, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, LoopScheduler::Interwave, PipelineVersion::v1> #endif -#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES +#if 0 + //CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES // pipeline v2, 1 wave , DeviceGemm_Xdl_CShuffle< Row, Row, Row, F8, F8, F8, F32, F8, PassThrough, PassThrough, PassThrough, GemmSpec, 1, 256, 256, 128, 64, 16, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, LoopScheduler::Default, PipelineVersion::v2>, @@ -98,17 +95,6 @@ using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances = std::tuple< // clang-format on >; -void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances{}); - - add_device_operation_instances( - instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp index bbc70f1a5b..626dd7f00a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp @@ -345,7 +345,11 @@ void add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances( std::vector>>& instances); -void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances( +void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_default_instances( + std::vector>>& instances); + +void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_padded_instances( std::vector>>& instances); @@ -575,7 +579,8 @@ struct DeviceOperationInstanceFactory< if constexpr(is_same_v && is_same_v && is_same_v) { - add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances(op_ptrs); + add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_padded_instances(op_ptrs); + add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_default_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v && is_same_v) diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt index d0bcacbe3c..3532c3f4ba 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt @@ -101,7 +101,8 @@ list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp) list(APPEND GEMM_INSTANCES - device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp + device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp + device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp) diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp new file mode 100644 index 0000000000..baa76a74af --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp" + +#ifdef CK_ENABLE_FP8 +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_default_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck +#endif diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp new file mode 100644 index 0000000000..f16809db28 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.hpp" + +#ifdef CK_ENABLE_FP8 +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_padded_instances( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck +#endif From 6891e4d10965513657d531c3c8c2048aaba34b05 Mon Sep 17 00:00:00 2001 From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com> Date: Wed, 13 Dec 2023 14:27:31 -0600 Subject: [PATCH 12/18] Fix the bugs (#1099) --- include/ck/utility/type_convert.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp index 70bc6f278c..11db866152 100644 --- a/include/ck/utility/type_convert.hpp +++ b/include/ck/utility/type_convert.hpp @@ -182,7 +182,7 @@ inline __host__ __device__ bf8_t f8_convert_sr(half_t x) { #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) // convert to float and use native converion - return f8_convert_sr(type_convert(x)); + return f8_convert_sr(type_convert(x)); #else constexpr bool negative_zero_nan = true; constexpr bool clip = true; @@ -295,7 +295,7 @@ inline __host__ __device__ bf8_t f8_convert_rne(half_t x) template <> inline __host__ __device__ f8_t type_convert(float x) { -#if defined CK_USE_SR_F8_CONVERSION +#if CK_USE_SR_F8_CONVERSION return f8_convert_sr(x); #else return f8_convert_rne(x); @@ -352,10 +352,10 @@ inline __host__ __device__ half2_t type_convert(float2_t x) template <> inline __host__ __device__ f8_t type_convert(half_t x) { -#if defined CK_USE_SR_F8_CONVERSION +#if CK_USE_SR_F8_CONVERSION return f8_convert_sr(x); #else - return f8_convert_nre(x); + return f8_convert_rne(x); #endif } @@ -376,7 +376,7 @@ inline __host__ __device__ half_t type_convert(f8_t x) template <> inline __host__ __device__ bf8_t type_convert(float x) { -#if defined CK_USE_SR_F8_CONVERSION +#if CK_USE_SR_F8_CONVERSION return f8_convert_sr(x); #else return f8_convert_rne(x); @@ -403,7 +403,7 @@ inline __host__ __device__ float type_convert(bf8_t x) template <> inline __host__ __device__ bf8_t type_convert(half_t x) { -#if defined CK_USE_SR_F8_CONVERSION +#if CK_USE_SR_F8_CONVERSION return f8_convert_sr(x); #else return f8_convert_rne(x); From 3a3b98ef79d967391840a202a8ddf7b3d05ba823 Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Wed, 13 Dec 2023 12:50:15 -0800 Subject: [PATCH 13/18] [Doc][Werror] Fix security alerts and sync with MIOpen (#1085) * fix Werror unused-parameter * sync doc requirements * fix blank space format * fix dependency issue --- docs/sphinx/requirements.txt | 16 ++++++++-------- .../gpu/grid/gridwise_tensor_rearrange.hpp | 2 ++ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 01cb32e714..75863c214e 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -16,7 +16,7 @@ beautifulsoup4==4.11.2 # via pydata-sphinx-theme breathe==4.34.0 # via rocm-docs-core -certifi==2022.12.7 +certifi==2023.7.22 # via requests cffi==1.15.1 # via @@ -26,7 +26,7 @@ charset-normalizer==3.1.0 # via requests click==8.1.3 # via sphinx-external-toc -cryptography==40.0.2 +cryptography==41.0.6 # via pyjwt deprecated==1.2.13 # via pygithub @@ -42,7 +42,7 @@ fastjsonschema==2.18.0 # via rocm-docs-core gitdb==4.0.10 # via gitpython -gitpython==3.1.35 +gitpython==3.1.37 # via rocm-docs-core idna==3.4 # via requests @@ -88,9 +88,9 @@ pydata-sphinx-theme==0.13.3 # via # rocm-docs-core # sphinx-book-theme -pygithub==1.58.2 +pygithub==1.58.1 # via rocm-docs-core -pygments==2.14.0 +pygments==2.15.0 # via # accessible-pygments # pydata-sphinx-theme @@ -109,7 +109,7 @@ pyyaml==6.0 # pybtex # rocm-docs-core # sphinx-external-toc -requests==2.28.2 +requests==2.31.0 # via # pygithub # sphinx @@ -141,7 +141,7 @@ sphinx-book-theme==1.0.1 # via rocm-docs-core sphinx-copybutton==0.5.1 # via rocm-docs-core -sphinx-design==0.3.0 +sphinx-design==0.4.1 # via rocm-docs-core sphinx-external-toc==0.3.1 # via rocm-docs-core @@ -163,7 +163,7 @@ sphinxcontrib-serializinghtml==1.1.5 # via sphinx typing-extensions==4.5.0 # via pydata-sphinx-theme -urllib3==1.26.15 +urllib3==1.26.18 # via requests wrapt==1.15.0 # via deprecated diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp index f77ffff350..9535ca69a9 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp @@ -50,7 +50,9 @@ __global__ void ignore = p_in_global; ignore = out_grid_desc; ignore = p_out_global; + ignore = batch_count; ignore = block_2_tile_map; + ignore = compute_ptr_offset_of_batch; #endif } From 281f8369033366669fbabe05ed9622c1370c4a71 Mon Sep 17 00:00:00 2001 From: Lisa Date: Thu, 14 Dec 2023 15:21:18 -0700 Subject: [PATCH 14/18] fix typo (#1067) Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 7679607e69..4889914691 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,6 @@ python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html ``` You can find a list of our developers and contributors on our [Contributors](/CONTRIBUTORS.md) page. -page. ```note If you use CK, cite us as follows: From efaf31061a00a9c17a888ddbf2e273aafe977d5e Mon Sep 17 00:00:00 2001 From: trixirt Date: Thu, 14 Dec 2023 17:26:41 -0800 Subject: [PATCH 15/18] cmake: Add CK_PARALLEL_LINK_JOBS and CK_PARALLEL_COMPILE_JOBS options (#1063) Copied from the llvm-project LLVM_PARALLEL_*_JOBS Concurrent linking can break the build as well as having too many compile jobs for the avaiable memory. These options allow the user to fine tune the build to fit within their machines memory constraints. An example use on linux is COMPILE_JOBS=`cat /proc/cpuinfo | grep -m 1 'cpu cores' | awk '{ print $4 }'` if [ ${COMPILE_JOBS}x = x ]; then COMPILE_JOBS=1 fi BUILD_MEM=4 MEM_KB=0 MEM_KB=`cat /proc/meminfo | grep MemTotal | awk '{ print $2 }'` MEM_MB=`eval "expr ${MEM_KB} / 1024"` MEM_GB=`eval "expr ${MEM_MB} / 1024"` COMPILE_JOBS_MEM=`eval "expr 1 + ${MEM_GB} / ${BUILD_MEM}"` if [ "$COMPILE_JOBS_MEM" -lt "$COMPILE_JOBS" ]; then COMPILE_JOBS=$COMPILE_JOBS_MEM fi LINK_MEM=32 LINK_JOBS=`eval "expr 1 + ${MEM_GB} / ${LINK_MEM}"` cmake -G Ninja -DCK_PARALLEL_LINK_JOBS=$LINK_JOBS -DCK_PARALLEL_COMPILE_JOBS=$COMPILE_JOBS Signed-off-by: Tom Rix --- CMakeLists.txt | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e780c15657..4e4b9d8d4b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -146,6 +146,33 @@ if(${hip_VERSION_FLAT} GREATER 500723302) add_compile_options(-fno-offload-uniform-block) endif() +# +# Seperate linking jobs from compiling +# Too many concurrent linking jobs can break the build +# Copied from LLVM +set(CK_PARALLEL_LINK_JOBS "" CACHE STRING + "Define the maximum number of concurrent link jobs (Ninja only).") +if(CMAKE_GENERATOR MATCHES "Ninja") + if(CK_PARALLEL_LINK_JOBS) + set_property(GLOBAL APPEND PROPERTY JOB_POOLS link_job_pool=${CK_PARALLEL_LINK_JOBS}) + set(CMAKE_JOB_POOL_LINK link_job_pool) + endif() +elseif(CK_PARALLEL_LINK_JOBS) + message(WARNING "Job pooling is only available with Ninja generators.") +endif() +# Similar for compiling +set(CK_PARALLEL_COMPILE_JOBS "" CACHE STRING + "Define the maximum number of concurrent compile jobs (Ninja only).") +if(CMAKE_GENERATOR MATCHES "Ninja") + if(CK_PARALLEL_COMPILE_JOBS) + set_property(GLOBAL APPEND PROPERTY JOB_POOLS compile_job_pool=${CK_PARALLEL_COMPILE_JOBS}) + set(CMAKE_JOB_POOL_COMPILE compile_job_pool) + endif() +elseif(CK_PARALLEL_COMPILE_JOBS) + message(WARNING "Job pooling is only available with Ninja generators.") +endif() + + option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF) option(USE_OPT_NAVI3X, "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF) From 07092d68f0b13560caf3cbe762a9a799d13cdc0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Fri, 15 Dec 2023 12:45:08 +0100 Subject: [PATCH 16/18] Add tensor structure to wrapper (#1098) * Add tensor structure to wrapper * update changelog * Fix names * Comment fixes --- CHANGELOG.md | 2 +- docs/wrapper.rst | 39 ++- include/ck/wrapper/layout.hpp | 178 +++++++--- include/ck/wrapper/tensor.hpp | 314 ++++++++++++++++++ .../ck/wrapper/{ => utils}/layout_utils.hpp | 62 ++-- include/ck/wrapper/utils/tensor_utils.hpp | 290 ++++++++++++++++ test/wrapper/CMakeLists.txt | 2 + test/wrapper/test_layout.cpp | 16 +- test/wrapper/test_tensor.cpp | 205 ++++++++++++ 9 files changed, 1020 insertions(+), 88 deletions(-) create mode 100644 include/ck/wrapper/tensor.hpp rename include/ck/wrapper/{ => utils}/layout_utils.hpp (86%) create mode 100644 include/ck/wrapper/utils/tensor_utils.hpp create mode 100644 test/wrapper/test_tensor.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 3da22fc790..2891b8585b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ None - Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804) - Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799) - Support for Batched Gemm DL (#732) -- Introduce wrapper sublibrary (limited functionality) (#1071) +- Introduce wrapper sublibrary (limited functionality). (#1071, #1098) ### Changes - Changed the grouped convolution API to maintain consistency with other convolution kernels (#817) diff --git a/docs/wrapper.rst b/docs/wrapper.rst index 64fb6a4031..a2f60b97ae 100644 --- a/docs/wrapper.rst +++ b/docs/wrapper.rst @@ -13,7 +13,7 @@ Description CK provides a lightweight wrapper for more complex operations implemented in the library. It allows indexing of nested layouts using a simple interface -(avoiding complex descriptor transformations). +(avoiding complex descriptor transformations) and memory access (using Tensor). Example: @@ -22,24 +22,31 @@ Example: const auto shape_4x2x4 = ck::make_tuple(4, ck::make_tuple(2, 4)); const auto strides_s2x1x8 = ck::make_tuple(2, ck::make_tuple(1, 8)); const auto layout = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8); + + std::array data; + auto tensor = ck::wrapper::make_tensor(&data[0], layout); - std::cout << "dims:4,(2,4) strides:2,(1,8)" << std::endl; - for(ck::index_t h = 0; h < ck::wrapper::size<0>(layout); h++) + for(ck::index_t w = 0; w < size(tensor); w++) { + tensor(w) = w; + } + + // slice() == slice(0, -1) (whole dimension) + auto tensor_slice = tensor(ck::wrapper::slice(1, 3), ck::make_tuple(ck::wrapper::slice(), ck::wrapper::slice())); + std::cout << "dims:2,(2,4) strides:2,(1,8)" << std::endl; + for(ck::index_t h = 0; h < ck::wrapper::size<0>(tensor_slice); h++) { - for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++) + for(ck::index_t w = 0; w < ck::wrapper::size<1>(tensor_slice); w++) { - std::cout << layout(ck::make_tuple(h, w)) << " "; + std::cout << tensor_slice(h, w) << " "; } std::cout << std::endl; } Output:: - dims:4,(2,4) strides:2,(1,8) - 0 1 8 9 16 17 24 25 - 2 3 10 11 18 19 26 27 - 4 5 12 13 20 21 28 29 - 6 7 14 15 22 23 30 31 + dims:2,(2,4) strides:2,(1,8) + 1 5 9 13 17 21 25 29 + 2 6 10 14 18 22 26 30 ------------------------------------- Layout @@ -52,3 +59,15 @@ Layout helpers ------------------------------------- .. doxygenfile:: layout_utils.hpp + +------------------------------------- +Tensor +------------------------------------- + +.. doxygenstruct:: ck::wrapper::Tensor + +------------------------------------- +Tensor helpers +------------------------------------- + +.. doxygenfile:: tensor_utils.hpp diff --git a/include/ck/wrapper/layout.hpp b/include/ck/wrapper/layout.hpp index b337d88a1a..f20d985b49 100644 --- a/include/ck/wrapper/layout.hpp +++ b/include/ck/wrapper/layout.hpp @@ -3,7 +3,7 @@ #pragma once -#include "ck/wrapper/layout_utils.hpp" +#include "ck/wrapper/utils/layout_utils.hpp" namespace ck { namespace wrapper { @@ -25,6 +25,26 @@ struct Layout static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; + // Generate default idxs tuple (idx with all merged nested shapes) + template + __host__ __device__ constexpr static auto GenerateDefaultIdxsTuple(const Tuple&) + { + return generate_tuple( + [&](auto) { + if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime()) + { + // runtime layout + return index_t(0); + } + else + { + // compiletime layout + return I0; + } + }, + Number::Size()>{}); + } + // Generate packed (column-major) strides if not passed template __host__ __device__ constexpr static auto @@ -131,7 +151,7 @@ struct Layout template __host__ __device__ constexpr static auto MakeMerge1d(const Tuple& shape, - DescriptorToMerge& desc) + const DescriptorToMerge& desc) { // Reverse each element in tuple const auto merge_elems = TupleReverse(UnrollNestedTuple(shape)); @@ -144,7 +164,7 @@ struct Layout desc, make_tuple(make_merge_transform(merge_elems)), lower_dims, upper_dims); } - // Merge nested shape dims. Merge nested shape dims when idx is also nested. + // Merge nested shape dims when corresponding index is also nested. // Input desc shape: 2, 2, 2, 2, 2, 2 // Example idx: 1, 1, 1, 1 // Example shape: 2, (2, 2), 2, (2, 2) @@ -187,14 +207,38 @@ struct Layout return transform_tensor_descriptor(desc, transforms, lower_dims, upper_dims); } + template + __host__ __device__ static auto MakeFlattenDescriptor(const LayoutShape& shape, + const LayoutStrides& strides) + { + const auto unrolled_shape = UnrollNestedTuple(shape); + const auto unrolled_strides = UnrollNestedTuple(strides); + static_assert(unrolled_shape.Size() == unrolled_strides.Size(), + "Size of strides and shape are not consistent."); + return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); + } + + // If the stride is not passed, you can infer it from `GenerateColumnMajorPackedStrides`. + using DeducedStrides = + std::conditional_t>, + remove_cvref_t, + Strides>; + using FlattenDescriptorType = + remove_cvref_t; + using Descriptor1dType = + remove_cvref_t; + using DefaultIdxsTupleType = remove_cvref_t; + template - __host__ __device__ constexpr auto TransformDesc(const Tuple& shape, - const Tuple& idx) const + __host__ __device__ constexpr static auto + TransformDesc(const Tuple& shape, + const Tuple& idx, + const FlattenDescriptorType& naive_descriptor) { if constexpr(Tuple::Size() == I1) { // 1d idx path - return MakeMerge1d(shape, descriptor_); + return MakeMerge1d(shape, naive_descriptor); } else { @@ -207,56 +251,53 @@ struct Layout // Unroll while IdxDims is nested const auto aligned_shape = AlignShapeToIdx(shape, idx); // Transform correct form of shape - return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idx), descriptor_); + return CreateMergedDescriptor(aligned_shape, UnrollNestedTuple(idx), naive_descriptor); } } - template - __host__ __device__ static auto MakeNaiveDescriptor(const LayoutShape& shape, - const LayoutStrides& strides) - { - const auto unrolled_shape = UnrollNestedTuple(shape); - const auto unrolled_strides = UnrollNestedTuple(strides); - static_assert(unrolled_shape.Size() == unrolled_strides.Size(), - "Size of strides and shape are not consistent."); - return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); - } + using MergedNestsDescriptorType = remove_cvref_t; public: - // If the stride is not passed, you can infer it from `GenerateColumnMajorPackedStrides`. - using DeducedStrides = - std::conditional_t>, - remove_cvref_t, - Strides>; - using NaiveDescriptorType = - remove_cvref_t; + __host__ __device__ constexpr auto GetElementSpaceSize() const + { + return flatten_descriptor_.GetElementSpaceSize(); + } + __host__ __device__ Layout() = delete; /** * \brief Layout constructor. * * \param shape Shape for layout. * \param strides Strides for layout (optional if tensor is packed). - * \return Layout object. */ - __host__ __device__ Layout() = delete; - __host__ __device__ Layout(const Shape& shape, const Strides& strides) : descriptor_{} + __host__ __device__ constexpr Layout(const Shape& shape, const Strides& strides) + : flatten_descriptor_{}, shape_(shape), strides_(strides) { // Construct if runtime mode - if constexpr(!NaiveDescriptorType::IsKnownAtCompileTime()) + if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime()) { - shape_ = shape; - strides_ = strides; - descriptor_ = MakeNaiveDescriptor(shape_, strides_); + flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_); + descriptor_1d_ = MakeMerge1d(shape_, flatten_descriptor_); + merged_nests_descriptor_ = + TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_); } } - __host__ __device__ Layout(const Shape& shape) : descriptor_{} + /** + * \brief Layout constructor (with default packed column-major strides). + * + * \param shape Shape for layout. + */ + __host__ __device__ constexpr Layout(const Shape& shape) + : flatten_descriptor_{}, shape_(shape), strides_(GenerateColumnMajorPackedStrides(shape_)) { - if constexpr(!NaiveDescriptorType::IsKnownAtCompileTime()) + if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime()) { - shape_ = shape; - strides_ = GenerateColumnMajorPackedStrides(shape_); - descriptor_ = MakeNaiveDescriptor(shape_, strides_); + flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_); + descriptor_1d_ = MakeMerge1d(shape_, flatten_descriptor_); + merged_nests_descriptor_ = + TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_); } } @@ -269,7 +310,9 @@ struct Layout template __host__ __device__ constexpr index_t operator()() const { - using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{})); + static_assert(FlattenDescriptorType::IsKnownAtCompileTime(), + "Compiletime operator used on runtime layout."); + using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, FlattenDescriptorType{})); using UnrolledIdx = decltype(UnrollNestedTuple(Idxs{})); return TransformedDesc{}.CalculateOffset(UnrolledIdx{}); } @@ -283,9 +326,22 @@ struct Layout template __host__ __device__ index_t operator()(const Tuple& Idx) const { - // Static to construct transformed_desc only once - static const auto transformed_desc = TransformDesc(shape_, Idx); - return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx)); + if constexpr(!IsNestedTuple(Tuple{}) && Tuple::Size() == 1) + { + // if 1d access + return descriptor_1d_.CalculateOffset(Idx); + } + else if constexpr(!IsNestedTuple(Tuple{}) && Tuple::Size() == Shape::Size()) + { + // if Shape::Size() access (merged nested shapes) + return merged_nests_descriptor_.CalculateOffset(UnrollNestedTuple(Idx)); + } + else + { + // Custom index, need to transform descriptor + const auto transformed_desc = TransformDesc(shape_, Idx, flatten_descriptor_); + return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx)); + } } /** @@ -327,19 +383,51 @@ struct Layout * * \return Shape. */ - __host__ __device__ constexpr Shape GetShape() const { return shape_; } + __host__ __device__ constexpr const Shape& GetShape() const { return shape_; } /** * \brief Strides getter. * * \return Strides. */ - __host__ __device__ constexpr DeducedStrides GetStrides() const { return strides_; } + __host__ __device__ constexpr const DeducedStrides& GetStrides() const { return strides_; } + + /** + * \brief Get default lengths (tuple filled with Shape length elements). + * + * \return Default lengths. + */ + __host__ __device__ constexpr auto GetDefaultLengthsTuple() const + { + return generate_tuple([&](auto i) { return GetLength(); }, Number{}); + } + + /** + * \brief Get default start idx (tuple filled with 0s of the same size as Shape). + * + * \return Default start idx. + */ + __host__ __device__ constexpr auto GetDefaultStartIdxs() const + { + return GenerateDefaultIdxsTuple(shape_); + } + + /** + * \brief Get default descriptor (with the same size as Shape) + * + * \return Default descriptor. + */ + __host__ __device__ constexpr MergedNestsDescriptorType GetDefaultDescriptor() + { + return merged_nests_descriptor_; + } private: - NaiveDescriptorType descriptor_; - Shape shape_; - DeducedStrides strides_; + FlattenDescriptorType flatten_descriptor_; + Descriptor1dType descriptor_1d_; + MergedNestsDescriptorType merged_nests_descriptor_; + const Shape shape_; + const DeducedStrides strides_; }; } // namespace wrapper diff --git a/include/ck/wrapper/tensor.hpp b/include/ck/wrapper/tensor.hpp new file mode 100644 index 0000000000..4ec6498fbc --- /dev/null +++ b/include/ck/wrapper/tensor.hpp @@ -0,0 +1,314 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "utils/tensor_utils.hpp" +#include "utils/layout_utils.hpp" + +namespace ck { +namespace wrapper { + +/** + * \brief Tensor wrapper that performs static and dynamic buffer logic. + * + * \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR). + * \tparam ElementType Element data type. + * \tparam Shape Tensor shape (layout component). + * \tparam Strides Tensor strides (layout component). + * \tparam NumVectors Number of vectors (only for VGPR, SGPR). + * \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR). + */ +template +struct Tensor +{ + private: + // Check if Tuple contains Slice object + template + constexpr static bool IsSlicing(T&&) + { + return is_detected::value; + } + template + constexpr static bool IsSlicing(Tuple&&) + { + return (IsSlicing(Ts{}) || ...); + } + + // Calculate first index of new tensor after slice + // It is needed to calculate offset for new tensor + template + constexpr auto GetStartIdxForSlicedTensor(const Tuple& idx) const + { + const auto start_idx_for_sliced_tensor = generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + // if tuple then recurrence + return GetStartIdxForSlicedTensor(idx.At(num_i)); + } + else if constexpr(is_detected>>::value) + { + // if slice, return the beginning of the interval + return idx.At(num_i).from_; + } + else + { + // if one dim selected + return idx.At(num_i); + } + }, + Number::Size()>{}); + + return start_idx_for_sliced_tensor; + } + + // Calculate new tensor shape after slice + template + constexpr auto GetShapeFromSlicedTensor(const Tuple& idx, + const ShapeTmpType& shape) const + { + // Pack each value in tuple to remove empty tuples after generation + auto new_shape = generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + if constexpr(!IsSlicing(tuple_element_t>{})) + { + // if tuple does not have any slice then we can remove dimension + return Tuple<>{}; + } + else + { + // if tuple then recurrence + return make_tuple(GetShapeFromSlicedTensor(idx.At(num_i), shape.At(num_i))); + } + } + else if constexpr(is_detected>>::value) + { + // calculate new dimension + const auto& dim = size(shape.At(num_i)); + const auto val = idx.At(num_i).range(dim); + return make_tuple(val); + } + else + { + // remove dimension for just value + return Tuple<>{}; + } + }, + Number::Size()>{}); + // Remove empty tuples (deleted elements) and return + return UnrollNestedTuple<0, 1>(new_shape); + } + + template + constexpr auto GetStridesFromSlicedTensor(const Tuple& idx, + const StridesTmpType& strides) const + { + // Pack each value in tuple to remove empty tuples after generation + auto new_strides = generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + if constexpr(!IsSlicing(tuple_element_t>{})) + { + // if tuple does not have any slice then we can remove dimension + return Tuple<>{}; + } + else + { + // if tuple then recurrence + return make_tuple( + GetStridesFromSlicedTensor(idx.At(num_i), strides.At(num_i))); + } + } + else if constexpr(is_detected>>::value) + { + // Stride will be the same + return make_tuple(strides.At(num_i)); + } + else + { + // remove dimension for just value + return Tuple<>{}; + } + }, + Number::Size()>{}); + // Remove empty tuples (deleted elements) and return + return UnrollNestedTuple<0, 1>(new_strides); + } + + public: + using ElementSpaceSize = decltype(Layout{ + Shape{}, Strides{}}.GetElementSpaceSize()); // SpaceSize type for buffer + using TensorElementType = ElementType; // DataType + + static constexpr MemoryTypeEnum TensorBufferAddressSpace = BufferAddressSpace; + static constexpr bool IsDynamicBuffer = !(BufferAddressSpace == MemoryTypeEnum ::Sgpr || + BufferAddressSpace == MemoryTypeEnum ::Vgpr); + + __host__ __device__ Tensor() = delete; + __host__ __device__ Tensor(ElementType* pointer, const Layout& layout) + : layout_(layout), + buffer_(make_dynamic_buffer(pointer, layout.GetElementSpaceSize())) + { + } + + __host__ __device__ Tensor(const Layout& layout) : layout_(layout) + { + static_assert(!IsDynamicBuffer, "Wrong BufferAddressSpace for register."); + } + + __host__ __device__ constexpr const Layout& GetLayout() const + { + return layout_; + } + + // Getter for new sliced tensor + template {}), bool> = false> + __host__ __device__ auto operator[](const Tuple& idx) const + { + static_assert(IsDynamicBuffer, "Register slice is not supported"); + // Calculate offset based on first idx for new tensor + const index_t offset = layout_(GetStartIdxForSlicedTensor(idx)); + + auto new_shape = GetShapeFromSlicedTensor(idx, layout_.GetShape()); + if constexpr(is_same_v>) + { + auto new_layout = make_layout(new_shape); + return make_tensor(buffer_.p_data_ + offset, new_layout); + } + else + { + auto new_strides = GetStridesFromSlicedTensor(idx, layout_.GetStrides()); + auto new_layout = make_layout(new_shape, new_strides); + return make_tensor(buffer_.p_data_ + offset, new_layout); + } + } + + template {}), bool> = false> + __host__ __device__ auto operator()(const Tuple& idx) const + { + return this->operator[](idx); + } + + template {}), bool> = false> + __host__ __device__ auto operator()(Idxs... idxs) const + { + return this->operator[](make_tuple(idxs...)); + } + + // Getter for the const value + template {}), bool> = false> + __host__ __device__ const ElementType& operator[](const Tuple& idx) const + { + if constexpr(IsDynamicBuffer) + { + const index_t offset = layout_(idx); + return buffer_[offset]; + } + else + { + if constexpr(is_same_v>) + { + constexpr index_t offset = + Layout{Shape{}}.template operator()>(); + return buffer_[Number{}]; + } + else + { + constexpr index_t offset = + Layout{Shape{}, Strides{}}.template operator()>(); + return buffer_[Number{}]; + } + } + } + + template {}), bool> = false> + __host__ __device__ const ElementType& operator()(const Tuple& idx) const + { + return this->operator[](idx); + } + + template {}), bool> = false> + __host__ __device__ const ElementType& operator()(Idxs... idxs) const + { + return this->operator[](make_tuple(idxs...)); + } + + // Getter for the value reference + template {}), bool> = false> + __host__ __device__ ElementType& operator[](const Tuple& idx) + { + if constexpr(IsDynamicBuffer) + { + const index_t offset = layout_(idx); + return buffer_(offset); + } + else + { + if constexpr(is_same_v>) + { + constexpr index_t offset = + Layout{Shape{}}.template operator()>(); + return buffer_(Number{}); + } + else + { + constexpr index_t offset = + Layout{Shape{}, Strides{}}.template operator()>(); + return buffer_(Number{}); + } + } + } + + template {}), bool> = false> + __host__ __device__ ElementType& operator()(const Tuple& idx) + { + return this->operator[](idx); + } + + template {}), bool> = false> + __host__ __device__ ElementType& operator()(Idxs... idxs) + { + return this->operator[](make_tuple(idxs...)); + } + + __host__ __device__ constexpr auto GetDefaultDescriptor() + { + return layout_.GetDefaultDescriptor(); + } + + private: + using DynamicBufferType = DynamicBuffer; + using StaticBufferType = + StaticBufferTupleOfVector; + // If register use static buffer, else use dynamic buffer + using Buffer = std::conditional_t; + + const Layout layout_; + Buffer buffer_; +}; + +} // namespace wrapper +} // namespace ck diff --git a/include/ck/wrapper/layout_utils.hpp b/include/ck/wrapper/utils/layout_utils.hpp similarity index 86% rename from include/ck/wrapper/layout_utils.hpp rename to include/ck/wrapper/utils/layout_utils.hpp index fac8f33854..5df9dd7dea 100644 --- a/include/ck/wrapper/layout_utils.hpp +++ b/include/ck/wrapper/utils/layout_utils.hpp @@ -22,7 +22,7 @@ namespace wrapper { // Disable from doxygen docs generation /// @cond // forward declaration -template > +template struct Layout; template @@ -52,13 +52,23 @@ __host__ __device__ constexpr Layout make_layout(const Shape& sh * \return Constructed layout. */ template -__host__ __device__ constexpr Layout make_layout(const Shape& shape) +__host__ __device__ constexpr Layout> make_layout(const Shape& shape) { - return Layout(shape); + return Layout>(shape); } // Layout helpers // get +// Get dim (could be returned from get with empty Idxs) +/** + * \private + */ +template +__host__ __device__ T constexpr get(const T& dim) +{ + return dim; +} + /** * \brief Get element from tuple (Shape/Strides/Idxs). * @@ -82,7 +92,8 @@ __host__ __device__ constexpr auto get(const Tuple& tuple) template __host__ __device__ constexpr auto get(const Layout& layout) { - const auto new_shape = get(layout.GetShape()); + const auto& shape = layout.GetShape(); + const auto& new_shape = get(shape); static_assert(is_detected::value, "Shape of sub layout must be tuple"); if constexpr(is_same_v>) @@ -92,7 +103,8 @@ __host__ __device__ constexpr auto get(const Layout& layout) } else { - const auto new_strides = get(layout.GetStrides()); + const auto& strides = layout.GetStrides(); + const auto& new_strides = get(strides); static_assert(is_detected::value, "Strides of sub layout must be tuple"); return make_layout(new_shape, new_strides); @@ -113,11 +125,21 @@ __host__ __device__ constexpr auto get(const T& elem) } // size +// Get dim size (could be returned from get function) +/** + * \private + */ +template +__host__ __device__ T constexpr size(const T& dim) +{ + return dim; +} + /** * \brief Length get (product if tuple). * * \tparam idx Index to lookup. - * \param layout Layout to get Shape. + * \param layout Layout to get Shape of. * \return Requsted length. */ template @@ -140,16 +162,6 @@ __host__ __device__ constexpr index_t size(const Tuple& shape) unrolled_shape); } -// Get dim size (could be returned from get function) -/** - * \private - */ -template -__host__ __device__ T constexpr size(const T& dim) -{ - return dim; -} - /** * \brief Layout size (product of dims). * @@ -178,14 +190,15 @@ __host__ __device__ constexpr index_t size(const Tuple& tuple) /** * \brief Hierarchical size. * - * \tparam Idxs Indexes to lookup. + * \tparam Idx First index to lookup (to avoid empty Idxs). + * \tparam Idxs Next indexes to lookup. * \param elem Element to lookup. * \return Requsted element. */ -template +template __host__ __device__ constexpr auto size(const T& elem) { - return size(get(elem)); + return size(get(elem)); } // rank @@ -251,7 +264,8 @@ __host__ __device__ constexpr auto rank(const T& elem) template __host__ __device__ constexpr auto depth(const Layout& layout) { - return TupleDepth(layout.GetShape()); + const auto& shape = layout.GetShape(); + return TupleDepth(shape); } /** @@ -296,11 +310,11 @@ __host__ __device__ constexpr auto depth(const T& elem) /** * \brief Get Layout strides. * - * \param layout Layout to get strides. + * \param layout Layout to get strides from. * \return Requsted strides. */ template -__host__ __device__ constexpr auto stride(const Layout& layout) +__host__ __device__ constexpr const auto& stride(const Layout& layout) { return layout.GetStrides(); } @@ -308,11 +322,11 @@ __host__ __device__ constexpr auto stride(const Layout& layout) /** * \brief Get Layout shape. * - * \param layout Layout to get shape. + * \param layout Layout to get shape from. * \return Requsted shape. */ template -__host__ __device__ constexpr auto shape(const Layout& layout) +__host__ __device__ constexpr const auto& shape(const Layout& layout) { return layout.GetShape(); } diff --git a/include/ck/wrapper/utils/tensor_utils.hpp b/include/ck/wrapper/utils/tensor_utils.hpp new file mode 100644 index 0000000000..5f0dc3e500 --- /dev/null +++ b/include/ck/wrapper/utils/tensor_utils.hpp @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/ck.hpp" + +#include "ck/utility/number.hpp" +#include "ck/utility/tuple.hpp" +#include "ck/utility/tuple_helper.hpp" +#include "ck/utility/dynamic_buffer.hpp" +#include "ck/utility/amd_address_space.hpp" + +namespace ck { +namespace wrapper { + +/** + * \brief Memory type, allowed members: + * - Generic, + * - Global, + * - LDS, + * - SGPR, + * - VGPR, + */ +using MemoryTypeEnum = AddressSpaceEnum; + +// Disable from doxygen docs generation +/// @cond +// forward declarations +template +struct Layout; +template + +struct Tensor; + +template +struct Slice +{ + __host__ __device__ constexpr Slice() : from_(), to_() {} + __host__ __device__ constexpr Slice(FromType from, ToType to) : from_(from), to_(to) {} + + template + __host__ __device__ constexpr auto range(const T& dim) const + { + if constexpr(is_same_v || is_same_v || + is_same_v) + { + assert(dim >= to_ && from_ >= 0 && (to_ < 0 || to_ > from_) && "Invalid range"); + if(to_ < 0) + { + return dim - from_ + to_ + 1; + } + else + { + // workaround if one end of the interval is index_t and the second one is Number + return static_cast(to_) - static_cast(from_); + } + } + else + { + static_assert(dim >= to_ && from_ >= Number<0>{} && (to_ < 0 || to_ > from_), + "Invalid range"); + if constexpr(to_ < 0) + { + return dim - from_ + to_ + Number<1>{}; + } + else + { + return to_ - from_; + } + } + } + + __host__ __device__ static constexpr bool IsSlice() { return true; } + + const FromType from_; + const ToType to_; +}; + +template +using is_slice = decltype(std::declval().IsSlice()); + +template +using is_tuple = decltype(std::declval().IsTuple()); +/// @endcond + +/** + * \brief Make tensor function. + * + * \tparam MemoryType Type of memory. + * \param pointer Pointer to the memory. + * \param layout Tensor layout. + * \return Constructed tensor. + */ +template +constexpr auto make_tensor(ElementType* pointer, const Layout& layout) +{ + return Tensor( + pointer, layout); +} + +/** + * \brief Make SGPR or VGPR tensor function. + * + * \tparam MemoryType Type of memory. + * \tparam NumVectors Number of vectors. + * \tparam ScalarPerVector Scalars per vector. + * \tparam ElementType Memory data type. + * \param layout Tensor layout. + * \return Constructed tensor. + */ +template +constexpr auto make_register_tensor(const Layout& layout) +{ + static_assert(!IsNestedTuple(Shape{}), "Register tensor with nested layout is not supported"); + return Tensor(layout); +} + +/** + * \brief Get Tensor Layout. + * + * \param tensor Tensor to get layout of. + * \return Requsted layout. + */ +template +__host__ __device__ constexpr const auto& +layout(const Tensor& + tensor) +{ + return tensor.GetLayout(); +} + +/** + * \brief Product of tensor shape dims. + * + * \tparam Idxs Indexes to access specific shape dim (optional). + * \param tensor Tensor to get Shape of. + * \return Requsted size. + */ +template +__host__ __device__ constexpr index_t +size(const Tensor& + tensor) +{ + return size(tensor.GetLayout()); +} + +/** + * \brief Rank of Shape tuple. + * + * \tparam Idxs Indexes to access specific shape dim (optional). + * \param tensor Tensor to get rank of. + * \return Requsted rank. + */ +template +__host__ __device__ constexpr index_t +rank(const Tensor& + tensor) +{ + return rank(tensor.GetLayout()); +} + +/** + * \brief Depth of Shape tuple. + * + * \tparam Idxs Indexes to access specific shape dim (optional). + * \param tensor Tensor to get depth of. + * \return Requsted depth. + */ +template +__host__ __device__ constexpr index_t +depth(const Tensor& + tensor) +{ + return depth(tensor.GetLayout()); +} + +/** + * \brief Get Tensor strides. + * + * \param tensor Tensor to get strides from. + * \return Requsted strides. + */ +template +__host__ __device__ constexpr const auto& +stride(const Tensor& + tensor) +{ + return stride(tensor.GetLayout()); +} + +/** + * \brief Get Tensor shape. + * + * \param tensor Tensor to get shape from. + * \return Requsted shape. + */ +template +__host__ __device__ constexpr const auto& +shape(const Tensor& + tensor) +{ + return shape(tensor.GetLayout()); +} + +/** + * \brief Get dim slice. + * + * \param from Beginning of the interval. + * \param to End of the interval. (could be also negative to index from the end) + * \return Requested slice. Could be used to create sliced tensor from other tensor. + */ +template +constexpr auto slice(const FromType from, const ToType to) +{ + return Slice(from, to); +} + +/** + * \brief Get dim slice. (Assumed that from is equal to 1) + * + * \param to End of the interval. (could be also negative to index from the end) + * \return Requested slice. Could be used to create sliced tensor from other tensor. + */ +template +constexpr auto slice(const ToType to) +{ + if constexpr(is_same_v) + { + return Slice(0, to); + } + else + { + return Slice, ToType>(Number<0>{}, to); + } +} + +/** + * \brief Get whole dim slice (from = 0, to = -1). + * + * \return Requested slice. Could be used to create sliced tensor from other tensor. + */ +constexpr auto slice() { return Slice, Number<-1>>(Number<0>{}, Number<-1>{}); } + +} // namespace wrapper +} // namespace ck diff --git a/test/wrapper/CMakeLists.txt b/test/wrapper/CMakeLists.txt index e25ef176dd..6b25c08a8a 100644 --- a/test/wrapper/CMakeLists.txt +++ b/test/wrapper/CMakeLists.txt @@ -1,2 +1,4 @@ add_gtest_executable(test_layout test_layout.cpp) target_link_libraries(test_layout PRIVATE utility) +add_gtest_executable(test_tensor test_tensor.cpp) +target_link_libraries(test_tensor PRIVATE utility) diff --git a/test/wrapper/test_layout.cpp b/test/wrapper/test_layout.cpp index 7d09696fbb..14a8b96462 100644 --- a/test/wrapper/test_layout.cpp +++ b/test/wrapper/test_layout.cpp @@ -433,17 +433,17 @@ TEST(TestLayoutHelpers, ShapeAndStrides) ck::wrapper::make_layout(shape_compiletime, strides_compiletime); constexpr bool check_compiletime_shape = - std::is_same_v::type, - decltype(shape(layout_compiletime))>; + std::is_same_v>; constexpr bool check_compiletime_strides = - std::is_same_v::type, - decltype(stride(layout_compiletime))>; + std::is_same_v>; constexpr bool check_runtime_shape = - std::is_same_v::type, - decltype(shape(layout_runtime))>; + std::is_same_v>; constexpr bool check_runtime_strides = - std::is_same_v::type, - decltype(stride(layout_runtime))>; + std::is_same_v>; EXPECT_TRUE(check_compiletime_shape); EXPECT_TRUE(check_compiletime_strides); EXPECT_TRUE(check_runtime_shape); diff --git a/test/wrapper/test_tensor.cpp b/test/wrapper/test_tensor.cpp new file mode 100644 index 0000000000..92f8e2e1bd --- /dev/null +++ b/test/wrapper/test_tensor.cpp @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/library/utility/device_memory.hpp" + +#include "ck/host_utility/kernel_launch.hpp" + +#include "ck/utility/common_header.hpp" + +#include "ck/wrapper/layout.hpp" +#include "ck/wrapper/tensor.hpp" + +// Compare data in tensor with offset from layout. +// Data and offset should match if physical memory has been initialized with +// sequentially increasing values from 0. +template +__host__ __device__ bool TestTensorCheck3d(TensorType& tensor) +{ + const auto& layout = ck::wrapper::layout(tensor); + for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++) + { + for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++) + { + for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++) + { + const auto idx = ck::make_tuple(ck::make_tuple(d, h), w); + if(tensor(idx) != layout(idx)) + { + return false; + } + } + } + } + return true; +} + +template +__host__ __device__ bool TestTensorCheck1d(TensorType& tensor, ck::index_t start_offset = 0) +{ + const auto& layout = ck::wrapper::layout(tensor); + for(ck::index_t w = 0; w < ck::wrapper::size<0>(layout); w++) + { + if(tensor(w) - start_offset != layout(ck::make_tuple(w))) + { + return false; + } + } + return true; +} + +template +__host__ __device__ bool StaticTestTensorCheck1d(TensorType& tensor) +{ + const auto& layout = ck::wrapper::layout(tensor); + bool success = true; + ck::static_for<0, nelems, 1>{}([&](auto w) { + if(tensor(ck::Number{}) != layout(ck::make_tuple(w.value))) + { + success = false; + } + }); + return success; +} + +template +__host__ __device__ void InitTensor(TensorType& tensor) +{ + for(ck::index_t i = 0; i < ck::wrapper::size(ck::wrapper::layout(tensor)); i++) + { + tensor(i) = i; + } +} + +template +__host__ __device__ void StaticInitTensor(TensorType& tensor) +{ + + ck::static_for<0, nelems, 1>{}([&](auto i) { tensor(ck::Number{}) = i.value; }); +} + +// Tests +TEST(TestTensor, ReadWriteHostMemory) +{ + constexpr ck::index_t nelems = 8; + + std::array data; + const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2)); + auto tensor = ck::wrapper::make_tensor(&data[0], layout); + InitTensor(tensor); + + EXPECT_TRUE(TestTensorCheck1d(tensor)); + EXPECT_TRUE(TestTensorCheck3d(tensor)); +} + +__global__ void TestTensorReadWriteDevice(void* data, void* success) +{ + constexpr ck::index_t nelems = 8; + constexpr ck::index_t scalar_per_vector = 1; + __shared__ ck::index_t p_shared[nelems]; + + ck::index_t* casted_data_ptr = static_cast(data); + bool* casted_success_ptr = static_cast(success); + + const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2)); + constexpr auto register_layout = ck::wrapper::make_layout(ck::make_tuple(ck::Number<8>{})); + + auto tensor_global = + ck::wrapper::make_tensor(casted_data_ptr, layout); + auto tensor_lds = ck::wrapper::make_tensor(p_shared, layout); + auto tensor_vgpr = ck::wrapper::make_register_tensor(register_layout); + auto tensor_sgpr = ck::wrapper::make_register_tensor(register_layout); + + InitTensor(tensor_global); + InitTensor(tensor_lds); + StaticInitTensor(tensor_vgpr); + StaticInitTensor(tensor_sgpr); + + *casted_success_ptr &= TestTensorCheck1d(tensor_global); + *casted_success_ptr &= TestTensorCheck3d(tensor_global); + + *casted_success_ptr &= TestTensorCheck1d(tensor_lds); + *casted_success_ptr &= TestTensorCheck3d(tensor_lds); + + *casted_success_ptr &= StaticTestTensorCheck1d(tensor_vgpr); + + *casted_success_ptr &= StaticTestTensorCheck1d(tensor_sgpr); +} + +TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory) +{ + constexpr ck::index_t nelems = 8; + std::array host_data; + + DeviceMem data_buf(nelems * sizeof(ck::index_t)); + data_buf.ToDevice(&host_data[0]); + DeviceMem success_buf(sizeof(bool)); + + launch_and_time_kernel(StreamConfig{}, + TestTensorReadWriteDevice, + dim3(1), + dim3(1), + nelems * sizeof(ck::index_t), + data_buf.GetDeviceBuffer(), + success_buf.GetDeviceBuffer()); + + bool success; + success_buf.FromDevice(&success); + EXPECT_TRUE(success); +} + +TEST(TestTensor, Slicing) +{ + constexpr ck::index_t nelems = 8; + + std::array data; + const auto shape = ck::make_tuple(ck::make_tuple(2, 2), 2); + const auto strides = ck::make_tuple(ck::make_tuple(1, 2), 4); + const auto layout = ck::wrapper::make_layout(shape, strides); + auto tensor = ck::wrapper::make_tensor(&data[0], layout); + InitTensor(tensor); + + auto tensor2x2x2 = + tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2)); + EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2); + EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2); + EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8); + EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2)); + + auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2)); + EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2); + EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2); + EXPECT_EQ(ck::wrapper::size(tensor2x2), 4); + EXPECT_TRUE(TestTensorCheck1d(tensor2x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0)))); + + auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2)); + EXPECT_EQ(rank(tensor1x1), 2); + EXPECT_EQ(depth(tensor1x1), 2); + EXPECT_EQ(size(tensor1x1), 1); + EXPECT_TRUE(TestTensorCheck1d(tensor1x1, layout(ck::make_tuple(ck::make_tuple(1, 1), 1)))); + + auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2)); + EXPECT_EQ(ck::wrapper::rank(tensor2), 1); + EXPECT_EQ(ck::wrapper::depth(tensor2), 1); + EXPECT_EQ(ck::wrapper::size(tensor2), 2); + EXPECT_TRUE(TestTensorCheck1d(tensor2, layout(ck::make_tuple(ck::make_tuple(1, 1), 0)))); + + // negative indexing + auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice()); + EXPECT_EQ(rank(tensor1x2), 2); + EXPECT_EQ(depth(tensor1x2), 2); + EXPECT_EQ(size(tensor1x2), 2); + EXPECT_TRUE(TestTensorCheck1d(tensor1x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0)))); +} From 3246d1f693035929562240d8c73611345692bbbc Mon Sep 17 00:00:00 2001 From: abhimeda <138710508+abhimeda@users.noreply.github.com> Date: Fri, 15 Dec 2023 12:41:35 -0500 Subject: [PATCH 17/18] Adding Issue Template (#1094) * Add files via upload * fixed extra space typo * add mi300 GPU architectures and rocm versions 5.6.1 and 6.0.0 --------- Co-authored-by: illsilin Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> --- .github/ISSUE_TEMPLATE/config.yml | 1 + .github/ISSUE_TEMPLATE/issue_report.yml | 221 ++++++++++++++++++++++++ 2 files changed, 222 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/config.yml create mode 100644 .github/ISSUE_TEMPLATE/issue_report.yml diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000..0086358db1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: true diff --git a/.github/ISSUE_TEMPLATE/issue_report.yml b/.github/ISSUE_TEMPLATE/issue_report.yml new file mode 100644 index 0000000000..ef6e6faa1b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/issue_report.yml @@ -0,0 +1,221 @@ +name: Issue Report +description: File a report for ROCm related issues on Linux and Windows. For issues pertaining to documentation or non-bug related, please open a blank issue located below. +title: "[Issue]: " + +body: +- type: markdown + attributes: + value: | + Thank you for taking the time to fill out this report! + + You can acquire your OS, CPU, GPU (for filling out this report) with the following commands: + + Linux: + echo "OS:" && cat /etc/os-release | grep -E "^(NAME=|VERSION=)"; + echo "CPU: " && cat /proc/cpuinfo | grep "model name" | sort --unique; + echo "GPU:" && /opt/rocm/bin/rocminfo | grep -E "^\s*(Name|Marketing Name)"; + + Windows: + (Get-WmiObject Win32_OperatingSystem).Version + (Get-WmiObject win32_Processor).Name + (Get-WmiObject win32_VideoController).Name +- type: textarea + attributes: + label: Problem Description + description: Describe the issue you encountered. + validations: + required: true +- type: input + attributes: + label: Operating System + description: What is the name and version number of the OS? + placeholder: "e.g. Ubuntu 22.04.3 LTS (Jammy Jellyfish)" + validations: + required: true +- type: input + attributes: + label: CPU + description: What CPU did you encounter the issue on? + placeholder: "e.g. AMD Ryzen 9 5900HX with Radeon Graphics" + validations: + required: true +- type: dropdown + attributes: + label: GPU + description: What GPU(s) did you encounter the issue on (you can select multiple GPUs from the list) + multiple: true + options: + - AMD Instinct MI300X + - AMD Instinct MI300A + - AMD Instinct MI300 + - AMD Instinct MI250X + - AMD Instinct MI250 + - AMD Instinct MI210 + - AMD Instinct MI100 + - AMD Instinct MI50 + - AMD Instinct MI25 + - AMD Radeon Pro V620 + - AMD Radeon Pro VII + - AMD Radeon RX 7900 XTX + - AMD Radeon VII + - AMD Radeon Pro W7900 + - AMD Radeon Pro W7800 + - AMD Radeon Pro W6800 + - AMD Radeon Pro W6600 + - AMD Radeon Pro W5500 + - AMD Radeon RX 7900 XT + - AMD Radeon RX 7600 + - AMD Radeon RX 6950 XT + - AMD Radeon RX 6900 XT + - AMD Radeon RX 6800 XT + - AMD Radeon RX 6800 + - AMD Radeon RX 6750 + - AMD Radeon RX 6700 XT + - AMD Radeon RX 6700 + - AMD Radeon RX 6650 XT + - AMD Radeon RX 6600 XT + - AMD Radeon RX 6600 + - Other + validations: + required: true +- type: input + attributes: + label: Other + description: If you selected Other, please specify +- type: dropdown + attributes: + label: ROCm Version + description: What version(s) of ROCm did you encounter the issue on? + multiple: true + options: + - ROCm 6.0.0 + - ROCm 5.7.1 + - ROCm 5.7.0 + - ROCm 5.6.1 + - ROCm 5.6.0 + - ROCm 5.5.1 + - ROCm 5.5.0 + validations: + required: true +- type: dropdown + attributes: + label: ROCm Component + description: (Optional) If this issue relates to a specific ROCm component, it can be mentioned here. + multiple: true + options: + - Other + - AMD Common Language Runtime + - AMD MIGraphX + - AMD System Management Interface + - amdgpu KCL/autoconf + - amdgpu Kernel-mode GPU Driver + - amdgpu-install + - AOMP + - AOMP Extras + - AqlProfile + - build-infra + - chelsio + - clang-ocl + - Composable Kernel + - dkms + - docker / ROCm-docker + - flang + - gpuburn + - half + - HIP + - HIP Examples + - hipBLAS + - hipBLASLt + - HIPCC + - hipCUB + - hip-examples-private + - hipFFT + - hipfort + - HIPIFY + - hipRAND + - hipSOLVER + - hipSPARSE + - hipSPARSELt + - hipTensor + - hip-tests + - HSA Runtime + - infrastructure + - jenkins-utils + - libdrm + - Linux BPI packaging framework + - llvm-project + - Mesa + - meta + - MIOpen + - MIVisionX + - ml-framework-ci + - MLSEQA_TestRepo + - OpenCL API C++ Bindings + - OpenCL API Headers + - OpenCL Conformance Test Suite + - OpenCL ICD Loader + - perftest-p2p + - prototype + - RCCL + - rccl-rdma-sharp-plugins + - rocALUTION + - rocBLAS + - ROCdbgapi + - ROCdebug-agent + - rocFFT + - ROCgdb + - ROCK + - ROCm Documentation/Website + - ROCm Data Center Tool + - ROCm Examples + - ROCm for Windows + - ROCm Performance Primitives + - ROCm System Management Interface Library + - ROCm Thrust + - ROCm Validation Suite + - rocm_bandwidth_test + - rocm-cmake + - rocm-core + - rocm-docs-core + - rocminfo + - rocMLIR + - rocmtools + - rocPRIM + - rocprofiler + - rocRAND + - ROCR-Runtime + - rocSOLVER + - rocSPARSE + - roctracer + - ROCT-Thunk-Interface + - rocWMMA + - Tensile + - umr + - ibv_rc_pingpong-amd + - mellanox + - mpitest + - Pytorch + - Tensorflow + - APEX + - torchvision + - Magma +- type: textarea + attributes: + label: Steps to Reproduce + description: (Optional) Detailed steps to reproduce the issue. + validations: + required: false + +- type: textarea + attributes: + label: (Optional for Linux users) Output of /opt/rocm/bin/rocminfo --support + description: The output of rocminfo --support could help to better address the problem. + validations: + required: false + +- type: textarea + attributes: + label: Additional Information + description: (Optional) Any additional information that is relevant, e.g. relevant environment variables, dockerfiles, log files, dmesg output (on Linux), etc. + validations: + required: false From dcedf3632f0e066c1712add65cb440622416363e Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Sat, 16 Dec 2023 09:17:40 -0800 Subject: [PATCH 18/18] Upgrade the default compiler to ROCm6.0 release. (#1103) * upgrade to rocm6.0 compiler * move rocm6.0 from private to public repo * switch to testing hipTensor mainline in CI --- Dockerfile | 6 +++--- Jenkinsfile | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index 87b4eb8e2b..b9339ec5d4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:20.04 ARG DEBIAN_FRONTEND=noninteractive -ARG ROCMVERSION=5.7 +ARG ROCMVERSION=6.0 ARG compiler_version="" ARG compiler_commit="" @@ -16,8 +16,8 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg -RUN wget https://repo.radeon.com/amdgpu-install/5.7/ubuntu/focal/amdgpu-install_5.7.50700-1_all.deb --no-check-certificate -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_5.7.50700-1_all.deb +RUN wget https://repo.radeon.com/amdgpu-install/6.0/ubuntu/focal/amdgpu-install_6.0.60000-1_all.deb --no-check-certificate +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.0.60000-1_all.deb RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \ diff --git a/Jenkinsfile b/Jenkinsfile index 8f661e4780..2bb48b85ce 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -33,7 +33,7 @@ def runShell(String command){ def getDockerImageName(){ def img - if (params.ROCMVERSION != "6.0"){ + if (params.ROCMVERSION != "6.1"){ if (params.COMPILER_VERSION == "") { img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" } @@ -655,8 +655,8 @@ def process_results(Map conf=[:]){ } //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version -CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION= - 0 21 * * * % ROCMVERSION=5.7;COMPILER_VERSION=;COMPILER_COMMIT= +CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.0;COMPILER_VERSION= + 0 21 * * * % ROCMVERSION=6.0;COMPILER_VERSION=;COMPILER_COMMIT= 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=;USE_SCCACHE=false 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;COMPILER_COMMIT=;USE_SCCACHE=false''' : "" @@ -675,8 +675,8 @@ pipeline { description: "Force building docker image (default: false), set to true if docker image needs to be updated.") string( name: 'ROCMVERSION', - defaultValue: '5.7', - description: 'Specify which ROCM version to use: 5.7 (default).') + defaultValue: '6.0', + description: 'Specify which ROCM version to use: 6.0 (default).') string( name: 'COMPILER_VERSION', defaultValue: '', @@ -703,8 +703,8 @@ pipeline { description: "Use the CK build to verify hipTensor build and tests (default: ON)") string( name: 'hipTensor_branch', - defaultValue: 'develop', - description: 'Specify which branch of hipTensor to use (default: develop)') + defaultValue: 'mainline', + description: 'Specify which branch of hipTensor to use (default: mainline)') booleanParam( name: "USE_SCCACHE", defaultValue: true,